From 1a7de64ff4a769e77a4be6bf12343feb10ed05be Mon Sep 17 00:00:00 2001
From: seanshpark <saehie.park@gmail.com>
Date: Thu, 8 Dec 2022 13:31:29 +0900
Subject: [PATCH 001/567] [mlir] Revise GetMlirOpNameFromOpCode for
 custom/while

This will revise GetMlirOpNameFromOpCode method to follow default name
from EnumNameBuiltinOperator for custom and while as tfl dialect
supports them.
---
 tensorflow/compiler/mlir/lite/flatbuffer_operator.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index 3cb1edc466abce..9c050193775b68 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -65,15 +65,9 @@ StatusOr<mlir::StringAttr> GetPaddingAttr(TfLitePadding pad_params,
 std::string mlir::GetMlirOpNameFromOpCode(
     const tflite::OperatorCodeT& op_code) {
   auto builtin_code = tflite::GetBuiltinCode(&op_code);
-  if (builtin_code == tflite::BuiltinOperator_CUSTOM) {
-    return std::string("tfl.custom");
-  }
   if (builtin_code == tflite::BuiltinOperator_IF) {
     return std::string("tf.If");
   }
-  if (builtin_code == tflite::BuiltinOperator_WHILE) {
-    return std::string("tfl.while");
-  }
 
   llvm::StringRef op_name(tflite::EnumNameBuiltinOperator(builtin_code));
   return llvm::Twine("tfl.", op_name.lower()).str();

From 450ff271f086150a9c1cb595a364b24c0c1bae56 Mon Sep 17 00:00:00 2001
From: DongHak Park <donghak.park@samsung.com>
Date: Thu, 2 Feb 2023 15:25:38 +0900
Subject: [PATCH 002/567] Typo Error

in tensorflow/lite/schema/schema_utils.cc
in tensorflow/lite/python/lite.py
in tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc

There is typo error (trivial)
---
 .../compiler/mlir/lite/transforms/legalize_jax_random.cc      | 2 +-
 tensorflow/lite/python/lite.py                                | 2 +-
 tensorflow/lite/schema/schema_utils.cc                        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
index db99805c92e0b3..3492e375a33d88 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// The full pipline of converting jax random include 2 steps.
+// The full pipeline of converting jax random include 2 steps.
 // 1. Rename the jax random functions to tflite wrapped functions with the aid
 //    of "jax.named_call". For example, in the dumped hlo, the
 //    jax.random.uniform will have name "tfl_wrapped_jax_random_uniform".
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 2f112d1dd8933a..c7ba41519f1e96 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -1624,7 +1624,7 @@ def _convert_concrete_functions_to_saved_model(self, output_dir):
 
     # Without the provided trackable obj, it is not able to serialize the given
     # concrete functions as a saved model format. Also when trackable obj is
-    # a function, use the original concrete function conversion pipline.
+    # a function, use the original concrete function conversion pipeline.
     if not self._trackable_obj or isinstance(
         self._trackable_obj,
         (_function.ConcreteFunction, _def_function.Function),
diff --git a/tensorflow/lite/schema/schema_utils.cc b/tensorflow/lite/schema/schema_utils.cc
index fc19290b862777..285873de24d84e 100644
--- a/tensorflow/lite/schema/schema_utils.cc
+++ b/tensorflow/lite/schema/schema_utils.cc
@@ -21,7 +21,7 @@ limitations under the License.
 namespace tflite {
 
 // The following GetBuiltinCode methods are the utility methods for reading
-// builtin operatore code, ensuring compatibility issues between v3 and v3a
+// builtin operator code, ensuring compatibility issues between v3 and v3a
 // schema. Always the maximum value of the two fields always will be the correct
 // value as follows:
 //
@@ -29,7 +29,7 @@ namespace tflite {
 //
 // The `builtin_code` field is not available in the v3 models. Flatbuffer
 // library will feed zero value, which is the default value in the v3a schema.
-// The actual builtin operatore code value will exist in the
+// The actual builtin operator code value will exist in the
 // `deprecated_builtin_code` field. At the same time, it implies that
 // `deprecated_builtin_code` >= `builtin_code` and the maximum value of the two
 // fields will be same with `deprecated_builtin_code'.

From 3a979e53b401508e7cab3147d348ff3577b108a8 Mon Sep 17 00:00:00 2001
From: Kun-Lu <kun.lu@ibm.com>
Date: Mon, 27 Mar 2023 16:05:19 -0400
Subject: [PATCH 003/567] Add byte-swapping for TFLite String TensorType on
 s390x

Signed-off-by: Kun-Lu <kun.lu@ibm.com>
---
 .../mlir/lite/flatbuffer_to_string.cc         |  3 +-
 tensorflow/lite/core/model_builder.cc         | 37 +++++++++++++------
 tensorflow/lite/core/model_builder.h          | 17 ++++++---
 .../model_modifier/embedder_main.cc           |  7 ++--
 .../mini_benchmark/validator_test.cc          |  2 +-
 tensorflow/lite/tools/flatbuffer_utils.py     | 23 +++++++++++-
 6 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc b/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc
index a51c4eaabf2c20..39c1c995050e9b 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc
@@ -141,7 +141,8 @@ int main(int argc, char** argv) {
   std::string serialized_model;
   if (tflite::ReadAndVerify(argv[1], &serialized_model)) return 1;
 #if FLATBUFFERS_LITTLEENDIAN == 0
-  tflite::FlatBufferModel::ByteSwapSerializedModel(&serialized_model);
+  if (std::string(argv[1]) == "-")
+    tflite::FlatBufferModel::ByteSwapSerializedModel(&serialized_model, true);
 #endif
   tflite::ToString(serialized_model);
   return 0;
diff --git a/tensorflow/lite/core/model_builder.cc b/tensorflow/lite/core/model_builder.cc
index 3684892bfb04d7..0b3239f78cd9e7 100644
--- a/tensorflow/lite/core/model_builder.cc
+++ b/tensorflow/lite/core/model_builder.cc
@@ -107,16 +107,25 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromBuffer(
 
 #if FLATBUFFERS_LITTLEENDIAN == 0
 
-void FlatBufferModel::ByteSwapSerializedModel(std::string* serialized_model) {
+void FlatBufferModel::ByteSwapSerializedModel(std::string* serialized_model,
+                                              bool from_big_endian) {
   const uint8_t* buffer =
       reinterpret_cast<const uint8_t*>(serialized_model->c_str());
   const tflite::Model* input_model = tflite::GetModel(buffer);
-  ByteSwapTFLiteModel(input_model);
+  ByteSwapTFLiteModel(input_model, from_big_endian);
 }
 
 void FlatBufferModel::ByteSwapBuffer(int8_t tensor_type, size_t buffer_size,
-                                     uint8_t* buffer) {
+                                     uint8_t* buffer, bool from_big_endian) {
   switch (tensor_type) {
+    case tflite::TensorType_STRING: {
+      auto bp = reinterpret_cast<int32_t*>(buffer);
+      int num_of_strings = 
+          from_big_endian ? bp[0] : flatbuffers::EndianSwap(bp[0]);
+      for (int i = 0; i < num_of_strings + 2; i++)
+        bp[i] = flatbuffers::EndianSwap(bp[i]);
+      break;
+    }
     // 16-bit types
     case tflite::TensorType_FLOAT16:
     case tflite::TensorType_INT16:
@@ -151,7 +160,8 @@ void FlatBufferModel::ByteSwapBuffer(int8_t tensor_type, size_t buffer_size,
   }
 }
 
-void FlatBufferModel::ByteSwapTFLiteModel(const tflite::Model* tfl_model) {
+void FlatBufferModel::ByteSwapTFLiteModel(const tflite::Model* tfl_model,
+                                          bool from_big_endian) {
   bool buffer_swapped[tfl_model->buffers()->size()] = {};
   for (size_t subgraph_idx = 0; subgraph_idx < tfl_model->subgraphs()->size();
        subgraph_idx++) {
@@ -167,7 +177,7 @@ void FlatBufferModel::ByteSwapTFLiteModel(const tflite::Model* tfl_model) {
         if (!buffer_ || !buffer_->data()) continue;
         auto* buffer = buffer_->data();
         uint8_t* buff_ = const_cast<uint8_t*>(buffer->data());
-        ByteSwapBuffer(tensor->type(), buffer->size(), buff_);
+        ByteSwapBuffer(tensor->type(), buffer->size(), buff_, from_big_endian);
         buffer_swapped[tensor->buffer()] = true;
       }
     }
@@ -175,21 +185,25 @@ void FlatBufferModel::ByteSwapTFLiteModel(const tflite::Model* tfl_model) {
 }
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::ByteConvertModel(
-    std::unique_ptr<FlatBufferModel> model, ErrorReporter* error_reporter) {
+    std::unique_ptr<FlatBufferModel> model, ErrorReporter* error_reporter,
+    bool from_big_endian) {
   if (model == nullptr) return model;
   auto tfl_model = model->GetModel();
   if (tfl_model->subgraphs()->size() == 0) return model;
   if (tfl_model->subgraphs()->Get(0)->tensors()->size() == 0) return model;
-  return ByteSwapFlatBufferModel(std::move(model), error_reporter);
+  if (tfl_model->buffers()->size() < 2) return model;
+  return ByteSwapFlatBufferModel(std::move(model), error_reporter,
+                                 from_big_endian);
 }
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::ByteSwapFlatBufferModel(
-    std::unique_ptr<FlatBufferModel> model, ErrorReporter* error_reporter) {
+    std::unique_ptr<FlatBufferModel> model, ErrorReporter* error_reporter,
+    bool from_big_endian) {
   FlatBufferModel* modelp = model.release();
   auto tflite_model = modelp->GetModel();
   auto copied_model = std::make_unique<tflite::ModelT>();
   tflite_model->UnPackTo(copied_model.get(), nullptr);
-  ByteSwapTFLiteModelT(copied_model.get());
+  ByteSwapTFLiteModelT(copied_model.get(), from_big_endian);
   std::unique_ptr<flatbuffers::FlatBufferBuilder> builder(
       new flatbuffers::FlatBufferBuilder());
   auto packed_model = tflite::Model::Pack(*builder, copied_model.get());
@@ -200,7 +214,8 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::ByteSwapFlatBufferModel(
       builder_->GetSize(), error_reporter);
 }
 
-void FlatBufferModel::ByteSwapTFLiteModelT(tflite::ModelT* tfl_modelt) {
+void FlatBufferModel::ByteSwapTFLiteModelT(tflite::ModelT* tfl_modelt,
+                                           bool from_big_endian) {
   size_t bytes_per_elem = 0;
   bool buffer_swapped[tfl_modelt->buffers.size()] = {};
   for (size_t subgraph_idx = 0; subgraph_idx < tfl_modelt->subgraphs.size();
@@ -213,7 +228,7 @@ void FlatBufferModel::ByteSwapTFLiteModelT(tflite::ModelT* tfl_modelt) {
         const auto* buffer = &(tfl_modelt->buffers[tensor->buffer].get()->data);
         if (buffer && buffer->data()) {
           uint8_t* buff_ = const_cast<uint8_t*>(buffer->data());
-          ByteSwapBuffer(tensor->type, buffer->size(), buff_);
+          ByteSwapBuffer(tensor->type, buffer->size(), buff_, from_big_endian);
           buffer_swapped[tensor->buffer] = true;
         }
       }
diff --git a/tensorflow/lite/core/model_builder.h b/tensorflow/lite/core/model_builder.h
index bcf5248d03cc47..a7202f133d5475 100644
--- a/tensorflow/lite/core/model_builder.h
+++ b/tensorflow/lite/core/model_builder.h
@@ -158,28 +158,33 @@ class FlatBufferModel {
 #if FLATBUFFERS_LITTLEENDIAN == 0
   /// Byte swap a constant buffer in place.
   static void ByteSwapBuffer(int8_t tensor_type, size_t buffer_size,
-                             uint8_t* buffer);
+                             uint8_t* buffer, bool from_big_endian = true);
 
   /// Byte swap the buffers field of a TFLite Model instance in place.
-  static void ByteSwapTFLiteModel(const tflite::Model* tfl_model);
+  static void ByteSwapTFLiteModel(const tflite::Model* tfl_model,
+                                  bool from_big_endian = true);
 
   /// Byte swap the buffers field of a TFLite ModelT instance in place.
-  static void ByteSwapTFLiteModelT(tflite::ModelT* tfl_modelt);
+  static void ByteSwapTFLiteModelT(tflite::ModelT* tfl_modelt,
+                                   bool from_big_endian = true);
 
   /// Convert the TFLite buffers field between LE and BE format in a
   /// FlatBufferModel which is not empty and return the converted instance.
   static std::unique_ptr<FlatBufferModel> ByteConvertModel(
       std::unique_ptr<FlatBufferModel> model,
-      ErrorReporter* error_reporter = DefaultErrorReporter());
+      ErrorReporter* error_reporter = DefaultErrorReporter(),
+      bool from_big_endian = false);
 
   /// Byte Swap the TFLite buffers field in a FlatBufferModel and return the
   /// swapped instance.
   static std::unique_ptr<FlatBufferModel> ByteSwapFlatBufferModel(
       std::unique_ptr<FlatBufferModel> model,
-      ErrorReporter* error_reporter = DefaultErrorReporter());
+      ErrorReporter* error_reporter = DefaultErrorReporter(),
+      bool from_big_endian = false);
 
   /// Byte Swap the serialized String of a TFLite model in place.
-  static void ByteSwapSerializedModel(std::string* serialized_model);
+  static void ByteSwapSerializedModel(std::string* serialized_model,
+                                      bool from_big_endian = true);
 #endif
 
   // Releases memory or unmaps mmaped memory.
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder_main.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder_main.cc
index 0287f3d1dabf7c..86608c7191f07f 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder_main.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder_main.cc
@@ -73,7 +73,7 @@ int RunEmbedder(const EmbedderOptions& options) {
     return 3;
   }
 #if FLATBUFFERS_LITTLEENDIAN == 0
-  tflite::FlatBufferModel::ByteSwapSerializedModel(&main_model_contents);
+  tflite::FlatBufferModel::ByteSwapSerializedModel(&main_model_contents, false);
 #endif
   const Model* main_model =
       flatbuffers::GetRoot<Model>(main_model_contents.data());
@@ -87,7 +87,8 @@ int RunEmbedder(const EmbedderOptions& options) {
     return 4;
   }
 #if FLATBUFFERS_LITTLEENDIAN == 0
-  tflite::FlatBufferModel::ByteSwapSerializedModel(&metrics_model_contents);
+  tflite::FlatBufferModel::ByteSwapSerializedModel(&metrics_model_contents,
+                                                   false);
 #endif
   const Model* metrics_model =
       flatbuffers::GetRoot<Model>(metrics_model_contents.data());
@@ -134,7 +135,7 @@ int RunEmbedder(const EmbedderOptions& options) {
     return 7;
   }
 #if FLATBUFFERS_LITTLEENDIAN == 0
-  tflite::FlatBufferModel::ByteSwapSerializedModel(&binary);
+  tflite::FlatBufferModel::ByteSwapSerializedModel(&binary, true);
 #endif
   f << binary;
   f.close();
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc
index d51b84dd94ea9c..e6d756ce4244d8 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc
@@ -121,7 +121,7 @@ TEST_F(ValidatorTest, HappyPathOnCpuWithCustomValidation) {
       reinterpret_cast<const char*>(model_with_input.GetBufferPointer()),
       model_with_input.GetSize());
 #if FLATBUFFERS_LITTLEENDIAN == 0
-  tflite::FlatBufferModel::ByteSwapSerializedModel(&serialized_str);
+  tflite::FlatBufferModel::ByteSwapSerializedModel(&serialized_str, true);
 #endif
   std::string model_path = MiniBenchmarkTestHelper::DumpToTempFile(
       "mobilenet_quant_with_input.tflite",
diff --git a/tensorflow/lite/tools/flatbuffer_utils.py b/tensorflow/lite/tools/flatbuffer_utils.py
index 6c8dcf0a031fce..48da3ff04827d1 100644
--- a/tensorflow/lite/tools/flatbuffer_utils.py
+++ b/tensorflow/lite/tools/flatbuffer_utils.py
@@ -297,6 +297,23 @@ def byte_swap_buffer_content(buffer, chunksize, from_endiness, to_endiness):
   )
 
 
+def byte_swap_string_content(buffer, from_endiness, to_endiness):
+  """Helper function for byte-swapping the string buffer.
+
+  Args:
+    buffer: TFLite string buffer of from_endiness format.
+    from_endiness: The original endianness format of the string buffer.
+    to_endiness: The destined endianness format of the string buffer.
+  """
+  num_of_strings = int.from_bytes(buffer.data[0:4], from_endiness)
+  string_content = bytearray(buffer.data[4*(num_of_strings+2):])
+  prefix_data = b''.join([int.from_bytes(
+    buffer.data[i:i+4], from_endiness).to_bytes(
+      4, to_endiness) for i in range(
+        0, (num_of_strings+1)*4+1, 4)])
+  buffer.data = prefix_data + string_content
+
+
 def byte_swap_tflite_model_obj(model, from_endiness, to_endiness):
   """Byte swaps the buffers field in a TFLite model.
 
@@ -334,7 +351,11 @@ def byte_swap_tflite_model_obj(model, from_endiness, to_endiness):
           and tensor.buffer not in buffer_swapped
           and model.buffers[tensor.buffer].data is not None
       ):
-        if tensor.type in types_of_16_bits:
+        if tensor.type == schema_fb.TensorType.STRING:
+          byte_swap_string_content(
+            model.buffers[tensor.buffer], from_endiness, to_endiness
+          )
+        elif tensor.type in types_of_16_bits:
           byte_swap_buffer_content(
               model.buffers[tensor.buffer], 2, from_endiness, to_endiness
           )

From 058656893d223956c9867c99634fbb223655f4e3 Mon Sep 17 00:00:00 2001
From: Feiyue Chen <Feiyue.Chen@verisilicon.com>
Date: Thu, 13 Apr 2023 10:31:07 +0800
Subject: [PATCH 004/567] get data ranges for missing types

---
 tensorflow/lite/tools/utils.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/tools/utils.cc b/tensorflow/lite/tools/utils.cc
index 0b43de6d779d74..846f76471f2ce1 100644
--- a/tensorflow/lite/tools/utils.cc
+++ b/tensorflow/lite/tools/utils.cc
@@ -153,8 +153,8 @@ void GetDataRangesForType(TfLiteType type, float* low_range,
       type == kTfLiteFloat64) {
     *low_range = -0.5f;
     *high_range = 0.5f;
-  } else if (type == kTfLiteInt64 || type == kTfLiteInt64 ||
-             type == kTfLiteInt64 || type == kTfLiteInt64) {
+  } else if (type == kTfLiteInt64 || type == kTfLiteUInt64 ||
+             type == kTfLiteInt32 || type == kTfLiteUInt32) {
     *low_range = 0;
     *high_range = 99;
   } else if (type == kTfLiteUInt8) {

From 30699a7f6c4d810dc26eabaec6c8ae732a1c2e12 Mon Sep 17 00:00:00 2001
From: pjpratik <118897289+pjpratik@users.noreply.github.com>
Date: Tue, 23 May 2023 22:04:44 +0530
Subject: [PATCH 005/567] Update dataset.py to be compatible with TF 2.x

The functions in the dataset.py are outdated and deprecated in Tensorflow 2.x. This commit updates the code to be compatible with the latest TF versions by replacing the deprecated functions with their TensorFlow 2.x equivalents.

Thanks.
---
 tensorflow/lite/tutorials/dataset.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/tutorials/dataset.py b/tensorflow/lite/tutorials/dataset.py
index 92a669b4460ec5..bae3d17d473f31 100644
--- a/tensorflow/lite/tutorials/dataset.py
+++ b/tensorflow/lite/tutorials/dataset.py
@@ -36,7 +36,7 @@ def read32(bytestream):
 
 def check_image_file_header(filename):
   """Validate that filename corresponds to images for the MNIST dataset."""
-  with tf.gfile.Open(filename, 'rb') as f:
+  with tf.io.gfile.Gfile(filename, 'rb') as f:
     magic = read32(f)
     read32(f)  # num_images, unused
     rows = read32(f)
@@ -63,17 +63,17 @@ def check_labels_file_header(filename):
 def download(directory, filename):
   """Download (and unzip) a file from the MNIST dataset if not already done."""
   filepath = os.path.join(directory, filename)
-  if tf.gfile.Exists(filepath):
+  if tf.io.gfile.exists(filepath):
     return filepath
-  if not tf.gfile.Exists(directory):
-    tf.gfile.MakeDirs(directory)
+  if not tf.io.gfile.exists(directory):
+    tf.io.gfile.makedirs(directory)
   # CVDF mirror of http://yann.lecun.com/exdb/mnist/
   url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'
   _, zipped_filepath = tempfile.mkstemp(suffix='.gz')
   print('Downloading %s to %s' % (url, zipped_filepath))
   urllib.request.urlretrieve(url, zipped_filepath)
   with gzip.open(zipped_filepath, 'rb') as f_in, \
-      tf.gfile.Open(filepath, 'wb') as f_out:
+      tf.io.gfile.Gfile(filepath, 'wb') as f_out:
     shutil.copyfileobj(f_in, f_out)
   os.remove(zipped_filepath)
   return filepath
@@ -90,15 +90,15 @@ def dataset(directory, images_file, labels_file):
 
   def decode_image(image):
     # Normalize from [0, 255] to [0.0, 1.0]
-    image = tf.decode_raw(image, tf.uint8)
+    image = tf.io.decode_raw(image, tf.uint8)
     image = tf.cast(image, tf.float32)
     image = tf.reshape(image, [784])
     return image / 255.0
 
   def decode_label(label):
-    label = tf.decode_raw(label, tf.uint8)  # tf.string -> [tf.uint8]
+    label = tf.io.decode_raw(label, tf.uint8)  # tf.string -> [tf.uint8]
     label = tf.reshape(label, [])  # label is a scalar
-    return tf.to_int32(label)
+    return tf.cast(label,tf.int32)
 
   images = tf.data.FixedLengthRecordDataset(
       images_file, 28 * 28, header_bytes=16).map(decode_image)

From 4fa0913dfc9629a3b42c930938a4729440d23896 Mon Sep 17 00:00:00 2001
From: Kun-Lu <kun.lu@ibm.com>
Date: Wed, 7 Jun 2023 09:33:29 -0400
Subject: [PATCH 006/567] Remove the trailing space

Signed-off-by: Kun-Lu <kun.lu@ibm.com>
---
 tensorflow/lite/core/model_builder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/core/model_builder.cc b/tensorflow/lite/core/model_builder.cc
index 0b3239f78cd9e7..a4df232fb791e6 100644
--- a/tensorflow/lite/core/model_builder.cc
+++ b/tensorflow/lite/core/model_builder.cc
@@ -120,7 +120,7 @@ void FlatBufferModel::ByteSwapBuffer(int8_t tensor_type, size_t buffer_size,
   switch (tensor_type) {
     case tflite::TensorType_STRING: {
       auto bp = reinterpret_cast<int32_t*>(buffer);
-      int num_of_strings = 
+      int num_of_strings =
           from_big_endian ? bp[0] : flatbuffers::EndianSwap(bp[0]);
       for (int i = 0; i < num_of_strings + 2; i++)
         bp[i] = flatbuffers::EndianSwap(bp[i]);

From aa2a1ced77c5e47fb7e4cb2f80b768bfc7c63e6b Mon Sep 17 00:00:00 2001
From: Mike Corrigan <corrigan@motorola.com>
Date: Fri, 7 Oct 2022 22:30:51 -0500
Subject: [PATCH 007/567] Fix GPU compile errors (OpenCL and OpenGL)

Fix number of parameters in call to CLArguments::Init().
Add missing linkopts for EGL, math, and dlopen.

Signed-off-by: Mike Corrigan <corrigan@gmail.com>
---
 tensorflow/lite/delegates/gpu/cl/BUILD          |  1 +
 .../lite/delegates/gpu/cl/cl_arguments_test.cc  |  5 +++--
 tensorflow/lite/delegates/gpu/cl/testing/BUILD  |  7 +++++++
 tensorflow/lite/delegates/gpu/gl/BUILD          |  4 ++++
 tensorflow/lite/delegates/gpu/gl/kernels/BUILD  | 17 +++++++++++++----
 5 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index d710059af90886..e9423c5b5dafcb 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -126,6 +126,7 @@ cc_test(
     deps = [
         ":buffer",
         ":cl_arguments",
+        ":cl_test",
         ":gpu_object",
         "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc b/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
index 2a46c202286dd5..c9da770940968f 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/match.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_test.h"
 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 
@@ -45,7 +46,7 @@ __kernel void main_function($0) {
 
   CLArguments cl_args;
   GpuInfo gpu_info;
-  ASSERT_OK(cl_args.Init(gpu_info, {}, nullptr, &args, &sample_code));
+  ASSERT_OK(cl_args.Init(gpu_info, nullptr, &args, &sample_code));
   EXPECT_TRUE(absl::StrContains(sample_code, "value = weights_buffer[id];"));
   EXPECT_TRUE(
       absl::StrContains(sample_code, "__global float4* weights_buffer"));
@@ -67,7 +68,7 @@ TEST(CLArgumentsTest, TestNoSelector) {
 )";
   CLArguments cl_args;
   GpuInfo gpu_info;
-  EXPECT_FALSE(cl_args.Init(gpu_info, {}, nullptr, &args, &sample_code).ok());
+  EXPECT_FALSE(cl_args.Init(gpu_info, nullptr, &args, &sample_code).ok());
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index f5c26496875348..ef5daedddf5186 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -42,6 +42,13 @@ cc_test(
 cc_binary(
     name = "internal_api_samples",
     srcs = ["internal_api_samples.cc"],
+    linkopts = select({
+        "//tensorflow:android": [
+           "-lEGL",
+           "-lGLESv3",
+        ],
+        "//conditions:default": []
+    }),
     tags = [
         "nobuilder",
         "notap",
diff --git a/tensorflow/lite/delegates/gpu/gl/BUILD b/tensorflow/lite/delegates/gpu/gl/BUILD
index ff4c1663c0eec8..20e37c67b29f16 100644
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@@ -213,6 +213,7 @@ cc_test(
     linkopts = [
         "-lEGL",
         "-lGLESv2",
+        "-lm",
     ],
     tags = tf_gpu_tests_tags() + [
         "local",
@@ -446,6 +447,9 @@ cc_library(
 cc_test(
     name = "serialization_test",
     srcs = ["serialization_test.cc"],
+    linkopts = [
+        "-lm",
+    ],
     tags = [
         "local",
         "nobuilder",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index c648b75cd0fd27..3abd79f5729431 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -45,6 +45,7 @@ cc_test(
     ],
     deps = [
         ":converter",
+        ":test_util",
         "//tensorflow/lite/delegates/gpu/common:convert",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -717,10 +718,18 @@ cc_library(
     testonly = 1,
     srcs = ["test_util.cc"],
     hdrs = ["test_util.h"],
-    linkopts = [
-        "-lEGL",
-        "-lGLESv3",
-    ],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lEGL",
+            "-lGLESv3",
+            "-ldl",
+            "-lm",
+        ],
+        "//conditions:default": [
+            "-lEGL",
+            "-lGLESv3",
+        ],
+    }),
     deps = [
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",

From c55a3fa55c1e2a26e7d5aba3637ddbde7cf4a4ee Mon Sep 17 00:00:00 2001
From: Mike Corrigan <corrigan@motorola.com>
Date: Thu, 8 Jun 2023 11:49:31 -0500
Subject: [PATCH 008/567] Refactor ReLU clip to activation_max

Refactor ReLUAttributes.clip to activation_max to
match TfLite activation naming conventions.

Signed-off-by: Mike Corrigan <corrigan@gmail.com>
---
 .../lite/delegates/gpu/common/model_builder.cc       | 12 ++++++------
 .../delegates/gpu/common/model_builder_helper.cc     |  2 +-
 tensorflow/lite/delegates/gpu/common/operations.h    | 12 ++++++------
 tensorflow/lite/delegates/gpu/common/tasks/relu.cc   |  8 ++++----
 .../delegates/gpu/common/tasks/relu_test_util.cc     |  8 ++++----
 tensorflow/lite/delegates/gpu/gl/kernels/relu.cc     |  8 ++++----
 .../lite/delegates/gpu/gl/kernels/relu_test.cc       |  8 ++++----
 7 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 9702d8f4a828a8..481df15c42d04f 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -427,8 +427,8 @@ class ClampOperationsParser : public TFLiteOperationParser {
     // We replace clamp(...) with sequence of elementwise ops:
     // substaction -> usual relu with alpha = 0.0 -> addition.
     // node_sub = v0 = v - a // add op (add -a)
-    // node_relu = v1 = clamp(v0, 0.0, clip); // relu op alpha = 0.0,
-    // clip = b - a;
+    // node_relu = v1 = clamp(v0, 0.0, activation_max); // relu op alpha = 0.0,
+    // activation_max = b - a;
     // node_add = v2 = v1 + a // add op (add a)
     Node* node_sub = graph->NewNode();
     Node* node_relu = graph->NewNode();
@@ -441,7 +441,7 @@ class ClampOperationsParser : public TFLiteOperationParser {
 
     ReLUAttributes relu_attr;
     relu_attr.alpha = 0.0f;
-    relu_attr.clip = clamp_b_ - clamp_a_;
+    relu_attr.activation_max = clamp_b_ - clamp_a_;
     node_relu->operation.type = ToString(OperationType::RELU);
     node_relu->operation.attributes = relu_attr;
 
@@ -1937,7 +1937,7 @@ class QuantizeOperationParser : public TFLiteOperationParser {
 
 class ReLUOperationParser : public TFLiteOperationParser {
  public:
-  explicit ReLUOperationParser(int clip) : clip_(clip) {}
+  explicit ReLUOperationParser(int activation_max) : activation_max_(activation_max) {}
 
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
@@ -1957,13 +1957,13 @@ class ReLUOperationParser : public TFLiteOperationParser {
     const TfLiteLeakyReluParams* tf_options;
     auto status = RetrieveBuiltinData(tflite_node, &tf_options);
     attr.alpha = status.ok() ? tf_options->alpha : 0;
-    attr.clip = clip_;
+    attr.activation_max = activation_max_;
     node->operation.attributes = attr;
     return reader->AddOutputs(node);
   }
 
  private:
-  const int clip_;
+  const int activation_max_;
 };
 
 class ResamplerOperationParser : public TFLiteOperationParser {
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index c825d2be691d5f..313bf7decfbe6b 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -397,7 +397,7 @@ absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
     case kTfLiteActReluN1To1:
     case kTfLiteActRelu6: {
       ReLUAttributes attr;
-      attr.clip = fused_activation == kTfLiteActRelu
+      attr.activation_max = fused_activation == kTfLiteActRelu
                       ? 0.0f
                       : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
       Node* activation_node;
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 37d8dfa0fcdc6f..71d6d1aa0f6025 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -392,16 +392,16 @@ Padding3D CalculateSamePadding(const BHWDC& input,
 
 // f(x):= {
 //   if x < 0  : x -> alpha * x
-//   if x >= 0 : x -> min(clip, x)
+//   if x >= 0 : x -> min(activation_max, x)
 // }
 //
 // Examples:
-//   - ReLU: clip = 0, alpha = 0
-//   - ReLU6: clip = 6, alpha = 0
-//   - Leaky ReLU: clip = 0, alpha = a
+//   - ReLU: activation_max = 0, alpha = 0
+//   - ReLU6: activation_max = 6, alpha = 0
+//   - Leaky ReLU: activation_max = 0, alpha = a
 struct ReLUAttributes {
-  // clip <= 0 mean it is not set.
-  float clip = 0;
+  // activation_max <= 0 mean it is not set.
+  float activation_max = 0;
 
   float alpha = 0;
 };
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/relu.cc b/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
index 9afc2c7508eabb..a3b7e008a08aa5 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
@@ -37,14 +37,14 @@ ElementwiseDescriptor CreateReLU(const ReLUAttributes& attr,
   } else {
     min_func = "INIT_FLT4(0.0f)";
   }
-  if (attr.clip != 0.0f) {
+  if (attr.activation_max != 0.0f) {
     if (precision == CalculationsPrecision::F32) {
-      result.args.AddFloat("clip", attr.clip);
+      result.args.AddFloat("activation_max", attr.activation_max);
     } else {
-      result.args.AddHalf("clip", half(attr.clip));
+      result.args.AddHalf("activation_max", half(attr.activation_max));
     }
     result.code = absl::StrCat("out_value = clamp(in_value, " + min_func +
-                               ", INIT_FLT4(args.clip));");
+                               ", INIT_FLT4(args.activation_max));");
   } else {
     result.code = absl::StrCat("out_value = max(in_value, ", min_func, ");");
   }
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
index 7c87bbb7823c0b..fefd5d3cf5759e 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
@@ -33,7 +33,7 @@ absl::Status ReLUNoClipNoAlphaTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.0f;
-  attr.clip = 0.0f;
+  attr.activation_max = 0.0f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -62,7 +62,7 @@ absl::Status ReLUClipTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.0f;
-  attr.clip = 0.9f;
+  attr.activation_max = 0.9f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -91,7 +91,7 @@ absl::Status ReLUAlphaTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.5f;
-  attr.clip = 0.0f;
+  attr.activation_max = 0.0f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -120,7 +120,7 @@ absl::Status ReLUAlphaClipTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.5f;
-  attr.clip = 0.5f;
+  attr.activation_max = 0.5f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
index 6d05ea894d4cfe..20051ae4195c85 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
@@ -39,7 +39,7 @@ class ReLU : public NodeShader {
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
     const auto& attr = std::any_cast<const ReLUAttributes&>(ctx.op_attr);
-    // clamp(value, min(0, alpha * value), clip)
+    // clamp(value, min(0, alpha * value), activation_max)
     std::vector<Variable> params;
     std::string min;
     if (attr.alpha == 0) {
@@ -49,11 +49,11 @@ class ReLU : public NodeShader {
       params.push_back({"alpha", attr.alpha});
     }
     std::string code;
-    if (attr.clip == 0) {
+    if (attr.activation_max == 0) {
       code = "value_0 = max(value_0, " + min + ");";
     } else {
-      code = "value_0 = clamp(value_0, " + min + ", vec4($clip$));";
-      params.push_back({"clip", attr.clip});
+      code = "value_0 = clamp(value_0, " + min + ", vec4($activation_max$));";
+      params.push_back({"activation_max", attr.activation_max});
     }
     *generated_code = {
         /*parameters=*/std::move(params),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
index 0c2d2536fd654e..a1c54bfe69b761 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
@@ -45,7 +45,7 @@ class ReluTest : public ::testing::Test {
 TEST_F(ReluTest, Smoke) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.clip = 0;
+  attr.activation_max = 0;
   attr.alpha = 0;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});
@@ -58,7 +58,7 @@ TEST_F(ReluTest, Smoke) {
 TEST_F(ReluTest, ClipOnly) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.clip = 6;
+  attr.activation_max = 6;
   attr.alpha = 0;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});
@@ -71,7 +71,7 @@ TEST_F(ReluTest, ClipOnly) {
 TEST_F(ReluTest, AlphaOnly) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.clip = 0;
+  attr.activation_max = 0;
   attr.alpha = 0.5;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});
@@ -84,7 +84,7 @@ TEST_F(ReluTest, AlphaOnly) {
 TEST_F(ReluTest, ClipAndAlpha) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.clip = 6;
+  attr.activation_max = 6;
   attr.alpha = 0.5;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});

From 532124e333e9bcfe80209a1ddd437998760246ac Mon Sep 17 00:00:00 2001
From: Mike Corrigan <corrigan@gmail.com>
Date: Mon, 10 Oct 2022 21:15:02 -0500
Subject: [PATCH 009/567] Fix ReLUN1To1 fused activation for OpenCL and OpenGL

The ReLUN1To1 op was repurposed from the a ReLU1 op.
However it was not fully reimplemented. As a result
it clipped to a minimum of 0 instead of -1.

Add a min clip attribute, attribute_min, to the ReLU
implementation. Set to -1 for ReLUN1To1 and 0 for
all other ReLUs.

Use the ReLUOperationParser for ReLUN1To1 instead of
the ClampoperationParser.

Signed-off-by: Mike Corrigan <corrigan@gmail.com>
---
 .../delegates/gpu/cl/kernels/relu_test.cc     |  21 +++
 .../delegates/gpu/common/model_builder.cc     |  13 +-
 .../gpu/common/model_builder_helper.cc        |   1 +
 .../lite/delegates/gpu/common/operations.h    |  17 ++-
 .../lite/delegates/gpu/common/tasks/relu.cc   |   7 +-
 .../gpu/common/tasks/relu_test_util.cc        | 125 ++++++++++++++++++
 .../gpu/common/tasks/relu_test_util.h         |   4 +
 .../lite/delegates/gpu/gl/kernels/relu.cc     |   3 +-
 .../delegates/gpu/gl/kernels/relu_test.cc     |  56 ++++++++
 9 files changed, 235 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
index 10a5c565c2437f..36984468fad976 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
@@ -46,6 +46,27 @@ TEST_F(OpenCLOperationTest, ReLUAlphaClip) {
   ASSERT_TRUE(status.ok()) << status.message();
 }
 
+
+TEST_F(OpenCLOperationTest, ReLULN1NoClipNoAlpha) {
+  auto status = ReLUN1NoClipNoAlphaTest(&exec_env_);
+  ASSERT_TRUE(status.ok()) << status;
+}
+
+TEST_F(OpenCLOperationTest, ReLUN1Clip) {
+  auto status = ReLUN1ClipTest(&exec_env_);
+  ASSERT_TRUE(status.ok()) << status;
+}
+
+TEST_F(OpenCLOperationTest, ReLULN1Alpha) {
+  auto status = ReLUN1AlphaTest(&exec_env_);
+  ASSERT_TRUE(status.ok()) << status;
+}
+
+TEST_F(OpenCLOperationTest, ReLUN1AlphaClip) {
+  auto status = ReLUN1AlphaClipTest(&exec_env_);
+  ASSERT_TRUE(status.ok()) << status;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 481df15c42d04f..3b15bb9043db59 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -1937,7 +1937,8 @@ class QuantizeOperationParser : public TFLiteOperationParser {
 
 class ReLUOperationParser : public TFLiteOperationParser {
  public:
-  explicit ReLUOperationParser(int activation_max) : activation_max_(activation_max) {}
+  explicit ReLUOperationParser(int activation_min, int activation_max)
+    : activation_min_(activation_min), activation_max_(activation_max) {}
 
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
@@ -1957,12 +1958,14 @@ class ReLUOperationParser : public TFLiteOperationParser {
     const TfLiteLeakyReluParams* tf_options;
     auto status = RetrieveBuiltinData(tflite_node, &tf_options);
     attr.alpha = status.ok() ? tf_options->alpha : 0;
+    attr.activation_min = activation_min_;
     attr.activation_max = activation_max_;
     node->operation.attributes = attr;
     return reader->AddOutputs(node);
   }
 
  private:
+  const int activation_min_;
   const int activation_max_;
 };
 
@@ -3190,13 +3193,13 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       }
       break;
     case kTfLiteBuiltinRelu:
-      return std::make_unique<ReLUOperationParser>(0);
+      return std::make_unique<ReLUOperationParser>(0, 0);
     case kTfLiteBuiltinRelu6:
-      return std::make_unique<ReLUOperationParser>(6);
+      return std::make_unique<ReLUOperationParser>(0, 6);
     case kTfLiteBuiltinReluN1To1:
-      return std::make_unique<ClampOperationsParser>(-1.0, 1.0);
+      return std::make_unique<ReLUOperationParser>(-1.0, 1.0);
     case kTfLiteBuiltinLeakyRelu:
-      return std::make_unique<ReLUOperationParser>(0);
+      return std::make_unique<ReLUOperationParser>(0, 0);
     case kTfLiteBuiltinPrelu:
       return std::make_unique<PReLUOperationParser>();
     case kTfLiteBuiltinReshape:
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index 313bf7decfbe6b..1772bc5d257907 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -400,6 +400,7 @@ absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
       attr.activation_max = fused_activation == kTfLiteActRelu
                       ? 0.0f
                       : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
+      attr.activation_min = fused_activation == kTfLiteActReluN1To1 ? -1.0f : 0.0f;
       Node* activation_node;
       RETURN_IF_ERROR(
           NewPassthroughNode(graph, node, outputs[0], &activation_node));
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 71d6d1aa0f6025..ac384bbe801ea2 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -391,18 +391,25 @@ Padding3D CalculateSamePadding(const BHWDC& input,
                                const DepthwiseConvolution3DAttributes& attr);
 
 // f(x):= {
-//   if x < 0  : x -> alpha * x
-//   if x >= 0 : x -> min(activation_max, x)
+//   if alpha != 0: x -> min(activation_max, x)
+//   else
+//     if x < activation_min : x -> min(activation_min, alpha * x)
+//     if x >= activation_min : x -> min(activation_max, x)
 // }
 //
 // Examples:
-//   - ReLU: activation_max = 0, alpha = 0
-//   - ReLU6: activation_max = 6, alpha = 0
-//   - Leaky ReLU: activation_max = 0, alpha = a
+//   - ReLU: activation_min = 0, activation_max = 0, alpha = 0
+//   - ReLU6: activation_min = 0, activation_max = 6, alpha = 0
+//   - Leaky ReLU: activation_min = 0, activation_max = 0, alpha = a
+//   - ReLUN1To1: activation_min = -1, activation_max = 1, alpha = 0
 struct ReLUAttributes {
+  // activation_min must be < activation_max
+  float activation_min = 0;
+
   // activation_max <= 0 mean it is not set.
   float activation_max = 0;
 
+  // alpha must be <= 1
   float alpha = 0;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/relu.cc b/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
index a3b7e008a08aa5..861ba496cc2b72 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
@@ -35,7 +35,12 @@ ElementwiseDescriptor CreateReLU(const ReLUAttributes& attr,
       result.args.AddHalf("alpha", half(attr.alpha));
     }
   } else {
-    min_func = "INIT_FLT4(0.0f)";
+    min_func = "INIT_FLT4(args.activation_min)";
+    if (precision == CalculationsPrecision::F32) {
+      result.args.AddFloat("activation_min", attr.activation_min);
+    } else {
+      result.args.AddHalf("activation_min", half(attr.activation_min));
+    }
   }
   if (attr.activation_max != 0.0f) {
     if (precision == CalculationsPrecision::F32) {
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
index fefd5d3cf5759e..bd62c91ee26a7d 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
@@ -142,5 +142,130 @@ absl::Status ReLUAlphaClipTest(TestExecutionEnvironment* env) {
   return absl::OkStatus();
 }
 
+absl::Status ReLUN1NoClipNoAlphaTest(TestExecutionEnvironment* env) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 0.0f;
+  attr.activation_min = -1.0f;
+  attr.activation_max = 0.0f;
+
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateReLU(op_def, attr);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          src_tensor, absl::make_unique<GPUOperation>(std::move(operation)),
+          BHWC(1, 2, 1, 4), &dst_tensor));
+      RETURN_IF_ERROR(
+          PointWiseNear({-1.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f},
+              dst_tensor.data, eps));
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ReLUN1ClipTest(TestExecutionEnvironment* env) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 0.0f;
+  attr.activation_min = -1.0f;
+  attr.activation_max = 1.0f;
+
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateReLU(op_def, attr);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          src_tensor, absl::make_unique<GPUOperation>(std::move(operation)),
+          BHWC(1, 2, 1, 4), &dst_tensor));
+      RETURN_IF_ERROR(PointWiseNear({-1.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 1.0f},
+          dst_tensor.data, eps));
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ReLUN1AlphaTest(TestExecutionEnvironment* env) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 1.0f;
+  attr.activation_min = -1.0f; // activation_min ignored if alpha != 0
+  attr.activation_max = 0.0f;
+
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateReLU(op_def, attr);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          src_tensor, absl::make_unique<GPUOperation>(std::move(operation)),
+          BHWC(1, 2, 1, 4), &dst_tensor));
+      RETURN_IF_ERROR(PointWiseNear({-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f},
+          dst_tensor.data, eps));
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ReLUN1AlphaClipTest(TestExecutionEnvironment* env) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 1.0f;
+  attr.activation_min = -1.0f; // activation_min ignored if alpha != 0
+  attr.activation_max = 3.0f;
+
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateReLU(op_def, attr);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          src_tensor, absl::make_unique<GPUOperation>(std::move(operation)),
+          BHWC(1, 2, 1, 4), &dst_tensor));
+      RETURN_IF_ERROR(PointWiseNear({-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.0f},
+          dst_tensor.data, eps));
+    }
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h
index 92ed2eea5cbcd8..8c0a7023048962 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h
@@ -26,6 +26,10 @@ absl::Status ReLUNoClipNoAlphaTest(TestExecutionEnvironment* env);
 absl::Status ReLUClipTest(TestExecutionEnvironment* env);
 absl::Status ReLUAlphaTest(TestExecutionEnvironment* env);
 absl::Status ReLUAlphaClipTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1NoClipNoAlphaTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1ClipTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1AlphaTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1AlphaClipTest(TestExecutionEnvironment* env);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
index 20051ae4195c85..5bfd0517df68eb 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
@@ -43,7 +43,8 @@ class ReLU : public NodeShader {
     std::vector<Variable> params;
     std::string min;
     if (attr.alpha == 0) {
-      min = "vec4(0.0)";
+      min = "vec4($activation_min$)";
+      params.push_back({"activation_min", attr.activation_min});
     } else {
       min = "min($alpha$ * value_0, 0.0)";
       params.push_back({"alpha", attr.alpha});
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
index a1c54bfe69b761..d32fd7489fa50c 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
@@ -94,6 +94,62 @@ TEST_F(ReluTest, ClipAndAlpha) {
               Pointwise(FloatNear(1e-6), {-3.0, 0.0, 2.0, 6.0}));
 }
 
+TEST_F(ReluTest, ReLUN1Smoke) {
+  OperationType op_type = OperationType::RELU;
+  ReLUAttributes attr;
+  attr.activation_min = -1;
+  attr.activation_max = 0;
+  attr.alpha = 0;
+  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
+                      {GetTensorRef(1)});
+  ASSERT_TRUE(model.PopulateTensor(0, {-12.0f, -0.5f, 0.8f, 3.2f}));
+  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {-1.0f, -0.5f, 0.8f, 3.2f}) );
+}
+
+TEST_F(ReluTest, ReLUN1ClipOnly) {
+  OperationType op_type = OperationType::RELU;
+  ReLUAttributes attr;
+  attr.activation_min = -1;
+  attr.activation_max = 1;
+  attr.alpha = 0;
+  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
+                      {GetTensorRef(1)});
+  ASSERT_TRUE(model.PopulateTensor(0, {-12.0f, -0.5f, 0.8f, 3.2f}));
+  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {-1.0f, -0.5f, 0.8f, 1.0f}));
+}
+
+TEST_F(ReluTest, ReLUN1AlphaOnly) {
+  OperationType op_type = OperationType::RELU;
+  ReLUAttributes attr;
+  attr.activation_min = -1; // activation_min ignored if alpha != 0
+  attr.activation_max = 0;
+  attr.alpha = 0.5;
+  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
+                      {GetTensorRef(1)});
+  ASSERT_TRUE(model.PopulateTensor(0, {-6.0, 0.0, 2.0, 8.0}));
+  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {-3.0, 0.0, 2.0, 8.0}));
+}
+
+TEST_F(ReluTest, ReLUN1ClipAndAlpha) {
+  OperationType op_type = OperationType::RELU;
+  ReLUAttributes attr;
+  attr.activation_min = -1; // activation_min ignored if alpha != 0
+  attr.activation_max = 6;
+  attr.alpha = 0.5;
+  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
+                      {GetTensorRef(1)});
+  ASSERT_TRUE(model.PopulateTensor(0, {-6.0, 0.0, 2.0, 8.0}));
+  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {-3.0, 0.0, 2.0, 6.0}));
+}
+
 }  // namespace
 }  // namespace gl
 }  // namespace gpu

From 34cb72d9d6fc3c0bc585048b3f419cf097f1c355 Mon Sep 17 00:00:00 2001
From: Kun-Lu <kun.lu@ibm.com>
Date: Wed, 12 Jul 2023 08:55:40 -0400
Subject: [PATCH 010/567] Add comments on using "-"

Signed-off-by: Kun-Lu <kun.lu@ibm.com>
---
 tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc b/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc
index 39c1c995050e9b..b3e7e8e633e0da 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc
@@ -141,6 +141,8 @@ int main(int argc, char** argv) {
   std::string serialized_model;
   if (tflite::ReadAndVerify(argv[1], &serialized_model)) return 1;
 #if FLATBUFFERS_LITTLEENDIAN == 0
+  // If the flatbuffer model comes from stdin, convert its tensor content from
+  // BE to LE to ensure the output text string is the same as on LE platforms.
   if (std::string(argv[1]) == "-")
     tflite::FlatBufferModel::ByteSwapSerializedModel(&serialized_model, true);
 #endif

From 2ee38c91ba5e8fd27d89c0a15838d720944e7d87 Mon Sep 17 00:00:00 2001
From: Harshavardhan Bellamkonda <harshavardhan_b@srmap.edu.in>
Date: Fri, 18 Aug 2023 20:27:38 +0530
Subject: [PATCH 011/567] Add CPU and GPU behavior notice and example to
 tf.nn.embedding_lookup function

---
 tensorflow/python/ops/embedding_ops.py | 28 ++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 2fbd2643da45d1..3fbc582ead68f2 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -322,6 +322,34 @@ def embedding_lookup(
   Raises:
     ValueError: If `params` is empty.
   """
+  """
+    **Behavior Difference between CPU and GPU**
+
+    Please note that when using `tf.nn.embedding_lookup` on a GPU, if an out-of-bound index is encountered, a value of 0 will be stored in the corresponding output value. 
+    On the other hand, when using `tf.nn.embedding_lookup` on a CPU, an error will be returned if an out-of-bound index is found.
+
+    This behavior difference can impact the results of your computation, especially when dealing with indices that may go beyond the bounds of the tensor. 
+    Make sure to be mindful of this distinction when using the `tf.nn.embedding_lookup` function in your computations.
+
+    **Usage Example**
+
+    Here's an example demonstrating how to use `tf.nn.embedding_lookup`:
+
+    ```python
+    import tensorflow as tf
+
+    # Example embedding matrix and indices
+    embedding_matrix = tf.constant([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
+    indices = tf.constant([1, 0, 2])
+
+    # Perform embedding lookup
+    embeddings = tf.nn.embedding_lookup(embedding_matrix, indices)
+
+    # Print the result
+    print("Embeddings:")
+    print(embeddings.numpy())
+    ```
+    """
 
   return _embedding_lookup_and_transform(
       params=params,

From 616aa8414f58f5477e6350bf53372c9a47042765 Mon Sep 17 00:00:00 2001
From: Harshavardhan Bellamkonda <harshavardhan_b@srmap.edu.in>
Date: Tue, 22 Aug 2023 20:47:32 +0530
Subject: [PATCH 012/567] Updated the file with necessary changes

---
 tensorflow/python/ops/embedding_ops.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 3fbc582ead68f2..37be47304ea8a2 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -322,14 +322,20 @@ def embedding_lookup(
   Raises:
     ValueError: If `params` is empty.
   """
+  
+  
   """
     **Behavior Difference between CPU and GPU**
 
-    Please note that when using `tf.nn.embedding_lookup` on a GPU, if an out-of-bound index is encountered, a value of 0 will be stored in the corresponding output value. 
-    On the other hand, when using `tf.nn.embedding_lookup` on a CPU, an error will be returned if an out-of-bound index is found.
+    Please note that when using `tf.nn.embedding_lookup` on a GPU, if an out-of-bound 
+    index is encountered, a value of 0 will be stored in the corresponding output value. 
+    On the other hand, when using `tf.nn.embedding_lookup` on a CPU, an error will be 
+    returned if an out-of-bound index is found.
 
-    This behavior difference can impact the results of your computation, especially when dealing with indices that may go beyond the bounds of the tensor. 
-    Make sure to be mindful of this distinction when using the `tf.nn.embedding_lookup` function in your computations.
+    This behavior difference can impact the results of your computation, especially when 
+    dealing with indices that may go beyond the bounds of the tensor. 
+    Make sure to be mindful of this distinction when using the `tf.nn.embedding_lookup` 
+    function in your computations.
 
     **Usage Example**
 

From a94b7e777a2c2621aa80e7e15ae9e45c6bf2b5cd Mon Sep 17 00:00:00 2001
From: weihanmines <wei.han3@amd.com>
Date: Fri, 11 Aug 2023 19:14:45 +0000
Subject: [PATCH 013/567] put rocm config back in bazelrc

---
 .bazelrc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.bazelrc b/.bazelrc
index aad8e480ec4741..f341501b528b69 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -39,6 +39,7 @@
 #     tpu:          Build TF with TPU support
 #     cuda:         Build with CUDA support.
 #     cuda_clang    Build with CUDA Clang support.
+#     rocm:         Build with AMD GPU support (rocm)
 #     mkl:          Enable full mkl support.
 #     tensorrt:     Enable Tensorrt support.
 #     noaws:        Disable AWS S3 storage support
@@ -274,6 +275,11 @@ build:tpu --define=with_tpu_support=true
 
 build:tensorrt --repo_env TF_NEED_TENSORRT=1
 
+build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain
+build:rocm --define=using_rocm_hipcc=true
+build:rocm --define=tensorflow_mkldnn_contraction_kernel=0
+build:rocm --repo_env TF_NEED_ROCM=1
+
 # Options to disable default on features
 build:noaws --define=no_aws_support=true
 build:nogcp --define=no_gcp_support=true
@@ -579,4 +585,5 @@ build:release_gpu_windows --config=release_gpu_base
 build:android --config=no_tfrt
 build:macos   --config=no_tfrt
 build:windows --config=no_tfrt
+build:rocm --config=no_tfrt
 build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/ir,tensorflow/compiler/mlir/tfrt/ir/mlrt,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/mlrt,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/compiler/mlir/tfrt/transforms/mlrt,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/runtime_fallback/test,tensorflow/core/runtime_fallback/test/gpu,tensorflow/core/runtime_fallback/test/saved_model,tensorflow/core/runtime_fallback/test/testdata,tensorflow/core/tfrt/stubs,tensorflow/core/tfrt/tfrt_session,tensorflow/core/tfrt/mlrt,tensorflow/core/tfrt/mlrt/attribute,tensorflow/core/tfrt/mlrt/kernel,tensorflow/core/tfrt/mlrt/bytecode,tensorflow/core/tfrt/mlrt/interpreter,tensorflow/compiler/mlir/tfrt/translate/mlrt,tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug,tensorflow/core/tfrt/saved_model/python,tensorflow/core/tfrt/graph_executor/python,tensorflow/core/tfrt/saved_model/utils

From 22430372eb3b29f5bf336295d214365e5f62f70c Mon Sep 17 00:00:00 2001
From: Akash Patel <17132214+acxz@users.noreply.github.com>
Date: Sat, 9 Sep 2023 14:46:01 -0500
Subject: [PATCH 014/567] fix hipcc path

---
 third_party/gpus/rocm_configure.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 432f8391424037..f7f05391dfaaa5 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -751,7 +751,7 @@ def _create_local_rocm_repository(repository_ctx):
         tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_rocm"],
         {
             "%{cpu_compiler}": str(cc),
-            "%{hipcc_path}": rocm_config.rocm_toolkit_path + "/hip/bin/hipcc",
+            "%{hipcc_path}": rocm_config.rocm_toolkit_path + "/bin/hipcc",
             "%{hipcc_env}": _hipcc_env(repository_ctx),
             "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
             "%{rocr_runtime_library}": "hsa-runtime64",

From b92b789a8ef69e6385d0814b188684270776d5b2 Mon Sep 17 00:00:00 2001
From: Joyce Brum <joycebrum@google.com>
Date: Tue, 12 Sep 2023 14:49:29 +0000
Subject: [PATCH 015/567] [StepSecurity] ci: Harden GitHub Actions

Signed-off-by: Joyce Brum <joycebrum@google.com>
---
 .github/workflows/arm-cd.yml                    | 3 +++
 .github/workflows/arm-ci-extended-cpp.yml       | 3 +++
 .github/workflows/arm-ci-extended.yml           | 3 +++
 .github/workflows/arm-ci.yml                    | 3 +++
 .github/workflows/release-branch-cherrypick.yml | 3 +++
 .github/workflows/sigbuild-docker-branch.yml    | 3 +++
 .github/workflows/sigbuild-docker.yml           | 3 +++
 .github/workflows/stale-issues.yml              | 3 +++
 .github/workflows/trusted-partners.yml          | 3 +++
 .github/workflows/update-rbe.yml                | 3 +++
 10 files changed, 30 insertions(+)

diff --git a/.github/workflows/arm-cd.yml b/.github/workflows/arm-cd.yml
index d01ccbf458d5b7..15433f8f14be32 100644
--- a/.github/workflows/arm-cd.yml
+++ b/.github/workflows/arm-cd.yml
@@ -24,6 +24,9 @@ on:
   schedule:
     - cron: '0 8 * * *'
 
+permissions:
+  contents: read
+
 jobs:
   build:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
diff --git a/.github/workflows/arm-ci-extended-cpp.yml b/.github/workflows/arm-ci-extended-cpp.yml
index 7f75dbff659dad..e648297d37e789 100644
--- a/.github/workflows/arm-ci-extended-cpp.yml
+++ b/.github/workflows/arm-ci-extended-cpp.yml
@@ -22,6 +22,9 @@ on:
   schedule:
     - cron: '0 2 * * *'
 
+permissions:
+  contents: read
+
 jobs:
   build:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
diff --git a/.github/workflows/arm-ci-extended.yml b/.github/workflows/arm-ci-extended.yml
index 7abfbd6f8cd030..01ce70ba82ecfa 100644
--- a/.github/workflows/arm-ci-extended.yml
+++ b/.github/workflows/arm-ci-extended.yml
@@ -22,6 +22,9 @@ on:
   schedule:
     - cron: '0 4 * * *'
 
+permissions:
+  contents: read
+
 jobs:
   build:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
diff --git a/.github/workflows/arm-ci.yml b/.github/workflows/arm-ci.yml
index 73462a673dab03..96467ebaeb35a9 100644
--- a/.github/workflows/arm-ci.yml
+++ b/.github/workflows/arm-ci.yml
@@ -26,6 +26,9 @@ on:
       - master
       - r2.**
 
+permissions:
+  contents: read
+
 jobs:
   build:
     # Don't do this in forks, and if labeled, only for 'kokoro:force-run'
diff --git a/.github/workflows/release-branch-cherrypick.yml b/.github/workflows/release-branch-cherrypick.yml
index 5ff69e468057c4..87b33bb4a0fd8c 100644
--- a/.github/workflows/release-branch-cherrypick.yml
+++ b/.github/workflows/release-branch-cherrypick.yml
@@ -35,6 +35,9 @@ on:
         required: true
         type: string
 
+permissions:
+  contents: read
+
 jobs:
   cherrypick:
     name: Cherrypick to ${{ github.event.inputs.release_branch}} - ${{ github.event.inputs.git_commit }}
diff --git a/.github/workflows/sigbuild-docker-branch.yml b/.github/workflows/sigbuild-docker-branch.yml
index d48ef98608e758..2a1ba68891123d 100644
--- a/.github/workflows/sigbuild-docker-branch.yml
+++ b/.github/workflows/sigbuild-docker-branch.yml
@@ -25,6 +25,9 @@ on:
     branches:
       - "r[1-9].[0-9]+"
 
+permissions:
+  contents: read
+
 jobs:
   docker:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
diff --git a/.github/workflows/sigbuild-docker.yml b/.github/workflows/sigbuild-docker.yml
index a7f9477929389d..810e9ec55a5317 100644
--- a/.github/workflows/sigbuild-docker.yml
+++ b/.github/workflows/sigbuild-docker.yml
@@ -28,6 +28,9 @@ on:
     branches:
       - master
 
+permissions:
+  contents: read
+
 jobs:
   docker:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
diff --git a/.github/workflows/stale-issues.yml b/.github/workflows/stale-issues.yml
index cb8d8639a11ffb..84118acca683fd 100644
--- a/.github/workflows/stale-issues.yml
+++ b/.github/workflows/stale-issues.yml
@@ -18,6 +18,9 @@ on:
   schedule:
     - cron: "30 1 * * *"
 
+permissions:
+  contents: read
+
 jobs:
   close-issues:
     # Don't do this in forks
diff --git a/.github/workflows/trusted-partners.yml b/.github/workflows/trusted-partners.yml
index 0b64aad201cc13..f6f40518fe5edf 100644
--- a/.github/workflows/trusted-partners.yml
+++ b/.github/workflows/trusted-partners.yml
@@ -17,6 +17,9 @@ name: Trusted Partner PR
 on:
   pull_request_target:
 
+permissions:
+  contents: read
+
 jobs:
   assign-partner-tags:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index a07b6f9e38e5a0..ca22041782fa4f 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -20,6 +20,9 @@ name: Update RBE Configs
 on:
   workflow_dispatch:
 
+permissions:
+  contents: read
+
 jobs:
   rbe:
     name: Update RBE Configs

From 56803aae482fe077b6fb94e2f4ba96d9ab885de7 Mon Sep 17 00:00:00 2001
From: Joyce Brum <joycebrum@google.com>
Date: Tue, 12 Sep 2023 16:40:29 +0000
Subject: [PATCH 016/567] set permissions on workflows

Signed-off-by: Joyce Brum <joycebrum@google.com>
---
 .github/workflows/cffconvert.yml                | 3 +++
 .github/workflows/issue-on-pr-rollback.yml      | 6 ++++++
 .github/workflows/pylint-presubmit.yml          | 3 +++
 .github/workflows/sigbuild-docker-presubmit.yml | 6 ++++++
 4 files changed, 18 insertions(+)

diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml
index 21ac759f3ef656..49e240867ec257 100644
--- a/.github/workflows/cffconvert.yml
+++ b/.github/workflows/cffconvert.yml
@@ -20,6 +20,9 @@ on:
     paths:
       - CITATION.cff
 
+permissions:
+  contents: read
+
 jobs:
   validate:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
diff --git a/.github/workflows/issue-on-pr-rollback.yml b/.github/workflows/issue-on-pr-rollback.yml
index fa76923a2ba770..459b99b4bee501 100644
--- a/.github/workflows/issue-on-pr-rollback.yml
+++ b/.github/workflows/issue-on-pr-rollback.yml
@@ -18,10 +18,16 @@ on:
   push:
     branches:
       - master
+      
+permissions: {}
 
 jobs:
   create-issue-on-pr-rollback:
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      issues: write
+      pull-requests: read
     if: |
       github.repository == 'tensorflow/tensorflow' &&
       startsWith(github.event.head_commit.message, 'Rollback of PR #')
diff --git a/.github/workflows/pylint-presubmit.yml b/.github/workflows/pylint-presubmit.yml
index e97f34472d8356..53e2253ad4d4a8 100644
--- a/.github/workflows/pylint-presubmit.yml
+++ b/.github/workflows/pylint-presubmit.yml
@@ -19,6 +19,9 @@ on:
     paths:
       - '**.py'
 
+permissions:
+  contents: read
+
 jobs:
   build:
     name: PyLint
diff --git a/.github/workflows/sigbuild-docker-presubmit.yml b/.github/workflows/sigbuild-docker-presubmit.yml
index 6450359835fafe..eed3af5b5cf380 100644
--- a/.github/workflows/sigbuild-docker-presubmit.yml
+++ b/.github/workflows/sigbuild-docker-presubmit.yml
@@ -23,6 +23,9 @@ on:
       - 'tensorflow/tools/tf_sig_build_dockerfiles/**'
       - '!tensorflow/tools/tf_sig_build_dockerfiles/README.md'
 
+permissions:
+  contents: read
+
 jobs:
   docker:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
@@ -30,6 +33,9 @@ jobs:
     strategy:
       matrix:
         python-version: [python3.9, python3.10, python3.11]
+    permissions:
+      contents: read
+      pull-requests: write
     steps:
       -
         name: Checkout

From 9f729bcaa23bb337e4005ffd220bd2b27e9cfe33 Mon Sep 17 00:00:00 2001
From: Joyce <joycebrum@google.com>
Date: Tue, 12 Sep 2023 13:45:56 -0300
Subject: [PATCH 017/567] set update-nightly.yml permissions

Signed-off-by: Joyce <joycebrum@google.com>
---
 .github/workflows/update-nightly.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/update-nightly.yml b/.github/workflows/update-nightly.yml
index 60372fddd272bf..282f030a31ff9f 100644
--- a/.github/workflows/update-nightly.yml
+++ b/.github/workflows/update-nightly.yml
@@ -18,10 +18,15 @@ on:
   schedule:
     - cron: 0 4 * * *  # 4am UTC is 9pm PDT and 8pm PST
 name: Set nightly branch to master HEAD
+
+permissions: {}
+
 jobs:
   master-to-nightly:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
     runs-on: ubuntu-latest
+    permissions:
+      contents: write
     steps:
     - uses: zofrex/mirror-branch@a8809f0b42f9dfe9b2c5c2162a46327c23d15266 # v1.0.3
       name: Set nightly branch to master HEAD

From 17d8892757be72019201e79981d311fd2a12b31e Mon Sep 17 00:00:00 2001
From: William Muir <wamuir@gmail.com>
Date: Sat, 16 Sep 2023 22:47:34 -0500
Subject: [PATCH 018/567] Strip `external/local_tsl` prefix during tar of tsl c
 headers

---
 tensorflow/tools/lib_package/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 578d2e3f2f5bfe..1c6af3884bc9ce 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -73,7 +73,7 @@ pkg_tar(
         "//tensorflow/c:headers",
     ],
     package_dir = "include/",
-    strip_prefix = "/",
+    strip_prefix = "/external/local_tsl",
     # Mark as "manual" till
     # https://github.com/bazelbuild/bazel/issues/2352
     # and https://github.com/bazelbuild/bazel/issues/1580

From 29abb6ac7121bed646f3cd832511372626e7916c Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <fadi.arafeh@arm.com>
Date: Fri, 15 Sep 2023 13:10:50 +0000
Subject: [PATCH 019/567] Update oneDNN to v3.2.1 for aarch64 and add fp32-bf16
 JIT reorder patch

---
 tensorflow/workspace2.bzl                     |   13 +-
 third_party/mkl_dnn/mkldnn_acl.BUILD          |    9 +-
 .../onednn_acl_depthwise_convolution.patch    |  356 --
 .../onednn_acl_fixed_format_kernels.patch     | 1166 -----
 .../onednn_acl_fp32_bf16_reorder.patch        |  111 +
 .../mkl_dnn/onednn_acl_remove_winograd.patch  |  326 --
 third_party/mkl_dnn/onednn_acl_reorder.patch  |   17 +-
 .../mkl_dnn/onednn_acl_reorder_padded.patch   |  858 ----
 .../mkl_dnn/onednn_acl_reorder_update.patch   | 4193 -----------------
 .../onednn_acl_thread_local_scheduler.patch   |   65 +-
 .../mkl_dnn/onednn_acl_threadcap.patch        |   12 +-
 .../onednn_acl_threadpool_scheduler.patch     |   45 -
 .../xla/third_party/mkl_dnn/mkldnn_acl.BUILD  |    9 +-
 .../tsl/third_party/mkl_dnn/mkldnn_acl.BUILD  |    9 +-
 .../third_party/tsl/tsl/mkl/build_defs.bzl    |    2 +-
 .../xla/third_party/tsl/workspace2.bzl        |   19 +-
 16 files changed, 188 insertions(+), 7022 deletions(-)
 delete mode 100644 third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch
 delete mode 100644 third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch
 create mode 100644 third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch
 delete mode 100644 third_party/mkl_dnn/onednn_acl_remove_winograd.patch
 delete mode 100644 third_party/mkl_dnn/onednn_acl_reorder_padded.patch
 delete mode 100644 third_party/mkl_dnn/onednn_acl_reorder_update.patch
 delete mode 100644 third_party/mkl_dnn/onednn_acl_threadpool_scheduler.patch

diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index fd6071f091e1b6..22d25f101488aa 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -205,18 +205,13 @@ def _tf_repositories():
         build_file = "//third_party/mkl_dnn:mkldnn_acl.BUILD",
         patch_file = [
             "//third_party/mkl_dnn:onednn_acl_threadcap.patch",
-            "//third_party/mkl_dnn:onednn_acl_remove_winograd.patch",
-            "//third_party/mkl_dnn:onednn_acl_fixed_format_kernels.patch",
-            "//third_party/mkl_dnn:onednn_acl_depthwise_convolution.patch",
-            "//third_party/mkl_dnn:onednn_acl_threadpool_scheduler.patch",
-            "//third_party/mkl_dnn:onednn_acl_reorder_padded.patch",
-            "//third_party/mkl_dnn:onednn_acl_reorder_update.patch",
             "//third_party/mkl_dnn:onednn_acl_reorder.patch",
             "//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
+            "//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
         ],
-        sha256 = "a50993aa6265b799b040fe745e0010502f9f7103cc53a9525d59646aef006633",
-        strip_prefix = "oneDNN-2.7.3",
-        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/v2.7.3.tar.gz"),
+        sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
+        strip_prefix = "oneDNN-3.2.1",
+        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.2.1.tar.gz"),
     )
 
     tf_http_archive(
diff --git a/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/mkl_dnn/mkldnn_acl.BUILD
index a1085427ec08da..0653bcb5523941 100644
--- a/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -27,6 +27,7 @@ _DNNL_RUNTIME_THREADPOOL = {
     "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
     "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
     "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
+    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
     "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
     "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
     "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
@@ -74,6 +75,7 @@ _DNNL_RUNTIME_OMP = {
     "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
     "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
     "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
+    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
     "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
     "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
     "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
@@ -124,9 +126,9 @@ expand_template(
     name = "dnnl_version_h",
     out = "include/oneapi/dnnl/dnnl_version.h",
     substitutions = {
-        "@DNNL_VERSION_MAJOR@": "2",
-        "@DNNL_VERSION_MINOR@": "7",
-        "@DNNL_VERSION_PATCH@": "3",
+        "@DNNL_VERSION_MAJOR@": "3",
+        "@DNNL_VERSION_MINOR@": "2",
+        "@DNNL_VERSION_PATCH@": "1",
         "@DNNL_VERSION_HASH@": "N/A",
     },
     template = "include/oneapi/dnnl/dnnl_version.h.in",
@@ -142,6 +144,7 @@ cc_library(
         ],
         exclude = [
             "src/cpu/x64/**",
+            "src/cpu/rv64/**",
         ],
     ),
     copts = select({
diff --git a/third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch b/third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch
deleted file mode 100644
index 950077665fb4b7..00000000000000
--- a/third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch
+++ /dev/null
@@ -1,356 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index 6b57374643..85e45ace9d 100644
---- a/src/cpu/aarch64/acl_convolution_utils.cpp
-+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
-@@ -48,11 +48,14 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
-     if (!is_fwd) return status::unimplemented;
- 
-     const int ndims = src_d.ndims();
-+    const bool is_depthwise = wei_d.ndims() == 5 && wei_d.dims()[1] == 1
-+            && wei_d.dims()[2] == 1;
- 
--    ACL_CHECK_SUPPORT(ndims != 4, " only supports 2 spatial dimensions");
-+    ACL_CHECK_SUPPORT(
-+            ndims != 4 && !is_depthwise, " only supports 2 spatial dimensions");
- 
-     const int with_groups = wei_d.ndims() == src_d.ndims() + 1;
--    ACL_CHECK_SUPPORT(with_groups, " does not support groups");
-+    ACL_CHECK_SUPPORT(with_groups && !is_depthwise, " does not support groups");
- 
-     ACL_CHECK_SUPPORT(src_d.data_type() != data_type::f32
-                     || wei_d.data_type() != data_type::f32
-@@ -108,7 +111,8 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
- 
-     acp.with_bias = cd.bias_desc.format_kind != format_kind::undef;
- 
--    if (wei_d.format_kind() != format_kind::any) return status::unimplemented;
-+    if (wei_d.format_kind() != format_kind::any && !is_depthwise)
-+        return status::unimplemented;
- 
-     auto src_tag = memory_desc_matches_one_of_tag(
-             src_md, format_tag::nhwc, format_tag::nchw);
-@@ -138,8 +142,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
-             || src_tag != dst_tag)
-         return status::unimplemented;
- 
--    // Set weights to initially be the same as src
--    CHECK(memory_desc_init_by_tag(weights_md, src_tag));
-+    if (is_depthwise) {
-+        CHECK(memory_desc_init_by_tag(weights_md, format_tag::hwigo));
-+    } else {
-+        // Set weights to initially be the same as src
-+        CHECK(memory_desc_init_by_tag(weights_md, src_tag));
-+    }
- 
-     // Bias is just 1D, set to be the obvious format
-     if (acp.with_bias && bias_md.format_kind == format_kind::any)
-@@ -166,6 +174,11 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
-             1,
-             acl_data_type,
-             acl_layout);
-+    if(is_depthwise) {
-+       // We need to set that values are not constant so that we
-+       // we can update them in-place in ACL
-+      acp.wei_tensor_info.set_are_values_constant(false);
-+    }
- 
-     acp.dst_tensor_info = arm_compute::TensorInfo(
-             is_nhwc ? arm_compute::TensorShape(oc, ow, oh, mb) :
-@@ -185,6 +198,11 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
-     // Are we allowed to cast down to bf16 or not?
-     acp.fast_math
-             = one_of(attr.fpmath_mode_, fpmath_mode::bf16, fpmath_mode::any);
-+    if (is_depthwise) {
-+        // There is no support for fixed format kernels for depthwise convolution
-+        // in ACL so we are going to use weight format that we set up earlier
-+        return status::success;
-+    }
- 
-     // WeightFormat::ANY tells ACL we can handle any format
-     acp.weights_info = arm_compute::WeightsInfo(
-@@ -252,6 +270,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-         memory_desc_t &weights_md, memory_desc_t &dst_md,
-         memory_desc_t &bias_md, const convolution_desc_t &cd,
-         const primitive_attr_t &attr) {
-+    if (weights_md.ndims != 4) return status::unimplemented;
- 
-     // General Compute Library checks, memory tags are also set there
-     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
-@@ -277,6 +296,7 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-         memory_desc_t &weights_md, memory_desc_t &dst_md,
-         memory_desc_t &bias_md, const convolution_desc_t &cd,
-         const primitive_attr_t &attr) {
-+    if (weights_md.ndims != 4) return status::unimplemented;
- 
-     // Indirect is slower for small convolution kernels
-     if (weights_md.dims[2] == 1 && weights_md.dims[3] == 1)
-@@ -314,6 +334,22 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-     return status::success;
- }
- 
-+status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md,
-+        memory_desc_t &weights_md, memory_desc_t &dst_md,
-+        memory_desc_t &bias_md, const convolution_desc_t &cd,
-+        const primitive_attr_t &attr) {
-+    if (weights_md.ndims != 5) return status::unimplemented;
-+
-+    CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
-+
-+    ACL_CHECK_VALID(arm_compute::NEDepthwiseConvolutionLayer::validate(
-+            &acp.src_tensor_info, &acp.wei_tensor_info,
-+            acp.with_bias ? &acp.bia_tensor_info : nullptr,
-+            &acp.dst_tensor_info, acp.padstride_info));
-+
-+    return status::success;
-+}
-+
- } // namespace acl_convolution_utils
- 
- } // namespace aarch64
-diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
-index e3d40a5e75..1ded5826c4 100644
---- a/src/cpu/aarch64/acl_convolution_utils.hpp
-+++ b/src/cpu/aarch64/acl_convolution_utils.hpp
-@@ -66,6 +66,11 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-         memory_desc_t &bias_md, const convolution_desc_t &cd,
-         const primitive_attr_t &attr);
- 
-+status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md,
-+        memory_desc_t &weights_md, memory_desc_t &dst_md,
-+        memory_desc_t &bias_md, const convolution_desc_t &cd,
-+        const primitive_attr_t &attr);
-+
- } // namespace acl_convolution_utils
- 
- template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
-diff --git a/src/cpu/aarch64/acl_depthwise_convolution.cpp b/src/cpu/aarch64/acl_depthwise_convolution.cpp
-new file mode 100644
-index 0000000000..70ae6bceea
---- /dev/null
-+++ b/src/cpu/aarch64/acl_depthwise_convolution.cpp
-@@ -0,0 +1,42 @@
-+/*******************************************************************************
-+* Copyright 2023 Arm Ltd. and affiliates
-+*
-+* Licensed under the Apache License, Version 2.0 (the "License");
-+* you may not use this file except in compliance with the License.
-+* You may obtain a copy of the License at
-+*
-+*     http://www.apache.org/licenses/LICENSE-2.0
-+*
-+* Unless required by applicable law or agreed to in writing, software
-+* distributed under the License is distributed on an "AS IS" BASIS,
-+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+* See the License for the specific language governing permissions and
-+* limitations under the License.
-+*******************************************************************************/
-+
-+#include "cpu/aarch64/acl_depthwise_convolution.hpp"
-+
-+namespace dnnl {
-+namespace impl {
-+namespace cpu {
-+namespace aarch64 {
-+
-+status_t acl_depthwise_convolution_fwd_t::execute_forward(
-+        const exec_ctx_t &ctx) const {
-+    std::lock_guard<std::mutex> _lock {this->mtx};
-+
-+    auto *acl_resource
-+            = ctx.get_resource_mapper()
-+                      ->get<acl_depthwise_convolution_resource_t>(this);
-+    acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer> &acl_depthwise_obj
-+            = acl_resource->get_acl_obj();
-+
-+    return execute_forward_conv_acl<
-+            acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer>, pd_t, data_t>(
-+            ctx, acl_depthwise_obj, pd());
-+}
-+
-+} // namespace aarch64
-+} // namespace cpu
-+} // namespace impl
-+} // namespace dnnl
-diff --git a/src/cpu/aarch64/acl_depthwise_convolution.hpp b/src/cpu/aarch64/acl_depthwise_convolution.hpp
-new file mode 100644
-index 0000000000..3e3d02cf41
---- /dev/null
-+++ b/src/cpu/aarch64/acl_depthwise_convolution.hpp
-@@ -0,0 +1,141 @@
-+/*******************************************************************************
-+* Copyright 2023 Arm Ltd. and affiliates
-+*
-+* Licensed under the Apache License, Version 2.0 (the "License");
-+* you may not use this file except in compliance with the License.
-+* You may obtain a copy of the License at
-+*
-+*     http://www.apache.org/licenses/LICENSE-2.0
-+*
-+* Unless required by applicable law or agreed to in writing, software
-+* distributed under the License is distributed on an "AS IS" BASIS,
-+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+* See the License for the specific language governing permissions and
-+* limitations under the License.
-+*******************************************************************************/
-+
-+#ifndef CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
-+#define CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
-+
-+#include "cpu/aarch64/acl_convolution_utils.hpp"
-+#include "cpu/cpu_convolution_pd.hpp"
-+
-+namespace dnnl {
-+namespace impl {
-+namespace cpu {
-+namespace aarch64 {
-+
-+struct acl_depthwise_convolution_resource_t : public resource_t {
-+    acl_depthwise_convolution_resource_t()
-+        : acl_obj_(utils::make_unique<
-+                acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer>>()) {}
-+
-+    status_t configure(const acl_conv_conf_t &acp) {
-+        if (!acl_obj_) return status::out_of_memory;
-+
-+        acl_obj_->src_tensor.allocator()->init(acp.src_tensor_info);
-+        acl_obj_->wei_tensor.allocator()->init(acp.wei_tensor_info);
-+        acl_obj_->dst_tensor.allocator()->init(acp.dst_tensor_info);
-+        acl_obj_->bia_tensor.allocator()->init(acp.bia_tensor_info);
-+
-+        // clang-format off
-+        acl_obj_->conv.configure(
-+            &acl_obj_->src_tensor,
-+            &acl_obj_->wei_tensor,
-+            acp.with_bias ? &acl_obj_->bia_tensor : nullptr,
-+            &acl_obj_->dst_tensor,
-+            acp.padstride_info,
-+            1, // depth multiplier default value
-+            acp.act_info);
-+
-+        // clang-format on
-+        return status::success;
-+    }
-+
-+    acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer> &get_acl_obj() const {
-+        return *acl_obj_;
-+    }
-+
-+    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_depthwise_convolution_resource_t);
-+
-+private:
-+    std::unique_ptr<acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer>>
-+            acl_obj_;
-+};
-+
-+struct acl_depthwise_convolution_fwd_t : public primitive_t {
-+
-+    struct pd_t : public cpu_convolution_fwd_pd_t {
-+        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-+                const typename pd_t::base_class *hint_fwd_pd)
-+            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), acp_() {}
-+
-+        DECLARE_COMMON_PD_T("depthwise_convolution:acl",
-+                acl_depthwise_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
-+
-+        status_t init(engine_t *engine) {
-+            using namespace data_type;
-+
-+            const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
-+                    && attr()->has_default_values(
-+                            primitive_attr_t::skip_mask_t::post_ops, f16);
-+            const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
-+                    && attr()->has_default_values(
-+                            primitive_attr_t::skip_mask_t::post_ops, f32);
-+            bool ok = is_fwd()
-+                    && set_default_alg_kind(alg_kind::convolution_direct)
-+                    && utils::one_of(true, is_fp16_ok, is_fp32_ok)
-+                    && !has_zero_dim_memory();
-+            if (!ok) return status::unimplemented;
-+
-+            CHECK(acl_convolution_utils::init_conf_depthwise(acp_, src_md_,
-+                    weights_md_, dst_md_, bias_md_, *desc(), *attr()));
-+
-+            CHECK(post_ops.init(
-+                    engine, attr_.post_ops_, dst_md_, acp_.act_info));
-+            acp_.use_dst_acc = post_ops.has_sum();
-+
-+            return status::success;
-+        }
-+
-+        acl_conv_conf_t acp_;
-+
-+        acl_post_ops_t post_ops;
-+    };
-+
-+    acl_depthwise_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-+
-+    status_t create_resource(
-+            engine_t *engine, resource_mapper_t &mapper) const override {
-+        if (mapper.has_resource(this)) return status::success;
-+
-+        auto r = utils::make_unique<acl_depthwise_convolution_resource_t>();
-+        if (!r) return status::out_of_memory;
-+
-+        CHECK(r->configure(pd()->acp_));
-+        mapper.add(this, std::move(r));
-+
-+        CHECK(pd()->post_ops.create_resource(engine, mapper));
-+
-+        return status::success;
-+    }
-+
-+    typedef typename prec_traits<data_type::f32>::type data_t;
-+
-+    status_t execute(const exec_ctx_t &ctx) const override {
-+        return execute_forward(ctx);
-+    }
-+
-+private:
-+    mutable std::mutex mtx;
-+    status_t execute_forward(const exec_ctx_t &ctx) const;
-+
-+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-+};
-+
-+} // namespace aarch64
-+} // namespace cpu
-+} // namespace impl
-+} // namespace dnnl
-+
-+#endif // CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
-diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
-index 094c73aa36..80385432d8 100644
---- a/src/cpu/cpu_convolution_list.cpp
-+++ b/src/cpu/cpu_convolution_list.cpp
-@@ -63,6 +63,7 @@ using namespace dnnl::impl::cpu::x64;
- #include "cpu/aarch64/jit_sve_512_x8s8s32x_convolution.hpp"
- #include "cpu/aarch64/jit_uni_dw_convolution.hpp"
- #if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
-+#include "cpu/aarch64/acl_depthwise_convolution.hpp"
- #include "cpu/aarch64/acl_gemm_convolution.hpp"
- #include "cpu/aarch64/acl_indirect_gemm_convolution.hpp"
- #endif
-@@ -102,6 +103,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
-             CPU_INSTANCE_AARCH64(jit_sve_512_dw_convolution_fwd_t)
-             CPU_INSTANCE_AARCH64(jit_sve_512_1x1_convolution_fwd_f32_t)
-             CPU_INSTANCE_AARCH64(jit_sve_512_convolution_fwd_t<f32>)
-+            CPU_INSTANCE_AARCH64_ACL(acl_depthwise_convolution_fwd_t)
-             CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t)
-             CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t<f32>)
-             CPU_INSTANCE(gemm_convolution_fwd_t)
diff --git a/third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch b/third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch
deleted file mode 100644
index 5d918564fb1515..00000000000000
--- a/third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch
+++ /dev/null
@@ -1,1166 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/common/matmul_pd.hpp b/src/common/matmul_pd.hpp
-index 4330ad938b..df16c5fcca 100644
---- a/src/common/matmul_pd.hpp
-+++ b/src/common/matmul_pd.hpp
-@@ -159,6 +159,19 @@ protected:
- 
-         return true;
-     }
-+
-+    // All implementations that do not support sparse inputs/outputs should
-+    // call this function.
-+    bool is_dense_data() {
-+#ifdef DNNL_EXPERIMENTAL_SPARSE
-+        for (auto md : {&src_md_, &weights_md_, &bias_md_, &dst_md_}) {
-+            if (memory_desc_wrapper(md).format_kind() == format_kind::sparse)
-+                return false;
-+        }
-+#endif
-+        return true;
-+    }
-+
- };
- 
- } // namespace impl
-diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index 37f8ecbc06..6b57374643 100644
---- a/src/cpu/aarch64/acl_convolution_utils.cpp
-+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
-@@ -41,25 +41,23 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
-     const memory_desc_wrapper dst_d(&dst_md);
-     const memory_desc_wrapper bia_d(&bias_md);
- 
--    auto math_mode = get_fpmath_mode();
--    acp.fast_math = one_of(math_mode, fpmath_mode::bf16, fpmath_mode::any);
--
-     // Compute Library currently supports forward propagation only
-     const prop_kind_t prop_kind = cd.prop_kind;
-     const bool is_fwd = (prop_kind == dnnl_forward_training)
-             || (prop_kind == dnnl_forward_inference);
-     if (!is_fwd) return status::unimplemented;
- 
--    const int with_groups = wei_d.ndims() == src_d.ndims() + 1;
-     const int ndims = src_d.ndims();
--    const bool is_1d = ndims == 3;
--    const bool is_3d = ndims == 5;
--    bool is_nspc;
- 
--    // Compute Library unsupported shape scenarios
--    if (one_of(true, is_3d, is_1d, with_groups)) {
--        return status::unimplemented;
--    }
-+    ACL_CHECK_SUPPORT(ndims != 4, " only supports 2 spatial dimensions");
-+
-+    const int with_groups = wei_d.ndims() == src_d.ndims() + 1;
-+    ACL_CHECK_SUPPORT(with_groups, " does not support groups");
-+
-+    ACL_CHECK_SUPPORT(src_d.data_type() != data_type::f32
-+                    || wei_d.data_type() != data_type::f32
-+                    || dst_d.data_type() != data_type::f32,
-+            " src, dst and wei must be fp32");
- 
-     // batch size
-     const int mb = src_d.dims()[0];
-@@ -110,108 +108,143 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
- 
-     acp.with_bias = cd.bias_desc.format_kind != format_kind::undef;
- 
--    auto set_or_check_tags = [&](format_tag_t desired_src_tag,
--                                     format_tag_t desired_dst_tag) -> status_t {
--        using namespace format_tag;
--        auto src_tag = any, dst_tag = any;
--
--        if (src_d.format_kind() == format_kind::any) {
--            CHECK(memory_desc_init_by_tag(src_md, desired_src_tag));
--            src_tag = desired_src_tag;
--        } else {
--            src_tag = memory_desc_matches_one_of_tag(src_md, nhwc, nchw);
--        }
--
--        if (dst_d.format_kind() == format_kind::any) {
--            CHECK(memory_desc_init_by_tag(dst_md, desired_dst_tag));
--            dst_tag = desired_dst_tag;
--        } else {
--            dst_tag = memory_desc_matches_one_of_tag(dst_md, nhwc, nchw);
--        }
--
--        if (acp.with_bias && bias_md.format_kind == format_kind::any)
--            CHECK(memory_desc_init_by_tag(bias_md, x));
--
--        is_nspc = utils::one_of(src_tag, nhwc);
--
--        memory_desc_t want_wei_md = weights_md;
--        auto wei_tag = is_nspc ? ohwi : oihw;
--        CHECK(memory_desc_init_by_tag(want_wei_md, wei_tag));
--
--        // Compute Library does not support mismatching layouts
--        if ((src_tag != wei_tag) || (src_tag != dst_tag))
--            return status::unimplemented;
-+    if (wei_d.format_kind() != format_kind::any) return status::unimplemented;
-+
-+    auto src_tag = memory_desc_matches_one_of_tag(
-+            src_md, format_tag::nhwc, format_tag::nchw);
-+    auto dst_tag = memory_desc_matches_one_of_tag(
-+            dst_md, format_tag::nhwc, format_tag::nchw);
-+
-+    // We want src and dst to match, preferrably both to be NHWC
-+    if (src_d.format_kind() == format_kind::any
-+            && dst_d.format_kind() == format_kind::any) {
-+        CHECK(memory_desc_init_by_tag(src_md, format_tag::nhwc));
-+        CHECK(memory_desc_init_by_tag(dst_md, format_tag::nhwc));
-+    } else if (src_d.format_kind() == format_kind::any
-+            && dst_tag != format_tag::undef) {
-+        CHECK(memory_desc_init_by_tag(src_md, dst_tag));
-+    } else if (dst_d.format_kind() == format_kind::any
-+            && src_tag != format_tag::undef) {
-+        CHECK(memory_desc_init_by_tag(dst_md, src_tag));
-+    }
- 
--        if (weights_md.format_kind == format_kind::any) {
--            weights_md = want_wei_md;
--        }
--        return (want_wei_md == weights_md) ? status::success
--                                           : status::unimplemented;
--    };
-+    // Recompute tags after potentially running memory desc init
-+    src_tag = memory_desc_matches_one_of_tag(
-+            src_md, format_tag::nhwc, format_tag::nchw);
-+    dst_tag = memory_desc_matches_one_of_tag(
-+            dst_md, format_tag::nhwc, format_tag::nchw);
- 
--    auto default_dat_tag = format_tag::nhwc;
--    if (set_or_check_tags(default_dat_tag, default_dat_tag) != status::success)
-+    if (src_tag == format_tag::undef || dst_tag == format_tag::undef
-+            || src_tag != dst_tag)
-         return status::unimplemented;
- 
--    const auto acl_layout = is_nspc ? arm_compute::DataLayout::NHWC
--                                    : arm_compute::DataLayout::NCHW;
-+    // Set weights to initially be the same as src
-+    CHECK(memory_desc_init_by_tag(weights_md, src_tag));
- 
--    // For convolutions, int8 datatypes imply quantized types in ACL
--    acp.is_int8 = utils::one_of(src_d.data_type(), s8, u8)
--            && wei_d.data_type() == s8;
-+    // Bias is just 1D, set to be the obvious format
-+    if (acp.with_bias && bias_md.format_kind == format_kind::any)
-+        CHECK(memory_desc_init_by_tag(bias_md, format_tag::x));
- 
--    auto acl_src_data_t
--            = acl_utils::get_acl_data_t(src_d.data_type(), acp.is_int8);
--    auto acl_wei_data_t
--            = acl_utils::get_acl_data_t(wei_d.data_type(), acp.is_int8);
--    auto acl_dst_data_t
--            = acl_utils::get_acl_data_t(dst_d.data_type(), acp.is_int8);
--    auto acl_bia_data_t
--            = acl_utils::get_acl_data_t(bia_d.data_type(), acp.is_int8);
-+    bool is_nhwc = src_tag == format_tag::nhwc;
-+    // The layouts have to match (although we may later modify the weights)
-+    const auto acl_layout = is_nhwc ? arm_compute::DataLayout::NHWC
-+                                    : arm_compute::DataLayout::NCHW;
- 
--    if (acl_bia_data_t == arm_compute::DataType::UNKNOWN)
--        acl_bia_data_t = arm_compute::DataType::F32;
-+    auto acl_data_type = arm_compute::DataType::F32;
- 
-     // clang-format off
--    acp.src_info = arm_compute::TensorInfo(
--            is_nspc ? arm_compute::TensorShape(ic, iw, ih, mb) :
-+    acp.src_tensor_info = arm_compute::TensorInfo(
-+            is_nhwc ? arm_compute::TensorShape(ic, iw, ih, mb) :
-             arm_compute::TensorShape(iw, ih, ic, mb),
-             1,
--            acl_src_data_t,
-+            acl_data_type,
-             acl_layout);
- 
--    acp.wei_info = arm_compute::TensorInfo(
--            is_nspc ? arm_compute::TensorShape(ic, kw, kh, oc) :
-+    acp.wei_tensor_info = arm_compute::TensorInfo(
-+            is_nhwc ? arm_compute::TensorShape(ic, kw, kh, oc) :
-             arm_compute::TensorShape(kw, kh, ic, oc),
-             1,
--            acl_wei_data_t,
-+            acl_data_type,
-             acl_layout);
- 
--    acp.dst_info = arm_compute::TensorInfo(
--            is_nspc ? arm_compute::TensorShape(oc, ow, oh, mb) :
-+    acp.dst_tensor_info = arm_compute::TensorInfo(
-+            is_nhwc ? arm_compute::TensorShape(oc, ow, oh, mb) :
-             arm_compute::TensorShape(ow, oh, oc, mb),
-             1,
--            acl_dst_data_t,
-+            acl_data_type,
-             acl_layout);
- 
--    acp.bia_info = arm_compute::TensorInfo(
-+    acp.bia_tensor_info = arm_compute::TensorInfo(
-             acp.with_bias ? arm_compute::TensorShape(oc)
-                           : arm_compute::TensorShape(),
-             1,
--            acl_bia_data_t,
-+            acl_data_type,
-             acl_layout);
-     // clang-format on
- 
--    // Add quantization info to tensors
--    if (acp.is_int8) {
--        const float *scales = attr.output_scales_.scales_;
--        acp.src_info.set_quantization_info(arm_compute::QuantizationInfo(1, 0));
--        acp.bia_info.set_quantization_info(arm_compute::QuantizationInfo(1, 0));
--        acp.wei_info.set_quantization_info(arm_compute::QuantizationInfo(1, 0));
--        acp.dst_info.set_quantization_info(
--                arm_compute::QuantizationInfo(1.0f / scales[0], 0));
-+    // Are we allowed to cast down to bf16 or not?
-+    acp.fast_math
-+            = one_of(attr.fpmath_mode_, fpmath_mode::bf16, fpmath_mode::any);
-+
-+    // WeightFormat::ANY tells ACL we can handle any format
-+    acp.weights_info = arm_compute::WeightsInfo(
-+            false, kw, kh, oc, false, arm_compute::WeightFormat::ANY);
-+
-+    // Get the format that the ACL kernel will expect the weights to be
-+    // in (if a kernel exists). Note that these are referred to as fixed format
-+    // kernels, because they require one specific weights format
-+    arm_compute::WeightFormat expected_weight_format;
-+    ACL_CHECK_VALID(arm_compute::NEGEMMConvolutionLayer::has_opt_impl(
-+            expected_weight_format, &acp.src_tensor_info, &acp.wei_tensor_info,
-+            acp.with_bias ? &acp.bia_tensor_info : nullptr,
-+            &acp.dst_tensor_info, acp.padstride_info, acp.weights_info,
-+            acp.dilation_info, acp.act_info, acp.fast_math));
-+
-+    // Set weights info to the one returned by has_opt_impl
-+    acp.weights_info.set_weight_format(expected_weight_format);
-+
-+    // has_opt_impl may return a non fast math kernel, even if we requested one
-+    acp.fast_math
-+            = arm_compute::is_fixed_format_fast_math(expected_weight_format);
-+
-+    // Map OIHW used in ACL WeightFormat to the logical dimensions of the memory descriptor
-+    dim_t O_dim = 0;
-+    dim_t I_dim = 1;
-+    dim_t H_dim = 2;
-+    dim_t W_dim = 3;
-+
-+    if (!is_nhwc) {
-+        // We can try to support NCHW by swapping IHW around, note that this
-+        // requires weights_md.dims[I_dim] % block_by != 0 (see next block)
-+        O_dim = 0;
-+        I_dim = 3;
-+        H_dim = 1;
-+        W_dim = 2;
-     }
- 
-+    // We can't currently support nchw and block_by != 1. If this is the case,
-+    // try a non fast math kernel, which currently have no blocking
-+    int block_by = arm_compute::block_by(acp.weights_info.weight_format());
-+    if (!is_nhwc && weights_md.dims[I_dim] % block_by != 0 && acp.fast_math) {
-+        acp.fast_math = false;
-+        acp.weights_info.set_weight_format(arm_compute::WeightFormat::ANY);
-+        ACL_CHECK_VALID(arm_compute::NEGEMMConvolutionLayer::has_opt_impl(
-+                expected_weight_format, &acp.src_tensor_info,
-+                &acp.wei_tensor_info,
-+                acp.with_bias ? &acp.bia_tensor_info : nullptr,
-+                &acp.dst_tensor_info, acp.padstride_info, acp.weights_info,
-+                acp.dilation_info, acp.act_info, acp.fast_math));
-+        acp.weights_info.set_weight_format(expected_weight_format);
-+        block_by = arm_compute::block_by(expected_weight_format);
-+        // This shouldn't happen, because non-fastmath have no blocking, but
-+        // guard against it because it would silently return incorrect results
-+        if (weights_md.dims[I_dim] % block_by != 0)
-+            return status::unimplemented;
-+    }
-+
-+    acl_utils::reorder_to_weight_format(acp.wei_tensor_info, weights_md,
-+            expected_weight_format, I_dim, O_dim, {W_dim, H_dim}, {});
-+
-     return status::success;
- }
- 
-@@ -226,10 +259,10 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-     // clang-format off
-     // Validate convolution manually to check for return status
-     ACL_CHECK_VALID(arm_compute::NEGEMMConvolutionLayer::validate(
--        &acp.src_info,
--        &acp.wei_info,
--        acp.with_bias ? &acp.bia_info : nullptr,
--        &acp.dst_info,
-+        &acp.src_tensor_info,
-+        &acp.wei_tensor_info,
-+        acp.with_bias ? &acp.bia_tensor_info : nullptr,
-+        &acp.dst_tensor_info,
-         acp.padstride_info,
-         acp.weights_info,
-         acp.dilation_info,
-@@ -244,28 +277,38 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-         memory_desc_t &weights_md, memory_desc_t &dst_md,
-         memory_desc_t &bias_md, const convolution_desc_t &cd,
-         const primitive_attr_t &attr) {
--    // Indirect convolution results in slowdown for low thread count or 1x1
--    // kernels, so fall back to GEMM-based convolution in these cases
--    if (one_of(true, weights_md.dims[2] == 1, // kh
--                weights_md.dims[3] == 1, // kw
--                dnnl_get_max_threads() < 28)) {
-+
-+    // Indirect is slower for small convolution kernels
-+    if (weights_md.dims[2] == 1 && weights_md.dims[3] == 1)
-         return status::unimplemented;
--    }
- 
-     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
- 
-+    // Indirect is slower than gemm for low thread counts, except for fast math
-+    if (dnnl_get_max_threads() < 28 && !acp.fast_math)
-+        return status::unimplemented;
-+
-+    // If we do not need to pad input channels for fast math mode then it would
-+    // be faster to run convolution with im2row instead of using indirect kernel
-+    int block_by = arm_compute::block_by(acp.weights_info.weight_format());
-+    int ic = src_md.dims[1];
-+    if (acp.fast_math && ic % block_by == 0) return status::unimplemented;
-+
-+    // TODO: remove this once NEGEMMConv2d::validate allows src and weights to mismatch
-+    acp.wei_tensor_info.set_data_layout(arm_compute::DataLayout::NHWC);
-+
-     // clang-format off
-     // NOTE: indirect convolution method supports only nhwc layout.
-     ACL_CHECK_VALID(arm_compute::NEGEMMConv2d::validate(
--        &acp.src_info,
--        &acp.wei_info,
--        acp.with_bias ? &acp.bia_info : nullptr,
--        &acp.dst_info,
-+        &acp.src_tensor_info,
-+        &acp.wei_tensor_info,
-+        acp.with_bias ? &acp.bia_tensor_info : nullptr,
-+        &acp.dst_tensor_info,
-         arm_compute::Conv2dInfo(acp.padstride_info,
-                                 acp.dilation_info,
-                                 acp.act_info,
-                                 acp.fast_math,
--                                1)));
-+                                1, {}, acp.weights_info)));
-     // clang-format on
- 
-     return status::success;
-diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
-index 0398ab06b9..e3d40a5e75 100644
---- a/src/cpu/aarch64/acl_convolution_utils.hpp
-+++ b/src/cpu/aarch64/acl_convolution_utils.hpp
-@@ -38,17 +38,17 @@ struct acl_obj_t {
- 
- struct acl_conv_conf_t {
-     bool with_bias;
--    bool is_int8;
-     bool fast_math;
-     // If this is true, the result of the convolution goes into a temporarily
-     // allocated ACL tensor to be accumulated into the oneDNN dst during postops
-     bool use_dst_acc;
--    arm_compute::TensorInfo src_info;
--    arm_compute::TensorInfo wei_info;
--    arm_compute::TensorInfo bia_info;
--    arm_compute::TensorInfo dst_info;
-+    arm_compute::TensorInfo src_tensor_info;
-+    arm_compute::TensorInfo wei_tensor_info;
-+    arm_compute::TensorInfo bia_tensor_info;
-+    arm_compute::TensorInfo dst_tensor_info;
-     arm_compute::PadStrideInfo padstride_info;
-     arm_compute::Size2D dilation_info;
-+    // Additional information about the weights not included in wei_tensor_info
-     arm_compute::WeightsInfo weights_info;
-     // Note: this will default to not enabled, and will do nothing
-     arm_compute::ActivationLayerInfo act_info;
-diff --git a/src/cpu/aarch64/acl_gemm_convolution.hpp b/src/cpu/aarch64/acl_gemm_convolution.hpp
-index 485db954ea..da58e4f610 100644
---- a/src/cpu/aarch64/acl_gemm_convolution.hpp
-+++ b/src/cpu/aarch64/acl_gemm_convolution.hpp
-@@ -1,5 +1,5 @@
- /*******************************************************************************
--* Copyright 2020-2022 Arm Ltd. and affiliates
-+* Copyright 2020-2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -36,10 +36,10 @@ struct acl_resource_t : public resource_t {
-         if (!acl_obj_) return status::out_of_memory;
- 
-         // Init Compute Library tensors based on info from descriptor
--        acl_obj_->src_tensor.allocator()->init(acp.src_info);
--        acl_obj_->wei_tensor.allocator()->init(acp.wei_info);
--        acl_obj_->dst_tensor.allocator()->init(acp.dst_info);
--        acl_obj_->bia_tensor.allocator()->init(acp.bia_info);
-+        acl_obj_->src_tensor.allocator()->init(acp.src_tensor_info);
-+        acl_obj_->wei_tensor.allocator()->init(acp.wei_tensor_info);
-+        acl_obj_->dst_tensor.allocator()->init(acp.dst_tensor_info);
-+        acl_obj_->bia_tensor.allocator()->init(acp.bia_tensor_info);
- 
-         acl_obj_->conv.configure(&acl_obj_->src_tensor, &acl_obj_->wei_tensor,
-                 acp.with_bias ? &acl_obj_->bia_tensor : nullptr,
-diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
-index bcf031a771..b7c8dce894 100644
---- a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
-+++ b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
-@@ -1,5 +1,5 @@
- /*******************************************************************************
--* Copyright 2021-2022 Arm Ltd. and affiliates
-+* Copyright 2021-2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -35,10 +35,10 @@ struct acl_indirect_gemm_resource_t : public resource_t {
-         if (!acl_obj_) return status::out_of_memory;
- 
-         // Init Compute Library tensors based on info from descriptor
--        acl_obj_->src_tensor.allocator()->init(acp.src_info);
--        acl_obj_->wei_tensor.allocator()->init(acp.wei_info);
--        acl_obj_->dst_tensor.allocator()->init(acp.dst_info);
--        acl_obj_->bia_tensor.allocator()->init(acp.bia_info);
-+        acl_obj_->src_tensor.allocator()->init(acp.src_tensor_info);
-+        acl_obj_->wei_tensor.allocator()->init(acp.wei_tensor_info);
-+        acl_obj_->dst_tensor.allocator()->init(acp.dst_tensor_info);
-+        acl_obj_->bia_tensor.allocator()->init(acp.bia_tensor_info);
- 
-         // clang-format off
-         acl_obj_->conv.configure(
-@@ -50,7 +50,9 @@ struct acl_indirect_gemm_resource_t : public resource_t {
-                                     acp.dilation_info,
-                                     acp.act_info,
-                                     acp.fast_math,
--                                    1));
-+                                    1,
-+                                    {},
-+                                    acp.weights_info));
-         // clang-format on
- 
-         return status::success;
-diff --git a/src/cpu/aarch64/acl_inner_product.hpp b/src/cpu/aarch64/acl_inner_product.hpp
-index c5e507085f..a27df640fb 100644
---- a/src/cpu/aarch64/acl_inner_product.hpp
-+++ b/src/cpu/aarch64/acl_inner_product.hpp
-@@ -40,11 +40,13 @@ struct acl_ip_conf_t {
-     // If this is true, the result of the inner product goes into a temporarily
-     // allocated ACL tensor to be accumulated into the oneDNN dst during postops
-     bool use_dst_acc;
--    arm_compute::TensorInfo src_info;
--    arm_compute::TensorInfo wei_info;
--    arm_compute::TensorInfo bia_info;
--    arm_compute::TensorInfo dst_info;
-+    arm_compute::TensorInfo src_tensor_info;
-+    arm_compute::TensorInfo wei_tensor_info;
-+    arm_compute::TensorInfo bia_tensor_info;
-+    arm_compute::TensorInfo dst_tensor_info;
-     arm_compute::FullyConnectedLayerInfo fc_info;
-+    // Additional information about the weights not included in wei_tensor_info
-+    arm_compute::WeightsInfo weights_info;
- };
- struct acl_ip_resource_t : public resource_t {
-     acl_ip_resource_t() : acl_ip_obj_(utils::make_unique<acl_ip_obj_t>()) {}
-@@ -53,10 +55,10 @@ struct acl_ip_resource_t : public resource_t {
-         if (!acl_ip_obj_) return status::out_of_memory;
- 
-         // Init Compute Library tensors based on info from descriptor
--        acl_ip_obj_->src_tensor.allocator()->init(aip.src_info);
--        acl_ip_obj_->wei_tensor.allocator()->init(aip.wei_info);
--        acl_ip_obj_->dst_tensor.allocator()->init(aip.dst_info);
--        acl_ip_obj_->bia_tensor.allocator()->init(aip.bia_info);
-+        acl_ip_obj_->src_tensor.allocator()->init(aip.src_tensor_info);
-+        acl_ip_obj_->wei_tensor.allocator()->init(aip.wei_tensor_info);
-+        acl_ip_obj_->dst_tensor.allocator()->init(aip.dst_tensor_info);
-+        acl_ip_obj_->bia_tensor.allocator()->init(aip.bia_tensor_info);
- 
-         // clang-format off
-         acl_ip_obj_->fc.configure(
-@@ -64,7 +66,8 @@ struct acl_ip_resource_t : public resource_t {
-             &acl_ip_obj_->wei_tensor,
-             aip.with_bias ? &acl_ip_obj_->bia_tensor : nullptr,
-             &acl_ip_obj_->dst_tensor,
--            aip.fc_info);
-+            aip.fc_info,
-+            aip.weights_info);
-         // clang-format on
- 
-         return status::success;
-@@ -89,12 +92,16 @@ struct acl_inner_product_fwd_t : public primitive_t {
-         DECLARE_COMMON_PD_T("acl", acl_inner_product_fwd_t);
- 
-         status_t init(engine_t *engine) {
--            const bool ok = is_fwd() && !has_zero_dim_memory()
--                    && expect_data_types(data_type::f32, data_type::f32,
--                            data_type::f32, data_type::f32, data_type::f32)
-+            using namespace data_type;
-+            const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
-+                && attr()->has_default_values(
-+                        primitive_attr_t::skip_mask_t::post_ops, f16);
-+            const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
-                     && attr()->has_default_values(
--                            primitive_attr_t::skip_mask_t::post_ops,
--                            data_type::f32)
-+                            primitive_attr_t::skip_mask_t::post_ops, f32);
-+            const bool ok = is_fwd() && !has_zero_dim_memory()
-+                    && utils::one_of(true, is_fp16_ok, is_fp32_ok)
-+                    && weights_md_.format_kind == format_kind::any
-                     && set_default_params() == status::success;
- 
-             if (!ok) return status::unimplemented;
-@@ -121,88 +128,46 @@ struct acl_inner_product_fwd_t : public primitive_t {
-             ACL_CHECK_SUPPORT(
-                     !(is_2d || is_4d), "ACL supports only 2d or 4d cases");
- 
--            // batch size
--            const int n = src_md()->dims[0];
--
--            // input and output channels
--            const int ic = src_md()->dims[1];
--            const int oc = dst_md()->dims[1];
--
--            // source spatial dimensions
--            const int ih = is_4d ? src_md()->dims[ndims - 2] : 0;
--            const int iw = is_4d ? src_md()->dims[ndims - 1] : 0;
--
--            // weights spatial dimensions
--            const int kh = is_4d ? weights_md()->dims[ndims - 2] : 0;
--            const int kw = is_4d ? weights_md()->dims[ndims - 1] : 0;
--
--            // Only NCHW or NHWC derivatives supported by ACL kernels
-             using namespace format_tag;
--            auto src_tag = memory_desc_matches_one_of_tag(
--                    src_md_, nhwc, nchw, nc, cn);
--            auto wei_tag = memory_desc_matches_one_of_tag(
--                    weights_md_, ohwi, oihw, oi, io);
--            auto dst_tag = memory_desc_matches_one_of_tag(dst_md_, nc, cn);
-+            auto src_tag
-+                    = memory_desc_matches_one_of_tag(src_md_, nhwc, nchw, nc);
-+            auto dst_tag = memory_desc_matches_one_of_tag(dst_md_, nc);
- 
-             ACL_CHECK_SUPPORT(
--                    utils::one_of(format_tag::undef, src_tag, wei_tag, dst_tag),
-+                    utils::one_of(format_tag::undef, src_tag, dst_tag),
-                     "unsupported memory layout");
- 
-             ACL_CHECK_SUPPORT(is_2d && src_tag != dst_tag,
-                     "for src and dst layouts must match");
- 
--            arm_compute::TensorShape src_shape, wei_shape;
--            if (is_2d) {
--                src_shape = (src_tag == nc) ? arm_compute::TensorShape(ic, n)
--                                            : arm_compute::TensorShape(n, ic);
--
--                wei_shape = (wei_tag == io) ? arm_compute::TensorShape(oc, ic)
--                                            : arm_compute::TensorShape(ic, oc);
--            }
--            if (is_4d) {
--                src_shape = (src_tag == nhwc)
--                        ? arm_compute::TensorShape(ic, iw, ih, n)
--                        : arm_compute::TensorShape(iw, ih, ic, n);
--
--                // ACL requires the weights to be in 2D flattened shape
--                const int flattened_ic = is_4d ? ic * kh * kw : ic;
--                wei_shape = arm_compute::TensorShape(flattened_ic, oc);
--            }
--
--            arm_compute::DataLayout src_layout = (src_tag == nhwc)
--                    ? arm_compute::DataLayout::NHWC
--                    : arm_compute::DataLayout::NCHW;
-+            const dim_t ic_total = IC_total();
-+            const dim_t n = MB();
-+            const dim_t oc = OC();
- 
--            arm_compute::DataLayout wei_layout = (wei_tag == ohwi)
--                    ? arm_compute::DataLayout::NHWC
--                    : arm_compute::DataLayout::NCHW;
-+            aip.src_tensor_info = arm_compute::TensorInfo(
-+                    arm_compute::TensorShape(ic_total, n), 1,
-+                    acl_utils::get_acl_data_t(src_md()->data_type));
- 
--            aip.src_info = arm_compute::TensorInfo(
--                    src_shape, 1, arm_compute::DataType::F32, src_layout);
-+            // ACL requires the weights to be in 2D flattened shape
-+            aip.wei_tensor_info = arm_compute::TensorInfo(
-+                    arm_compute::TensorShape(oc, ic_total), 1,
-+                    acl_utils::get_acl_data_t(weights_md(0)->data_type));
- 
--            aip.wei_info = arm_compute::TensorInfo(
--                    wei_shape, 1, arm_compute::DataType::F32, wei_layout);
--
--            aip.dst_info
--                    = arm_compute::TensorInfo(arm_compute::TensorShape(oc, n),
--                            1, arm_compute::DataType::F32);
-+            auto acl_dst_data_t
-+                    = acl_utils::get_acl_data_t(dst_md()->data_type);
-+            aip.dst_tensor_info = arm_compute::TensorInfo(
-+                    arm_compute::TensorShape(oc, n), 1, acl_dst_data_t);
- 
-             aip.with_bias = desc()->bias_desc.format_kind != format_kind::undef;
--            aip.bia_info = arm_compute::TensorInfo(aip.with_bias
-+            auto acl_bia_data_t = aip.with_bias
-+                    ? acl_utils::get_acl_data_t(weights_md(1)->data_type)
-+                    : acl_dst_data_t;
-+            aip.bia_tensor_info = arm_compute::TensorInfo(aip.with_bias
-                             ? arm_compute::TensorShape(oc)
-                             : arm_compute::TensorShape(),
-                     1, arm_compute::DataType::F32);
- 
--            aip.fc_info.weights_trained_layout = wei_layout;
--            if (is_2d && wei_tag != src_tag) {
--                // weights are already transposed
--                aip.fc_info.transpose_weights = false;
--
--                if (desc()->prop_kind == dnnl_forward_training) {
--                    aip.wei_info.set_are_values_constant(false);
--                    aip.fc_info.are_weights_reshaped = true;
--                }
--            }
-+            aip.fc_info.transpose_weights = false;
- 
-             // Fast math mode
-             auto math_mode = get_fpmath_mode();
-@@ -214,15 +179,103 @@ struct acl_inner_product_fwd_t : public primitive_t {
-                     aip.fc_info.activation_info));
-             aip.use_dst_acc = post_ops.has_sum();
- 
-+            // WeightFormat::ANY tells ACL we can handle any format
-+            aip.weights_info = arm_compute::WeightsInfo(false, 1, 1, ic_total,
-+                    false, arm_compute::WeightFormat::ANY);
-+
-+            // Get the format that the ACL kernel will expect the weights to be
-+            // in (if a kernel exists) Note that these are referred to as fixed
-+            // format kernels, because they require one specific weights format
-+            arm_compute::WeightFormat expected_weight_format;
-+            ACL_CHECK_VALID(arm_compute::NEFullyConnectedLayer::has_opt_impl(
-+                    expected_weight_format, &aip.src_tensor_info,
-+                    &aip.wei_tensor_info,
-+                    aip.with_bias ? &aip.bia_tensor_info : nullptr,
-+                    &aip.dst_tensor_info, aip.fc_info, aip.weights_info));
-+
-+            // Set weights info to the one returned by has_opt_impl
-+            aip.weights_info.set_weight_format(expected_weight_format);
-+
-+            // has_opt_impl may return a non fast math kernel, even if requested
-+            aip.fc_info.enable_fast_math
-+                    = arm_compute::is_fixed_format_fast_math(
-+                            expected_weight_format);
-+
-+            // Inner product is the same as the matmul n x (chw) * (ihw) x o
-+            // (note that the src c and weights i both correspond to the input
-+            // channel). ACL FullyConnectedLayer assumes the chw dimensions of
-+            // src and ihw dimensions of weights are collapsed, so we need to
-+            // make sure that they have the same layout. Given that weights are
-+            // more often fixed, (so reorders can be hoisted) it makes sense to
-+            // reorder the weights to fit the src.
-+
-+            // For 4D tensors we need to:
-+            // - reorder the ihw of the weights to match the src chw
-+            // - collapse ihw
-+            // - pad the collapsed ihw
-+            // But there is not yet a way to express this collapse+pad as a
-+            // reorder. So we try to reorder the weights to match the src,
-+            // implicitly collapse ihw in our definition of the weights
-+            // TensorInfo and hope that the inner_dim has zero padding
-+            // (weights_md_.dims[inner_dim] % block_by == 0). If it does, we
-+            // fall back to a kernel without blocking (currently this is
-+            // equivalent to non-fastmath).
-+
-+            // 2D just works because we just pad the only dimension.
-+
-+            // o_dim is always the first logical dimension (oihw, ohwi, oi)
-+            dim_t o_dim = 0;
-+            dim_t inner_dim;
-+            // Rest of logical dimensions in order of innermost to outermost
-+            std::vector<dim_t> remaining_dims = {};
-+
-+            if (src_tag == nchw) {
-+                inner_dim = 3; // w
-+                remaining_dims = {2, 1}; // h, i
-+            } else if (src_tag == nhwc) {
-+                inner_dim = 1; // i
-+                remaining_dims = {3, 2}; // w, h
-+            } else { // Only remaining case is 2D (nc)
-+                inner_dim = 1; // i
-+                remaining_dims = {}; // No other dimensions for 2D
-+            }
-+
-+            // Fallback
-+            int block_by = arm_compute::block_by(expected_weight_format);
-+            if (is_4d && weights_md_.dims[inner_dim] % block_by != 0
-+                    && aip.fc_info.enable_fast_math) {
-+                aip.fc_info.enable_fast_math = false;
-+                aip.weights_info.set_weight_format(
-+                        arm_compute::WeightFormat::ANY);
-+                ACL_CHECK_VALID(
-+                        arm_compute::NEFullyConnectedLayer::has_opt_impl(
-+                                expected_weight_format, &aip.src_tensor_info,
-+                                &aip.wei_tensor_info,
-+                                aip.with_bias ? &aip.bia_tensor_info : nullptr,
-+                                &aip.dst_tensor_info, aip.fc_info,
-+                                aip.weights_info));
-+                aip.weights_info.set_weight_format(expected_weight_format);
-+                block_by = arm_compute::block_by(expected_weight_format);
-+                if (weights_md_.dims[inner_dim] % block_by != 0)
-+                    return status::unimplemented;
-+            }
-+
-+            acl_utils::reorder_to_weight_format(aip.wei_tensor_info,
-+                    weights_md_, expected_weight_format, inner_dim, o_dim,
-+                    remaining_dims, {});
-+
-             // clang-format off
-+
-             // Validate fully connected layer manually to check for return status
-             ACL_CHECK_VALID(arm_compute::NEFullyConnectedLayer::validate(
--                &aip.src_info,
--                &aip.wei_info,
--                aip.with_bias ? &aip.bia_info : nullptr,
--                &aip.dst_info,
--                aip.fc_info));
-+                &aip.src_tensor_info,
-+                &aip.wei_tensor_info,
-+                aip.with_bias ? &aip.bia_tensor_info : nullptr,
-+                &aip.dst_tensor_info,
-+                aip.fc_info,
-+                aip.weights_info));
-             // clang-format on
-+
-             return status::success;
-         }
-     }; // pd_t
-diff --git a/src/cpu/aarch64/acl_utils.cpp b/src/cpu/aarch64/acl_utils.cpp
-index 79ea775d6d..5792fd4911 100644
---- a/src/cpu/aarch64/acl_utils.cpp
-+++ b/src/cpu/aarch64/acl_utils.cpp
-@@ -1,5 +1,5 @@
- /*******************************************************************************
--* Copyright 2021-2022 Arm Ltd. and affiliates
-+* Copyright 2021-2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -261,6 +261,75 @@ int reorder_dimensions_by_stride(std::vector<memory_desc_t *> permuted_mds,
-     return reordered_dims;
- }
- 
-+void reorder_to_weight_format(arm_compute::TensorInfo &info, memory_desc_t &md,
-+        arm_compute::WeightFormat wf, dim_t I_dim, dim_t O_dim,
-+        std::vector<dim_t> spatial_dims, std::vector<dim_t> batch_dims) {
-+
-+    md.format_kind = format_kind::blocked;
-+    md.format_desc.blocking = blocking_desc_t {};
-+    const int interleaved_by = arm_compute::interleave_by(wf);
-+    const int block_by = arm_compute::block_by(wf);
-+
-+    // I dimension becomes densest (apart from blocking)
-+    md.format_desc.blocking.strides[I_dim] = interleaved_by * block_by;
-+    md.padded_dims[I_dim] = utils::rnd_up(md.dims[I_dim], block_by);
-+
-+    // Then any spatial dimensions (e.g. HW)
-+    dim_t ldb = interleaved_by * md.padded_dims[I_dim];
-+    for (dim_t sd : spatial_dims) {
-+        md.format_desc.blocking.strides[sd] = ldb;
-+        ldb *= md.padded_dims[sd];
-+    }
-+
-+    // O dim (which was the innermost) becomes the outermost (apart from batching)
-+    md.format_desc.blocking.strides[O_dim] = ldb;
-+    md.padded_dims[O_dim] = utils::rnd_up(md.dims[O_dim], interleaved_by);
-+
-+    // Update the batch dimensions, starting with stride of the innermost batch
-+    const dim_t innermost_batch_stride
-+            = md.padded_dims[I_dim] * md.padded_dims[O_dim];
-+    dim_t batch_stride = innermost_batch_stride;
-+    for (dim_t bd : batch_dims) {
-+        md.format_desc.blocking.strides[bd] = batch_stride;
-+        batch_stride *= md.padded_dims[bd];
-+    }
-+
-+    // Weights can only be blocked if they are also interleaved
-+    if (interleaved_by > 1) {
-+        md.format_desc.blocking.inner_nblks = 1 + (block_by > 1);
-+
-+        md.format_desc.blocking.inner_idxs[0] = O_dim;
-+        md.format_desc.blocking.inner_blks[0] = interleaved_by;
-+        if (block_by > 1) {
-+            md.format_desc.blocking.inner_idxs[1] = I_dim;
-+            md.format_desc.blocking.inner_blks[1] = block_by;
-+        }
-+    }
-+
-+    if (arm_compute::is_fixed_format_fast_math(wf)) {
-+        md.data_type = dnnl_bf16;
-+        info.set_data_type(arm_compute::DataType::BFLOAT16);
-+    }
-+
-+    // The data layout is now determined by the manually set strides
-+    info.set_data_layout(arm_compute::DataLayout::UNKNOWN);
-+
-+    // x is ignored in fixed format kernels
-+    // y is the leading dimension of b (ldb) in the GEMM d = a*b + c
-+    //   This is the stride of O_dim in the md
-+    // z is the batch dimension (not strictly needed if there's only 1 batch)
-+    //   i.e. how much do I need to stride to get to the next matmul (ignoring
-+    //   the interleaving). Note that we use the innermost_batch_stride
-+    //   because all the batched dimensions are collapsed (as required by ACL).
-+    arm_compute::Strides new_strides_in_bytes = info.strides_in_bytes();
-+    new_strides_in_bytes.set(1, ldb * info.element_size());
-+    new_strides_in_bytes.set(2, innermost_batch_stride * info.element_size());
-+
-+    info.init(info.tensor_shape(), info.num_channels(), info.data_type(),
-+            new_strides_in_bytes, info.offset_first_element_in_bytes(),
-+            memory_desc_wrapper(md).size());
-+}
-+
- } // namespace acl_utils
- 
- } // namespace aarch64
-diff --git a/src/cpu/aarch64/acl_utils.hpp b/src/cpu/aarch64/acl_utils.hpp
-index 28693bb167..d9affe1c8f 100644
---- a/src/cpu/aarch64/acl_utils.hpp
-+++ b/src/cpu/aarch64/acl_utils.hpp
-@@ -1,5 +1,5 @@
- /*******************************************************************************
--* Copyright 2021-2022 Arm Ltd. and affiliates
-+* Copyright 2021-2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -74,6 +74,28 @@ status_t insert_singleton_dimension(arm_compute::TensorInfo &ti, size_t dim_i);
- int reorder_dimensions_by_stride(std::vector<memory_desc_t *> permuted_mds,
-         std::vector<const memory_desc_t *> mds);
- 
-+// Reorder a memory_desc_t and set the strides on a arm_compute::TensorInfo to
-+// match an arm_compute::WeightFormat. You are required to specify how various
-+// logical dimensions in oneDNN correspond to logical dimensions in arm_compute.
-+// info  TensorInfo where the strides will be changed to match the reordering
-+// md    memory descriptor where the stride and padded dimensions will be
-+//       changed or reordering
-+// wf    Describes the memory format/layout of the weights
-+// I_dim The logical dimension of md corresponding to the input channel of
-+//       a convolution or the K dimension in a matmul
-+// O_dim The logical dimension of md corresponding to the output channel of a
-+//       convolution or the N dimension in a matmul
-+// spatial_dims The logical dimensions of md corresponding to the spatial
-+//              dimensions of the weights (H, W, D for example). These will be
-+//              the next densest after the inner blocks and the input channel.
-+// batch_dims The logical dimensions of md related to the batch in a batched
-+//            matmul, ordered from innermost to outermost. ACL calls these
-+//            the multi_stride_b. These will become the outermost (least dense)
-+//            dimensions and will be collapsed.
-+void reorder_to_weight_format(arm_compute::TensorInfo &info, memory_desc_t &md,
-+        arm_compute::WeightFormat wf, dim_t I_dim, dim_t O_dim,
-+        std::vector<dim_t> spatial_dims, std::vector<dim_t> batch_dims = {});
-+
- // Logs a custom 'info' line describing an unsupported case
- #define LOG_ACL_UNSUPPORTED(msg) \
-     do { \
-diff --git a/src/cpu/aarch64/matmul/acl_matmul.cpp b/src/cpu/aarch64/matmul/acl_matmul.cpp
-index dce220fb6e..ca1c7eb47e 100644
---- a/src/cpu/aarch64/matmul/acl_matmul.cpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul.cpp
-@@ -1,5 +1,5 @@
- /*******************************************************************************
--* Copyright 2021-2022 Arm Ltd. and affiliates
-+* Copyright 2021-2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -31,36 +31,19 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
-     auto wei_base = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
- 
-     bool is_transA = pd()->amp_.is_transA;
--    bool is_transB = pd()->amp_.is_transB;
-     bool use_dst_acc = pd()->amp_.use_dst_acc;
- 
-     std::lock_guard<std::mutex> _lock {this->mtx};
-     auto *acl_resource = ctx.get_resource_mapper()->get<acl_resource_t>(this);
-     acl_matmul_obj_t &acl_obj = acl_resource->get_acl_obj();
-     // Run transpose kernel
--    if (is_transA && !is_transB) {
-+    if (is_transA) {
-         acl_obj.src_tensor.allocator()->allocate();
-         acl_obj.src_acc_tensor.allocator()->import_memory(
-                 const_cast<data_t *>(src_base));
-         acl_obj.transA.run();
-         acl_obj.wei_tensor.allocator()->import_memory(
-                 const_cast<data_t *>(wei_base));
--    } else if (is_transB && !is_transA) {
--        acl_obj.wei_tensor.allocator()->allocate();
--        acl_obj.wei_acc_tensor.allocator()->import_memory(
--                const_cast<data_t *>(wei_base));
--        acl_obj.transB.run();
--        acl_obj.src_tensor.allocator()->import_memory(
--                const_cast<data_t *>(src_base));
--    } else if (is_transA && is_transB) {
--        acl_obj.src_tensor.allocator()->allocate();
--        acl_obj.src_acc_tensor.allocator()->import_memory(
--                const_cast<data_t *>(src_base));
--        acl_obj.wei_tensor.allocator()->allocate();
--        acl_obj.wei_acc_tensor.allocator()->import_memory(
--                const_cast<data_t *>(wei_base));
--        acl_obj.transA.run();
--        acl_obj.transB.run();
-     } else {
-         acl_obj.src_tensor.allocator()->import_memory(
-                 const_cast<data_t *>(src_base));
-@@ -69,7 +52,7 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
-     }
- 
-     if (use_dst_acc) {
--        // Put the result in a new tensor, it will be accumalated to the dst
-+        // Put the result in a new tensor, it will be accumulated to the dst
-         // during the post ops
-         acl_obj.dst_tensor.allocator()->allocate();
-     } else {
-@@ -82,7 +65,6 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
-     acl_obj.src_tensor.allocator()->free();
-     acl_obj.wei_tensor.allocator()->free();
-     if (is_transA) acl_obj.src_acc_tensor.allocator()->free();
--    if (is_transB) acl_obj.wei_acc_tensor.allocator()->free();
- 
-     void *dst = acl_obj.dst_tensor.buffer();
-     pd()->post_ops.execute(ctx, dst);
-diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
-index cdc942e995..832b1dbb68 100644
---- a/src/cpu/aarch64/matmul/acl_matmul.hpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
-@@ -32,20 +32,15 @@ struct acl_resource_t : public resource_t {
- 
-     status_t configure(const acl_matmul_conf_t &amp) {
-         if (!acl_obj_) return status::out_of_memory;
--        acl_obj_->src_tensor.allocator()->init(amp.src_info);
--        acl_obj_->wei_tensor.allocator()->init(amp.wei_info);
--        acl_obj_->dst_tensor.allocator()->init(amp.dst_info);
-+        acl_obj_->src_tensor.allocator()->init(amp.src_tensor_info);
-+        acl_obj_->wei_tensor.allocator()->init(amp.wei_tensor_info);
-+        acl_obj_->dst_tensor.allocator()->init(amp.dst_tensor_info);
-         // Configure transpose kernel for src, wei or both
-         if (amp.is_transA) {
-             acl_obj_->src_acc_tensor.allocator()->init(amp.src_acc_info);
-             acl_obj_->transA.configure(
-                     &acl_obj_->src_acc_tensor, &acl_obj_->src_tensor);
-         }
--        if (amp.is_transB) {
--            acl_obj_->wei_acc_tensor.allocator()->init(amp.wei_acc_info);
--            acl_obj_->transB.configure(
--                    &acl_obj_->wei_acc_tensor, &acl_obj_->wei_tensor);
--        }
-         // Configure GEMM
-         acl_obj_->gemm.configure(&acl_obj_->src_tensor, &acl_obj_->wei_tensor,
-                 nullptr, &acl_obj_->dst_tensor, amp.alpha, 0.0f, amp.gemm_info);
-@@ -72,12 +67,20 @@ struct acl_matmul_t : public primitive_t {
- 
-         status_t init(engine_t *engine) {
-             using smask_t = primitive_attr_t::skip_mask_t;
--            bool ok = src_md()->data_type == data_type::f32
--                    && weights_md()->data_type == data_type::f32
--                    && desc()->accum_data_type == data_type::f32
--                    && dst_md()->data_type == data_type::f32
--                    && platform::has_data_type_support(data_type::f32)
-+            const bool is_fp32_ok
-+                    = utils::everyone_is(data_type::f32, src_md()->data_type,
-+                              weights_md()->data_type, dst_md()->data_type,
-+                              desc()->accum_data_type)
-+                    && platform::has_data_type_support(data_type::f32);
-+            const bool is_fp16_ok
-+                    = utils::everyone_is(data_type::f16, src_md()->data_type,
-+                              weights_md()->data_type, dst_md()->data_type)
-+                    && platform::has_data_type_support(data_type::f16);
-+            bool ok = is_dense_data()
-+                    && utils::one_of(true, is_fp32_ok, is_fp16_ok)
-                     && !has_zero_dim_memory()
-+                    && weights_md_.format_kind == format_kind::any
-+                    && set_default_formats()
-                     && attr()->has_default_values(
-                             smask_t::oscale | smask_t::post_ops)
-                     && attr_oscale_ok() && !has_runtime_dims_or_strides();
-@@ -92,9 +95,9 @@ struct acl_matmul_t : public primitive_t {
-             amp_.use_dst_acc = post_ops.has_sum();
- 
-             // Validate ACL GEMM
--            ACL_CHECK_VALID(arm_compute::NEGEMM::validate(&amp_.src_info,
--                    &amp_.wei_info, nullptr, &amp_.dst_info, amp_.alpha, 0.0f,
--                    amp_.gemm_info));
-+            ACL_CHECK_VALID(arm_compute::NEGEMM::validate(&amp_.src_tensor_info,
-+                    &amp_.wei_tensor_info, nullptr, &amp_.dst_tensor_info,
-+                    amp_.alpha, 0.0f, amp_.gemm_info));
- 
-             return status::success;
-         }
-diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-index 679baec3a4..30bc2c1443 100644
---- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-@@ -41,6 +41,7 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-     const dim_t src_batch = helper.src_batch();
-     const dim_t wei_batch = helper.wei_batch();
- 
-+    // We can only broadcast on one of src or wei at once
-     // ACL supports broadcast for 3D shapes, and 4D shapes
-     // for e.g when ab in abcd is 1x1
-     bool batch_ok = IMPLICATION(src_batch > 1, wei_batch == 1)
-@@ -53,44 +54,33 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-     bool with_bias = md.bias_desc.format_kind != format_kind::undef;
-     ACL_CHECK_SUPPORT(with_bias, "ACL does not support bias for matmul");
- 
-+    // The two innermost dimensions can be transposed, but the batch dimensions
-+    // must be the outermost
-     using namespace format_tag;
-     auto src_tag = memory_desc_matches_one_of_tag(
-             src_md, abcd, abdc, abc, acb, ab, ba);
--    auto wei_tag = memory_desc_matches_one_of_tag(
--            wei_md, abcd, abdc, abc, acb, ab, ba);
--    auto dst_tag
--            = memory_desc_matches_one_of_tag(dst_md, abcd, abc, acb, ab, ba);
--    ACL_CHECK_SUPPORT(
--            utils::one_of(format_tag::undef, src_tag, wei_tag, dst_tag),
-+    auto dst_tag = memory_desc_matches_one_of_tag(dst_md, abcd, abc, ab, ba);
-+    ACL_CHECK_SUPPORT(utils::one_of(format_tag::undef, src_tag, dst_tag),
-             "Format tag is undefined");
- 
--    // Transpose A (src) or B (wei)
-+    // Transpose A (src)
-     amp.is_transA = helper.transA() == 'T';
--    amp.is_transB = helper.transB() == 'T';
-+
-+    auto acl_src_data_t = acl_utils::get_acl_data_t(src_md.data_type);
-+    auto acl_wei_data_t = acl_utils::get_acl_data_t(wei_md.data_type);
-+    auto acl_dst_data_t = acl_utils::get_acl_data_t(dst_md.data_type);
-+
-     if (amp.is_transA)
-         amp.src_acc_info = arm_compute::TensorInfo(
-                 arm_compute::TensorShape(M, K, 1, src_batch), 1,
--                arm_compute::DataType::F32);
--    if (amp.is_transB)
--        amp.wei_acc_info = arm_compute::TensorInfo(
--                arm_compute::TensorShape(K, N, wei_batch), 1,
--                arm_compute::DataType::F32);
--
--    amp.src_info = arm_compute::TensorInfo(
--            arm_compute::TensorShape(K, M, 1, src_batch), 1,
--            arm_compute::DataType::F32);
--    amp.wei_info
--            = arm_compute::TensorInfo(arm_compute::TensorShape(N, K, wei_batch),
--                    1, arm_compute::DataType::F32);
--    amp.dst_info = arm_compute::TensorInfo(
--            arm_compute::TensorShape(N, M, 1, dst_batch), 1,
--            arm_compute::DataType::F32);
--
--    // Fast-math mode
--    auto math_mode = get_fpmath_mode();
--    bool is_fastmath_enabled
--            = utils::one_of(math_mode, fpmath_mode::bf16, fpmath_mode::any);
--    amp.gemm_info.set_fast_math(is_fastmath_enabled);
-+                acl_src_data_t);
-+
-+    amp.src_tensor_info = arm_compute::TensorInfo(
-+            arm_compute::TensorShape(K, M, 1, src_batch), 1, acl_src_data_t);
-+    amp.wei_tensor_info = arm_compute::TensorInfo(
-+            arm_compute::TensorShape(N, K, wei_batch), 1, acl_wei_data_t);
-+    amp.dst_tensor_info = arm_compute::TensorInfo(
-+            arm_compute::TensorShape(N, M, 1, dst_batch), 1, acl_dst_data_t);
- 
-     // Set alpha (output scaling)
-     amp.alpha = attr.output_scales_.scales_[0];
-@@ -98,10 +88,45 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-     // Validate ACL transpose
-     if (amp.is_transA)
-         ACL_CHECK_VALID(arm_compute::NETranspose::validate(
--                &amp.src_acc_info, &amp.src_info));
--    if (amp.is_transB)
--        ACL_CHECK_VALID(arm_compute::NETranspose::validate(
--                &amp.wei_acc_info, &amp.wei_info));
-+                &amp.src_acc_info, &amp.src_tensor_info));
-+
-+    bool is_fastmath_enabled = utils::one_of(
-+            attr.fpmath_mode_, fpmath_mode::bf16, fpmath_mode::any);
-+    amp.gemm_info.set_fast_math(is_fastmath_enabled);
-+
-+    amp.gemm_info.set_fixed_format(true);
-+
-+    // WeightFormat::ANY tells ACL we can handle any format
-+    amp.gemm_info.set_weight_format(arm_compute::WeightFormat::ANY);
-+
-+    // Get the format that the ACL kernel will expect the weights to be
-+    // in (if a kernel exists). Note that these are referred to as fixed format
-+    // kernels, because they require one specific weights format
-+    arm_compute::WeightFormat expected_weight_format;
-+    ACL_CHECK_VALID(arm_compute::NEGEMM::has_opt_impl(expected_weight_format,
-+            &amp.src_tensor_info, &amp.wei_tensor_info, nullptr,
-+            &amp.dst_tensor_info, amp.alpha, 0.0f, amp.gemm_info));
-+
-+    // Set gemm weights info to the one returned by has_opt_impl
-+    amp.gemm_info.set_weight_format(expected_weight_format);
-+
-+    // has_opt_impl may return a non fast math kernel, even if we requested one
-+    amp.gemm_info.set_fast_math(
-+            arm_compute::is_fixed_format_fast_math(expected_weight_format));
-+
-+    // Logical dimension indices
-+    dim_t innermost_dim = wei_md.ndims - 1;
-+    dim_t N_dim = innermost_dim;
-+    dim_t K_dim = innermost_dim - 1;
-+
-+    // The logical indices of dimensions related to the batch, ordered from
-+    // innermost to outermost
-+    std::vector<dim_t> batch_dims = {};
-+    for (dim_t i = K_dim - 1; i >= 0; --i)
-+        batch_dims.push_back(i);
-+
-+    acl_utils::reorder_to_weight_format(amp.wei_tensor_info, wei_md,
-+            expected_weight_format, K_dim, N_dim, {}, batch_dims);
- 
-     return status::success;
- }
-diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-index 0a5ee6a987..67bb2e78eb 100644
---- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-@@ -1,5 +1,5 @@
- /*******************************************************************************
--* Copyright 2021-2022 Arm Ltd. and affiliates
-+* Copyright 2021-2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -29,25 +29,21 @@ namespace aarch64 {
- struct acl_matmul_obj_t {
-     arm_compute::NEGEMM gemm;
-     arm_compute::NETranspose transA;
--    arm_compute::NETranspose transB;
-     arm_compute::Tensor src_tensor;
-     arm_compute::Tensor src_acc_tensor;
-     arm_compute::Tensor wei_tensor;
--    arm_compute::Tensor wei_acc_tensor;
-     arm_compute::Tensor dst_tensor;
- };
- 
- struct acl_matmul_conf_t {
-     bool is_transA;
--    bool is_transB;
-     // If this is true, the result of the matmul goes into a temporarily
-     // allocated ACL tensor to be accumulated into the oneDNN dst during postops
-     bool use_dst_acc;
--    arm_compute::TensorInfo src_info;
-+    arm_compute::TensorInfo src_tensor_info;
-     arm_compute::TensorInfo src_acc_info;
--    arm_compute::TensorInfo wei_info;
--    arm_compute::TensorInfo wei_acc_info;
--    arm_compute::TensorInfo dst_info;
-+    arm_compute::TensorInfo wei_tensor_info;
-+    arm_compute::TensorInfo dst_tensor_info;
-     arm_compute::GEMMInfo gemm_info;
-     float alpha;
- };
diff --git a/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch b/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch
new file mode 100644
index 00000000000000..202902a1894a86
--- /dev/null
+++ b/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch
@@ -0,0 +1,111 @@
+ *******************************************************************************
+ Copyright 2023 Arm Limited and affiliates.
+ SPDX-License-Identifier: Apache-2.0
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ *******************************************************************************
+diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
+index 4a43b24c5..1a5cfe590 100644
+--- a/src/cpu/aarch64/cpu_isa_traits.hpp
++++ b/src/cpu/aarch64/cpu_isa_traits.hpp
+@@ -1,6 +1,7 @@
+ /*******************************************************************************
+ * Copyright 2018-2023 Intel Corporation
+ * Copyright 2020-2023 FUJITSU LIMITED
++* Copyright 2023 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -211,10 +212,10 @@ static inline bool mayiuse_atomic() {
+     return cpu().isAtomicSupported();
+ }
+ 
+-inline bool isa_has_bf16(cpu_isa_t isa) {
+-    return false;
++static inline bool mayiuse_bf16() {
++    using namespace Xbyak_aarch64::util;
++    return cpu().isBf16Supported();
+ }
+-
+ } // namespace
+ 
+ /* whatever is required to generate string literals... */
+diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
+index 6bd259ec2..5541bb702 100644
+--- a/src/cpu/aarch64/jit_uni_reorder.cpp
++++ b/src/cpu/aarch64/jit_uni_reorder.cpp
+@@ -1,7 +1,7 @@
+ /*******************************************************************************
+ * Copyright 2018-2023 Intel Corporation
+ * Copyright 2020-2023 FUJITSU LIMITED
+-* Copyright 2022 Arm Ltd. and affiliates
++* Copyright 2022-2023 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -163,11 +163,11 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
+ 
+         bool ok = true && p.ndims > 0
+                 && utils::one_of(p.itype, f32, s32, data_type::s8, u8)
+-                && utils::one_of(p.otype, f32, s32, data_type::s8, u8)
++                && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8)
+                 && utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
+                 && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
+-                && simple_impl_desc_init(p, nullptr)
+-                && prb_has_small_strides(p);
++                && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
++                && ((p.otype != bf16) || (p.itype == f32 && mayiuse_bf16()));
+ 
+         return ok;
+     }
+@@ -648,6 +648,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
+                         cvt_v_s32_u8(startIdx, regNum);
+                     if (idt == data_type::s8) cvt_v_s8_u8(startIdx, regNum);
+                     break;
++                case bf16:
++                    if (idt == f32) cvt_v_f32_bf16(startIdx, regNum);
++                    break;
+                 default: assert(!"unreachable");
+             }
+         };
+@@ -1677,6 +1680,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
+         UNROLL_INST(fcvtzs, VReg4S, tmp, tmp);
+     }
+ 
++    void cvt_v_f32_bf16(const size_t startIdx, const size_t regNum) {
++        UNROLL_INST2(bfcvtn, VReg4H(i), VReg4S(i));
++    }
++
+     void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) {
+         cvt_z_b_s(startIdx, regNum);
+         UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
+diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
+index ba5499ba9..d4e21d316 100644
+--- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
++++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
+@@ -1,5 +1,6 @@
+ /*******************************************************************************
+ * Copyright 2020-2022 Intel Corporation
++* Copyright 2023 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -34,6 +35,8 @@ const impl_list_map_t &regular_f32_bf16_impl_list_map() {
+             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c))
+             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c))
+ 
++            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
++
+             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep))
+             DNNL_NON_X64_ONLY(REG_SR(f32, goihw, bf16, gOIhw8i16o2i, fmt_order::keep))
+             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8o16i2o, fmt_order::keep))
diff --git a/third_party/mkl_dnn/onednn_acl_remove_winograd.patch b/third_party/mkl_dnn/onednn_acl_remove_winograd.patch
deleted file mode 100644
index 18abcc8f54e922..00000000000000
--- a/third_party/mkl_dnn/onednn_acl_remove_winograd.patch
+++ /dev/null
@@ -1,326 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index c46d697575..37f8ecbc06 100644
---- a/src/cpu/aarch64/acl_convolution_utils.cpp
-+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
-@@ -271,54 +271,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-     return status::success;
- }
- 
--status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
--        memory_desc_t &weights_md, memory_desc_t &dst_md,
--        memory_desc_t &bias_md, const convolution_desc_t &cd,
--        const primitive_attr_t &attr) {
--
--    // Under these conditions, fallback to faster GEMM-based convolution
--    // unless the user explicitly specifies Winograd algorithm
--    // clang-format off
--    if (one_of(true, src_md.dims[2] > 112, // ih
--                src_md.dims[3] > 112, // iw
--                src_md.dims[1] < 64, // ic
--                dst_md.dims[1] < 64, // oc
--                dnnl_get_max_threads() > 28)
--            && cd.alg_kind == alg_kind::convolution_auto) {
--        return status::unimplemented;
--    }
--    // clang-format on
--
--    // General Compute Library checks, memory tags are also set there
--    CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
--
--    const bool shape_ok
--            // only unit strides allowed
--            = (acp.padstride_info.stride() == std::pair<uint, uint> {1, 1})
--            // Note: Compute Library supports arbitrary padding for wino kernels
--            // but we only allow small padding to be consistent with oneDNN
--            && (acp.padstride_info.pad().first <= 1) // padding left/right
--            && (acp.padstride_info.pad().second <= 1) // padding top/bottom
--            // only non-dilated convolutions allowed
--            && (acp.dilation_info == arm_compute::Size2D(1, 1));
--
--    ACL_CHECK_SUPPORT(!shape_ok, "shape not supported by winograd kernels");
--
--    // clang-format off
--    // Validate convolution manually to check for return status
--    ACL_CHECK_VALID(arm_compute::NEWinogradConvolutionLayer::validate(
--        &acp.src_info,
--        &acp.wei_info,
--        acp.with_bias ? &acp.bia_info : nullptr,
--        &acp.dst_info,
--        acp.padstride_info,
--        acp.act_info,
--        true)); // enable_fast_math flag in ACL Winograd
--    // clang-format on
--
--    return status::success;
--}
--
- } // namespace acl_convolution_utils
- 
- } // namespace aarch64
-diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
-index 3e56245faf..0398ab06b9 100644
---- a/src/cpu/aarch64/acl_convolution_utils.hpp
-+++ b/src/cpu/aarch64/acl_convolution_utils.hpp
-@@ -66,11 +66,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-         memory_desc_t &bias_md, const convolution_desc_t &cd,
-         const primitive_attr_t &attr);
- 
--status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
--        memory_desc_t &weights_md, memory_desc_t &dst_md,
--        memory_desc_t &bias_md, const convolution_desc_t &cd,
--        const primitive_attr_t &attr);
--
- } // namespace acl_convolution_utils
- 
- template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
-diff --git a/src/cpu/aarch64/acl_winograd_convolution.cpp b/src/cpu/aarch64/acl_winograd_convolution.cpp
-deleted file mode 100644
-index b4eb5989fb..0000000000
---- a/src/cpu/aarch64/acl_winograd_convolution.cpp
-+++ /dev/null
-@@ -1,43 +0,0 @@
--/*******************************************************************************
--* Copyright 2020-2022 Arm Ltd. and affiliates
--*
--* Licensed under the Apache License, Version 2.0 (the "License");
--* you may not use this file except in compliance with the License.
--* You may obtain a copy of the License at
--*
--*     http://www.apache.org/licenses/LICENSE-2.0
--*
--* Unless required by applicable law or agreed to in writing, software
--* distributed under the License is distributed on an "AS IS" BASIS,
--* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--* See the License for the specific language governing permissions and
--* limitations under the License.
--*******************************************************************************/
--
--#include "cpu/aarch64/acl_winograd_convolution.hpp"
--
--namespace dnnl {
--namespace impl {
--namespace cpu {
--namespace aarch64 {
--
--status_t acl_wino_convolution_fwd_t::execute_forward(
--        const exec_ctx_t &ctx) const {
--    // Lock here is needed because resource_mapper does not support
--    // concurrent multithreaded access.
--    std::lock_guard<std::mutex> _lock {this->mtx};
--    // Retrieve primitive resource and configured Compute Library objects
--    auto *acl_resource
--            = ctx.get_resource_mapper()->get<acl_wino_resource_t>(this);
--    acl_obj_t<arm_compute::NEWinogradConvolutionLayer> &acl_wino_obj
--            = acl_resource->get_acl_obj();
--
--    return execute_forward_conv_acl<
--            acl_obj_t<arm_compute::NEWinogradConvolutionLayer>, pd_t, data_t>(
--            ctx, acl_wino_obj, pd());
--}
--
--} // namespace aarch64
--} // namespace cpu
--} // namespace impl
--} // namespace dnnl
-diff --git a/src/cpu/aarch64/acl_winograd_convolution.hpp b/src/cpu/aarch64/acl_winograd_convolution.hpp
-deleted file mode 100644
-index 215635fe3f..0000000000
---- a/src/cpu/aarch64/acl_winograd_convolution.hpp
-+++ /dev/null
-@@ -1,146 +0,0 @@
--/*******************************************************************************
--* Copyright 2020-2022 Arm Ltd. and affiliates
--*
--* Licensed under the Apache License, Version 2.0 (the "License");
--* you may not use this file except in compliance with the License.
--* You may obtain a copy of the License at
--*
--*     http://www.apache.org/licenses/LICENSE-2.0
--*
--* Unless required by applicable law or agreed to in writing, software
--* distributed under the License is distributed on an "AS IS" BASIS,
--* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--* See the License for the specific language governing permissions and
--* limitations under the License.
--*******************************************************************************/
--
--#ifndef CPU_AARCH64_ACL_WINOGRAD_CONVOLUTION_HPP
--#define CPU_AARCH64_ACL_WINOGRAD_CONVOLUTION_HPP
--
--#include "cpu/cpu_convolution_pd.hpp"
--
--#include "cpu/aarch64/acl_convolution_utils.hpp"
--
--namespace dnnl {
--namespace impl {
--namespace cpu {
--namespace aarch64 {
--
--struct acl_wino_resource_t : public resource_t {
--    acl_wino_resource_t()
--        : acl_wino_obj_(utils::make_unique<
--                acl_obj_t<arm_compute::NEWinogradConvolutionLayer>>()) {}
--
--    status_t configure(const acl_conv_conf_t &acp) {
--        if (!acl_wino_obj_) return status::out_of_memory;
--
--        // Init Compute Library tensors based on info from descriptor
--        acl_wino_obj_->src_tensor.allocator()->init(acp.src_info);
--        acl_wino_obj_->wei_tensor.allocator()->init(acp.wei_info);
--        acl_wino_obj_->dst_tensor.allocator()->init(acp.dst_info);
--        acl_wino_obj_->bia_tensor.allocator()->init(acp.bia_info);
--
--        // clang-format off
--        acl_wino_obj_->conv.configure(
--            &acl_wino_obj_->src_tensor,
--            &acl_wino_obj_->wei_tensor,
--            acp.with_bias ? &acl_wino_obj_->bia_tensor : nullptr,
--            &acl_wino_obj_->dst_tensor,
--            acp.padstride_info,
--            acp.act_info,
--            true); // to support 5x5, 7x7 filter shapes in addition to 3x3
--        // clang-format on
--
--        return status::success;
--    }
--
--    acl_obj_t<arm_compute::NEWinogradConvolutionLayer> &get_acl_obj() const {
--        return *acl_wino_obj_;
--    }
--
--    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_wino_resource_t);
--
--private:
--    std::unique_ptr<acl_obj_t<arm_compute::NEWinogradConvolutionLayer>>
--            acl_wino_obj_;
--}; // acl_wino_resource_t
--
--struct acl_wino_convolution_fwd_t : public primitive_t {
--    struct pd_t : public cpu_convolution_fwd_pd_t {
--        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
--                const typename pd_t::base_class *hint_fwd_pd)
--            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
--            , acp_()
--            , post_ops() {}
--
--        DECLARE_COMMON_PD_T(
--                "wino:acl", acl_wino_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
--
--        status_t init(engine_t *engine) {
--            bool ok = is_fwd()
--                    && utils::one_of(desc()->alg_kind,
--                            alg_kind::convolution_auto,
--                            alg_kind::convolution_winograd)
--                    && expect_data_types(data_type::f32, data_type::f32,
--                            data_type::f32, data_type::f32, data_type::f32)
--                    && attr()->has_default_values(
--                            primitive_attr_t::skip_mask_t::post_ops,
--                            data_type::f32)
--                    && !has_zero_dim_memory();
--            if (!ok) return status::unimplemented;
--
--            CHECK(acl_convolution_utils::init_conf_wino(acp_, src_md_,
--                    weights_md_, dst_md_, bias_md_, *desc(), *attr()));
--
--            set_default_alg_kind(alg_kind::convolution_winograd);
--
--            CHECK(post_ops.init(
--                    engine, attr_.post_ops_, dst_md_, acp_.act_info));
--            acp_.use_dst_acc = post_ops.has_sum();
--
--            return status::success;
--        }
--
--        acl_conv_conf_t acp_;
--        acl_post_ops_t post_ops;
--    };
--
--    acl_wino_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
--
--    status_t create_resource(
--            engine_t *engine, resource_mapper_t &mapper) const override {
--        if (mapper.has_resource(this)) return status::success;
--
--        auto r = utils::make_unique<acl_wino_resource_t>();
--        if (!r) return status::out_of_memory;
--
--        // Configure the resource based on information from primitive descriptor
--        CHECK(r->configure(pd()->acp_));
--        mapper.add(this, std::move(r));
--
--        CHECK(pd()->post_ops.create_resource(engine, mapper));
--
--        return status::success;
--    }
--
--    ~acl_wino_convolution_fwd_t() {}
--
--    typedef typename prec_traits<data_type::f32>::type data_t;
--
--    status_t execute(const exec_ctx_t &ctx) const override {
--        return execute_forward(ctx);
--    }
--
--private:
--    // To guard the const execute_forward(), the mutex must be 'mutable'
--    mutable std::mutex mtx;
--    status_t execute_forward(const exec_ctx_t &ctx) const;
--    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
--}; // acl_wino_convolution_fwd_t
--
--} // namespace aarch64
--} // namespace cpu
--} // namespace impl
--} // namespace dnnl
--
--#endif // CPU_AARCH64_ACL_WINOGRAD_CONVOLUTION_HPP
-diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
-index 4142dbc7e7..094c73aa36 100644
---- a/src/cpu/cpu_convolution_list.cpp
-+++ b/src/cpu/cpu_convolution_list.cpp
-@@ -65,7 +65,6 @@ using namespace dnnl::impl::cpu::x64;
- #if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
- #include "cpu/aarch64/acl_gemm_convolution.hpp"
- #include "cpu/aarch64/acl_indirect_gemm_convolution.hpp"
--#include "cpu/aarch64/acl_winograd_convolution.hpp"
- #endif
- using namespace dnnl::impl::cpu::aarch64;
- #endif
-@@ -100,7 +99,6 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
-             CPU_INSTANCE_SSE41(jit_sse41_1x1_convolution_fwd_t)
-             CPU_INSTANCE_AVX2(jit_avx2_convolution_fwd_t)
-             CPU_INSTANCE_SSE41(jit_sse41_convolution_fwd_t)
--            CPU_INSTANCE_AARCH64_ACL(acl_wino_convolution_fwd_t)
-             CPU_INSTANCE_AARCH64(jit_sve_512_dw_convolution_fwd_t)
-             CPU_INSTANCE_AARCH64(jit_sve_512_1x1_convolution_fwd_f32_t)
-             CPU_INSTANCE_AARCH64(jit_sve_512_convolution_fwd_t<f32>)
-diff --git a/tests/gtests/test_iface_wino_convolution.cpp b/tests/gtests/test_iface_wino_convolution.cpp
-index 03861b1de4..2235ceae36 100644
---- a/tests/gtests/test_iface_wino_convolution.cpp
-+++ b/tests/gtests/test_iface_wino_convolution.cpp
-@@ -59,9 +59,6 @@ protected:
-         input_f16.wino_supported = is_gpu;
-         input_int8.wino_supported = is_cpu && has_avx512_core;
-         input_f32.backward_supported = is_cpu && impl::dnnl_thr_syncable();
--#elif DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
--        const bool is_cpu = get_test_engine_kind() == engine::kind::cpu;
--        input_f32.wino_supported = is_cpu;
- #endif
- 
- #else
diff --git a/third_party/mkl_dnn/onednn_acl_reorder.patch b/third_party/mkl_dnn/onednn_acl_reorder.patch
index 7241aca4eefc88..5da6756c70a275 100644
--- a/third_party/mkl_dnn/onednn_acl_reorder.patch
+++ b/third_party/mkl_dnn/onednn_acl_reorder.patch
@@ -16,7 +16,7 @@
  *******************************************************************************
 diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp
 new file mode 100644
-index 0000000000..061751b555
+index 000000000..061751b55
 --- /dev/null
 +++ b/src/cpu/aarch64/acl_reorder.cpp
 @@ -0,0 +1,52 @@
@@ -341,10 +341,10 @@ index 0000000000..edbc38914d
 +
 +#endif // CPU_AARCH64_ACL_REORDER_HPP
 diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-index bccd2f75f4..5e5ea331ba 100644
+index a4150b619..f4d6b4de3 100644
 --- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
 +++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-@@ -15,6 +15,7 @@
+@@ -16,6 +16,7 @@
  *******************************************************************************/
  
  #include "cpu/reorder/cpu_reorder.hpp"
@@ -352,19 +352,20 @@ index bccd2f75f4..5e5ea331ba 100644
  
  namespace dnnl {
  namespace impl {
-@@ -27,6 +28,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
+@@ -28,6 +29,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
          // f32 -> f32
          {{f32, f32, 0}, {
              REG_FAST_DIRECT_COPY_F32_F32
 +            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
  
+             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
              DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-@@ -64,6 +66,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
+@@ -69,6 +71,8 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
              nullptr,
          }},
          {{f32, f32, 4}, {
++
 +            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::wino_reorder_t<f32, f32>))
- 
              CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, f32>)
+ 
+             REG_FAST_DIRECT_COPY_F32_F32
diff --git a/third_party/mkl_dnn/onednn_acl_reorder_padded.patch b/third_party/mkl_dnn/onednn_acl_reorder_padded.patch
deleted file mode 100644
index f290f21ec87e9b..00000000000000
--- a/third_party/mkl_dnn/onednn_acl_reorder_padded.patch
+++ /dev/null
@@ -1,858 +0,0 @@
- *******************************************************************************
- Copyright 2022 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-
-diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
-index 24d6220cf..a6cefaa20 100644
---- a/src/cpu/aarch64/jit_uni_reorder.cpp
-+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
-@@ -1,6 +1,7 @@
- /*******************************************************************************
- * Copyright 2018-2021 Intel Corporation
- * Copyright 2020-2021 FUJITSU LIMITED
-+* Copyright 2022 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -54,6 +55,35 @@ namespace aarch64 {
- 
- namespace tr {
- 
-+static bool prb_has_small_strides(const prb_t &prb) {
-+    constexpr ptrdiff_t max_stride = (1LL << 31) - 1;
-+    for (int d = 0; d < prb.ndims; ++d) {
-+        const ptrdiff_t cms = max_stride / prb.nodes[d].n;
-+        const bool small_strides = true
-+                && prb.nodes[d].is < cms / (int)data_type_size(prb.itype)
-+                && prb.nodes[d].os < cms / (int)data_type_size(prb.otype);
-+        if (!small_strides) return false;
-+    }
-+    return true;
-+}
-+
-+static bool prb_tail_friendly(const prb_t &prb) {
-+    /* find optimal ndims to makes it easier to
-+     * identify the blk_chunk in the loop*/
-+    int ndims = prb.full_ndims - prb.ndims;
-+
-+    int n = prb.nodes[0].is;
-+    for (int d = 1; d < prb.ndims; ++d) {
-+        if (d != prb.blk_chunk_idx) n *= prb.nodes[d].n;
-+    }
-+    if (prb.ip_tail > 0
-+            && ((ndims == 0 && n != 1)
-+                    || (ndims > 0 && prb.ndims > prb.blk_chunk_idx)))
-+        return false;
-+
-+    return true;
-+}
-+
- /** Minimal reasonable/desirable kernel size.
-  * The constant might be used to determine how a problem should be split
-  * between kernel and threading driver. */
-@@ -121,18 +151,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                 && utils::one_of(p.otype, f32, s32, data_type::s8, u8)
-                 && utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
-                 && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
--                && simple_impl_desc_init(p, nullptr);
-+                && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
-+                && prb_tail_friendly(p);
-         if (!ok) return false;
- 
--        const ptrdiff_t max_stride = (1LL << 31) - 1;
--        for (int d = 0; d < p.ndims; ++d) {
--            const ptrdiff_t cms = max_stride / p.nodes[d].n;
--            bool strides_ok = true
--                    && p.nodes[d].is < cms / (int)data_type_size(p.itype)
--                    && p.nodes[d].os < cms / (int)data_type_size(p.otype);
--            if (!strides_ok) return false;
--        }
--
-         return true;
-     }
- 
-@@ -153,6 +175,13 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         return (int)prb_.nodes[d].ss;
-     }
- 
-+    int blk_cnt() {
-+        assert(prb_.blk_chunk_idx < prb_.full_ndims);
-+        return (int)prb_.nodes[prb_.blk_chunk_idx].n - 1;
-+    }
-+    int op_padding() { return prb_.op_tail ? prb_.iblock - prb_.op_tail : 0; }
-+    int ip_padding() { return prb_.ip_tail ? prb_.oblock - prb_.ip_tail : 0; }
-+
-     void step(int off, int prev_i_off, int prev_o_off, int prev_s_off,
-             int &i_off, int &o_off, int &s_off, int step_size = 1) {
-         i_off = prev_i_off;
-@@ -385,6 +414,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                                 prb_.otype, u8, data_type::s8, s32, f32)))
-                 && utils::everyone_is(8, n(0), n(1))
-                 && utils::everyone_is(1, os(0), is(1))
-+                && utils::everyone_is(0, prb_.ip_tail, prb_.op_tail)
-                 && prb_.scale_type == scale_type_t::NONE && prb_.beta == 0.f;
-     }
- 
-@@ -405,17 +435,14 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-     bool process_direct_copy(int len) {
-         using namespace data_type;
- 
--        const int simd_w = cpu_isa_traits<isa>::vlen == 16
--                ? cpu_isa_traits<isa>::vlen / itype_sz /* use 128-bit VReg */
--                : cpu_isa_traits<isa>::vlen / itype_sz
--                        / 2; /* use lower half of 512-bit ZReg */
--
-+        const int simd_w = cpu_isa_traits<isa>::vlen / itype_sz;
-         bool can_do = true && mayiuse(isa)
-                 && utils::everyone_is(1, os(0), is(0))
-                 && (false || prb_.itype == prb_.otype
-                         || (prb_.itype == s32 && prb_.otype == f32)
-                         || (prb_.itype == f32 && prb_.otype == s32))
-                 && len % simd_w == 0 && n(0) % len == 0
-+                && prb_.ip_tail % simd_w == 0 && prb_.op_tail % simd_w == 0
-                 && prb_.scale_type == scale_type_t::NONE && prb_.beta == 0.f;
-         if (!can_do) return false;
- 
-@@ -511,7 +538,8 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-     }
- 
-     void process_unroll_generic_step(int reg_unroll, const int *i_off,
--            const int *o_off, const int *s_off) {
-+            const int *o_off, const int *s_off, const int *ip_padding,
-+            const bool h_padded) {
-         using namespace data_type;
- 
-         auto cvt2ps
-@@ -571,6 +599,8 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         for (int ur = 1; ur < reg_unroll; ++ur)
-             if (o_off[ur] != o_off[ur - 1] + 1) can_store_xmm = false;
-         const int ur_step = can_store_xmm ? 4 : 1;
-+        const int load_tail_step
-+                = !can_load_xmm && can_store_xmm ? ur_step : load_step;
- 
-         const bool interim_f32 = false
-                 || utils::one_of(f32, prb_.itype, prb_.otype)
-@@ -579,55 +609,85 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         const bool need_saturation
-                 = (utils::one_of(prb_.otype, u8, data_type::s8, s32)
-                         && interim_f32);
--
--        if (!can_load_xmm && can_store_xmm) {
--            assert(ur_step == 4);
--            /* load with stride */
--            for (int ur = 0; ur < reg_unroll; ur += ur_step) {
--
-+        if (h_padded) {
-+            for (int ur = 0; ur < reg_unroll; ur += load_tail_step) {
-+                if (itype_sz == 4)
-+                    movi(VReg4S(ur), 0);
-+                else if (itype_sz == 2)
-+                    movi(VReg8H(ur), 0);
-+                else
-+                    movi(VReg16B(ur), 0);
-                 /* x_tmp_vec = X_TMP_0 - X_TMP_4
-                  Do not use X_TMP_? as the last arg. */
--                for (int r = 0; r < ur_step; ++r) {
--                    add_imm(x_tmp_vec[r], x_ptr_in_off,
--                            i_off[ur + r] * itype_sz, X_DEFAULT_ADDR);
-+                for (int r = 0; r < load_tail_step; ++r) {
-+                    if (ip_padding[ur + r] == 0) {
-+                        add_imm(x_tmp_vec[r], x_ptr_in_off,
-+                                i_off[ur + r] * itype_sz, X_DEFAULT_ADDR);
-+                    }
-                 }
- 
--                for (int r = 0; r < ur_step; ++r) {
--                    if (itype_sz == 4)
--                        ld1(VReg4S(ur)[r], ptr(x_tmp_vec[r]));
--                    else if (itype_sz == 2)
--                        ld1(VReg8H(ur)[r], ptr(x_tmp_vec[r]));
--                    else
--                        ld1(VReg16B(ur)[r], ptr(x_tmp_vec[r]));
-+                for (int r = 0; r < load_tail_step; ++r) {
-+                    if (ip_padding[ur + r] == 0) {
-+                        if (itype_sz == 4)
-+                            ld1(VReg4S(ur)[r], ptr(x_tmp_vec[r]));
-+                        else if (itype_sz == 2)
-+                            ld1(VReg8H(ur)[r], ptr(x_tmp_vec[r]));
-+                        else
-+                            ld1(VReg16B(ur)[r], ptr(x_tmp_vec[r]));
-+                    }
-                 }
-             }
-         } else {
--            int ur = 0;
--            int tmp_ur = 0;
--            while (ur < reg_unroll) {
--                int count = 0;
-+            if (!can_load_xmm && can_store_xmm) {
-+                assert(ur_step == 4);
-+                /* load with stride */
-+                for (int ur = 0; ur < reg_unroll; ur += ur_step) {
- 
--                do {
--                    add_imm(x_tmp_vec[count++], x_ptr_in_off,
--                            i_off[ur] * itype_sz, X_DEFAULT_ADDR);
--                    ur += load_step;
--                } while (ur < reg_unroll && count < x_tmp_vec_size);
-+                    /* x_tmp_vec = X_TMP_0 - X_TMP_4
-+                 Do not use X_TMP_? as the last arg. */
-+                    for (int r = 0; r < ur_step; ++r) {
-+                        add_imm(x_tmp_vec[r], x_ptr_in_off,
-+                                i_off[ur + r] * itype_sz, X_DEFAULT_ADDR);
-+                    }
- 
--                for (int i = 0; i < count; i++) {
-+                    for (int r = 0; r < ur_step; ++r) {
-+                        if (itype_sz == 4)
-+                            ld1(VReg4S(ur)[r], ptr(x_tmp_vec[r]));
-+                        else if (itype_sz == 2)
-+                            ld1(VReg8H(ur)[r], ptr(x_tmp_vec[r]));
-+                        else
-+                            ld1(VReg16B(ur)[r], ptr(x_tmp_vec[r]));
-+                    }
-+                }
-+            } else {
-+                int ur = 0;
-+                int tmp_ur = 0;
-+                while (ur < reg_unroll) {
-+                    int count = 0;
-+
-+                    do {
-+                        add_imm(x_tmp_vec[count++], x_ptr_in_off,
-+                                i_off[ur] * itype_sz, X_DEFAULT_ADDR);
-+                        ur += load_step;
-+                    } while (ur < reg_unroll && count < x_tmp_vec_size);
-+
-+                    for (int i = 0; i < count; i++) {
- 
--                    switch (load_step * itype_sz) {
--                        case 16: ldr(QReg(tmp_ur), ptr(x_tmp_vec[i])); break;
--                        case 8: ldr(DReg(tmp_ur), ptr(x_tmp_vec[i])); break;
--                        case 4: ldr(SReg(tmp_ur), ptr(x_tmp_vec[i])); break;
--                        case 2: ldr(HReg(tmp_ur), ptr(x_tmp_vec[i])); break;
--                        case 1: ldr(BReg(tmp_ur), ptr(x_tmp_vec[i])); break;
--                        default: assert(!"unreachable");
-+                        switch (load_step * itype_sz) {
-+                            case 16:
-+                                ldr(QReg(tmp_ur), ptr(x_tmp_vec[i]));
-+                                break;
-+                            case 8: ldr(DReg(tmp_ur), ptr(x_tmp_vec[i])); break;
-+                            case 4: ldr(SReg(tmp_ur), ptr(x_tmp_vec[i])); break;
-+                            case 2: ldr(HReg(tmp_ur), ptr(x_tmp_vec[i])); break;
-+                            case 1: ldr(BReg(tmp_ur), ptr(x_tmp_vec[i])); break;
-+                            default: assert(!"unreachable");
-+                        }
-+                        tmp_ur += load_step;
-                     }
--                    tmp_ur += load_step;
-                 }
-             }
-         }
--
-         /* xmm[:] <-- (f32)xmm[:] */
-         if (interim_f32) {
-             const int cvt_step = nstl::max(load_step, ur_step);
-@@ -708,7 +768,8 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                         if (s_off[r] != s_off[r - 1] + 0)
-                             scale_load_type = scale_load_type_t::load;
- 
--                    if (scale_load_type == scale_load_type_t::bcast) {
-+                    if (scale_load_type == scale_load_type_t::bcast
-+                            && !h_padded) {
-                         VReg4S v(xmm_scale.getIdx());
-                         VReg4S v_dst(ur);
-                         add_imm(X_TMP_0, x_ptr_scale_off, s_off[ur] * stype_sz,
-@@ -724,7 +785,8 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                         if (s_off[r] != s_off[r - 1] + 1)
-                             scale_load_type = scale_load_type_t::gather;
- 
--                    if (scale_load_type == scale_load_type_t::load) {
-+                    if (scale_load_type == scale_load_type_t::load
-+                            && !h_padded) {
-                         uint32_t idx = xmm_scale.getIdx();
-                         VReg4S v_dst(ur);
-                         add_imm(X_TMP_0, x_ptr_scale_off, s_off[ur] * stype_sz,
-@@ -739,14 +801,18 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                     // so gather the scale factors one by one
-                     /*ur_step is 1 or 4. */
-                     for (int r = ur; r < ur + ur_step; ++r) {
--                        /* x_tmp_vec = X_TMP_0 - X_TMP_4
-+                        if (ip_padding[r] == 0 || !h_padded) {
-+                            /* x_tmp_vec = X_TMP_0 - X_TMP_4
-                          Do not use X_TMP_? as the last arg. */
--                        add_imm(x_tmp_vec[r - ur], x_ptr_scale_off,
--                                s_off[r] * stype_sz, X_DEFAULT_ADDR);
-+                            add_imm(x_tmp_vec[r - ur], x_ptr_scale_off,
-+                                    s_off[r] * stype_sz, X_DEFAULT_ADDR);
-+                        }
-                     }
-                     for (int r = ur; r < ur + ur_step; ++r) {
--                        VReg4S v(xmm_scale.getIdx());
--                        ld1(v[r - ur], ptr(x_tmp_vec[r - ur]));
-+                        if (ip_padding[r] == 0 || !h_padded) {
-+                            VReg4S v(xmm_scale.getIdx());
-+                            ld1(v[r - ur], ptr(x_tmp_vec[r - ur]));
-+                        }
-                     }
-                     fmul(VReg4S(ur), VReg4S(ur), xmm_scale);
-                 }
-@@ -925,7 +991,15 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         }
-     }
- 
--    void process_unroll_generic(int len) {
-+    void comp_padding_flag(int ndims, int off, int len, int &i_tail) {
-+        const int ip_without_padding
-+                = ndims == 0 ? len - ip_padding() : prb_.ip_tail;
-+        if ((ndims == 0 && off >= ip_without_padding)
-+                || (ndims > 0 && (off % prb_.oblock) >= ip_without_padding))
-+            i_tail = 1;
-+    }
-+
-+    void process_unroll_generic(const int ndims, int len, const bool h_padded) {
-         const int blk = 8;
- 
-         int i_off[2 * blk] = {0};
-@@ -936,22 +1010,37 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-         for (int off = 0; off < len; off += blk) {
-             const int reg_unroll = nstl::min(off + blk, len) - off;
-+            int ip_padding[blk] = {0};
- 
--            /* compute offsets */
-+            /* compute offsets and tail*/
-             for (int ur = off != 0 ? 0 : 1; ur < reg_unroll; ++ur) {
-                 const int ur_c = curr * blk + ur;
-                 const int ur_p = (ur_c - 1 + 2 * blk) % (2 * blk); // prev ur
-                 step(off + ur, i_off[ur_p], o_off[ur_p], s_off[ur_p],
-                         i_off[ur_c], o_off[ur_c], s_off[ur_c]);
-+                if (h_padded)
-+                    comp_padding_flag(ndims, off + ur, len, ip_padding[ur]);
-             }
--
-             process_unroll_generic_step(reg_unroll, i_off + curr * blk,
--                    o_off + curr * blk, s_off + curr * blk);
-+                    o_off + curr * blk, s_off + curr * blk, ip_padding,
-+                    h_padded);
- 
-             curr = 1 - curr;
-         }
-     }
- 
-+    void compute_ker(
-+            const int ndims, const int len_unroll, const bool h_padded) {
-+        bool optimized = false;
-+        optimized = optimized
-+                || (process_direct_copy<sve_256>(len_unroll) && !h_padded);
-+        optimized = optimized
-+                || (process_direct_copy<asimd>(len_unroll) && !h_padded);
-+        optimized
-+                = optimized || (process_unroll_tr8x8(len_unroll) && !h_padded);
-+        if (!optimized) process_unroll_generic(ndims, len_unroll, h_padded);
-+    }
-+
-     void loop_begin(Label &l, XReg reg_cnt, int len) {
-         mov(reg_cnt, len);
-         L(l);
-@@ -985,6 +1074,28 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         }
-     }
- 
-+    void compute_blk_ker(const int len_unroll) {
-+        int omp_ndims = prb_.full_ndims - prb_.ndims;
-+        Label no_last_blk, end_label;
-+
-+        if (prb_.ip_tail > 0 && prb_.op_tail == 0) {
-+            if (omp_ndims == 0) {
-+                cmp(reg_last_loop_cnt, 1);
-+                bne(no_last_blk);
-+                compute_ker(omp_ndims, len_unroll, true);
-+            } else {
-+                cmp(reg_blk_chunks, blk_cnt());
-+                bne(no_last_blk);
-+                compute_ker(omp_ndims, len_unroll, true);
-+            }
-+            b(end_label);
-+        }
-+
-+        L(no_last_blk);
-+        compute_ker(omp_ndims, len_unroll, false);
-+        L(end_label);
-+    }
-+
-     bool simple_impl() {
-         simple_impl_desc_t d;
-         if (!simple_impl_desc_init(prb_, &d)) return false;
-@@ -1013,11 +1124,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         if (n_jit_loops > 0)
-             loop_begin(l_loop[0], reg_cnt[0], n(nfu + 0) / ldu);
- 
--        bool optimized = false;
--        optimized = optimized || process_direct_copy<sve_512>(d.len_unroll);
--        optimized = optimized || process_direct_copy<asimd>(d.len_unroll);
--        optimized = optimized || process_unroll_tr8x8(d.len_unroll);
--        if (!optimized) process_unroll_generic(d.len_unroll);
-+        compute_blk_ker(d.len_unroll);
- 
-         if (n_jit_loops > 0)
-             loop_end(l_loop[0], reg_cnt[0], n(nfu + 0) / ldu, is(nfu + 0) * ldu,
-@@ -1236,9 +1343,13 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         }
-         add_imm(X_TMP_0, abi_param1, PARAM(in), X_TMP_2);
-         add_imm(X_TMP_1, abi_param1, PARAM(out), X_TMP_2);
-+        add_imm(reg_blk, abi_param1, PARAM(blk_chunks), reg_blk);
-         ldr(reg_ptr_in, ptr(X_TMP_0));
-         ldr(reg_ptr_out, ptr(X_TMP_1));
-+        ldr(reg_blk_chunks, ptr(reg_blk));
-+
- #undef PARAM
-+        mov_imm(reg_last_loop_cnt, 1);
- 
-         mov(x_ptr_in_off, XReg(reg_ptr_in.getIdx()));
-         mov(x_ptr_out_off, XReg(reg_ptr_out.getIdx()));
-@@ -1282,6 +1393,10 @@ private:
-     XReg reg_off_out = x9;
-     XReg reg_off_scale = x10;
- 
-+    XReg reg_blk = x11;
-+    XReg reg_blk_chunks = x12;
-+    XReg reg_last_loop_cnt = x11;
-+
-     XReg reg_tmp = x0;
- 
-     VReg4S xmm_scale = v15.s;
-@@ -1416,10 +1531,16 @@ static void prb_thread_kernel_balance(
-     for (int d = 0; d < prb.ndims; ++d)
-         sz_total *= prb.nodes[d].n;
- 
-+    /* The general expression for sz_drv_thr can be written as
-+     * sz_drv_min = C0 + FC * (nthr > 1 ? 1 : 0) + VC * (nthr - 1)
-+     * where FC and VC are fixed and variable costs respectively.
-+     * Though for now, the below heuristic seems to be good enough */
-+    const size_t sz_drv_thr = (nthr > 1) ? 16 * nthr : 1;
-+
-     /* sz_drv_min is the minimal size for the parallel
-      * driver required for good parallelization */
-     const size_t sz_drv_min
--            = nstl::min<size_t>(16 * nthr, utils::div_up(sz_total, 1024));
-+            = nstl::min<size_t>(sz_drv_thr, utils::div_up(sz_total, 1024));
- 
-     /* kdims -- # of dimensions processed by a kernel
-      * sz_ker_cur -- product of the dimension processed by a kernel
-@@ -1440,7 +1561,8 @@ static void prb_thread_kernel_balance(
-      * (less than tr::ker_prb_size_min). In that case try to split the
-      * innermost driver dimension into two, to increase sz_ker_cur. */
-     bool want_borrow_ker_from_drv = true && kdims < prb.ndims
--            && sz_ker_cur < tr::ker_prb_size_min && sz_drv_cur > sz_drv_min;
-+            && sz_ker_cur < tr::ker_prb_size_min && sz_drv_cur > sz_drv_min
-+            && kdims != prb.blk_chunk_idx;
-     if (want_borrow_ker_from_drv) {
-         /* sz_want_borrow is the minimal sz, so that:
-          *  o) sz_ker_cur * sz_want_borrow >= tr::ker_prb_size_min
-@@ -1464,7 +1586,7 @@ static void prb_thread_kernel_balance(
-      * try to split the outermost kernel dimension into two, to increase
-      * sz_drv_cur. */
-     bool want_borrow_drv_from_ker = true && sz_ker_cur > tr::ker_prb_size_min
--            && sz_drv_cur < sz_drv_min;
-+            && sz_drv_cur < sz_drv_min && kdims != prb.blk_chunk_idx;
-     if (want_borrow_drv_from_ker) {
-         size_t sz_want_borrow = utils::div_up(sz_drv_min, sz_drv_cur);
-         for (; prb.nodes[kdims - 1].n % sz_want_borrow; ++sz_want_borrow)
-@@ -1518,6 +1640,8 @@ status_t jit_uni_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
-         prb_dump(prb);
-     });
- 
-+    CHECK(prb_check_blk(prb, *dst_md));
-+
-     int ndims_ker_max;
-     int nthr = dnnl_get_max_threads();
-     prb_thread_kernel_balance(prb, ndims_ker_max, nthr);
-@@ -1552,7 +1676,7 @@ status_t jit_uni_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
- 
- void jit_uni_reorder_t::omp_driver_0d(
-         int off, const char *in, char *out, const float *scale) const {
--    tr::call_param_t c {in, out, scale};
-+    tr::call_param_t c {in, out, scale, 0};
-     (*kernel_)(&c);
- }
- 
-@@ -1564,6 +1688,7 @@ void jit_uni_reorder_t::omp_driver_1d(int ithr, int nthr, int off,
-         c.in = in + d0 * ns[0].is * data_type_size(pd()->prb_.itype);
-         c.out = out + d0 * ns[0].os * data_type_size(pd()->prb_.otype);
-         c.scale = scale + d0 * ns[0].ss;
-+        c.blk_chunks = d0;
-         (*kernel_)(&c);
-     });
- }
-@@ -1571,6 +1696,7 @@ void jit_uni_reorder_t::omp_driver_1d(int ithr, int nthr, int off,
- void jit_uni_reorder_t::omp_driver_2d(int ithr, int nthr, int off,
-         const char *in, char *out, const float *scale) const {
-     const tr::node_t *ns = pd()->prb_.nodes + off;
-+    const int blk_idx_off = pd()->prb_.blk_chunk_idx - off;
-     for_nd(ithr, nthr, (ptrdiff_t)ns[1].n, (ptrdiff_t)ns[0].n,
-             [&](ptrdiff_t d1, ptrdiff_t d0) {
-                 auto c = tr::call_param_t();
-@@ -1581,6 +1707,7 @@ void jit_uni_reorder_t::omp_driver_2d(int ithr, int nthr, int off,
-                         + (d0 * ns[0].os + d1 * ns[1].os)
-                                 * data_type_size(pd()->prb_.otype);
-                 c.scale = scale + d0 * ns[0].ss + d1 * ns[1].ss;
-+                c.blk_chunks = utils::pick(blk_idx_off, d0, d1);
-                 (*kernel_)(&c);
-             });
- }
-@@ -1588,6 +1715,7 @@ void jit_uni_reorder_t::omp_driver_2d(int ithr, int nthr, int off,
- void jit_uni_reorder_t::omp_driver_3d(int ithr, int nthr, int off,
-         const char *in, char *out, const float *scale) const {
-     const tr::node_t *ns = pd()->prb_.nodes + off;
-+    const int blk_idx_off = pd()->prb_.blk_chunk_idx - off;
-     for_nd(ithr, nthr, (ptrdiff_t)ns[2].n, (ptrdiff_t)ns[1].n,
-             (ptrdiff_t)ns[0].n, [&](ptrdiff_t d2, ptrdiff_t d1, ptrdiff_t d0) {
-                 auto c = tr::call_param_t();
-@@ -1598,6 +1726,7 @@ void jit_uni_reorder_t::omp_driver_3d(int ithr, int nthr, int off,
-                         + (d0 * ns[0].os + d1 * ns[1].os + d2 * ns[2].os)
-                                 * data_type_size(pd()->prb_.otype);
-                 c.scale = scale + d0 * ns[0].ss + d1 * ns[1].ss + d2 * ns[2].ss;
-+                c.blk_chunks = utils::pick(blk_idx_off, d0, d1, d2);
-                 (*kernel_)(&c);
-             });
- }
-@@ -1605,6 +1734,7 @@ void jit_uni_reorder_t::omp_driver_3d(int ithr, int nthr, int off,
- void jit_uni_reorder_t::omp_driver_4d(int ithr, int nthr, int off,
-         const char *in, char *out, const float *scale) const {
-     const tr::node_t *ns = pd()->prb_.nodes + off;
-+    const int blk_idx_off = pd()->prb_.blk_chunk_idx - off;
-     for_nd(ithr, nthr, (ptrdiff_t)ns[3].n, (ptrdiff_t)ns[2].n,
-             (ptrdiff_t)ns[1].n, (ptrdiff_t)ns[0].n,
-             [&](ptrdiff_t d3, ptrdiff_t d2, ptrdiff_t d1, ptrdiff_t d0) {
-@@ -1619,6 +1749,7 @@ void jit_uni_reorder_t::omp_driver_4d(int ithr, int nthr, int off,
-                                 * data_type_size(pd()->prb_.otype);
-                 c.scale = scale + d0 * ns[0].ss + d1 * ns[1].ss + d2 * ns[2].ss
-                         + d3 * ns[3].ss;
-+                c.blk_chunks = utils::pick(blk_idx_off, d0, d1, d2, d3);
-                 (*kernel_)(&c);
-             });
- }
-diff --git a/src/cpu/aarch64/jit_uni_reorder.hpp b/src/cpu/aarch64/jit_uni_reorder.hpp
-index 88762756c..2fb6f0f89 100644
---- a/src/cpu/aarch64/jit_uni_reorder.hpp
-+++ b/src/cpu/aarch64/jit_uni_reorder.hpp
-@@ -1,6 +1,7 @@
- /*******************************************************************************
- * Copyright 2018-2020 Intel Corporation
- * Copyright 2020 FUJITSU LIMITED
-+* Copyright 2022 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -52,11 +53,19 @@ struct prb_t {
-     ptrdiff_t ooff;
-     scale_type_t scale_type;
-     float beta;
-+    int full_ndims;
-+    int ip_tail;
-+    int op_tail;
-+    int iblock;
-+    int oblock;
-+    int blk_chunk_idx;
- };
- 
- status_t prb_init(prb_t &prb, const memory_desc_t &imd,
-         const memory_desc_t &omd, const primitive_attr_t *attr);
- 
-+status_t prb_check_blk(prb_t &prb, const memory_desc_t &imd);
-+
- /** sorts the problem nodes so that output strides come in ascending order */
- void prb_normalize(prb_t &p);
- 
-@@ -82,6 +91,7 @@ struct call_param_t {
-     const void *in;
-     void *out;
-     const float *scale;
-+    size_t blk_chunks;
- };
- 
- struct kernel_t {
-diff --git a/src/cpu/aarch64/jit_uni_reorder_utils.cpp b/src/cpu/aarch64/jit_uni_reorder_utils.cpp
-index 3d6e424e3..7123811f8 100644
---- a/src/cpu/aarch64/jit_uni_reorder_utils.cpp
-+++ b/src/cpu/aarch64/jit_uni_reorder_utils.cpp
-@@ -1,6 +1,7 @@
- /*******************************************************************************
- * Copyright 2018-2021 Intel Corporation
- * Copyright 2020 FUJITSU LIMITED
-+* Copyright 2022 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -15,7 +16,8 @@
- * limitations under the License.
- *******************************************************************************/
- 
--#include <assert.h>
-+#include <cassert>
-+#include <set>
- 
- #include "common/c_types_map.hpp"
- #include "common/dnnl_thread.hpp"
-@@ -46,8 +48,65 @@ struct layout_desc_t {
-     strides_t strides;
- };
- 
--status_t cvt_mem_desc_to_layout_desc(
--        const memory_desc_t &md_, layout_desc_t &ld, const dims_t &blocks) {
-+static status_t compute_blk_and_tail(
-+        const memory_desc_t &md_, const int idx, int &blk, int &tail) {
-+    const auto md = memory_desc_wrapper(md_);
-+    const auto &bd = md.blocking_desc();
-+    if (tail == 0) return status::success;
-+
-+    const std::set<dim_t> unique_inner_idxs(
-+            bd.inner_idxs, bd.inner_idxs + bd.inner_nblks);
-+    std::set<dim_t> dims_with_multiple_blks;
-+    for (dim_t dim : unique_inner_idxs) {
-+        if (std::count(bd.inner_idxs, bd.inner_idxs + bd.inner_nblks, dim) > 1)
-+            dims_with_multiple_blks.insert(dim);
-+    }
-+
-+    // Dims that have a tail and have multiple blocks are not supported by the jit kernel yet.
-+    // For example:
-+    // src_tag = abcd
-+    // dst_tag = ABcd16b16a4b
-+    // 16x15x3x3
-+    // In this case, 'b' dim has two blocks and has a tail. It is not a supported case.
-+    if (dims_with_multiple_blks.find(idx) != dims_with_multiple_blks.end())
-+        return status::unimplemented;
-+
-+    // Only supports inconsistent padding in single and double blocks
-+    // and the total block size <= 256
-+    for (int iblk = bd.inner_nblks - 1; iblk > 0; --iblk) {
-+        if (bd.inner_idxs[iblk] == idx) break;
-+        blk *= bd.inner_blks[iblk];
-+        tail *= bd.inner_blks[iblk];
-+    }
-+    if (unique_inner_idxs.size() > 2 || blk > 256) return status::unimplemented;
-+
-+    return status::success;
-+}
-+
-+static status_t compute_chunk_idx(const prb_t &p, const memory_desc_t &imd_,
-+        const memory_desc_t &omd_, const int blk_idx, int &chunk_idx) {
-+    const auto imd = memory_desc_wrapper(imd_);
-+    const auto omd = memory_desc_wrapper(omd_);
-+    const auto &ibd = imd.blocking_desc();
-+    const auto &obd = omd.blocking_desc();
-+    if (p.ip_tail == 0 && p.op_tail == 0) return status::success;
-+
-+    const ptrdiff_t is
-+            = ibd.strides[blk_idx] * obd.inner_blks[obd.inner_idxs[blk_idx]];
-+    const ptrdiff_t os = obd.strides[blk_idx];
-+
-+    for (int i = blk_idx; i < omd.ndims(); ++i) {
-+        if (p.nodes[i].os == os && p.nodes[i].is == is) {
-+            chunk_idx = i;
-+            return status::success;
-+        }
-+    }
-+
-+    return status::invalid_arguments;
-+}
-+
-+status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_,
-+        layout_desc_t &ld, const dims_t &blocks, const dims_t &ext_padding) {
-     const auto md = memory_desc_wrapper(md_);
- 
-     bool ok = true && md.is_blocking_desc() && md.extra().flags == 0;
-@@ -75,7 +134,7 @@ status_t cvt_mem_desc_to_layout_desc(
-                 stride *= bd.inner_blks[iblk];
-             }
-         }
--        P(d, md.padded_dims()[d] / blocks[d], bd.strides[d]);
-+        P(d, (md.padded_dims()[d] + ext_padding[d]) / blocks[d], bd.strides[d]);
- 
-         // TODO: NOW: revisit, do we need a reverse?
-         // TODO: NOW: consider using strides instead of block sizes in md
-@@ -98,7 +157,8 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
- 
-     auto check_post_ops = [](const primitive_attr_t *attr) {
-         const auto &po = attr->post_ops_;
--        return po.len() == 0 || (po.len() == 1 && po.entry_[0].is_sum(false));
-+        return po.len() == 0
-+                || (po.len() == 1 && po.contain(primitive_kind::sum, 0));
-     };
- 
-     bool ok = im_d.is_blocking_desc() && om_d.is_blocking_desc()
-@@ -110,26 +170,58 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
-             && check_post_ops(attr);
-     if (!ok) return unimplemented;
- 
--    dims_t iblocks, oblocks;
-+    dims_t iblocks, oblocks, ip_padding, op_padding;
-     im_d.compute_blocks(iblocks);
-     om_d.compute_blocks(oblocks);
-+    utils::array_set(ip_padding, 0, im_d.ndims());
-+    utils::array_set(op_padding, 0, om_d.ndims());
-+
-+    /* padding_dim consistency check
-+     * only supports inconsitent padding for src
-+     * TODO: Add inconsistent padding support for dst */
-+    int ip_tail = 0;
-+    int op_tail = 0;
-+    int iblk_w_tail = 1;
-+    int oblk_w_tail = 1;
-+    int blk_idx = 0;
- 
--    /* padding_dim consistency check */
-     for (int d = 0; d < im_d.ndims(); ++d) {
--        const auto pdim = im_d.padded_dims()[d];
--        bool ok = true && pdim == om_d.padded_dims()[d]
--                && pdim % iblocks[d] == 0 && pdim % oblocks[d] == 0;
--        if (!ok) return unimplemented;
-+        const int ip_tmp_dim = im_d.padded_dims()[d];
-+        const int op_tmp_dim = om_d.padded_dims()[d];
-+        const int ip_tmp_tail = ip_tmp_dim % oblocks[d];
-+        const int op_tmp_tail = op_tmp_dim % iblocks[d];
-+
-+        const bool pdim_consistent = ip_tmp_dim == op_tmp_dim
-+                && ip_tmp_tail == 0 && op_tmp_tail == 0;
-+        const bool pdim_tail = ip_tmp_tail > 0
-+                && (ip_tmp_dim + oblocks[d] - ip_tmp_tail) == op_tmp_dim
-+                && op_tmp_tail == 0 && ip_tail == 0;
-+        if (!pdim_consistent && !pdim_tail) return status::unimplemented;
-+        if (pdim_tail) {
-+            blk_idx = d;
-+            ip_tail = ip_tmp_tail;
-+            op_tail = op_tmp_tail;
-+            iblk_w_tail = iblocks[d];
-+            oblk_w_tail = oblocks[d];
-+            ip_padding[d] = oblocks[d] - ip_tmp_tail;
-+            op_padding[d] = iblocks[d] - op_tmp_tail;
-+        }
-     }
-+    CHECK(compute_blk_and_tail(omd, blk_idx, oblk_w_tail, ip_tail));
- 
-     layout_desc_t ild, old;
--    status_t status = cvt_mem_desc_to_layout_desc(imd, ild, iblocks);
-+    status_t status
-+            = cvt_mem_desc_to_layout_desc(imd, ild, iblocks, ip_padding);
-     if (status != success) return status;
--    status = cvt_mem_desc_to_layout_desc(omd, old, oblocks);
-+    status = cvt_mem_desc_to_layout_desc(omd, old, oblocks, op_padding);
-     if (status != success) return status;
- 
-     p.itype = ild.dt;
-     p.otype = old.dt;
-+    p.ip_tail = ip_tail;
-+    p.op_tail = op_tail;
-+    p.iblock = iblk_w_tail;
-+    p.oblock = oblk_w_tail;
- 
-     p.scale_type = attr->output_scales_.has_default_values()
-             ? scale_type_t::NONE
-@@ -156,7 +248,6 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
- 
-     while (i_pos < ild.ndims && o_pos < old.ndims) {
-         assert(ild.id[i_pos] == old.id[o_pos]);
--        if (ild.id[i_pos] != old.id[o_pos]) return runtime_error;
- 
-         assert(ndims < max_ndims);
-         if (ndims == max_ndims) return runtime_error;
-@@ -191,7 +282,12 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
-             ild.dims[i_pos] = factor;
-         }
-     }
-+    int blk_chunk_idx = ndims;
-+    CHECK(compute_chunk_idx(p, imd, omd, blk_idx, blk_chunk_idx));
-+
-     p.ndims = ndims;
-+    p.full_ndims = ndims;
-+    p.blk_chunk_idx = blk_chunk_idx;
- 
-     p.ioff = memory_desc_wrapper(imd).offset0();
-     p.ooff = memory_desc_wrapper(omd).offset0();
-@@ -211,8 +307,28 @@ void prb_normalize(prb_t &p) {
-                             && p.nodes[j].n < p.nodes[min_pos].n);
-             if (new_min) min_pos = j;
-         }
--        if (min_pos != d) nstl::swap(p.nodes[d], p.nodes[min_pos]);
-+        if (min_pos != d) {
-+            nstl::swap(p.nodes[d], p.nodes[min_pos]);
-+            if (p.blk_chunk_idx == min_pos || p.blk_chunk_idx == d)
-+                p.blk_chunk_idx = p.blk_chunk_idx == min_pos ? d : min_pos;
-+        }
-+    }
-+}
-+
-+status_t prb_check_blk(prb_t &p, const memory_desc_t &md_) {
-+    const auto md = memory_desc_wrapper(md_);
-+    const auto &bd = md.blocking_desc();
-+    if (p.ip_tail == 0) return status::success;
-+
-+    // Check if the inner blocks and p.nodes[blk].n in the firsti nblks
-+    // is equivalent in reverse order when has tail in block layout.
-+    const int nblk = bd.inner_nblks;
-+    for (int iblk = 0; iblk < nblk; ++iblk) {
-+        if (bd.inner_blks[nblk - iblk - 1]
-+                != static_cast<ptrdiff_t>(p.nodes[iblk].n))
-+            return status::unimplemented;
-     }
-+    return status::success;
- }
- 
- void prb_simplify(prb_t &p) {
-@@ -225,18 +341,29 @@ void prb_simplify(prb_t &p) {
-     for (int d = 0; d < p.ndims - 1; ++d) {
-         auto &this_node = p.nodes[d + 0];
-         auto &next_node = p.nodes[d + 1];
-+        const bool skip_blk_idx = (p.ip_tail > 0 || p.op_tail > 0)
-+                && (p.blk_chunk_idx == d || p.blk_chunk_idx == d + 1);
-         const bool fold = false
--                || next_node.n == (size_t)1 // trivial case, just drop next node
-+                || (next_node.n == static_cast<size_t>(1)
-+                        && !skip_blk_idx) // trivial case, just drop next node
-                 || (true // or real folding if possible
--                        && next_node.is == (ptrdiff_t)this_node.n * this_node.is
--                        && next_node.os == (ptrdiff_t)this_node.n * this_node.os
-+                        && !skip_blk_idx
-+                        && next_node.is
-+                                == static_cast<ptrdiff_t>(
-+                                        this_node.n * this_node.is)
-+                        && next_node.os
-+                                == static_cast<ptrdiff_t>(
-+                                        this_node.n * this_node.os)
-                         && next_node.ss
--                                == (ptrdiff_t)this_node.n * this_node.ss);
-+                                == static_cast<ptrdiff_t>(
-+                                        this_node.n * this_node.ss));
-         if (fold) {
-             this_node.n *= next_node.n;
-             for (int j = d + 2; j < p.ndims; ++j)
-                 p.nodes[j - 1] = p.nodes[j];
-+            if (d < p.blk_chunk_idx) --p.blk_chunk_idx;
-             --p.ndims;
-+            --p.full_ndims;
-             --d; // make another try
-         }
-     }
-@@ -251,6 +378,8 @@ void prb_node_split(prb_t &p, int dim, size_t n1) {
-     assert(p.nodes[dim].n % n1 == 0);
- 
-     p.ndims += 1;
-+    p.full_ndims += 1;
-+    if (dim < p.blk_chunk_idx) p.blk_chunk_idx += 1;
- 
-     for (int d = p.ndims; d > dim + 1; --d)
-         p.nodes[d] = p.nodes[d - 1];
diff --git a/third_party/mkl_dnn/onednn_acl_reorder_update.patch b/third_party/mkl_dnn/onednn_acl_reorder_update.patch
deleted file mode 100644
index 3ac5a62906ff4c..00000000000000
--- a/third_party/mkl_dnn/onednn_acl_reorder_update.patch
+++ /dev/null
@@ -1,4193 +0,0 @@
-From b84c533dad4db495a92fc6d390a7db5ebd938a88 Mon Sep 17 00:00:00 2001
-From: Kentaro Kawakami <kawakami.k@fujitsu.com>
-Date: Tue, 1 Nov 2022 09:33:41 +0900
-Subject: [PATCH] cpu: aarch64: reorder: support jit-ed blk_reorder
-
----
- src/cpu/aarch64/jit_generator.hpp             |   20 +
- src/cpu/aarch64/jit_uni_reorder.cpp           | 2315 +++++++++++++----
- src/cpu/aarch64/jit_uni_reorder.hpp           |  183 +-
- src/cpu/aarch64/jit_uni_reorder_utils.cpp     |  482 ++--
- .../reorder/cpu_reorder_regular_f32_f32.cpp   |    6 +
- .../reorder/cpu_reorder_regular_f32_s32.cpp   |    2 +
- .../reorder/cpu_reorder_regular_f32_s8.cpp    |    2 +
- .../reorder/cpu_reorder_regular_f32_u8.cpp    |    2 +
- src/cpu/reorder/cpu_reorder_regular_s32.cpp   |    2 +
- src/cpu/reorder/cpu_reorder_regular_s8.cpp    |    2 +
- src/cpu/reorder/cpu_reorder_regular_u8.cpp    |    2 +
- 11 files changed, 2272 insertions(+), 746 deletions(-)
-
-diff --git a/src/cpu/aarch64/jit_generator.hpp b/src/cpu/aarch64/jit_generator.hpp
-index dd781a622e1..12de9fa8c01 100644
---- a/src/cpu/aarch64/jit_generator.hpp
-+++ b/src/cpu/aarch64/jit_generator.hpp
-@@ -435,6 +435,26 @@ class jit_generator : public Xbyak_aarch64::CodeGenerator, public c_compatible {
-                 Xbyak_aarch64::ZRegD(z3.getIdx()));
-     }
- 
-+    void uni_ld1rw(const Xbyak_aarch64::VReg4S &dst,
-+            const Xbyak_aarch64::XReg &base, const int64_t off) {
-+        if (off == 0) {
-+            ld1r(dst, ptr(base));
-+        } else {
-+            add_imm(X_DEFAULT_ADDR, base, off, X_TMP_0);
-+            ld1r(dst, ptr(X_DEFAULT_ADDR));
-+        }
-+    }
-+
-+    void uni_ld1rw(const Xbyak_aarch64::ZRegS &dst,
-+            const Xbyak_aarch64::XReg &base, const int64_t off) {
-+        if (-32 <= off && off < 32) {
-+            ld1rw(dst, P_ALL_ONE / Xbyak_aarch64::T_z, ptr(base, (int)off));
-+        } else {
-+            add_imm(X_DEFAULT_ADDR, base, off, X_TMP_0);
-+            ld1rw(dst, P_ALL_ONE / Xbyak_aarch64::T_z, ptr(X_DEFAULT_ADDR));
-+        }
-+    }
-+
-     void uni_ldr(
-             const Xbyak_aarch64::VReg &dst, const Xbyak_aarch64::XReg &addr) {
-         ldr(Xbyak_aarch64::QReg(dst.getIdx()), ptr(addr));
-diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
-index a6cefaa20e8..a708da808c0 100644
---- a/src/cpu/aarch64/jit_uni_reorder.cpp
-+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
-@@ -1,6 +1,6 @@
- /*******************************************************************************
--* Copyright 2018-2021 Intel Corporation
--* Copyright 2020-2021 FUJITSU LIMITED
-+* Copyright 2018-2022 Intel Corporation
-+* Copyright 2020-2022 FUJITSU LIMITED
- * Copyright 2022 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
-@@ -19,19 +19,21 @@
- #include <assert.h>
- #include <numeric>
- 
--#include "dnnl_debug.h"
-+#include "oneapi/dnnl/dnnl_debug.h"
- 
- #include "common/c_types_map.hpp"
-+#include "common/dnnl_thread.hpp"
- #include "common/memory_desc_wrapper.hpp"
- #include "common/nstl.hpp"
- #include "common/primitive.hpp"
- #include "common/type_helpers.hpp"
- #include "common/utils.hpp"
- 
--#include "cpu/aarch64/jit_uni_reorder.hpp"
- #include "cpu/cpu_primitive.hpp"
- #include "cpu/reorder/cpu_reorder_pd.hpp"
- 
-+#include "cpu/aarch64/jit_uni_reorder.hpp"
-+
- #include "cpu/aarch64/jit_generator.hpp"
- 
- // #define TR_DEBUG
-@@ -67,23 +69,6 @@ static bool prb_has_small_strides(const prb_t &prb) {
-     return true;
- }
- 
--static bool prb_tail_friendly(const prb_t &prb) {
--    /* find optimal ndims to makes it easier to
--     * identify the blk_chunk in the loop*/
--    int ndims = prb.full_ndims - prb.ndims;
--
--    int n = prb.nodes[0].is;
--    for (int d = 1; d < prb.ndims; ++d) {
--        if (d != prb.blk_chunk_idx) n *= prb.nodes[d].n;
--    }
--    if (prb.ip_tail > 0
--            && ((ndims == 0 && n != 1)
--                    || (ndims > 0 && prb.ndims > prb.blk_chunk_idx)))
--        return false;
--
--    return true;
--}
--
- /** Minimal reasonable/desirable kernel size.
-  * The constant might be used to determine how a problem should be split
-  * between kernel and threading driver. */
-@@ -96,6 +81,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-     void operator()(const call_param_t *c) const override {
-         jit_generator::operator()(c);
-     }
-+    void operator()(const tail_call_param_t *c) const override {
-+        jit_generator::operator()(c);
-+    }
- 
-     status_t create_kernel() override { return jit_generator::create_kernel(); }
- 
-@@ -105,30 +93,53 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-     };
- 
-     struct simple_impl_desc_t {
--        int ndims_full_unroll;
--        int len_last_dim_unroll;
--        int len_unroll;
-+        int ndims_full_unroll = 0;
-+        int len_last_dim_unroll = 0;
-+        int tail_len_unroll = 0;
-+        int len_unroll = 0;
-     };
- 
-+#define PARAM(x) \
-+    abi_param1, \
-+            prb_.is_tail_present ? offsetof(tail_call_param_t, base_params) \
-+                    + offsetof(call_param_t, x) \
-+                                 : offsetof(call_param_t, x)
-+#define TAIL_PARAM(x) abi_param1, offsetof(tail_call_param_t, x)
-+
-     static bool simple_impl_desc_init(
-             const prb_t &prb, simple_impl_desc_t *desc) {
-         const int ndims = prb.ndims;
- 
-         int ndims_full_unroll = 0;
-         int len_last_dim_unroll = 1;
-+        int tail_len_unroll = 0;
-         int len_unroll = 1;
- 
--        for (int d = 0; d < ndims; ++d) {
--            auto &node = prb.nodes[d];
--            if (len_unroll * node.n <= len_unroll_max) {
--                ndims_full_unroll++;
--                len_unroll *= node.n;
--            } else {
--                len_last_dim_unroll = len_unroll_max / len_unroll;
--                while (node.n % len_last_dim_unroll)
--                    --len_last_dim_unroll;
--                len_unroll *= len_last_dim_unroll;
--                break;
-+        // It is responsible for finding as many values
-+        // as kernel can unroll. If tail is present then
-+        // kernel will unroll only last node (possible improvement).
-+        // If there is no tail kernel can unroll a few nodes without any loops etc.
-+        // ndims_full_unroll - how many nodes will be unrolled
-+        // len_last_dim_unroll - what piece of last unrolled node will be unrolled
-+        if (prb.is_tail_present) {
-+            ndims_full_unroll = 1;
-+            len_unroll = prb.nodes[0].n;
-+            tail_len_unroll = prb.nodes[0].is_zero_pad_needed
-+                    ? 0
-+                    : static_cast<int>(prb.nodes[0].tail_size);
-+        } else {
-+            for (int d = 0; d < ndims; ++d) {
-+                const auto &node = prb.nodes[d];
-+                if (len_unroll * node.n <= len_unroll_max) {
-+                    ndims_full_unroll++;
-+                    len_unroll *= node.n;
-+                } else {
-+                    len_last_dim_unroll = len_unroll_max / len_unroll;
-+                    while (node.n % len_last_dim_unroll)
-+                        --len_last_dim_unroll;
-+                    len_unroll *= len_last_dim_unroll;
-+                    break;
-+                }
-             }
-         }
- 
-@@ -137,6 +148,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         if (desc) {
-             desc->ndims_full_unroll = ndims_full_unroll;
-             desc->len_last_dim_unroll = len_last_dim_unroll;
-+            desc->tail_len_unroll = tail_len_unroll;
-             desc->len_unroll = len_unroll;
-         }
- 
-@@ -151,62 +163,69 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                 && utils::one_of(p.otype, f32, s32, data_type::s8, u8)
-                 && utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
-                 && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
--                && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
--                && prb_tail_friendly(p);
--        if (!ok) return false;
-+                && simple_impl_desc_init(p, nullptr)
-+                && prb_has_small_strides(p);
- 
--        return true;
-+        return ok;
-     }
- 
--    int n(int d) {
--        assert(d < prb_.ndims);
--        return (int)prb_.nodes[d].n;
--    }
--    int is(int d) {
--        assert(d < prb_.ndims);
--        return (int)prb_.nodes[d].is;
--    }
--    int os(int d) {
--        assert(d < prb_.ndims);
--        return (int)prb_.nodes[d].os;
-+    XReg o_addr(int o_off, bool with_type_multiplier = true) {
-+        if (o_off) {
-+            add_imm(X_DEFAULT_ADDR, x_ptr_out_off,
-+                    o_off * (with_type_multiplier ? otype_sz_ : 1), X_TMP_0);
-+            return X_DEFAULT_ADDR;
-+        }
-+
-+        return x_ptr_out_off;
-     }
--    int ss(int d) {
--        assert(d < prb_.ndims);
--        return (int)prb_.nodes[d].ss;
-+
-+    XReg c_addr(int c_off) {
-+        if (c_off) {
-+            add_imm(X_DEFAULT_ADDR, x_ptr_comp_off, c_off, X_TMP_0);
-+            return X_DEFAULT_ADDR;
-+        }
-+
-+        return x_ptr_comp_off;
-     }
- 
--    int blk_cnt() {
--        assert(prb_.blk_chunk_idx < prb_.full_ndims);
--        return (int)prb_.nodes[prb_.blk_chunk_idx].n - 1;
-+    XReg data_chunk_addr(int node_id) {
-+        add_imm(X_DEFAULT_ADDR, abi_param1,
-+                offsetof(tail_call_param_t, curr_data_chunks)
-+                        + sizeof(int64_t) * (node_id),
-+                X_TMP_0);
-+        return X_DEFAULT_ADDR;
-     }
--    int op_padding() { return prb_.op_tail ? prb_.iblock - prb_.op_tail : 0; }
--    int ip_padding() { return prb_.ip_tail ? prb_.oblock - prb_.ip_tail : 0; }
- 
-     void step(int off, int prev_i_off, int prev_o_off, int prev_s_off,
--            int &i_off, int &o_off, int &s_off, int step_size = 1) {
-+            int prev_c_off, int &i_off, int &o_off, int &s_off, int &c_off,
-+            int step_size = 1) {
-         i_off = prev_i_off;
-         o_off = prev_o_off;
-         s_off = prev_s_off;
-+        c_off = prev_c_off;
- 
-         if (off == 0) return;
- 
-         int start_dim = 0, dims_prod = 1;
-         for (; start_dim < prb_.ndims && dims_prod != step_size; ++start_dim)
--            dims_prod *= n(start_dim);
-+            dims_prod *= prb_.n(start_dim);
-         assert(start_dim < prb_.ndims);
-         off /= step_size;
- 
--        for (int d = start_dim; d < prb_.ndims; ++d) {
--            i_off += is(d);
--            o_off += os(d);
--            s_off += ss(d);
-+        for (int dim_id = start_dim; dim_id < prb_.ndims; ++dim_id) {
-+            i_off += prb_.is(dim_id);
-+            o_off += prb_.os(dim_id);
-+            s_off += prb_.ss(dim_id);
-+            c_off += prb_.cs(dim_id);
-+
-+            if (off % prb_.n(dim_id)) break;
- 
--            if (off % n(d)) break;
-+            i_off += -prb_.n(dim_id) * prb_.is(dim_id);
-+            o_off += -prb_.n(dim_id) * prb_.os(dim_id);
-+            s_off += -prb_.n(dim_id) * prb_.ss(dim_id);
-+            c_off += -prb_.n(dim_id) * prb_.cs(dim_id);
- 
--            i_off += -n(d) * is(d);
--            o_off += -n(d) * os(d);
--            s_off += -n(d) * ss(d);
--            off /= n(d);
-+            off /= prb_.n(dim_id);
- 
-             if (off == 0) break; /* FIXME: is it really required? */
-         }
-@@ -215,8 +234,8 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-     void step(int off, int prev_i_off, int prev_o_off, int &i_off, int &o_off,
-             int step_size = 1) {
-         int dummy = 0;
--        step(off, prev_i_off, prev_o_off, dummy, i_off, o_off, dummy,
--                step_size);
-+        step(off, prev_i_off, prev_o_off, dummy, dummy, i_off, o_off, dummy,
-+                dummy, step_size);
-     }
- 
-     void tr8x8_sve256(int i_off, int o_off) {
-@@ -278,40 +297,36 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                         && interim_f32);
-         const uint64_t sveLen = get_sve_length();
- 
--        add_imm(X_TMP_0, XReg(x_ptr_in_off), i_off * itype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_1, X_TMP_0, is(0) * itype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_2, X_TMP_1, is(0) * itype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_3, X_TMP_2, is(0) * itype_sz, X_DEFAULT_ADDR);
--
--        if (unroll * itype_sz == 32)
--            for (uint32_t i = 0; i < 4; i++)
--                ld1w(ZRegS {i}, p_lsb_256 / T_z, ptr(x_tmp_vec[i]));
--        else if (unroll * itype_sz == 16)
--            for (uint32_t i = 0; i < 4; i++)
--                ldr(QReg {i}, ptr(x_tmp_vec[i]));
--        else if (unroll * itype_sz == 8)
--            for (uint32_t i = 0; i < 4; i++)
--                ldr(DReg {i}, ptr(x_tmp_vec[i]));
--
--        add_imm(X_TMP_0, X_TMP_3, is(0) * itype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_1, X_TMP_0, is(0) * itype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_2, X_TMP_1, is(0) * itype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_3, X_TMP_2, is(0) * itype_sz, X_DEFAULT_ADDR);
--
--        if (unroll * itype_sz == 32)
--            for (uint32_t i = 0; i < 4; i++)
--                ld1w(ZRegS {4 + i}, p_lsb_256 / T_z, ptr(x_tmp_vec[i]));
--        else if (unroll * itype_sz == 16)
--            for (uint32_t i = 0; i < 4; i++)
--                ldr(QReg {4 + i}, ptr(x_tmp_vec[i]));
--        else if (unroll * itype_sz == 8)
--            for (uint32_t i = 0; i < 4; i++)
--                ldr(DReg {4 + i}, ptr(x_tmp_vec[i]));
-+        PReg p_size(DUMMY_IDX);
-+        switch (unroll * itype_sz_) {
-+            case 32: p_size = p_lsb_256; break;
-+            case 16: p_size = p_lsb_128; break;
-+            case 8: p_size = p_lsb_64; break;
-+            default: assert(!"unreachable");
-+        }
-+
-+        const int node_0_input_stride = prb_.is(0);
-+        add_imm(X_TMP_0, XReg(x_ptr_in_off), itype_sz_ * i_off, X_DEFAULT_ADDR);
-+        for (int i = 1; i < unroll / 2; i++) {
-+            add_imm(x_tmp_vec[i], x_tmp_vec[i - 1],
-+                    itype_sz_ * node_0_input_stride, X_DEFAULT_ADDR);
-+        }
-+
-+        for (uint32_t i = 0; i < unroll / 2; i++)
-+            ld1w(ZRegS {i}, p_size / T_z, ptr(x_tmp_vec[i]));
-+
-+        for (int i = 0; i < unroll / 2; i++) {
-+            add_imm(x_tmp_vec[i], x_tmp_vec[(i + 3) % 4],
-+                    itype_sz_ * node_0_input_stride, X_DEFAULT_ADDR);
-+        }
-+
-+        for (uint32_t i = 0; i < unroll / 2; i++)
-+            ld1w(ZRegS {4 + i}, p_size / T_z, ptr(x_tmp_vec[i]));
- 
-         if (interim_f32) cvt2ps(0, unroll, prb_.itype);
- 
- #if 0
--        /* Deubg code */
-+        /* Debug code */
-         index(z0.s, 0, 1);
-         mov(z0.s, P_NOT_256/T_m, 0);
-         mov(z_tmp_vec[0].s, 16);
-@@ -348,9 +363,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         for (uint32_t i = 0; i < unroll / 2; i++) {
-             ZRegB z {unroll / 2 + i};
-             ZRegB z_tmp = z_tmp_vec[unroll / 2 + i].b;
--            /* Move bit 128-255 to 0-127. */
--            ext(z, z, 16);
-             /* Move bit 0-127 to 128-255. */
-+            ext(z, z, 16);
-+            /* Move bit 128-255 to 0-127. */
-             ext(z_tmp, z_tmp, sveLen - 16);
-         }
- 
-@@ -363,65 +378,64 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         }
- 
-         if (need_saturation) {
--            init_saturate_f32(ymm_zero, ymm_saturation_ubound, reg_tmp,
-+            init_saturate_f32(ymm_zero_, ymm_saturation_ubound_, reg_tmp_,
-                     interim_f32 ? f32 : prb_.itype, prb_.otype);
-             for (int i = 0; i < unroll; i++)
--                saturate_f32(ZRegS(i), ymm_zero, ymm_saturation_ubound,
--                        prb_.otype, p_all);
-+                saturate_f32(ZRegS(i), ymm_zero_, ymm_saturation_ubound_,
-+                        prb_.otype, P_ALL_ONE);
-         }
- 
-         if (prb_.otype != f32)
-             cvt2odt(0, unroll, prb_.otype, interim_f32 ? f32 : prb_.itype);
- 
--        add_imm(X_TMP_0, XReg(x_ptr_out_off), o_off * otype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_1, X_TMP_0, os(1) * otype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_2, X_TMP_1, os(1) * otype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_3, X_TMP_2, os(1) * otype_sz, X_DEFAULT_ADDR);
--
--        if (unroll * otype_sz == 32)
--            for (uint32_t i = 0; i < 4; i++)
--                st1w(ZRegS {i}, p_lsb_256 / T_z, ptr(x_tmp_vec[i]));
--        else if (unroll * otype_sz == 16)
--            for (uint32_t i = 0; i < 4; i++)
--                str(QReg {i}, ptr(x_tmp_vec[i]));
--        else if (unroll * otype_sz == 8)
--            for (uint32_t i = 0; i < 4; i++)
--                str(DReg {i}, ptr(x_tmp_vec[i]));
--
--        add_imm(X_TMP_0, X_TMP_3, os(1) * otype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_1, X_TMP_0, os(1) * otype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_2, X_TMP_1, os(1) * otype_sz, X_DEFAULT_ADDR);
--        add_imm(X_TMP_3, X_TMP_2, os(1) * otype_sz, X_DEFAULT_ADDR);
--
--        if (unroll * otype_sz == 32)
--            for (uint32_t i = 0; i < 4; i++)
--                st1w(ZRegS {4 + i}, p_lsb_256 / T_z, ptr(x_tmp_vec[i]));
--        else if (unroll * otype_sz == 16)
--            for (uint32_t i = 0; i < 4; i++)
--                str(QReg {4 + i}, ptr(x_tmp_vec[i]));
--        else if (unroll * otype_sz == 8)
--            for (uint32_t i = 0; i < 4; i++)
--                str(DReg {4 + i}, ptr(x_tmp_vec[i]));
-+        const int node_1_output_stride = prb_.os(1);
-+
-+        switch (unroll * otype_sz_) {
-+            case 32: p_size = p_lsb_256; break;
-+            case 16: p_size = p_lsb_128; break;
-+            case 8: p_size = p_lsb_64; break;
-+            default: assert(!"unreachable");
-+        }
-+
-+        add_imm(X_TMP_0, XReg(x_ptr_out_off), otype_sz_ * o_off,
-+                X_DEFAULT_ADDR);
-+        for (int i = 1; i < unroll / 2; i++) {
-+            add_imm(x_tmp_vec[i], x_tmp_vec[i - 1],
-+                    otype_sz_ * node_1_output_stride, X_DEFAULT_ADDR);
-+        }
-+
-+        for (uint32_t i = 0; i < 4; i++)
-+            st1w(ZRegS {i}, p_size / T_z, ptr(x_tmp_vec[i]));
-+
-+        for (int i = 0; i < unroll / 2; i++) {
-+            add_imm(x_tmp_vec[i], x_tmp_vec[(i + 3) % 4],
-+                    otype_sz_ * node_1_output_stride, X_DEFAULT_ADDR);
-+        }
-+
-+        for (uint32_t i = 0; i < unroll / 2; i++)
-+            st1w(ZRegS {4 + i}, p_size / T_z, ptr(x_tmp_vec[i]));
-     }
- 
-     bool can_do_tr8x8() {
-         using namespace data_type;
- 
--        return get_sve_length() >= Xbyak_aarch64::util::SVE_256
--                && prb_.ndims >= 2
-+        static constexpr int desirable_node_size = 8;
-+        static constexpr int desirable_stride = 1;
-+
-+        return mayiuse(sve_256) && prb_.ndims >= 2
-                 && ((utils::one_of(prb_.itype, u8, data_type::s8, s32, f32)
-                         && utils::one_of(
-                                 prb_.otype, u8, data_type::s8, s32, f32)))
--                && utils::everyone_is(8, n(0), n(1))
--                && utils::everyone_is(1, os(0), is(1))
--                && utils::everyone_is(0, prb_.ip_tail, prb_.op_tail)
-+                && utils::everyone_is(desirable_node_size, prb_.n(0), prb_.n(1))
-+                && utils::everyone_is(desirable_stride, prb_.os(0), prb_.is(1))
-+                && !prb_.is_tail_present
-                 && prb_.scale_type == scale_type_t::NONE && prb_.beta == 0.f;
-     }
- 
--    bool process_unroll_tr8x8(int len) {
-+    bool process_unroll_tr8x8(const int ndims, const int len) {
-         if (!can_do_tr8x8()) return false;
- 
--        const int step_size = n(0) * n(1);
-+        const int step_size = prb_.n(0) * prb_.n(1);
-         int i_off = 0, o_off = 0;
-         for (int off = 0; off < len; off += step_size) {
-             step(off, i_off, o_off, i_off, o_off, step_size);
-@@ -432,23 +446,56 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-     }
- 
-     template <cpu_isa_t isa>
--    bool process_direct_copy(int len) {
-+    bool process_direct_copy(const int ndims, const int len) {
-         using namespace data_type;
- 
--        const int simd_w = cpu_isa_traits<isa>::vlen / itype_sz;
--        bool can_do = true && mayiuse(isa)
--                && utils::everyone_is(1, os(0), is(0))
--                && (false || prb_.itype == prb_.otype
-+        static constexpr int desirable_stride = 1;
-+        using TRegS =
-+                typename utils::conditional<isa == asimd, VReg4S, ZRegS>::type;
-+        const int simd_w = cpu_isa_traits<isa>::vlen / itype_sz_;
-+
-+        // TODO: support tail_processing for direct copy
-+
-+        const bool do_src_zp = prb_.req_src_zp;
-+        const bool do_dst_zp = prb_.req_dst_zp;
-+        const bool zp_applicable = IMPLICATION(
-+                (do_src_zp || do_dst_zp), utils::one_of(prb_.itype, s32, f32));
-+        const bool can_do = true && mayiuse(isa)
-+                && compensation_needed_ == false
-+                && utils::everyone_is(desirable_stride, prb_.os(0), prb_.is(0))
-+                && (false || (prb_.itype == prb_.otype ? zp_applicable : false)
-                         || (prb_.itype == s32 && prb_.otype == f32)
-                         || (prb_.itype == f32 && prb_.otype == s32))
--                && len % simd_w == 0 && n(0) % len == 0
--                && prb_.ip_tail % simd_w == 0 && prb_.op_tail % simd_w == 0
-+                && len % simd_w == 0 && prb_.n(0) % len == 0
-+                && !prb_.is_tail_present
-                 && prb_.scale_type == scale_type_t::NONE && prb_.beta == 0.f;
-         if (!can_do) return false;
- 
-+        static constexpr int vmm_zp_last_idx = 15;
-+        const auto vmm_src_zp
-+                = TRegS(do_dst_zp ? vmm_zp_last_idx - 1 : vmm_zp_last_idx);
-+        if (do_src_zp) {
-+            uni_ld1rw(vmm_src_zp, PARAM(src_zp));
-+            uni_scvtf(vmm_src_zp, vmm_src_zp);
-+        }
-+        const auto vmm_dst_zp = TRegS(vmm_zp_last_idx);
-+        if (do_dst_zp) {
-+            uni_ld1rw(vmm_dst_zp, PARAM(dst_zp));
-+            uni_scvtf(vmm_dst_zp, vmm_dst_zp);
-+        }
-+
-+        const auto apply_zp_ps = [&](const TRegS vmm) {
-+            if (do_src_zp) fsub(vmm, vmm, vmm_src_zp);
-+            if (do_dst_zp) fadd(vmm, vmm, vmm_dst_zp);
-+        };
-+
-         for (int off = 0; off < len;) {
--            const int unroll
-+            // TODO: we need extra reg for proper saturation if otype == s32
-+            int unroll
-                     = nstl::min(16 - (prb_.otype == s32), (len - off) / simd_w);
-+            unroll = (do_src_zp || do_dst_zp)
-+                    ? nstl::min(unroll, 16 - do_src_zp - do_dst_zp)
-+                    : unroll;
- 
-             int ur = 0;
-             int tmp_ur = 0;
-@@ -458,14 +505,11 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-                 do {
-                     add_imm(x_tmp_vec[count++], x_ptr_in_off,
--                            (off + ur * simd_w) * itype_sz, X_DEFAULT_ADDR);
-+                            (off + ur * simd_w) * itype_sz_, X_DEFAULT_ADDR);
-                     ur++;
-                 } while (ur < unroll && count < x_tmp_vec_size);
- 
-                 for (int i = 0; i < count; i++) {
--                    /*                    if (vlen == 64)
--                        ldr(ZReg(tmp_ur + i), ptr(x_tmp_vec[i]));
--                        else */
-                     if (vlen == 64 || vlen == 32)
-                         ld1w(ZRegS(tmp_ur + i), p_lsb_256 / T_z,
-                                 ptr(x_tmp_vec[i]));
-@@ -478,33 +522,28 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-             }
- 
-             if (prb_.itype != prb_.otype) {
--                const int vlen = cpu_isa_traits<isa>::vlen;
-                 for (int ur = 0; ur < unroll; ++ur) {
-+                    TRegS r(ur);
-                     if (prb_.itype == s32 && prb_.otype == f32) {
--                        if (vlen == 64 || vlen == 32) {
--                            ZRegS r(ur);
--                            /* MSB side 256 bits are ignored. */
--                            scvtf(r, p_all / T_m, r);
--                        } else if (vlen == 16) {
--                            VReg4S r(ur);
--                            scvtf(r, r);
--                        } else
--                            assert(!"unreachable");
-+                        uni_scvtf(r, r);
-                     } else if (prb_.itype == f32 && prb_.otype == s32) {
--                        /* Out of order can be expected. */
--                        if (vlen == 64 || vlen == 32) {
--                            ZRegS r(ur);
--                            frinti(r, p_all / T_m, r);
--                            fcvtzs(r, p_all / T_m, r);
--                        } else if (vlen == 16) {
--                            VReg4S r(ur);
--                            frinti(r, r);
--                            fcvtzs(r, r);
--                        } else
--                            assert(!"unreachable");
-+                        uni_frinti(r, r);
-+                        uni_fcvtzs(r, r);
-                     } else
-                         assert(!"unreachable");
-                 }
-+            } else if (do_src_zp || do_dst_zp) {
-+                for (int ur = 0; ur < unroll; ++ur) {
-+                    const auto vmm = TRegS(ur);
-+                    if (prb_.otype == f32) {
-+                        apply_zp_ps(vmm);
-+                    } else if (prb_.otype == s32) {
-+                        uni_scvtf(vmm, vmm);
-+                        apply_zp_ps(vmm);
-+                        uni_frinti(vmm, vmm);
-+                        uni_fcvtzs(vmm, vmm);
-+                    }
-+                }
-             }
- 
-             ur = 0;
-@@ -515,7 +554,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-                 do {
-                     add_imm(x_tmp_vec[count++], x_ptr_out_off,
--                            (off + ur * simd_w) * otype_sz, X_DEFAULT_ADDR);
-+                            (off + ur * simd_w) * otype_sz_, X_DEFAULT_ADDR);
-                     ur++;
-                 } while (ur < unroll && count < x_tmp_vec_size);
- 
-@@ -538,8 +577,8 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-     }
- 
-     void process_unroll_generic_step(int reg_unroll, const int *i_off,
--            const int *o_off, const int *s_off, const int *ip_padding,
--            const bool h_padded) {
-+            const int *o_off, const int *s_off, const int *c_off,
-+            const int *zero_padding, const bool tail_processing) {
-         using namespace data_type;
- 
-         auto cvt2ps
-@@ -588,76 +627,84 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-             }
-         };
- 
-+        auto load_bytes_addr = [=](const int ur, const int r) {
-+            add_imm(x_tmp_vec[r], x_ptr_in_off, i_off[ur + r] * itype_sz_,
-+                    X_DEFAULT_ADDR);
-+        };
-+        auto load_bytes = [=](const int ur, int size, int r) {
-+            switch (size) {
-+                case 4: ld1(VReg4S(ur)[r], ptr(x_tmp_vec[r])); break;
-+                case 2: ld1(VReg8H(ur)[r], ptr(x_tmp_vec[r])); break;
-+                case 1: ld1(VReg16B(ur)[r], ptr(x_tmp_vec[r])); break;
-+                default: assert(!"unreachable");
-+            }
-+        };
-+
-+        auto store = [=](const XReg &addr, const VReg ymm, int size) {
-+            const uint32_t xmm = ymm.getIdx();
-+            switch (size) {
-+                case 16: str(QReg(xmm), ptr(addr)); break;
-+                case 8: str(DReg(xmm), ptr(addr)); break;
-+                case 4: str(SReg(xmm), ptr(addr)); break;
-+                case 2: str(HReg(xmm), ptr(addr)); break;
-+                case 1: str(BReg(xmm), ptr(addr)); break;
-+                default: assert(!"unreachable");
-+            }
-+        };
-+
-         /* check whether loading 4 values at once is possible */
--        bool can_load_xmm = reg_unroll % 4 == 0;
-+        static constexpr int xmm_vlen = 4;
-+        bool can_load_xmm = reg_unroll % xmm_vlen == 0;
-         for (int ur = 1; ur < reg_unroll; ++ur)
--            if (i_off[ur] != i_off[ur - 1] + 1) can_load_xmm = false;
--        const int load_step = can_load_xmm ? 4 : 1;
-+            if (i_off[ur] != i_off[ur - 1] + 1) {
-+                can_load_xmm = false;
-+                break;
-+            }
-+        const int load_step = can_load_xmm ? xmm_vlen : 1;
- 
-         /* check whether storing 4 values at once is possible */
--        bool can_store_xmm = reg_unroll % 4 == 0;
-+        bool can_store_xmm = reg_unroll % xmm_vlen == 0;
-         for (int ur = 1; ur < reg_unroll; ++ur)
--            if (o_off[ur] != o_off[ur - 1] + 1) can_store_xmm = false;
-+            if (o_off[ur] != o_off[ur - 1] + 1) {
-+                can_store_xmm = false;
-+                break;
-+            }
-         const int ur_step = can_store_xmm ? 4 : 1;
-         const int load_tail_step
-                 = !can_load_xmm && can_store_xmm ? ur_step : load_step;
- 
--        const bool interim_f32 = false
--                || utils::one_of(f32, prb_.itype, prb_.otype)
--                || prb_.scale_type != scale_type_t::NONE || prb_.beta != 0.f;
-+        const bool interim_f32 = interim_f32_needed();
- 
-         const bool need_saturation
-                 = (utils::one_of(prb_.otype, u8, data_type::s8, s32)
-                         && interim_f32);
--        if (h_padded) {
-+
-+        std::vector<int> store_masks;
-+        if (tail_processing) {
-             for (int ur = 0; ur < reg_unroll; ur += load_tail_step) {
--                if (itype_sz == 4)
--                    movi(VReg4S(ur), 0);
--                else if (itype_sz == 2)
--                    movi(VReg8H(ur), 0);
--                else
--                    movi(VReg16B(ur), 0);
--                /* x_tmp_vec = X_TMP_0 - X_TMP_4
--                 Do not use X_TMP_? as the last arg. */
--                for (int r = 0; r < load_tail_step; ++r) {
--                    if (ip_padding[ur + r] == 0) {
--                        add_imm(x_tmp_vec[r], x_ptr_in_off,
--                                i_off[ur + r] * itype_sz, X_DEFAULT_ADDR);
--                    }
--                }
-+                uni_clear(VReg(ur));
-+                store_masks.push_back(0);
- 
-                 for (int r = 0; r < load_tail_step; ++r) {
--                    if (ip_padding[ur + r] == 0) {
--                        if (itype_sz == 4)
--                            ld1(VReg4S(ur)[r], ptr(x_tmp_vec[r]));
--                        else if (itype_sz == 2)
--                            ld1(VReg8H(ur)[r], ptr(x_tmp_vec[r]));
--                        else
--                            ld1(VReg16B(ur)[r], ptr(x_tmp_vec[r]));
-+                    if (zero_padding[ur + r] == 0) {
-+                        store_masks.back() += 1 << r;
-+                        load_bytes_addr(ur, r);
-                     }
-                 }
-+
-+                for (int r = 0; r < load_tail_step; ++r)
-+                    if (zero_padding[ur + r] == 0) load_bytes(ur, itype_sz_, r);
-             }
-         } else {
-             if (!can_load_xmm && can_store_xmm) {
--                assert(ur_step == 4);
-+                assert(ur_step == xmm_vlen);
-                 /* load with stride */
-                 for (int ur = 0; ur < reg_unroll; ur += ur_step) {
--
--                    /* x_tmp_vec = X_TMP_0 - X_TMP_4
--                 Do not use X_TMP_? as the last arg. */
-                     for (int r = 0; r < ur_step; ++r) {
--                        add_imm(x_tmp_vec[r], x_ptr_in_off,
--                                i_off[ur + r] * itype_sz, X_DEFAULT_ADDR);
--                    }
--
--                    for (int r = 0; r < ur_step; ++r) {
--                        if (itype_sz == 4)
--                            ld1(VReg4S(ur)[r], ptr(x_tmp_vec[r]));
--                        else if (itype_sz == 2)
--                            ld1(VReg8H(ur)[r], ptr(x_tmp_vec[r]));
--                        else
--                            ld1(VReg16B(ur)[r], ptr(x_tmp_vec[r]));
-+                        load_bytes_addr(ur, r);
-                     }
-+                    for (int r = 0; r < ur_step; ++r)
-+                        load_bytes(ur, itype_sz_, r);
-                 }
-             } else {
-                 int ur = 0;
-@@ -667,13 +714,13 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-                     do {
-                         add_imm(x_tmp_vec[count++], x_ptr_in_off,
--                                i_off[ur] * itype_sz, X_DEFAULT_ADDR);
-+                                i_off[ur] * itype_sz_, X_DEFAULT_ADDR);
-                         ur += load_step;
-                     } while (ur < reg_unroll && count < x_tmp_vec_size);
- 
-                     for (int i = 0; i < count; i++) {
- 
--                        switch (load_step * itype_sz) {
-+                        switch (load_step * itype_sz_) {
-                             case 16:
-                                 ldr(QReg(tmp_ur), ptr(x_tmp_vec[i]));
-                                 break;
-@@ -688,6 +735,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                 }
-             }
-         }
-+
-         /* xmm[:] <-- (f32)xmm[:] */
-         if (interim_f32) {
-             const int cvt_step = nstl::max(load_step, ur_step);
-@@ -702,30 +750,32 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-             if (fast_return) {
-                 if (prb_.scale_type == scale_type_t::COMMON)
-                     for (int ur = 0; ur < reg_unroll; ur += load_step)
--                        fmul(VReg4S(ur), VReg4S(ur), xmm_scale);
-+                        fmul(VReg4S(ur), VReg4S(ur), xmm_scale_);
-                 if (prb_.otype != f32) {
--                    init_saturate_f32(xmm_zero, xmm_saturation_ubound, reg_tmp,
--                            interim_f32 ? f32 : prb_.itype, prb_.otype);
--                    for (int ur = 0; ur < reg_unroll; ur += load_step)
-+                    init_saturate_f32(xmm_zero_, xmm_saturation_ubound_,
-+                            reg_tmp_, interim_f32 ? f32 : prb_.itype,
-+                            prb_.otype);
-+                    for (int ur = 0; ur < reg_unroll; ur += load_step) {
-                         if (need_saturation)
--                            saturate_f32(VReg4S(ur), xmm_zero,
--                                    xmm_saturation_ubound, prb_.otype, p_all);
-+                            saturate_f32(VReg4S(ur), xmm_zero_,
-+                                    xmm_saturation_ubound_, prb_.otype,
-+                                    P_ALL_ONE);
-+                    }
- 
-                     for (int ur = 0; ur < reg_unroll; ur += load_step)
-                         cvt2odt(ur, 1, prb_.otype,
-                                 interim_f32 ? f32 : prb_.itype);
-                 }
--                /* load_step is 1 or 4. */
-                 for (int ur = 0; ur < reg_unroll; ur += load_step) {
-                     for (int r = 0; r < load_step; ++r) {
-                         add_imm(x_tmp_vec[r], x_ptr_out_off,
--                                o_off[ur + r] * otype_sz, X_DEFAULT_ADDR);
-+                                o_off[ur + r] * otype_sz_, X_DEFAULT_ADDR);
-                     }
- 
-                     for (int r = 0; r < load_step; ++r) {
--                        if (otype_sz == 4)
-+                        if (otype_sz_ == 4)
-                             st1(VReg4S(ur)[r], ptr(x_tmp_vec[r]));
--                        else if (otype_sz == 2)
-+                        else if (otype_sz_ == 2)
-                             st1(VReg8H(ur)[r], ptr(x_tmp_vec[r]));
-                         else
-                             st1(VReg16B(ur)[r], ptr(x_tmp_vec[r]));
-@@ -735,7 +785,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-             }
- 
-             /* scatter elements of xmm into 4 xmms */
--            if (itype_sz == 4 || interim_f32) {
-+            if (itype_sz_ == 4 || interim_f32) {
-                 for (int ur = 0; ur < reg_unroll; ur += load_step)
-                     for (int r = 1; r < load_step; ++r) {
-                         VReg4S v(ur);
-@@ -747,7 +797,18 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                 for (int ur = 0; ur < reg_unroll; ur += load_step)
-                     for (int r = 1; r < load_step; ++r)
-                         ext(VReg16B(ur + r), VReg16B(ur), VReg16B(ur),
--                                itype_sz * r);
-+                                itype_sz_ * r);
-+            }
-+        }
-+
-+        /* src zero point application */
-+        if (prb_.req_src_zp) {
-+            for (int ur = 0; ur < reg_unroll; ur += ur_step) {
-+                const auto xmm = VReg4S(ur);
-+                if (interim_f32)
-+                    fsub(xmm, xmm, xmm_src_zp_);
-+                else
-+                    sub(xmm, xmm, xmm_src_zp_);
-             }
-         }
- 
-@@ -756,7 +817,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-             /* xmm <-- scale * xmm[:] */
-             if (prb_.scale_type == scale_type_t::COMMON) {
-                 for (int ur = 0; ur < reg_unroll; ur += ur_step)
--                    fmul(VReg4S(ur), VReg4S(ur), xmm_scale);
-+                    fmul(VReg4S(ur), VReg4S(ur), xmm_scale_);
-             } else if (prb_.scale_type == scale_type_t::MANY) {
-                 enum class scale_load_type_t { bcast, load, gather };
- 
-@@ -769,13 +830,12 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                             scale_load_type = scale_load_type_t::load;
- 
-                     if (scale_load_type == scale_load_type_t::bcast
--                            && !h_padded) {
--                        VReg4S v(xmm_scale.getIdx());
-+                            && !tail_processing) {
-+                        VReg4S v(xmm_scale_.getIdx());
-                         VReg4S v_dst(ur);
--                        add_imm(X_TMP_0, x_ptr_scale_off, s_off[ur] * stype_sz,
-+                        add_imm(X_TMP_0, x_ptr_scale_off, s_off[ur] * stype_sz_,
-                                 X_DEFAULT_ADDR);
--                        ldr(W_TMP_0, ptr(X_TMP_0));
--                        dup(v, W_TMP_0);
-+                        ld1r(v, ptr(X_TMP_0));
-                         fmul(v_dst, v_dst, v);
-                         continue;
-                     }
-@@ -786,10 +846,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                             scale_load_type = scale_load_type_t::gather;
- 
-                     if (scale_load_type == scale_load_type_t::load
--                            && !h_padded) {
--                        uint32_t idx = xmm_scale.getIdx();
-+                            && !tail_processing) {
-+                        uint32_t idx = xmm_scale_.getIdx();
-                         VReg4S v_dst(ur);
--                        add_imm(X_TMP_0, x_ptr_scale_off, s_off[ur] * stype_sz,
-+                        add_imm(X_TMP_0, x_ptr_scale_off, s_off[ur] * stype_sz_,
-                                 X_DEFAULT_ADDR);
- 
-                         ldr(QReg {idx}, ptr(X_TMP_0));
-@@ -799,22 +859,15 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-                     // load doesn't work as well
-                     // so gather the scale factors one by one
--                    /*ur_step is 1 or 4. */
--                    for (int r = ur; r < ur + ur_step; ++r) {
--                        if (ip_padding[r] == 0 || !h_padded) {
--                            /* x_tmp_vec = X_TMP_0 - X_TMP_4
--                         Do not use X_TMP_? as the last arg. */
-+                    for (int r = ur; r < ur + ur_step; ++r)
-+                        if (zero_padding[r] == 0 || !tail_processing) {
-                             add_imm(x_tmp_vec[r - ur], x_ptr_scale_off,
--                                    s_off[r] * stype_sz, X_DEFAULT_ADDR);
--                        }
--                    }
--                    for (int r = ur; r < ur + ur_step; ++r) {
--                        if (ip_padding[r] == 0 || !h_padded) {
--                            VReg4S v(xmm_scale.getIdx());
--                            ld1(v[r - ur], ptr(x_tmp_vec[r - ur]));
-+                                    s_off[r] * stype_sz_, X_DEFAULT_ADDR);
-                         }
--                    }
--                    fmul(VReg4S(ur), VReg4S(ur), xmm_scale);
-+                    for (int r = ur; r < ur + ur_step; ++r)
-+                        if (zero_padding[r] == 0 || !tail_processing)
-+                            ld1(xmm_scale_[r - ur], ptr(x_tmp_vec[r - ur]));
-+                    fmul(VReg4S(ur), VReg4S(ur), xmm_scale_);
-                 }
-             }
- 
-@@ -829,7 +882,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-                     do {
-                         add_imm(x_tmp_vec[count++], x_ptr_out_off,
--                                o_off[ur] * otype_sz, X_DEFAULT_ADDR);
-+                                o_off[ur] * otype_sz_, X_DEFAULT_ADDR);
-                         ur += ur_step;
-                     } while (ur < reg_unroll && count < x_tmp_vec_size);
- 
-@@ -873,7 +926,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-             if (prb_.scale_type == scale_type_t::COMMON) {
-                 for (int ur = 0; ur < reg_unroll; ur += ur_step) {
-                     VReg4S tmp(ur);
--                    fmul(tmp, tmp, VReg4S(xmm_scale.getIdx()));
-+                    fmul(tmp, tmp, VReg4S(xmm_scale_.getIdx()));
-                 }
-             } else if (prb_.scale_type == scale_type_t::MANY) {
-                 int ur = 0;
-@@ -883,7 +936,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-                     do {
-                         add_imm(x_tmp_vec[count++], x_ptr_scale_off,
--                                s_off[ur] * stype_sz, X_DEFAULT_ADDR);
-+                                s_off[ur] * stype_sz_, X_DEFAULT_ADDR);
-                         ur += ur_step;
-                     } while (ur < reg_unroll && count < x_tmp_vec_size);
- 
-@@ -908,7 +961,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-                     do {
-                         add_imm(x_tmp_vec[count++], x_ptr_out_off,
--                                o_off[ur] * otype_sz, X_DEFAULT_ADDR);
-+                                o_off[ur] * otype_sz_, X_DEFAULT_ADDR);
-                         ur += ur_step;
-                     } while (ur < reg_unroll && count < (x_tmp_vec_size / 2));
- 
-@@ -951,94 +1004,272 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-             }
-         }
- 
--        if (need_saturation) {
--            init_saturate_f32(
--                    xmm_zero, xmm_saturation_ubound, reg_tmp, f32, prb_.otype);
-+        /* dst zero point application */
-+        if (prb_.req_dst_zp) {
-             for (int ur = 0; ur < reg_unroll; ur += ur_step) {
--                saturate_f32(VReg4S(ur), xmm_zero, xmm_saturation_ubound,
--                        prb_.otype, p_all);
-+                const auto xmm = VReg4S(ur);
-+                if (interim_f32)
-+                    fadd(xmm, xmm, xmm_dst_zp_);
-+                else
-+                    add(xmm, xmm, xmm_dst_zp_);
-             }
-         }
- 
--        for (int ur = 0; ur < reg_unroll; ur += ur_step) {
--            if (prb_.otype != f32)
--                cvt2odt(ur, 1, prb_.otype, interim_f32 ? f32 : prb_.itype);
-+        /* adjust scale application */
-+        if (prb_.scale_adjust != 1.f) {
-+            dup(xmm_tmp_, reg_scale_adjust_);
-+            for (int ur = 0; ur < reg_unroll; ur += ur_step) {
-+                fmul(VReg4S(ur), VReg4S(ur), xmm_tmp_);
-+            }
-+        }
-+
-+        if (need_saturation) {
-+            init_saturate_f32(xmm_zero_, xmm_saturation_ubound_, reg_tmp_, f32,
-+                    prb_.otype);
-+            for (int ur = 0; ur < reg_unroll; ur += ur_step) {
-+                saturate_f32(VReg4S(ur), xmm_zero_, xmm_saturation_ubound_,
-+                        prb_.otype, P_ALL_ONE);
-+            }
-+
-+            // reset back xmm_zero_ if needed.
-+            if (compensation_needed_ && (prb_.req_src_zp || prb_.req_dst_zp))
-+                uni_clear(VReg(xmm_zero_.getIdx()));
-         }
- 
--        int ur = 0;
--        int tmp_ur = 0;
--        while (ur < reg_unroll) {
--            int count = 0;
-+        if (compensation_needed_) {
-+            const uint32_t xmm_begin = 9;
-+            const uint32_t xmm_end = 11;
-+            uint32_t xmm_id = xmm_begin;
-+            const auto get_temp_xmm = [&] {
-+                const Xbyak_aarch64::VReg temp {xmm_id++};
-+
-+                if (xmm_id > xmm_end) { xmm_id = xmm_begin; }
-+
-+                return temp;
-+            };
-+            if (can_store_xmm) {
-+                enum class comp_load_type_t { bcast, load, gather };
-+
-+                for (int ur = 0; ur < reg_unroll; ur += ur_step) {
-+
-+                    bool all_ip_padding_one = true;
-+                    bool all_ip_padding_zero = true;
-+                    for (int r = ur; r < ur + ur_step; r++) {
-+                        if (zero_padding[r] != 1)
-+                            all_ip_padding_one = false;
-+                        else
-+                            all_ip_padding_zero = false;
-+                    }
-+                    if (all_ip_padding_one) continue;
-+
-+                    comp_load_type_t comp_load_type = comp_load_type_t::bcast;
-+
-+                    for (int r = ur + 1; r < ur + ur_step; ++r)
-+                        if (c_off[r] != c_off[r - 1] + 0) {
-+                            comp_load_type = comp_load_type_t::load;
-+                            break;
-+                        }
- 
--            do {
--                add_imm(x_tmp_vec[count++], x_ptr_out_off, o_off[ur] * otype_sz,
--                        X_DEFAULT_ADDR);
--                ur += ur_step;
--            } while (ur < reg_unroll && count < x_tmp_vec_size);
-+                    if (comp_load_type == comp_load_type_t::bcast
-+                            && all_ip_padding_zero) {
-+                        const auto reduction_xmm = get_temp_xmm().s4;
-+                        const auto xmm_reorder_result = VReg4S(ur);
-+                        frinti(reduction_xmm, xmm_reorder_result);
-+                        addv(SReg(reduction_xmm.getIdx()), reduction_xmm);
-+                        const auto comp_addr = c_addr(c_off[ur]);
-+                        const auto xmm_tmp_ = get_temp_xmm().s4;
-+                        ldr(SReg(xmm_tmp_.getIdx()), ptr(comp_addr));
-+                        add(xmm_tmp_, xmm_tmp_, reduction_xmm);
-+                        str(SReg(xmm_tmp_.getIdx()), ptr(comp_addr));
-+                        continue;
-+                    }
-+
-+                    if (comp_load_type == comp_load_type_t::load)
-+                        for (int r = ur + 1; r < ur + ur_step; ++r)
-+                            if (c_off[r] != c_off[r - 1] + 1) {
-+                                comp_load_type = comp_load_type_t::gather;
-+                                break;
-+                            }
-+
-+                    if (comp_load_type == comp_load_type_t::load
-+                            && all_ip_padding_zero) {
-+                        const auto xmm_reorder_result_dq = get_temp_xmm().s4;
-+                        const auto xmm_reorder_result = VReg4S(ur);
-+                        const auto comp_addr = c_addr(c_off[ur]);
-+                        frinti(xmm_reorder_result_dq, xmm_reorder_result);
-+                        const auto xmm_tmp_ = get_temp_xmm().s4;
-+                        ldr(SReg(xmm_tmp_.getIdx()), ptr(comp_addr));
-+                        add(xmm_reorder_result_dq, xmm_reorder_result_dq,
-+                                xmm_tmp_);
-+                        str(SReg(xmm_tmp_.getIdx()), ptr(comp_addr));
-+                        continue;
-+                    }
- 
--            for (int i = 0; i < count; i++) {
-+                    const auto xmm_reorder_result_dq = get_temp_xmm().s4;
-+                    const auto xmm_reorder_result = VReg4S(ur);
-+                    frinti(xmm_reorder_result_dq, xmm_reorder_result);
- 
--                switch (ur_step * otype_sz) {
--                    case 16: str(QReg(tmp_ur), ptr(x_tmp_vec[i])); break;
--                    case 8: str(DReg(tmp_ur), ptr(x_tmp_vec[i])); break;
--                    case 4: str(SReg(tmp_ur), ptr(x_tmp_vec[i])); break;
--                    case 2: str(HReg(tmp_ur), ptr(x_tmp_vec[i])); break;
--                    case 1: str(BReg(tmp_ur), ptr(x_tmp_vec[i])); break;
--                    default: assert(!"unreachable");
-+                    for (int r = ur; r < ur + ur_step; ++r) {
-+                        if (zero_padding[r] == 0 || !tail_processing) {
-+                            mov(W_TMP_0, xmm_reorder_result_dq[r]);
-+                            const auto comp_addr = c_addr(c_off[ur]);
-+                            str(W_TMP_0, ptr(comp_addr));
-+                        }
-+                    }
-+                }
-+            } else {
-+                for (int ur = 0; ur < reg_unroll; ur += ur_step) {
-+                    if (zero_padding[ur] == 0 || !tail_processing) {
-+                        const auto xmm_reorder_result_dq = get_temp_xmm().s4;
-+                        const auto xmm_reorder_result = VReg4S(ur);
-+                        const auto comp_addr = c_addr(c_off[ur]);
-+                        frinti(xmm_reorder_result_dq, xmm_reorder_result);
-+                        const auto xmm_tmp_ = get_temp_xmm().s4;
-+                        ldr(SReg(xmm_tmp_.getIdx()), ptr(comp_addr));
-+                        add(xmm_reorder_result_dq, xmm_reorder_result_dq,
-+                                xmm_tmp_);
-+                        str(SReg(xmm_tmp_.getIdx()), ptr(comp_addr));
-+                    }
-                 }
--                tmp_ur += ur_step;
-             }
-         }
-+
-+        for (int ur = 0; ur < reg_unroll; ur += ur_step) {
-+            if (prb_.req_src_zp || prb_.req_dst_zp) {
-+                const bool use_store_masks = !store_masks.empty();
-+                if (use_store_masks) {
-+                    const auto mask = (~store_masks[ur / ur_step]) & 0xF;
-+                    switch (mask) {
-+                        case 0x0:
-+                            /* Do nothing */
-+                            break;
-+                        case 0x1: ins(VReg4S(ur)[0], xmm_zero_[0]); break;
-+                        case 0x2: ins(VReg4S(ur)[1], xmm_zero_[1]); break;
-+                        case 0x3:
-+                            ins(VReg2D(ur)[0], VReg2D(xmm_zero_.getIdx())[0]);
-+                            break;
-+                        case 0x4: ins(VReg4S(ur)[2], xmm_zero_[2]); break;
-+                        case 0x5:
-+                            ins(VReg4S(ur)[0], xmm_zero_[0]);
-+                            ins(VReg4S(ur)[2], xmm_zero_[2]);
-+                            break;
-+                        case 0x6:
-+                            ins(VReg4S(ur)[1], xmm_zero_[1]);
-+                            ins(VReg4S(ur)[2], xmm_zero_[2]);
-+                            break;
-+                        case 0x7:
-+                            ins(VReg2D(ur)[0], VReg2D(xmm_zero_.getIdx())[0]);
-+                            ins(VReg4S(ur)[2], xmm_zero_[2]);
-+                            break;
-+                        case 0x8: ins(VReg4S(ur)[3], xmm_zero_[3]); break;
-+                        case 0x9:
-+                            ins(VReg4S(ur)[0], xmm_zero_[0]);
-+                            ins(VReg4S(ur)[3], xmm_zero_[3]);
-+                            break;
-+                        case 0xa:
-+                            ins(VReg4S(ur)[1], xmm_zero_[1]);
-+                            ins(VReg4S(ur)[3], xmm_zero_[3]);
-+                            break;
-+                        case 0xb:
-+                            ins(VReg2D(ur)[0], VReg2D(xmm_zero_.getIdx())[0]);
-+                            ins(VReg4S(ur)[3], xmm_zero_[3]);
-+                            break;
-+                        case 0xc:
-+                            ins(VReg2D(ur)[1], VReg2D(xmm_zero_.getIdx())[1]);
-+                            break;
-+                        case 0xd:
-+                            ins(VReg4S(ur)[0], xmm_zero_[0]);
-+                            ins(VReg2D(ur)[1], VReg2D(xmm_zero_.getIdx())[1]);
-+                            break;
-+                        case 0xe:
-+                            ins(VReg4S(ur)[1], xmm_zero_[1]);
-+                            ins(VReg2D(ur)[1], VReg2D(xmm_zero_.getIdx())[1]);
-+                            break;
-+                        case 0xf: movi(VReg16B(ur), 0); break;
-+                        default: assert(!"unreachable");
-+                    }
-+                }
-+            }
-+            if (prb_.otype != f32)
-+                cvt2odt(ur, 1, prb_.otype, interim_f32 ? f32 : prb_.itype);
-+
-+            store(o_addr(o_off[ur]), VReg(ur), ur_step * otype_sz_);
-+        }
-     }
- 
--    void comp_padding_flag(int ndims, int off, int len, int &i_tail) {
--        const int ip_without_padding
--                = ndims == 0 ? len - ip_padding() : prb_.ip_tail;
--        if ((ndims == 0 && off >= ip_without_padding)
--                || (ndims > 0 && (off % prb_.oblock) >= ip_without_padding))
--            i_tail = 1;
-+    bool interim_f32_needed() {
-+        using namespace data_type;
-+
-+        return utils::one_of(f32, prb_.itype, prb_.otype)
-+                || prb_.scale_type != scale_type_t::NONE || prb_.beta != 0.f
-+                || ((prb_.req_src_zp || prb_.req_dst_zp)
-+                                ? !(prb_.itype == s32 && prb_.otype == s32)
-+                                : false)
-+                || (prb_.itype != f32 && compensation_needed_)
-+                || prb_.scale_adjust != 1.f;
-     }
- 
--    void process_unroll_generic(const int ndims, int len, const bool h_padded) {
-+    void process_unroll_generic(
-+            const int ndims, int len, const bool tail_processing) {
-+        assert(IMPLICATION(prb_.nodes[0].tail_size > 0,
-+                len == static_cast<int>(prb_.nodes[0].n)
-+                        || len == static_cast<int>(prb_.nodes[0].tail_size)));
-+
-         const int blk = 8;
- 
-         int i_off[2 * blk] = {0};
-         int o_off[2 * blk] = {0};
-         int s_off[2 * blk] = {0};
-+        int c_off[2 * blk] = {0};
- 
-         int curr = 0; // will switch between 0 and 1
- 
-+        const bool interim_f32 = interim_f32_needed();
-+
-+        if (prb_.req_src_zp) {
-+            add_imm(X_DEFAULT_ADDR, PARAM(src_zp), X_TMP_0);
-+            ld1r(xmm_src_zp_, ptr(X_DEFAULT_ADDR));
-+            if (interim_f32) scvtf(xmm_src_zp_, xmm_src_zp_);
-+        }
-+        if (prb_.req_dst_zp) {
-+            add_imm(X_DEFAULT_ADDR, PARAM(dst_zp), X_TMP_0);
-+            ld1r(xmm_dst_zp_, ptr(X_DEFAULT_ADDR));
-+            if (interim_f32) scvtf(xmm_dst_zp_, xmm_dst_zp_);
-+        }
-+
-         for (int off = 0; off < len; off += blk) {
-             const int reg_unroll = nstl::min(off + blk, len) - off;
--            int ip_padding[blk] = {0};
-+            int zero_padding[blk] = {0};
-+            const auto curr_blk = curr * blk;
- 
-             /* compute offsets and tail*/
-             for (int ur = off != 0 ? 0 : 1; ur < reg_unroll; ++ur) {
--                const int ur_c = curr * blk + ur;
-+                const int ur_c = curr_blk + ur;
-                 const int ur_p = (ur_c - 1 + 2 * blk) % (2 * blk); // prev ur
-+                const bool is_tail
-+                        = off + ur >= static_cast<int>(prb_.nodes[0].tail_size);
-                 step(off + ur, i_off[ur_p], o_off[ur_p], s_off[ur_p],
--                        i_off[ur_c], o_off[ur_c], s_off[ur_c]);
--                if (h_padded)
--                    comp_padding_flag(ndims, off + ur, len, ip_padding[ur]);
-+                        c_off[ur_p], i_off[ur_c], o_off[ur_c], s_off[ur_c],
-+                        c_off[ur_c]);
-+                if (tail_processing && is_tail) zero_padding[ur] = 1;
-             }
--            process_unroll_generic_step(reg_unroll, i_off + curr * blk,
--                    o_off + curr * blk, s_off + curr * blk, ip_padding,
--                    h_padded);
-+
-+            process_unroll_generic_step(reg_unroll, i_off + curr_blk,
-+                    o_off + curr_blk, s_off + curr_blk, c_off + curr_blk,
-+                    zero_padding, tail_processing);
- 
-             curr = 1 - curr;
-         }
-     }
- 
-     void compute_ker(
--            const int ndims, const int len_unroll, const bool h_padded) {
-+            const int ndims, const int len_unroll, const bool tail_processing) {
-         bool optimized = false;
--        optimized = optimized
--                || (process_direct_copy<sve_256>(len_unroll) && !h_padded);
--        optimized = optimized
--                || (process_direct_copy<asimd>(len_unroll) && !h_padded);
--        optimized
--                = optimized || (process_unroll_tr8x8(len_unroll) && !h_padded);
--        if (!optimized) process_unroll_generic(ndims, len_unroll, h_padded);
-+        optimized = optimized || process_direct_copy<sve_256>(ndims, len_unroll)
-+                || process_direct_copy<asimd>(ndims, len_unroll)
-+                || process_unroll_tr8x8(ndims, len_unroll);
-+        if (!optimized)
-+            process_unroll_generic(ndims, len_unroll, tail_processing);
-     }
- 
-     void loop_begin(Label &l, XReg reg_cnt, int len) {
-@@ -1046,97 +1277,287 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         L(l);
-     }
- 
-+    void check_if_this_is_last_chunk(const XReg reg_curr_chunk, int node_id) {
-+        // Chunks are backwards numered i.e:
-+        // [0] -> [node_size]
-+        // [1] -> [node_size - 1]
-+        // ...
-+        // [node_size - 1] -> [1]
-+
-+        // It is done like this, because it is easier to decrement counter
-+        // and check if it is equal to zero than increment and check
-+        // if it is equal to node_size.
-+        static constexpr int64_t last_chunk = 1;
-+        cmp(reg_curr_chunk, last_chunk);
-+    }
-+
-+    void zero_dst_memory(const int bytes_to_zeroing) {
-+        static constexpr int num_of_bytes_in_xmm = 128 / 8;
-+
-+        const int xmms_to_zeroing
-+                = std::div(bytes_to_zeroing, num_of_bytes_in_xmm).quot;
-+        const int tail_to_zeroing
-+                = std::div(bytes_to_zeroing, num_of_bytes_in_xmm).rem;
-+
-+        movi(xmm_tmp_, 0);
-+
-+        if (xmms_to_zeroing > 0) {
-+            Label loop;
-+
-+            mov(reg_tmp_, xmms_to_zeroing);
-+            L(loop);
-+            str(QReg(xmm_tmp_.getIdx()), ptr(o_addr(0)));
-+            add_imm(reg_off_out_, reg_off_out_, num_of_bytes_in_xmm, X_TMP_0);
-+            add_imm(x_ptr_out_off, x_ptr_out_off, num_of_bytes_in_xmm, X_TMP_0);
-+            subs(reg_tmp_, reg_tmp_, 1);
-+            mov(X_TMP_0, 32);
-+            b(NE, loop);
-+        }
-+
-+        if (tail_to_zeroing) mov_imm(W_TMP_0, 0);
-+
-+        for (int i = 0; i < tail_to_zeroing; i++)
-+            strb(W_TMP_0, ptr(o_addr(i, false)));
-+
-+        // Restore dst offset to initial value
-+        if (xmms_to_zeroing > 0) {
-+            sub_imm(reg_off_out_, reg_off_out_,
-+                    num_of_bytes_in_xmm * xmms_to_zeroing, X_TMP_0);
-+            sub_imm(x_ptr_out_off, x_ptr_out_off,
-+                    num_of_bytes_in_xmm * xmms_to_zeroing, X_TMP_0);
-+        }
-+    }
-+
-+    void finalize_tail_loop(int i_step, int o_step, int s_step, int c_step,
-+            const int curr_node_id) {
-+        static constexpr int empty_chunk_info = -1;
-+
-+        mov(reg_tmp_, empty_chunk_info);
-+        str(reg_tmp_, ptr(data_chunk_addr(curr_node_id)));
-+
-+        const int padded_area = prb_.nodes[curr_node_id].n
-+                - prb_.nodes[curr_node_id].tail_size;
-+
-+        if (prb_.nodes[curr_node_id].is_zero_pad_needed) {
-+            int num_of_zero_padded_values = padded_area;
-+            for (int i = curr_node_id - 1; i >= 0; i--) {
-+                num_of_zero_padded_values *= prb_.nodes[i].n;
-+            }
-+
-+            const int bytes_to_zeroing = num_of_zero_padded_values * otype_sz_;
-+            zero_dst_memory(bytes_to_zeroing);
-+        }
-+
-+        // This function is called by loop_end. At the end
-+        // of loop_end is section that is responsible for
-+        // restoring offset values. Restoring is based on
-+        // len value which is equal to prb.nodes[x].n.
-+        // If fill_zero_padded_area is called then it means
-+        // offsets were shifted prb.nodes[x].tail_size times.
-+        // Therefore, this function has to shift offsets by
-+        // zero pad area.
-+        add_imm(reg_off_in_, reg_off_in_, padded_area * i_step * itype_sz_,
-+                X_TMP_0);
-+        add_imm(reg_off_out_, reg_off_out_, padded_area * o_step * otype_sz_,
-+                X_TMP_0);
-+        add_imm(x_ptr_in_off, x_ptr_in_off, padded_area * i_step * itype_sz_,
-+                X_TMP_0);
-+        add_imm(x_ptr_out_off, x_ptr_out_off, padded_area * o_step * otype_sz_,
-+                X_TMP_0);
-+        if (prb_.scale_type == scale_type_t::MANY) {
-+            add_imm(reg_off_scale_, reg_off_scale_,
-+                    padded_area * s_step * stype_sz_, X_TMP_0);
-+            add_imm(x_ptr_scale_off, x_ptr_scale_off,
-+                    padded_area * s_step * stype_sz_, X_TMP_0);
-+        }
-+        if (compensation_needed_) {
-+            add_imm(reg_off_comp_, reg_off_comp_,
-+                    padded_area * c_step * sizeof(int32_t), X_TMP_0);
-+            add_imm(x_ptr_comp_off, x_ptr_comp_off,
-+                    padded_area * c_step * sizeof(int32_t), X_TMP_0);
-+        }
-+    }
-+
-     void loop_end(Label &l, XReg reg_cnt, int len, int i_step, int o_step,
--            int s_step) {
--        add_imm(reg_off_in, reg_off_in, i_step * itype_sz, X_TMP_0);
--        add_imm(reg_off_out, reg_off_out, o_step * otype_sz, X_TMP_0);
--        add_imm(x_ptr_in_off, x_ptr_in_off, i_step * itype_sz, X_TMP_0);
--        add_imm(x_ptr_out_off, x_ptr_out_off, o_step * otype_sz, X_TMP_0);
-+            int s_step, int c_step, const int curr_node_id) {
-+        add_imm(reg_off_in_, reg_off_in_, i_step * itype_sz_, X_TMP_0);
-+        add_imm(reg_off_out_, reg_off_out_, o_step * otype_sz_, X_TMP_0);
-+        add_imm(x_ptr_in_off, x_ptr_in_off, i_step * itype_sz_, X_TMP_0);
-+        add_imm(x_ptr_out_off, x_ptr_out_off, o_step * otype_sz_, X_TMP_0);
- 
-         if (prb_.scale_type == scale_type_t::MANY) {
--            add_imm(reg_off_scale, reg_off_scale, s_step * stype_sz, X_TMP_0);
--            add_imm(x_ptr_scale_off, x_ptr_scale_off, s_step * stype_sz,
-+            add_imm(reg_off_scale_, reg_off_scale_, s_step * stype_sz_,
-+                    X_TMP_0);
-+            add_imm(x_ptr_scale_off, x_ptr_scale_off, s_step * stype_sz_,
-                     X_TMP_0);
-         }
-+
-+        if (compensation_needed_) {
-+            add_imm(reg_off_comp_, reg_off_comp_, c_step * sizeof(int32_t),
-+                    X_TMP_0);
-+            add_imm(x_ptr_comp_off, x_ptr_comp_off, c_step * sizeof(int32_t),
-+                    X_TMP_0);
-+        }
-+
-         subs(reg_cnt, reg_cnt, 1);
-         b(NE, l);
- 
--        sub_imm(reg_off_in, reg_off_in, len * i_step * itype_sz, X_TMP_0);
--        sub_imm(reg_off_out, reg_off_out, len * o_step * otype_sz, X_TMP_0);
--        sub_imm(x_ptr_in_off, x_ptr_in_off, len * i_step * itype_sz, X_TMP_0);
--        sub_imm(x_ptr_out_off, x_ptr_out_off, len * o_step * otype_sz, X_TMP_0);
-+        if (prb_.tail(curr_node_id) != 0) {
-+            Label if_end;
-+
-+            // On the stack should be an information if node
-+            // was processed with tail or not.
-+            ldr(reg_tmp_, post_ptr(X_SP, reg_tmp_.getBit() / 8));
-+
-+            cmp(reg_tmp_, with_tail_info_);
-+            b(NE, if_end);
-+            finalize_tail_loop(i_step, o_step, s_step, c_step, curr_node_id);
-+            L(if_end);
-+        }
-+
-+        // Restore offset to initial values. It means before
-+        // loop execution.
-+        sub_imm(reg_off_in_, reg_off_in_, len * i_step * itype_sz_, X_TMP_0);
-+        sub_imm(reg_off_out_, reg_off_out_, len * o_step * otype_sz_, X_TMP_0);
-+        sub_imm(x_ptr_in_off, x_ptr_in_off, len * i_step * itype_sz_, X_TMP_0);
-+        sub_imm(x_ptr_out_off, x_ptr_out_off, len * o_step * otype_sz_,
-+                X_TMP_0);
- 
-         if (prb_.scale_type == scale_type_t::MANY) {
--            sub_imm(reg_off_scale, reg_off_scale, len * s_step * stype_sz,
-+            sub_imm(reg_off_scale_, reg_off_scale_, len * s_step * stype_sz_,
-                     X_TMP_0);
--            sub_imm(x_ptr_scale_off, x_ptr_scale_off, len * s_step * stype_sz,
-+            sub_imm(x_ptr_scale_off, x_ptr_scale_off, len * s_step * stype_sz_,
-                     X_TMP_0);
-         }
-+        if (compensation_needed_) {
-+            sub_imm(reg_off_comp_, reg_off_comp_,
-+                    len * c_step * sizeof(int32_t), X_TMP_0);
-+            sub_imm(x_ptr_comp_off, x_ptr_comp_off,
-+                    len * c_step * sizeof(int32_t), X_TMP_0);
-+        }
-     }
- 
--    void compute_blk_ker(const int len_unroll) {
-+    void compute_blk_ker(const simple_impl_desc_t &desc) {
-+        static constexpr bool with_tail_processing = true;
-+        Label no_last_chunk, end_label;
-         int omp_ndims = prb_.full_ndims - prb_.ndims;
--        Label no_last_blk, end_label;
- 
--        if (prb_.ip_tail > 0 && prb_.op_tail == 0) {
--            if (omp_ndims == 0) {
--                cmp(reg_last_loop_cnt, 1);
--                bne(no_last_blk);
--                compute_ker(omp_ndims, len_unroll, true);
--            } else {
--                cmp(reg_blk_chunks, blk_cnt());
--                bne(no_last_blk);
--                compute_ker(omp_ndims, len_unroll, true);
-+        if (prb_.nodes[0].tail_size > 0) {
-+            if (!prb_.nodes[0].is_parent_empty()) {
-+                const int parent_node_id = prb_.nodes[0].parent_node_id;
-+                ldr(reg_tmp_, ptr(data_chunk_addr(parent_node_id)));
-+                check_if_this_is_last_chunk(reg_tmp_, parent_node_id);
-+                b(NE, no_last_chunk);
-             }
-+
-+            const int len_unroll = desc.tail_len_unroll > 0
-+                    ? desc.tail_len_unroll
-+                    : desc.len_unroll;
-+            compute_ker(omp_ndims, len_unroll, with_tail_processing);
-             b(end_label);
-         }
- 
--        L(no_last_blk);
--        compute_ker(omp_ndims, len_unroll, false);
-+        L(no_last_chunk);
-+        compute_ker(omp_ndims, desc.len_unroll, !with_tail_processing);
-         L(end_label);
-     }
- 
-+    void create_loops(const simple_impl_desc_t &desc,
-+            const std::array<const XReg, 3> &reg_cnt, int jit_loop) {
-+        assert(jit_loop <= ndims_jit_loop_max);
-+
-+        if (jit_loop > 0) {
-+            const int nfu = desc.ndims_full_unroll;
-+            const int unroll_factor
-+                    = jit_loop == 1 ? desc.len_last_dim_unroll : 1;
-+            const int curr_node_id = nfu + (jit_loop - 1);
-+            const int parent_node_id = prb_.nodes[curr_node_id].parent_node_id;
-+            const int tail_size = prb_.tail(curr_node_id) / unroll_factor;
-+            const int node_size = prb_.n(curr_node_id) / unroll_factor;
-+            const XReg reg_loop_cnt = reg_cnt[jit_loop - 1];
-+            const bool curr_node_has_tail = prb_.tail(curr_node_id) != 0;
-+            Label loop, if_no_tail, if_end;
-+
-+            if (curr_node_has_tail) {
-+                const size_t reg_bytes = reg_tmp_.getBit() / 8;
-+                if (prb_.nodes[curr_node_id].is_parent_empty()) {
-+                    mov(reg_loop_cnt, tail_size);
-+                    // Put info that node is being processed with tail.
-+                    mov(reg_tmp_, with_tail_info_);
-+                    str(reg_tmp_, pre_ptr(X_SP, -reg_bytes));
-+                } else {
-+                    ldr(reg_tmp_, ptr(data_chunk_addr(parent_node_id)));
-+                    check_if_this_is_last_chunk(reg_tmp_, parent_node_id);
-+                    b(NE, if_no_tail);
-+                    mov(reg_loop_cnt, tail_size);
-+                    // Put info that node is being processed with tail.
-+                    mov(reg_tmp_, with_tail_info_);
-+                    str(reg_tmp_, pre_ptr(X_SP, -reg_bytes));
-+                    b(if_end);
-+
-+                    L(if_no_tail);
-+                    mov(reg_loop_cnt, node_size);
-+                    // Put info that node is being processed without tail.
-+                    mov(reg_tmp_, without_tail_info_);
-+                    str(reg_tmp_, pre_ptr(X_SP, -reg_bytes));
-+                    L(if_end);
-+                }
-+            }
-+
-+            if (prb_.is_tail_in_one_of_child_nodes(curr_node_id)) {
-+                if (!curr_node_has_tail) {
-+                    mov(reg_loop_cnt, node_size);
-+                    str(reg_loop_cnt, ptr(data_chunk_addr(curr_node_id)));
-+                }
-+                L(loop);
-+                if (!prb_.nodes[curr_node_id].is_parent_empty()) {
-+                    Label if_no_tail_in_child_node;
-+                    ldr(reg_tmp_, ptr(data_chunk_addr(parent_node_id)));
-+                    check_if_this_is_last_chunk(reg_tmp_, parent_node_id);
-+                    b(NE, if_no_tail_in_child_node);
-+                    str(reg_loop_cnt, ptr(data_chunk_addr(curr_node_id)));
-+                    L(if_no_tail_in_child_node);
-+                } else {
-+                    str(reg_loop_cnt, ptr(data_chunk_addr(curr_node_id)));
-+                }
-+            } else if (curr_node_has_tail) {
-+                L(loop);
-+            } else {
-+                loop_begin(loop, reg_loop_cnt, node_size);
-+            }
-+            create_loops(desc, reg_cnt, jit_loop - 1);
-+
-+            loop_end(loop, reg_loop_cnt, node_size,
-+                    prb_.is(curr_node_id) * unroll_factor,
-+                    prb_.os(curr_node_id) * unroll_factor,
-+                    prb_.ss(curr_node_id) * unroll_factor,
-+                    prb_.cs(curr_node_id) * unroll_factor, curr_node_id);
-+        } else {
-+            compute_blk_ker(desc);
-+        }
-+    }
-+
-     bool simple_impl() {
-         simple_impl_desc_t d;
-         if (!simple_impl_desc_init(prb_, &d)) return false;
- 
--        const int nfu = d.ndims_full_unroll;
--        const int ldu = d.len_last_dim_unroll;
--        const int n_jit_loops = prb_.ndims - d.ndims_full_unroll;
--        assert(n_jit_loops <= ndims_jit_loop_max);
--
--        eor(reg_off_in, reg_off_in, reg_off_in);
--        eor(reg_off_out, reg_off_out, reg_off_out);
--        mov(x_ptr_in_off, XReg(reg_ptr_in.getIdx()));
--        mov(x_ptr_out_off, XReg(reg_ptr_out.getIdx()));
-+        eor(reg_off_in_, reg_off_in_, reg_off_in_);
-+        eor(reg_off_out_, reg_off_out_, reg_off_out_);
-+        mov(x_ptr_in_off, reg_ptr_in_);
-+        mov(x_ptr_out_off, reg_ptr_out_);
-         if (prb_.scale_type == scale_type_t::MANY) {
--            eor(reg_off_scale, reg_off_scale, reg_off_scale);
--            mov(x_ptr_scale_off, XReg(reg_ptr_scale.getIdx()));
-+            mov(reg_off_scale_, 0);
-+            mov(x_ptr_scale_off, reg_ptr_scale_);
-+        }
-+        if (compensation_needed_) {
-+            eor(reg_off_comp_, reg_off_comp_, reg_off_comp_);
-+            mov(x_ptr_comp_off, reg_off_comp_);
-         }
- 
--        Label l_loop[3];
--        XReg reg_cnt[3] = {x15, x14, x13};
--
--        if (n_jit_loops > 2) loop_begin(l_loop[2], reg_cnt[2], n(nfu + 2));
--
--        if (n_jit_loops > 1) loop_begin(l_loop[1], reg_cnt[1], n(nfu + 1));
--
--        if (n_jit_loops > 0)
--            loop_begin(l_loop[0], reg_cnt[0], n(nfu + 0) / ldu);
--
--        compute_blk_ker(d.len_unroll);
--
--        if (n_jit_loops > 0)
--            loop_end(l_loop[0], reg_cnt[0], n(nfu + 0) / ldu, is(nfu + 0) * ldu,
--                    os(nfu + 0) * ldu, ss(nfu + 0) * ldu);
--
--        if (n_jit_loops > 1)
--            loop_end(l_loop[1], reg_cnt[1], n(nfu + 1), is(nfu + 1),
--                    os(nfu + 1), ss(nfu + 1));
-+        std::array<const XReg, 3> reg_cnt({{x15, x14, x13}});
- 
--        if (n_jit_loops > 2)
--            loop_end(l_loop[2], reg_cnt[2], n(nfu + 2), is(nfu + 2),
--                    os(nfu + 2), ss(nfu + 2));
-+        const int n_jit_loops = prb_.ndims - d.ndims_full_unroll;
-+        create_loops(d, reg_cnt, n_jit_loops);
- 
-         return true;
-     }
-@@ -1156,7 +1577,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         inst(__VA_ARGS__);
- 
-     void cvt_z_s32_f32(const size_t startIdx, const size_t regNum) {
--        UNROLL_INST(scvtf, ZRegS, tmp, p_all / T_m, tmp);
-+        UNROLL_INST(scvtf, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
-     }
- 
-     void cvt_v_s32_f32(const size_t startIdx, const size_t regNum) {
-@@ -1164,8 +1585,8 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-     }
- 
-     void cvt_z_f32_s32(const size_t startIdx, const size_t regNum) {
--        UNROLL_INST(frinti, ZRegS, tmp, p_all / T_m, tmp);
--        UNROLL_INST(fcvtzs, ZRegS, tmp, p_all / T_m, tmp);
-+        UNROLL_INST(frinti, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
-+        UNROLL_INST(fcvtzs, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
-     }
- 
-     void cvt_v_f32_s32(const size_t startIdx, const size_t regNum) {
-@@ -1175,7 +1596,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-     void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) {
-         cvt_z_b_s(startIdx, regNum);
--        UNROLL_INST(sxtb, ZRegS, tmp, p_all / T_m, tmp);
-+        UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
-     }
- 
-     void cvt_v_s8_s32(const size_t startIdx, const size_t regNum) {
-@@ -1214,7 +1635,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-     void cvt_z_u8_s32(const size_t startIdx, const size_t regNum) {
-         cvt_z_b_s(startIdx, regNum);
--        UNROLL_INST(uxtb, ZRegS, tmp, p_all / T_m, tmp);
-+        UNROLL_INST(uxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
-     }
- 
-     void cvt_v_u8_s32(const size_t startIdx, const size_t regNum) {
-@@ -1285,7 +1706,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-         dupm(z_tmp7.s, 255);
-         UNROLL_INST2(smax, ZRegS(i), 0);
--        UNROLL_INST2(smin, ZRegS(i), p_all / T_m, z_tmp7.s);
-+        UNROLL_INST2(smin, ZRegS(i), P_ALL_ONE / T_m, z_tmp7.s);
-         UNROLL_INST(uzp1, ZRegH, tmp, tmp, tmp);
-         UNROLL_INST(uzp1, ZRegB, tmp, tmp, tmp);
-         UNROLL_INST2(mov, ZRegB(i), P_NOT_128 / T_m, 0);
-@@ -1320,107 +1741,514 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- #undef UNROLL_INST
- #undef UNROLL_INST
- 
--    jit_uni_reorder_kernel_f32_t(const desc_t &desc) : kernel_t(desc) {
--        itype_sz = data_type_size(prb_.itype);
--        otype_sz = data_type_size(prb_.otype);
--        stype_sz = sizeof(float);
-+    jit_uni_reorder_kernel_f32_t(const desc_t &desc)
-+        : kernel_t(desc), isa_(get_max_cpu_isa()) {
-+        assert(!utils::one_of(isa_, isa_undef, isa_all));
-+        itype_sz_ = data_type_size(prb_.itype);
-+        otype_sz_ = data_type_size(prb_.otype);
-+        stype_sz_ = sizeof(float);
-     }
- 
-     void generate() override {
-         using namespace Xbyak_aarch64::util;
-         uint64_t sveLen = get_sve_length();
-+        Label end_of_kernel;
- 
-         preamble();
--#define PARAM(x) offsetof(call_param_t, x)
-+
-         if (prb_.scale_type == scale_type_t::COMMON) {
--            add_imm(X_DEFAULT_ADDR, abi_param1, PARAM(scale), X_TMP_1);
-+            add_imm(X_DEFAULT_ADDR, PARAM(scale), X_TMP_1);
-             ldr(X_TMP_0, ptr(X_DEFAULT_ADDR));
--            ldr(W_TMP_1, ptr(X_TMP_0));
--            dup(xmm_scale, W_TMP_1);
-+            ld1r(xmm_scale_, ptr(X_TMP_0));
-         } else if (prb_.scale_type == scale_type_t::MANY) {
--            add_imm(X_DEFAULT_ADDR, abi_param1, PARAM(scale), X_TMP_0);
--            ldr(reg_ptr_scale, ptr(X_DEFAULT_ADDR));
-+            add_imm(X_DEFAULT_ADDR, PARAM(scale), X_TMP_0);
-+            ldr(reg_ptr_scale_, ptr(X_DEFAULT_ADDR));
-         }
--        add_imm(X_TMP_0, abi_param1, PARAM(in), X_TMP_2);
--        add_imm(X_TMP_1, abi_param1, PARAM(out), X_TMP_2);
--        add_imm(reg_blk, abi_param1, PARAM(blk_chunks), reg_blk);
--        ldr(reg_ptr_in, ptr(X_TMP_0));
--        ldr(reg_ptr_out, ptr(X_TMP_1));
--        ldr(reg_blk_chunks, ptr(reg_blk));
--
--#undef PARAM
--        mov_imm(reg_last_loop_cnt, 1);
-+        if (compensation_needed_) {
-+            add_imm(X_DEFAULT_ADDR, PARAM(compensation_scratch), X_TMP_0);
-+            ldr(reg_ptr_comp_, ptr(X_DEFAULT_ADDR));
-+        }
-+        if (prb_.scale_adjust == 0.5f) { mov(reg_scale_adjust_, 0x3f000000); }
-+        add_imm(X_TMP_0, PARAM(in), X_TMP_2);
-+        add_imm(X_TMP_1, PARAM(out), X_TMP_2);
-+        ldr(reg_ptr_in_, ptr(X_TMP_0));
-+        ldr(reg_ptr_out_, ptr(X_TMP_1));
- 
--        mov(x_ptr_in_off, XReg(reg_ptr_in.getIdx()));
--        mov(x_ptr_out_off, XReg(reg_ptr_out.getIdx()));
--        mov(x_ptr_scale_off, XReg(reg_ptr_scale.getIdx()));
-+        mov(x_ptr_in_off, reg_ptr_in_);
-+        mov(x_ptr_out_off, reg_ptr_out_);
-+        mov(x_ptr_scale_off, reg_ptr_scale_);
-+        mov(x_ptr_comp_off, reg_ptr_comp_);
- 
-         if (sveLen) { /* SVE is available. */
-             ptrue(p_lsb_256.b, VL32);
--            ptrue(p_all.b);
-+            ptrue(p_lsb_128.b, VL16);
-+            ptrue(p_lsb_64.b, VL8);
-         }
- 
--        if (can_do_tr8x8()) {
--            dup(ymm_zero, 0);
--
--            if (prb_.itype == data_type::u8 && prb_.otype == data_type::s8) {
--                mov_imm(reg_tmp, 0x7f7f7f7f7f7f7f7f);
--                mov(VReg4S(ymm_8x127b.getIdx())[0], WReg(reg_tmp.getIdx()));
-+        bool is_tail_in_drv_dims = false;
-+        for (int i = prb_.ndims; i < prb_.full_ndims; i++)
-+            if (prb_.nodes[i].tail_size > 0) {
-+                is_tail_in_drv_dims = true;
-+                break;
-             }
--        } else if (mayiuse(sve_512)) {
--            movi(xmm_zero, 0);
- 
--            if (prb_.itype == data_type::u8 && prb_.otype == data_type::s8) {
--                mov(WReg(reg_tmp.getIdx()), 0x7f7f7f7f);
--                mov(xmm_4x127b[0], WReg(reg_tmp.getIdx()));
-+        if (is_tail_in_drv_dims) {
-+            Label reorder_kernel;
-+            add_imm(X_DEFAULT_ADDR, TAIL_PARAM(skip_kernel_execution), X_TMP_0);
-+            ldr(reg_tmp_, ptr(X_DEFAULT_ADDR));
-+            cmp(reg_tmp_, static_cast<int64_t>(true));
-+            b(EQ, end_of_kernel);
-+
-+            add_imm(X_DEFAULT_ADDR, TAIL_PARAM(zeroing_data), X_TMP_0);
-+            ldr(reg_tmp_, ptr(X_DEFAULT_ADDR));
-+            cmp(reg_tmp_, static_cast<int64_t>(false));
-+            b(EQ, reorder_kernel);
-+            // If zeroing data is set then all dst memory
-+            // will be zeroed and nothing more will be done.
-+            int bytes_to_zeroing = otype_sz_;
-+            for (int i = 0; i < prb_.ndims; i++) {
-+                bytes_to_zeroing *= prb_.nodes[i].n;
-             }
-+            eor(reg_off_out_, reg_off_out_, reg_off_out_);
-+            mov(x_ptr_out_off, reg_ptr_out_);
-+            zero_dst_memory(bytes_to_zeroing);
-+            b(end_of_kernel);
-+            L(reorder_kernel);
-+        }
-+
-+        if (can_do_tr8x8()) {
-+            dup(ymm_zero_, 0);
-+        } else {
-+            movi(xmm_zero_, 0);
-         }
- 
-         impl();
-+
-+        L(end_of_kernel);
-         postamble();
-     }
- 
-+    ~jit_uni_reorder_kernel_f32_t() override = default;
-+
-+#undef TAIL_PARAM
-+#undef PARAM
-+
- private:
--    int itype_sz;
--    int otype_sz;
--    int stype_sz;
-+    static constexpr int64_t with_tail_info_ = static_cast<int64_t>(true);
-+    static constexpr int64_t without_tail_info_ = static_cast<int64_t>(false);
-+
-+    int itype_sz_;
-+    int otype_sz_;
-+    int stype_sz_;
- 
--    XReg reg_ptr_in = x6;
--    XReg reg_ptr_out = x2;
--    XReg reg_ptr_scale = abi_not_param1;
-+    const cpu_isa_t isa_;
- 
--    XReg reg_off_in = x8;
--    XReg reg_off_out = x9;
--    XReg reg_off_scale = x10;
-+    const XReg reg_ptr_in_ = x6;
-+    const XReg reg_ptr_out_ = x2;
-+    const XReg reg_ptr_scale_ = abi_not_param1;
-+    const XReg reg_ptr_comp_ = x3;
-+    const WReg &reg_scale_adjust_ = w5;
- 
--    XReg reg_blk = x11;
--    XReg reg_blk_chunks = x12;
--    XReg reg_last_loop_cnt = x11;
-+    const XReg reg_off_in_ = x8;
-+    const XReg reg_off_out_ = x9;
-+    const XReg reg_off_scale_ = x10;
-+    const XReg reg_off_comp_ = x11;
- 
--    XReg reg_tmp = x0;
-+    XReg reg_tmp_ = x12;
- 
--    VReg4S xmm_scale = v15.s;
--    VReg4S xmm_zero = v14.s;
--    VReg4S xmm_4x127b = v13.s; // TODO: unite with ymm_zero
--    ZRegS ymm_zero = z14.s;
--    ZRegS ymm_8x127b = z13.s;
--    VReg4S xmm_tmp = v12.s;
--    VReg4S xmm_saturation_ubound = v12.s;
--    ZRegS ymm_saturation_ubound = z12.s;
-+    VReg4S xmm_scale_ = v15.s;
-+    VReg4S xmm_zero_ = v14.s;
-+    ZRegS ymm_zero_ = z14.s;
-+    VReg4S xmm_tmp_ = v12.s;
-+    const VReg4S xmm_src_zp_ = v9.s;
-+    const VReg4S xmm_dst_zp_ = v11.s;
-+    VReg4S xmm_saturation_ubound_ = v12.s;
-+    ZRegS ymm_saturation_ubound_ = z12.s;
- 
-     /* Note: x22 - x28 are already used as temporal registgers
-        in jit_generator.hpp.
--       x_ptr_(in|out|scale)_off keeps (base + offset) address. */
-+       x_ptr_(in|out|scale|comp)_off keeps (base + offset) address. */
-     XReg x_ptr_in_off = x16;
-     XReg x_ptr_out_off = x18;
-     XReg x_ptr_scale_off = x20;
-+    XReg x_ptr_comp_off = x17;
- 
-     /* Caution: Chose predicate registers not used by x64's implementation. */
-     PReg p_lsb_256 = p7;
--    PReg p_all = p6;
-+    PReg p_lsb_128 = p6;
-+    PReg p_lsb_64 = p4;
-     PReg p_tmp0 = p5;
- 
-     const std::vector<uint32_t> tmp_vec_idx = {20, 21, 22, 23, 24, 25, 26, 27};
-+    VReg v_tmp0 = v20;
-+    ZReg z_tmp0 = z20;
-+    ZReg z_tmp1 = z21;
-+    ZReg z_tmp2 = z22;
-+    ZReg z_tmp3 = z23;
-+    ZReg z_tmp4 = z24;
-+    ZReg z_tmp5 = z25;
-+    ZReg z_tmp6 = z26;
-+    ZReg z_tmp7 = z27;
-+    VReg v_tmp7 = v27;
-+
-+    const std::vector<ZReg> z_tmp_vec
-+            = {z_tmp0, z_tmp1, z_tmp2, z_tmp3, z_tmp4, z_tmp5, z_tmp6, z_tmp7};
-+    constexpr static int z_tmp_vec_size = 8;
-+};
-+
-+// Seperate class for no unroll/threading burden
-+struct jit_single_blk_kernel_t : public jit_generator {
-+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_single_blk_kernel)
-+    static bool applicable(const prb_t &p) {
-+        using namespace data_type;
-+
-+        bool ok = p.ndims >= 2 && mayiuse(sve_256)
-+                && p.scale_type == scale_type_t::NONE
-+                && utils::one_of(p.itype, f32) && utils::one_of(p.otype, f32)
-+                && utils::everyone_is(0, p.ioff, p.ooff) && p.beta == 0.f
-+                && prb_has_small_strides(p);
-+        if (!ok) return false;
-+
-+        int64_t n0 = p.nodes[0].n;
-+        auto i0 = p.nodes[0].is;
-+        auto o0 = p.nodes[0].os;
-+        int64_t n1 = p.nodes[1].n;
-+        auto i1 = p.nodes[1].is;
-+        auto o1 = p.nodes[1].os;
-+
-+        /*
-+         * for a transpose of plain to 8c case, nodes would be like:
-+         *     n    is   os
-+         *     m    1    8
-+         *     8    m    1
-+         * or
-+         *     8    m    1
-+         *     m    1    8
-+         */
-+        ok = (utils::one_of(n0, 8, 16) || utils::one_of(n1, 8, 16))
-+                && ((i0 == 1 && o1 == 1 && n0 == i1 && o0 == n1)
-+                        || (o0 == 1 && i1 == 1 && n0 == o1 && i0 == n1));
-+        if (!ok) return false;
-+
-+        // Do not handle transpose of dimensions other than last 2
-+        for (int i = 2; i < p.ndims; ++i) {
-+            if (p.nodes[i].is != p.nodes[i].os) {
-+                ok = false;
-+                break;
-+            }
-+        }
-+
-+        return ok;
-+    }
-+
-+    jit_single_blk_kernel_t(const tr::prb_t &prb)
-+        : jit_generator()
-+        , prb_(prb)
-+        , itype_sz_(data_type_size(prb_.itype))
-+        , otype_sz_(data_type_size(prb_.otype))
-+        , block_sz(prb.nodes[0].n) {}
-+
-+    void generate() override {
-+        auto input_stride
-+                = prb_.nodes[0].is != 1 ? prb_.nodes[0].is : prb_.nodes[1].is;
-+        auto output_stride
-+                = prb_.nodes[0].os != 1 ? prb_.nodes[0].os : prb_.nodes[1].os;
-+
-+        Label tail_processing;
-+
-+        const auto load_zp = [&](const ZRegS ymm_zp, const XReg reg_zp) {
-+            dup(ymm_zp, WReg(reg_zp.getIdx()));
-+            scvtf(ymm_zp, P_ALL_ONE / T_m, ymm_zp);
-+        };
-+
-+        preamble();
-+
-+        if (prb_.req_src_zp) load_zp(ymm_src_zp, reg_src_zp);
-+
-+        if (prb_.req_dst_zp) load_zp(ymm_dst_zp, reg_dst_zp);
-+
-+        cmp(reg_ptr_tail, true);
-+        b(EQ, tail_processing);
-+
-+        if (block_sz == 8) {
-+            gen_ker8x8(0, 0, input_stride, output_stride, 8, 8);
-+            block_sz = 8;
-+        } else if (block_sz == 16) {
-+            gen_ker16x16_in_8x8(input_stride, output_stride);
-+            block_sz = 16;
-+        } else {
-+            assert(!"unimplemented");
-+        }
-+
-+        postamble();
-+
-+        L(tail_processing);
-+
-+        if (block_sz == 8) {
-+            auto i_tail = input_stride % 8 != 0 ? input_stride % 8 : 8;
-+            auto o_tail = output_stride % 8 != 0 ? output_stride % 8 : 8;
-+            if (i_tail != o_tail) {
-+                auto t_mask = i_tail == 8 ? o_tail : i_tail;
-+                gen_setmask(t_mask);
-+                gen_ker8x8(0, 0, input_stride, output_stride, i_tail, o_tail);
-+            }
-+        } else if (block_sz == 16) {
-+            auto i_tail = input_stride % 16 != 0 ? input_stride % 16 : 16;
-+            auto o_tail = output_stride % 16 != 0 ? output_stride % 16 : 16;
-+            if (i_tail != o_tail) {
-+                auto t_mask = i_tail == 16 ? o_tail : i_tail;
-+                t_mask %= 8;
-+                if (t_mask != 0) gen_setmask(t_mask);
-+                gen_ker16x16_in_8x8(
-+                        input_stride, output_stride, i_tail, o_tail);
-+            }
-+        } else {
-+            assert(!"unimplemented");
-+        }
-+
-+        postamble();
-+    }
-+
-+    void gen_loadu(const ZRegS ymm, const XReg &addr, int size) {
-+        QReg xmm(ymm.getIdx());
-+        switch (size) {
-+            case 32: ld1w(ymm, p_lsb_256 / T_z, ptr(addr)); break;
-+            case 16: ldr(xmm, ptr(addr)); break;
-+            default: assert(!"unreachable");
-+        }
-+    }
-+
-+    void gen_storeu(const XReg &addr, const ZRegS ymm, int size) {
-+        QReg xmm(ymm.getIdx());
-+        switch (size) {
-+            case 32: st1w(ymm, p_lsb_256, ptr(addr)); break;
-+            case 16: str(xmm, ptr(addr)); break;
-+            default: assert(!"unreachable");
-+        }
-+    }
-+
-+    void gen_maskloadu(
-+            const ZRegS ymm, const XReg &addr, const PReg mask, int size) {
-+        switch (size) {
-+            case 32:
-+            case 16: ld1w(ymm, mask / T_z, ptr(addr)); break;
-+            default: assert(!"unreachable");
-+        }
-+    }
-+
-+    void gen_maskstoreu(
-+            const XReg &addr, const ZRegS ymm, const PReg mask, int size) {
-+        switch (size) {
-+            case 32:
-+            case 16: st1w(ymm, mask, ptr(addr)); break;
-+            default: assert(!"unreachable");
-+        }
-+    }
-+
-+    // Register allocation xmm0~11
-+    void gen_transpose_8x8() {
-+        const uint64_t sveLen = get_sve_length();
-+        constexpr int lane = 8;
-+
-+#if 0
-+        /* Debug code
-+	   z0:   7,  6,  5,  4,  3,  2,  1,  0
-+	   z1:  15, 14, 13, 12, 11, 10,  9,  8
-+	   ...
-+	   z17: 63, 62, 61, 60, 59, 58, 57, 56
-+	*/
-+	ptrue(P_ALL_ONE.b);
-+	ptrue(P_TMP.s, VL8);
-+	not_(P_TMP.b, P_ALL_ONE/T_z, P_TMP.b);
-+        index(z0.s, 0, 1);
-+        mov(z0.s, P_TMP/T_m, 0);
-+        mov(z_tmp_vec[0].s, 8);
-+        mov(z_tmp_vec[0].s, P_TMP/T_m, 0);
-+        for(uint32_t i=1; i<lane; i++)
-+          add(ZRegS{i}, ZRegS{i-1}, z_tmp_vec[0].s);
-+#endif
-+
-+        ptrue(P_TMP.s, VL4);
-+
-+        /* 1st turn */
-+        for (uint32_t i = 0; i < lane / 2; i++) {
-+            trn1(z_tmp_vec[i].s, ZRegS {2 * i}, ZRegS {2 * i + 1});
-+            trn2(z_tmp_vec[lane / 2 + i].s, ZRegS {2 * i}, ZRegS {2 * i + 1});
-+        }
-+
-+        /* 2nd turn */
-+        trn1(z4.d, z_tmp_vec[0].d, z_tmp_vec[1].d);
-+        trn1(z5.d, z_tmp_vec[4].d, z_tmp_vec[5].d);
-+        trn2(z6.d, z_tmp_vec[0].d, z_tmp_vec[1].d);
-+        trn2(z7.d, z_tmp_vec[4].d, z_tmp_vec[5].d);
-+        trn1(z_tmp_vec[0].d, z_tmp_vec[2].d, z_tmp_vec[3].d);
-+        trn1(z_tmp_vec[1].d, z_tmp_vec[6].d, z_tmp_vec[7].d);
-+        trn2(z_tmp_vec[2].d, z_tmp_vec[2].d, z_tmp_vec[3].d);
-+        trn2(z_tmp_vec[3].d, z_tmp_vec[6].d, z_tmp_vec[7].d);
-+
-+        /* 3rd turn */
-+        for (uint32_t i = 0; i < lane / 2; i++) {
-+            mov(ZRegD {i}, ZRegD {lane / 2 + i});
-+            mov(z_tmp_vec[lane / 2 + i].d, z_tmp_vec[i].d);
-+        }
-+
-+        /* 4th turn */
-+        for (uint32_t i = 0; i < lane / 2; i++) {
-+            ZRegB z {lane / 2 + i};
-+            ZRegB z_tmp = z_tmp_vec[lane / 2 + i].b;
-+            /* Move bit 0-127 to 128-255. */
-+            ext(z, z, 16);
-+            /* Move bit 128-255 to 0-127. */
-+            ext(z_tmp, z_tmp, sveLen - 16);
-+        }
-+
-+        /* 5th turn */
-+        for (uint32_t i = 0; i < lane / 2; i++) {
-+            ZRegS z0 {i};
-+            ZRegS z1 {lane / 2 + i};
-+            sel(z0, P_TMP, z0, z_tmp_vec[lane / 2 + i].s);
-+            sel(z1, P_TMP, z1, z_tmp_vec[i].s);
-+        }
-+    }
-+
-+    // keep order nchw -> nChw()C
-+    // or nChw()C -> nchw
-+    void gen_setmask(int mask) {
-+        mov_imm(x_tmp_0, 0);
-+        mov_imm(x_tmp_1, mask);
-+        whilelt(p_mask.s, x_tmp_0, x_tmp_1);
-+    }
-+
-+    // TODO: Mark parameter with type information
-+    // XXX: !
-+    // offset in byte offset
-+    // stride in element number
-+    //
-+    // Gen specific 8x8 transform respect to certain tail condition
-+    void gen_tr8x8(int i_off, int o_off, int input_stride, int output_stride,
-+            int in_tail, int out_tail) {
-+        constexpr int lane = 8;
-+
-+        if (in_tail == 0 || out_tail == 0) return;
-+
-+        for (int i = 0; i < out_tail; ++i) {
-+            if (in_tail != lane) {
-+                add_imm(x_addr, reg_ptr_in_,
-+                        i_off + i * input_stride * itype_sz_, x_tmp_0);
-+                gen_maskloadu(ZRegS(i), x_addr, p_mask, lane * itype_sz_);
-+            } else {
-+                add_imm(x_addr, reg_ptr_in_,
-+                        i_off + i * input_stride * itype_sz_, x_tmp_0);
-+                gen_loadu(ZRegS(i), x_addr, lane * itype_sz_);
-+            }
-+            if (prb_.req_src_zp) { fsub(ZRegS(i), ZRegS(i), ymm_src_zp); }
-+        }
-+
-+        gen_transpose_8x8();
-+
-+        for (int i = 0; i < in_tail; ++i) {
-+            if (prb_.req_dst_zp) { fadd(ZRegS(i), ZRegS(i), ymm_dst_zp); }
-+            if (out_tail == lane) {
-+                add_imm(x_addr, reg_ptr_out_,
-+                        o_off + i * output_stride * otype_sz_, x_tmp_0);
-+                gen_storeu(x_addr, ZRegS(i), lane * otype_sz_);
-+            } else {
-+                add_imm(x_addr, reg_ptr_out_,
-+                        o_off + i * output_stride * otype_sz_, x_tmp_0);
-+                gen_maskstoreu(x_addr, ZRegS(i), p_mask, lane * otype_sz_);
-+            }
-+        }
-+    }
-+
-+    // tail: 0 ~ 8
-+    // support: either in_tail or out_tail is not 8, but not both
-+    void gen_ker8x8(int i_off, int o_off, int input_stride, int output_stride,
-+            int in_tail, int out_tail) {
-+        gen_tr8x8(i_off, o_off, input_stride, output_stride, in_tail, out_tail);
-+    }
-+
-+    void gen_ker16x16_in_8x8(int input_stride, int output_stride) {
-+        const auto lane = 16;
-+        const auto sub_lane = lane / 2;
-+        gen_tr8x8(0, 0, input_stride, output_stride, sub_lane, sub_lane);
-+        gen_tr8x8(input_stride * sub_lane * itype_sz_, sub_lane * otype_sz_,
-+                input_stride, output_stride, sub_lane, sub_lane);
-+        gen_tr8x8(sub_lane * itype_sz_, output_stride * sub_lane * otype_sz_,
-+                input_stride, output_stride, sub_lane, sub_lane);
-+        gen_tr8x8((input_stride * sub_lane + sub_lane) * itype_sz_,
-+                (output_stride * sub_lane + sub_lane) * otype_sz_, input_stride,
-+                output_stride, sub_lane, sub_lane);
-+    }
-+
-+    // tail can be 1 ~ 16, using avx2 for now
-+    void gen_ker16x16_in_8x8(
-+            int input_stride, int output_stride, int in_tail, int out_tail) {
-+        constexpr auto lane = 16;
-+        constexpr auto sub_lane = lane / 2;
-+        auto tail = in_tail != lane ? in_tail : out_tail;
-+
-+        const auto l_tail = tail < sub_lane ? tail : sub_lane;
-+        const auto u_tail = tail < sub_lane ? 0 : tail - sub_lane;
-+
-+        if (tail == in_tail) {
-+            gen_tr8x8(0, 0, input_stride, output_stride, l_tail, sub_lane);
-+            gen_tr8x8(input_stride * sub_lane * itype_sz_, sub_lane * otype_sz_,
-+                    input_stride, output_stride, l_tail, sub_lane);
-+            gen_tr8x8(sub_lane * itype_sz_,
-+                    output_stride * sub_lane * otype_sz_, input_stride,
-+                    output_stride, u_tail, sub_lane);
-+            gen_tr8x8(itype_sz_ * (input_stride * sub_lane + sub_lane),
-+                    otype_sz_ * (output_stride * sub_lane + sub_lane),
-+                    input_stride, output_stride, u_tail, sub_lane);
-+        } else {
-+            gen_tr8x8(0, 0, input_stride, output_stride, sub_lane, l_tail);
-+            gen_tr8x8(input_stride * sub_lane * itype_sz_, sub_lane * otype_sz_,
-+                    input_stride, output_stride, sub_lane, u_tail);
-+            gen_tr8x8(sub_lane * itype_sz_,
-+                    output_stride * sub_lane * itype_sz_, input_stride,
-+                    output_stride, sub_lane, l_tail);
-+            gen_tr8x8(itype_sz_ * (input_stride * sub_lane + sub_lane),
-+                    otype_sz_ * (output_stride * sub_lane + sub_lane),
-+                    input_stride, output_stride, sub_lane, u_tail);
-+        }
-+    }
-+
-+private:
-+    // 6 ~ 12
-+    constexpr static int xmm_save_for_windows = 0;
-+    constexpr static int xmm_save_start_from = 6;
-+    constexpr static int xmm_width = 16;
-+
-+    void preamble() { ptrue(p_lsb_256.b, VL32); }
-+
-+    void postamble() { ret(); }
-+
-+    const prb_t &prb_;
-+
-+    int itype_sz_;
-+    int otype_sz_;
-+    int block_sz;
-+
-+    XReg reg_ptr_in_ = abi_param1;
-+    XReg reg_ptr_out_ = abi_param2;
-+    XReg reg_ptr_tail = abi_param3;
-+    XReg reg_src_zp = abi_param4;
-+    XReg reg_dst_zp = abi_param5;
-+
-+    XReg x_addr = x10;
-+    XReg x_tmp_0 = x11;
-+    XReg x_tmp_1 = x12;
-+
-+    /* Avoid P_TMP(p7) in jit_generator.hpp. */
-+    PReg p_lsb_256 = p6;
-+    PReg p_mask = p5;
-+
-+    ZRegS ymm_tmp = z0.s;
-+    ZRegS ymm_src_zp = z14.s;
-+    ZRegS ymm_dst_zp = z15.s;
-+
-+    const std::vector<uint32_t> tmp_vec_idx = {20, 21, 22, 23, 24, 25, 26, 27};
-+    VReg v_tmp0 = v20;
-     ZReg z_tmp0 = z20;
-     ZReg z_tmp1 = z21;
-     ZReg z_tmp2 = z22;
-@@ -1472,15 +2300,31 @@ kernel_t *kernel_t::create(const kernel_t::desc_t &desc) {
- 
-     return nullptr;
- }
-+
- } // namespace tr
- 
- static void prb_block_for_cache(tr::prb_t &prb) {
-     /* If strides for 0th and 1st nodes are cache friendly
-      * then one can altogether do away with blocking ! */
--    const bool cache_blocking_needed = false
--            || (prb.nodes[0].is % 64 == 0 && prb.nodes[0].n > 16)
--            || (prb.ndims > 1 && prb.nodes[1].is % 64 == 0
--                    && prb.nodes[1].n > 16);
-+    static constexpr int num_elems_thr = 16;
-+    const bool stride_cache_friendly
-+            = ((prb.nodes[0].is % 64 == 0 && prb.nodes[0].n > num_elems_thr)
-+                      || (prb.ndims > 1 && prb.nodes[1].is % num_elems_thr == 0
-+                              && prb.nodes[1].n > num_elems_thr))
-+            && !prb.is_tail_present;
-+
-+    // performance improvement for shapes with large inner-most dimension
-+    const size_t L1_cache_sz
-+            = size_t(3) * platform::get_per_core_cache_size(1) / 4;
-+    const size_t itype_sz_ = data_type_size(prb.itype);
-+    const size_t inner_block_sz = prb.nodes[0].n * itype_sz_;
-+    const bool requires_inner_blocking = inner_block_sz > L1_cache_sz
-+            // 'is_tail_present' is not supported for cache_blocking when
-+            // asymmetric_comp is executed.
-+            && IMPLICATION(prb.req_asymmetric_comp, !prb.is_tail_present);
-+
-+    const bool cache_blocking_needed
-+            = stride_cache_friendly || requires_inner_blocking;
-     if (!cache_blocking_needed) return;
- 
-     int unit_input_stride_idx = -1;
-@@ -1496,28 +2340,58 @@ static void prb_block_for_cache(tr::prb_t &prb) {
-         const auto output_stride = prb.nodes[unit_input_stride_idx].os;
-         const auto num_elems = prb.nodes[unit_input_stride_idx].n;
- 
--        const bool split_needed = (num_elems > 16) && (num_elems % 16 == 0);
-+        const bool split_needed = (num_elems > num_elems_thr)
-+                && (num_elems % num_elems_thr == 0);
-         const int move_location = (output_stride % 4 != 0) ? 0 : 1;
--        if (split_needed) prb_node_split(prb, unit_input_stride_idx, 16);
-+        if (split_needed)
-+            prb_node_split(prb, unit_input_stride_idx, num_elems_thr);
- 
-         /* Because of cache-unfriendly nature of unit-output stride node, let
-          * us move unit-input stride node on or near front! */
--        prb_node_move(prb, unit_input_stride_idx, move_location);
-+        if (unit_input_stride_idx != move_location)
-+            prb_node_move(prb, unit_input_stride_idx, move_location);
-     }
- 
-     /* Potentially, split the node with os=1 in two and pull in the node with
-      * is=1 between them for better cache reuse:
-      * [n0:is0:1][n1:1:os1] --> [16n0:is0:1][n1:1:os1][n0/16:is0*16:16] */
-     if (prb.ndims >= 2 && prb.nodes[0].os == 1 && prb.nodes[1].is == 1) {
--        const auto input_stride = prb.nodes[0].is;
-         const auto num_elems = prb.nodes[0].n;
- 
--        const bool split_needed = true && (num_elems > 16)
--                && (num_elems % 16 == 0) && (input_stride >= 256)
--                && (input_stride % 64 == 0);
-+        const bool split_needed = (num_elems > num_elems_thr)
-+                && (num_elems % num_elems_thr == 0);
-         if (split_needed) {
--            prb_node_split(prb, 0, 16);
-+            prb_node_split(prb, 0, num_elems_thr);
-             prb_node_move(prb, 1, 2);
-+
-+            // Update node information
-+            prb_node_dependency(prb);
-+
-+            // heuristics - looping over the unrolled dims should maximize reuse
-+            // of the already cached data; observation is choosing the smallest
-+            // dim from the remaining (from 2 up to ndims) gives good results
-+            constexpr int new_position = 2;
-+            const auto dim_beg_it = std::begin(prb.nodes);
-+            const auto dim_two_it = dim_beg_it + new_position;
-+            const auto dim_last_it = dim_beg_it + prb.ndims;
-+            const auto min_n_node_it = std::min_element(dim_two_it, dim_last_it,
-+                    [](const tr::node_t &lhs, const tr::node_t &rhs) {
-+                        return lhs.n < rhs.n;
-+                    });
-+            const auto min_idx = std::distance(dim_beg_it, min_n_node_it);
-+            // check if min_idx node is parent of node with tail processing which
-+            // is currently unsupported (i.e. tail processing can only be handled
-+            // at the inner-most dimension)
-+            bool inner_block_has_tail = false;
-+            for (int idx = min_idx - 1; idx >= new_position; idx--) {
-+                if (prb.nodes[idx].parent_node_id == min_idx) {
-+                    inner_block_has_tail = true;
-+                    break;
-+                }
-+            }
-+
-+            if (min_idx > new_position && (!inner_block_has_tail))
-+                prb_node_move(prb, min_idx, new_position);
-         }
-     }
- }
-@@ -1527,73 +2401,76 @@ static void prb_block_for_cache(tr::prb_t &prb) {
-  * parallel driver and the kernel. */
- static void prb_thread_kernel_balance(
-         tr::prb_t &prb, int &ndims_ker_max, int nthr) {
--    size_t sz_total = 1;
-+    size_t size_total = 1;
-     for (int d = 0; d < prb.ndims; ++d)
--        sz_total *= prb.nodes[d].n;
-+        size_total *= prb.nodes[d].n;
- 
--    /* The general expression for sz_drv_thr can be written as
--     * sz_drv_min = C0 + FC * (nthr > 1 ? 1 : 0) + VC * (nthr - 1)
-+    /* The general expression for size_drv_thr can be written as
-+     * size_drv_min = C0 + FC * (nthr > 1 ? 1 : 0) + VC * (nthr - 1)
-      * where FC and VC are fixed and variable costs respectively.
-      * Though for now, the below heuristic seems to be good enough */
--    const size_t sz_drv_thr = (nthr > 1) ? 16 * nthr : 1;
-+    const size_t size_drv_thr = (nthr > 1) ? 16 * nthr : 1;
- 
--    /* sz_drv_min is the minimal size for the parallel
-+    /* size_drv_min is the minimal size for the parallel
-      * driver required for good parallelization */
--    const size_t sz_drv_min
--            = nstl::min<size_t>(sz_drv_thr, utils::div_up(sz_total, 1024));
-+    const size_t size_drv_min
-+            = nstl::min<size_t>(size_drv_thr, utils::div_up(size_total, 1024));
- 
-     /* kdims -- # of dimensions processed by a kernel
--     * sz_ker_cur -- product of the dimension processed by a kernel
--     * sz_drv_cur -- product of the dimension processed by a driver */
-+     * size_ker_cur -- product of the dimension processed by a kernel
-+     * size_drv_cur -- product of the dimension processed by a driver */
- 
-     int kdims = prb.ndims;
--    size_t sz_drv_cur = 1;
--    for (; kdims > 1 && sz_drv_cur < sz_drv_min; --kdims)
--        sz_drv_cur *= prb.nodes[kdims - 1].n;
-+    size_t size_drv_cur = 1;
-+    for (; kdims > 1 && size_drv_cur < size_drv_min; --kdims)
-+        size_drv_cur *= prb.nodes[kdims - 1].n;
- 
--    size_t sz_ker_cur = 1;
-+    size_t size_ker_cur = 1;
-     for (int d = 0; d < kdims; ++d)
--        sz_ker_cur *= prb.nodes[d].n;
-+        size_ker_cur *= prb.nodes[d].n;
- 
--    /* Initially kdims is chosen so that sz_drv_cur >= sz_drv_min.
-+    /* Initially kdims is chosen so that size_drv_cur >= size_drv_min.
-      *
--     * It might happen that for chosen kdims the sz_ker_cur is too small
-+     * It might happen that for chosen kdims the size_ker_cur is too small
-      * (less than tr::ker_prb_size_min). In that case try to split the
--     * innermost driver dimension into two, to increase sz_ker_cur. */
--    bool want_borrow_ker_from_drv = true && kdims < prb.ndims
--            && sz_ker_cur < tr::ker_prb_size_min && sz_drv_cur > sz_drv_min
--            && kdims != prb.blk_chunk_idx;
-+     * innermost driver dimension into two, to increase size_ker_cur. */
-+    const bool want_borrow_ker_from_drv = kdims < prb.ndims
-+            && size_ker_cur < tr::ker_prb_size_min
-+            && size_drv_cur > size_drv_min;
-     if (want_borrow_ker_from_drv) {
--        /* sz_want_borrow is the minimal sz, so that:
--         *  o) sz_ker_cur * sz_want_borrow >= tr::ker_prb_size_min
-+        /* size_want_borrow is the minimal size, so that:
-+         *  o) size_ker_cur * size_want_borrow >= tr::ker_prb_size_min
-          *  o) current innermost driver dimension is divisible by
--         *     sz_want_borrow (so that we can evenly split that
-+         *     size_want_borrow (so that we can evenly split that
-          *     dimension into two)
-          *
--         *  In the worst case the minimal sz_want_borrow is equal
-+         *  In the worst case the minimal size_want_borrow is equal
-          *  to the innermost driver dimension itself. In that case
-          *  we will sacrifice it in favor of kernel (is it fine?). */
--        size_t sz_want_borrow = utils::div_up(tr::ker_prb_size_min, sz_ker_cur);
--        for (; prb.nodes[kdims].n % sz_want_borrow; ++sz_want_borrow)
-+        size_t size_want_borrow
-+                = utils::div_up(tr::ker_prb_size_min, size_ker_cur);
-+        for (; prb.nodes[kdims].n % size_want_borrow; ++size_want_borrow)
-             ;
--        if (sz_want_borrow != prb.nodes[kdims].n)
--            prb_node_split(prb, kdims, sz_want_borrow);
-+
-+        if (size_want_borrow != prb.nodes[kdims].n)
-+            prb_node_split(prb, kdims, size_want_borrow);
-         kdims += 1;
-     }
- 
-     /* On the other hand it might happen that for chosen kdims
--     * the sz_drv_cur is too small (less than sz_drv_min). In that case
-+     * the size_drv_cur is too small (less than size_drv_min). In that case
-      * try to split the outermost kernel dimension into two, to increase
--     * sz_drv_cur. */
--    bool want_borrow_drv_from_ker = true && sz_ker_cur > tr::ker_prb_size_min
--            && sz_drv_cur < sz_drv_min && kdims != prb.blk_chunk_idx;
-+     * size_drv_cur. */
-+    const bool want_borrow_drv_from_ker = size_ker_cur > tr::ker_prb_size_min
-+            && size_drv_cur < size_drv_min;
-     if (want_borrow_drv_from_ker) {
--        size_t sz_want_borrow = utils::div_up(sz_drv_min, sz_drv_cur);
--        for (; prb.nodes[kdims - 1].n % sz_want_borrow; ++sz_want_borrow)
-+        size_t size_want_borrow = utils::div_up(size_drv_min, size_drv_cur);
-+        for (; prb.nodes[kdims - 1].n % size_want_borrow; ++size_want_borrow)
-             ;
--        if (sz_want_borrow != prb.nodes[kdims - 1].n)
-+
-+        if (size_want_borrow != prb.nodes[kdims - 1].n)
-             prb_node_split(
--                    prb, kdims - 1, prb.nodes[kdims - 1].n / sz_want_borrow);
-+                    prb, kdims - 1, prb.nodes[kdims - 1].n / size_want_borrow);
-     }
- 
-     ndims_ker_max = kdims;
-@@ -1607,6 +2484,33 @@ static void prb_thread_kernel_balance(
-     }
- }
- 
-+status_t jit_uni_reorder_t::pd_t::init(
-+        engine_t *engine, engine_t *src_engine, engine_t *dst_engine) {
-+    CHECK(cpu_reorder_pd_t::init(engine, src_engine, dst_engine));
-+
-+    const bool compensation_needed
-+            = prb_.req_s8s8_comp || prb_.req_asymmetric_comp;
-+    if (compensation_needed) init_scratchpad();
-+
-+    return status::success;
-+}
-+
-+void jit_uni_reorder_t::pd_t::init_scratchpad() {
-+    const memory_desc_wrapper od(dst_md());
-+    const auto G = with_groups_ ? od.padded_dims()[0] : 1;
-+    const auto N = od.padded_dims()[with_groups_ ? 1 : 0];
-+    static constexpr int cache_line_size = 16;
-+    const auto wspace_per_thr_size
-+            = utils::rnd_up(G * N, cache_line_size) * sizeof(int32_t);
-+
-+    auto scratchpad = scratchpad_registry().registrar();
-+    const auto compensation_reduce_size = wspace_per_thr_size * nthr_;
-+
-+    // Every thread gets its own scratchpad space for each N
-+    scratchpad.template book<int32_t>(memory_tracking::names::key_reorder_space,
-+            compensation_reduce_size);
-+}
-+
- status_t jit_uni_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
-         engine_t *engine, const primitive_attr_t *attr, engine_t *src_engine,
-         const memory_desc_t *src_md, engine_t *dst_engine,
-@@ -1616,36 +2520,18 @@ status_t jit_uni_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
-     status_t prb_init_status = prb_init(prb, *src_md, *dst_md, attr);
-     if (prb_init_status != status::success) return prb_init_status;
- 
--    DEBUG({
--        printf("init : ");
--        prb_dump(prb);
--    });
--    // Sort the prb array in increasing sizes of the output stride
--    prb_normalize(prb);
--    DEBUG({
--        printf("norm : ");
--        prb_dump(prb);
--    });
--    /* Combine the variables, which appear together on both
--             * sides of the reorder */
--    prb_simplify(prb);
--    DEBUG({
--        printf("smpl : ");
--        prb_dump(prb);
--    });
--
-     prb_block_for_cache(prb);
-     DEBUG({
-         printf("cache: ");
-         prb_dump(prb);
-     });
- 
--    CHECK(prb_check_blk(prb, *dst_md));
--
--    int ndims_ker_max;
-+    int ndims_ker_max {};
-     int nthr = dnnl_get_max_threads();
-     prb_thread_kernel_balance(prb, ndims_ker_max, nthr);
- 
-+    if (prb.is_tail_present) prb_node_dependency(prb);
-+
-     tr::kernel_t::desc_t ker_desc;
-     status_t ker_init_status
-             = tr::kernel_t::desc_init(ker_desc, prb, ndims_ker_max);
-@@ -1663,99 +2549,191 @@ status_t jit_uni_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
-     auto _pd = new pd_t(
-             attr, src_engine->kind(), src_md, dst_engine->kind(), dst_md);
-     if (_pd == nullptr) return status::out_of_memory;
-+
-+    _pd->nthr_ = nthr;
-+    _pd->prb_ = prb;
-+    _pd->with_groups_
-+            = prb.compensation_mask == tr::prb_t::comp_mask_with_groups;
-     if (_pd->init(engine, src_engine, dst_engine) != status::success) {
-         delete _pd;
-         return status::unimplemented;
-     }
--    _pd->prb_ = prb;
-     _pd->ker_desc_ = ker_desc;
-     _pd->init_scratchpad_md();
--    _pd->nthr_ = nthr;
-+
-     return safe_ptr_assign(*reorder_pd, _pd);
- }
- 
--void jit_uni_reorder_t::omp_driver_0d(
--        int off, const char *in, char *out, const float *scale) const {
--    tr::call_param_t c {in, out, scale, 0};
--    (*kernel_)(&c);
-+void jit_uni_reorder_t::omp_driver_0d(int off, const char *in, char *out,
-+        const float *scale, int src_zp, int dst_zp,
-+        int32_t *compensation_scratch) const {
-+    const tr::prb_t &prb = pd()->prb_;
-+
-+    tr::call_param_t base_params;
-+    base_params.in = in;
-+    base_params.out = out;
-+    base_params.scale = scale;
-+    base_params.src_zp = src_zp;
-+    base_params.dst_zp = dst_zp;
-+    base_params.compensation_scratch = compensation_scratch;
-+
-+    if (prb.is_tail_present) {
-+        tr::tail_call_param_t tail_params;
-+        tail_params.base_params = base_params;
-+
-+        static constexpr int omp_ndims = 0;
-+        fill_curr_data_chunks(prb, off, nullptr, omp_ndims, tail_params);
-+        (*kernel_)(&tail_params);
-+    } else {
-+        (*kernel_)(&base_params);
-+    }
- }
- 
- void jit_uni_reorder_t::omp_driver_1d(int ithr, int nthr, int off,
--        const char *in, char *out, const float *scale) const {
--    const tr::node_t *ns = pd()->prb_.nodes + off;
-+        const char *in, char *out, const float *scale, int src_zp, int dst_zp,
-+        int32_t *compensation_scratch) const {
-+    const tr::prb_t &prb = pd()->prb_;
-+    const tr::node_t *ns = prb.nodes + off;
-     for_nd(ithr, nthr, (ptrdiff_t)ns[0].n, [&](ptrdiff_t d0) {
--        auto c = tr::call_param_t();
--        c.in = in + d0 * ns[0].is * data_type_size(pd()->prb_.itype);
--        c.out = out + d0 * ns[0].os * data_type_size(pd()->prb_.otype);
--        c.scale = scale + d0 * ns[0].ss;
--        c.blk_chunks = d0;
--        (*kernel_)(&c);
-+        tr::call_param_t base_params;
-+        base_params.in = in + d0 * ns[0].is * data_type_size(prb.itype);
-+        base_params.out = out + d0 * ns[0].os * data_type_size(prb.otype);
-+        base_params.scale = scale + d0 * ns[0].ss;
-+        base_params.src_zp = src_zp;
-+        base_params.dst_zp = dst_zp;
-+        base_params.compensation_scratch = compensation_scratch + d0 * ns[0].cs;
-+
-+        if (prb.is_tail_present) {
-+            tr::tail_call_param_t tail_params;
-+            tail_params.base_params = base_params;
-+
-+            static constexpr int omp_ndims = 1;
-+            const ptrdiff_t omp_data_chunks[omp_ndims] = {d0};
-+            fill_curr_data_chunks(
-+                    prb, off, omp_data_chunks, omp_ndims, tail_params);
-+            (*kernel_)(&tail_params);
-+        } else {
-+            (*kernel_)(&base_params);
-+        }
-     });
- }
- 
- void jit_uni_reorder_t::omp_driver_2d(int ithr, int nthr, int off,
--        const char *in, char *out, const float *scale) const {
--    const tr::node_t *ns = pd()->prb_.nodes + off;
--    const int blk_idx_off = pd()->prb_.blk_chunk_idx - off;
-+        const char *in, char *out, const float *scale, int src_zp, int dst_zp,
-+        int32_t *compensation_scratch) const {
-+    const tr::prb_t &prb = pd()->prb_;
-+    const tr::node_t *ns = prb.nodes + off;
-     for_nd(ithr, nthr, (ptrdiff_t)ns[1].n, (ptrdiff_t)ns[0].n,
-             [&](ptrdiff_t d1, ptrdiff_t d0) {
--                auto c = tr::call_param_t();
--                c.in = in
-+                tr::call_param_t base_params;
-+                base_params.in = in
-                         + (d0 * ns[0].is + d1 * ns[1].is)
--                                * data_type_size(pd()->prb_.itype);
--                c.out = out
-+                                * data_type_size(prb.itype);
-+                base_params.out = out
-                         + (d0 * ns[0].os + d1 * ns[1].os)
--                                * data_type_size(pd()->prb_.otype);
--                c.scale = scale + d0 * ns[0].ss + d1 * ns[1].ss;
--                c.blk_chunks = utils::pick(blk_idx_off, d0, d1);
--                (*kernel_)(&c);
-+                                * data_type_size(prb.otype);
-+                base_params.scale = scale + d0 * ns[0].ss + d1 * ns[1].ss;
-+                base_params.src_zp = src_zp;
-+                base_params.dst_zp = dst_zp;
-+                base_params.compensation_scratch
-+                        = compensation_scratch + d0 * ns[0].cs + d1 * ns[1].cs;
-+
-+                if (prb.is_tail_present) {
-+                    tr::tail_call_param_t tail_params;
-+                    tail_params.base_params = base_params;
-+
-+                    static constexpr int omp_ndims = 2;
-+                    const ptrdiff_t omp_data_chunks[omp_ndims] = {d0, d1};
-+                    fill_curr_data_chunks(
-+                            prb, off, omp_data_chunks, omp_ndims, tail_params);
-+
-+                    (*kernel_)(&tail_params);
-+                } else {
-+                    (*kernel_)(&base_params);
-+                }
-             });
- }
- 
- void jit_uni_reorder_t::omp_driver_3d(int ithr, int nthr, int off,
--        const char *in, char *out, const float *scale) const {
--    const tr::node_t *ns = pd()->prb_.nodes + off;
--    const int blk_idx_off = pd()->prb_.blk_chunk_idx - off;
-+        const char *in, char *out, const float *scale, int src_zp, int dst_zp,
-+        int32_t *compensation_scratch) const {
-+    const tr::prb_t &prb = pd()->prb_;
-+    const tr::node_t *ns = prb.nodes + off;
-     for_nd(ithr, nthr, (ptrdiff_t)ns[2].n, (ptrdiff_t)ns[1].n,
-             (ptrdiff_t)ns[0].n, [&](ptrdiff_t d2, ptrdiff_t d1, ptrdiff_t d0) {
--                auto c = tr::call_param_t();
--                c.in = in
-+                tr::call_param_t base_params;
-+                base_params.in = in
-                         + (d0 * ns[0].is + d1 * ns[1].is + d2 * ns[2].is)
--                                * data_type_size(pd()->prb_.itype);
--                c.out = out
-+                                * data_type_size(prb.itype);
-+                base_params.out = out
-                         + (d0 * ns[0].os + d1 * ns[1].os + d2 * ns[2].os)
--                                * data_type_size(pd()->prb_.otype);
--                c.scale = scale + d0 * ns[0].ss + d1 * ns[1].ss + d2 * ns[2].ss;
--                c.blk_chunks = utils::pick(blk_idx_off, d0, d1, d2);
--                (*kernel_)(&c);
-+                                * data_type_size(prb.otype);
-+                base_params.scale
-+                        = scale + d0 * ns[0].ss + d1 * ns[1].ss + d2 * ns[2].ss;
-+                base_params.src_zp = src_zp;
-+                base_params.dst_zp = dst_zp;
-+                base_params.compensation_scratch = compensation_scratch
-+                        + d0 * ns[0].cs + d1 * ns[1].cs + d2 * ns[2].cs;
-+
-+                if (prb.is_tail_present) {
-+                    tr::tail_call_param_t tail_params;
-+                    tail_params.base_params = base_params;
-+
-+                    static constexpr int omp_ndims = 3;
-+                    const ptrdiff_t omp_data_chunks[omp_ndims] = {d0, d1, d2};
-+                    fill_curr_data_chunks(
-+                            prb, off, omp_data_chunks, omp_ndims, tail_params);
-+                    (*kernel_)(&tail_params);
-+                } else {
-+                    (*kernel_)(&base_params);
-+                }
-             });
- }
- 
- void jit_uni_reorder_t::omp_driver_4d(int ithr, int nthr, int off,
--        const char *in, char *out, const float *scale) const {
--    const tr::node_t *ns = pd()->prb_.nodes + off;
--    const int blk_idx_off = pd()->prb_.blk_chunk_idx - off;
-+        const char *in, char *out, const float *scale, int src_zp, int dst_zp,
-+        int32_t *compensation_scratch) const {
-+    const tr::prb_t &prb = pd()->prb_;
-+    const tr::node_t *ns = prb.nodes + off;
-     for_nd(ithr, nthr, (ptrdiff_t)ns[3].n, (ptrdiff_t)ns[2].n,
-             (ptrdiff_t)ns[1].n, (ptrdiff_t)ns[0].n,
-             [&](ptrdiff_t d3, ptrdiff_t d2, ptrdiff_t d1, ptrdiff_t d0) {
--                auto c = tr::call_param_t();
--                c.in = in
-+                tr::call_param_t base_params;
-+                base_params.in = in
-                         + (d0 * ns[0].is + d1 * ns[1].is + d2 * ns[2].is
-                                   + d3 * ns[3].is)
--                                * data_type_size(pd()->prb_.itype);
--                c.out = out
-+                                * data_type_size(prb.itype);
-+                base_params.out = out
-                         + (d0 * ns[0].os + d1 * ns[1].os + d2 * ns[2].os
-                                   + d3 * ns[3].os)
--                                * data_type_size(pd()->prb_.otype);
--                c.scale = scale + d0 * ns[0].ss + d1 * ns[1].ss + d2 * ns[2].ss
--                        + d3 * ns[3].ss;
--                c.blk_chunks = utils::pick(blk_idx_off, d0, d1, d2, d3);
--                (*kernel_)(&c);
-+                                * data_type_size(prb.otype);
-+                base_params.scale = scale + d0 * ns[0].ss + d1 * ns[1].ss
-+                        + d2 * ns[2].ss + d3 * ns[3].ss;
-+                base_params.src_zp = src_zp;
-+                base_params.dst_zp = dst_zp;
-+                base_params.compensation_scratch = compensation_scratch
-+                        + d0 * ns[0].cs + d1 * ns[1].cs + d2 * ns[2].cs
-+                        + d3 * ns[3].cs;
-+
-+                if (prb.is_tail_present) {
-+                    tr::tail_call_param_t tail_params;
-+                    tail_params.base_params = base_params;
-+
-+                    static constexpr int omp_ndims = 4;
-+                    const ptrdiff_t omp_data_chunks[omp_ndims]
-+                            = {d0, d1, d2, d3};
-+                    fill_curr_data_chunks(
-+                            prb, off, omp_data_chunks, omp_ndims, tail_params);
-+                    (*kernel_)(&tail_params);
-+                } else {
-+                    (*kernel_)(&base_params);
-+                }
-             });
- }
- 
--void jit_uni_reorder_t::omp_driver(
--        const char *in, char *out, const float *scale) const {
-+void jit_uni_reorder_t::omp_driver(const char *in, char *out,
-+        const float *scale, int src_zp, int dst_zp,
-+        const memory_tracking::grantor_t &scratchpad) const {
-     in += pd()->prb_.ioff * data_type_size(pd()->prb_.itype);
-     out += pd()->prb_.ooff * data_type_size(pd()->prb_.otype);
- 
-@@ -1770,29 +2748,153 @@ void jit_uni_reorder_t::omp_driver(
- 
-     int ndims = pd()->prb_.ndims;
-     int ndims_ker = pd()->ker_desc_.prb.ndims;
-+    const bool req_s8s8_comp = pd()->prb_.req_s8s8_comp;
-+    const bool req_asymmetric_comp = pd()->prb_.req_asymmetric_comp;
-+    const bool req_compensation = req_s8s8_comp || req_asymmetric_comp;
-     assert(ndims - ndims_ker <= ndims_driver_max);
- 
-+    int32_t *compensation_reduce_scratch = scratchpad.template get<int32_t>(
-+            memory_tracking::names::key_reorder_space);
-+
-+    const memory_desc_wrapper od(pd()->dst_md());
-+    const auto G = pd()->with_groups_ ? od.padded_dims()[0] : 1;
-+    const auto N = od.padded_dims()[pd()->with_groups_ ? 1 : 0];
-+    static constexpr int cache_line_size = 16;
-+    const auto wspace_per_thr_size = utils::rnd_up(G * N, cache_line_size);
-+    const auto wspace_per_thr_bytes = wspace_per_thr_size * sizeof(int32_t);
-+
-     if (ndims - ndims_ker == 0) {
--        omp_driver_0d(ndims_ker, in, out, scale);
-+        if (req_compensation)
-+            std::memset(compensation_reduce_scratch, 0, wspace_per_thr_bytes);
-+
-+        omp_driver_0d(ndims_ker, in, out, scale, src_zp, dst_zp,
-+                compensation_reduce_scratch);
-     } else {
-         parallel(pd()->nthr_, [&](const int ithr, const int nthr) {
-+            int32_t *compensation_scratch = nullptr;
-+            if (req_compensation) {
-+                compensation_scratch = &compensation_reduce_scratch[ithr
-+                        * wspace_per_thr_size];
-+                std::memset(compensation_scratch, 0, wspace_per_thr_bytes);
-+            }
-+
-             switch (ndims - ndims_ker) {
-                 case 1:
--                    omp_driver_1d(ithr, nthr, ndims_ker, in, out, scale);
-+                    omp_driver_1d(ithr, nthr, ndims_ker, in, out, scale, src_zp,
-+                            dst_zp, compensation_scratch);
-                     break;
-                 case 2:
--                    omp_driver_2d(ithr, nthr, ndims_ker, in, out, scale);
-+                    omp_driver_2d(ithr, nthr, ndims_ker, in, out, scale, src_zp,
-+                            dst_zp, compensation_scratch);
-                     break;
-                 case 3:
--                    omp_driver_3d(ithr, nthr, ndims_ker, in, out, scale);
-+                    omp_driver_3d(ithr, nthr, ndims_ker, in, out, scale, src_zp,
-+                            dst_zp, compensation_scratch);
-                     break;
-                 case 4:
--                    omp_driver_4d(ithr, nthr, ndims_ker, in, out, scale);
-+                    omp_driver_4d(ithr, nthr, ndims_ker, in, out, scale, src_zp,
-+                            dst_zp, compensation_scratch);
-                     break;
-                 default: assert(!"unimplemented");
-             }
-         });
-     }
-+
-+    // Reduction of intermediate compensation results to the final output
-+    if (req_compensation) {
-+        const int nthr = ndims - ndims_ker == 0 ? 1 : pd()->nthr_;
-+        reduce_compensation(
-+                out, compensation_reduce_scratch, nthr, wspace_per_thr_size);
-+    }
-+}
-+
-+void jit_uni_reorder_t::reduce_compensation(char *out,
-+        const int32_t *compensation_reduce_scratch, const int nthr,
-+        const dim_t wspace_per_thr_size) const {
-+
-+    const memory_desc_wrapper od(pd()->dst_md());
-+    const size_t offset = od.size() - od.additional_buffer_size();
-+
-+    static constexpr auto comp_dt_size = sizeof(int32_t);
-+    static constexpr int32_t comp_s8s8_shift = 128;
-+
-+    // Note: We do not need to explicitly zero-out compensation buffer, as the
-+    // per_thread buffers are already zeroed out in the padded area.
-+    const auto G = pd()->with_groups_ ? od.padded_dims()[0] : 1;
-+    const auto N = od.padded_dims()[pd()->with_groups_ ? 1 : 0];
-+    const auto GN = G * N;
-+    const bool req_s8s8_comp = pd()->prb_.req_s8s8_comp;
-+    const bool req_asymmetric_comp = pd()->prb_.req_asymmetric_comp;
-+    const size_t zp_offset
-+            = offset + (pd()->prb_.req_s8s8_comp ? GN * comp_dt_size : 0);
-+
-+    parallel_nd(GN, [&](int idx) {
-+        int32_t acc = 0;
-+        for (int ithr = 0; ithr < nthr; ithr++) {
-+            acc -= compensation_reduce_scratch[ithr * wspace_per_thr_size
-+                    + idx];
-+        }
-+        if (req_s8s8_comp) {
-+            int32_t *out_comp = reinterpret_cast<int32_t *>(&out[offset]);
-+            out_comp[idx] = comp_s8s8_shift * acc;
-+        }
-+        if (req_asymmetric_comp) {
-+            int32_t *out_asym_comp
-+                    = reinterpret_cast<int32_t *>(&out[zp_offset]);
-+            out_asym_comp[idx] = acc;
-+        }
-+    });
-+}
-+
-+void jit_uni_reorder_t::fill_curr_data_chunks(const tr::prb_t &prb,
-+        const int off, const ptrdiff_t *omp_data_chunks, const int omp_ndims,
-+        tr::tail_call_param_t &c) const {
-+    // Chunks are backwards numered i.e:
-+    // [0] -> [node_size]
-+    // [1] -> [node_size - 1]
-+    // ...
-+    // [node_size - 1] -> [1]
-+
-+    // It is done like this, because it is easier to decrement counter
-+    // and check if it is equal to zero than increment and check
-+    // if it is equal to node_size in jit kernel.
-+
-+    static constexpr int64_t empty_chunk_info = -1;
-+    static constexpr int64_t last_chunk = 1;
-+
-+    for (int curr_node_id = prb.ndims - 1; curr_node_id >= 0; curr_node_id--) {
-+        const int parent_node_id = prb.nodes[curr_node_id].parent_node_id;
-+        const bool is_drv_processing_this_node
-+                = curr_node_id >= off && curr_node_id <= off + omp_ndims - 1;
-+        const bool is_tail_processing
-+                = prb.is_tail_in_one_of_child_nodes(curr_node_id)
-+                || prb.nodes[curr_node_id].tail_size > 0;
-+
-+        if (is_drv_processing_this_node && is_tail_processing) {
-+            const int inner_idx = curr_node_id - off;
-+            assert(inner_idx < omp_ndims);
-+            const int64_t node_size = prb.nodes[curr_node_id].tail_size > 0
-+                    ? prb.nodes[curr_node_id].tail_size
-+                    : prb.nodes[curr_node_id].n;
-+            const int64_t data_chunk = node_size - omp_data_chunks[inner_idx];
-+
-+            if (!prb.nodes[curr_node_id].is_parent_empty()) {
-+                const bool is_parent_chunk_last
-+                        = c.curr_data_chunks[parent_node_id] == last_chunk;
-+                c.curr_data_chunks[curr_node_id]
-+                        = is_parent_chunk_last ? data_chunk : empty_chunk_info;
-+                c.zeroing_data = static_cast<int64_t>(
-+                        is_parent_chunk_last && data_chunk <= 0);
-+            } else {
-+                c.curr_data_chunks[curr_node_id] = data_chunk;
-+                c.zeroing_data = static_cast<int64_t>(data_chunk <= 0);
-+            }
-+            c.skip_kernel_execution = static_cast<int64_t>(c.zeroing_data
-+                    && !prb.nodes[curr_node_id].is_zero_pad_needed);
-+            if (c.zeroing_data || c.skip_kernel_execution) break;
-+        } else
-+            c.curr_data_chunks[curr_node_id] = empty_chunk_info;
-+    }
- }
- 
- status_t jit_uni_reorder_t::init(engine_t *engine) {
-@@ -1801,13 +2903,98 @@ status_t jit_uni_reorder_t::init(engine_t *engine) {
- }
- 
- status_t jit_uni_reorder_t::execute(const exec_ctx_t &ctx) const {
--    status_t status = status::success;
-     auto in = CTX_IN_MEM(const char *, DNNL_ARG_FROM);
--    auto out = CTX_OUT_CLEAN_MEM(char *, DNNL_ARG_TO, status);
--    CHECK(status);
-+    auto out = CTX_OUT_MEM(char *, DNNL_ARG_TO);
-     DEFINE_SCALES_BUFFER(scales);
-+    DEFINE_ZERO_POINT_VALUE(src_zp, DNNL_ARG_FROM);
-+    DEFINE_ZERO_POINT_VALUE(dst_zp, DNNL_ARG_TO);
-+    const auto &scratchpad = ctx.get_scratchpad_grantor();
-+
-+    omp_driver(in, out, scales, src_zp, dst_zp, scratchpad);
-+
-+    return status::success;
-+}
-+
-+status_t jit_blk_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
-+        engine_t *engine, const primitive_attr_t *attr, engine_t *src_engine,
-+        const memory_desc_t *src_md, engine_t *dst_engine,
-+        const memory_desc_t *dst_md) {
-+    auto prb = tr::prb_t();
-+
-+    status_t prb_init_status = prb_init(prb, *src_md, *dst_md, attr);
-+    if (prb_init_status != status::success) return prb_init_status;
-+    // only uni_reorder supports tail processing now
-+    // TODO: Add tail processing support in blk_reorder
-+    if (prb.is_tail_present) return status::unimplemented;
-+
-+    prb_tile_normalize(prb);
-+    DEBUG({
-+        printf("tile : ");
-+        prb_dump(prb);
-+    });
-+
-+    if (!tr::jit_single_blk_kernel_t::applicable(prb)) {
-+        return status::unimplemented;
-+    }
- 
--    omp_driver(in, out, scales);
-+    auto _pd = new pd_t(
-+            attr, src_engine->kind(), src_md, dst_engine->kind(), dst_md);
-+    if (_pd == nullptr) return status::out_of_memory;
-+    _pd->prb_ = prb;
-+    if (_pd->init(engine, src_engine, dst_engine) != status::success) {
-+        delete _pd;
-+        return status::unimplemented;
-+    }
-+    _pd->init_scratchpad_md();
-+
-+    return safe_ptr_assign(*reorder_pd, _pd);
-+}
-+
-+void jit_blk_reorder_t::pd_t::prb_tile_normalize(tr::prb_t &p) {
-+    if (!utils::one_of(p.nodes[0].n, 8ul, 16ul)
-+            && utils::one_of(p.nodes[1].n, 8ul, 16ul)) {
-+        nstl::swap(p.nodes[0], p.nodes[1]);
-+    }
-+}
-+
-+jit_blk_reorder_t::jit_blk_reorder_t(const pd_t *apd) : primitive_t(apd) {}
-+jit_blk_reorder_t::~jit_blk_reorder_t() = default;
-+
-+status_t jit_blk_reorder_t::init(engine_t *engine) {
-+    kernel_ = utils::make_unique<tr::jit_single_blk_kernel_t>(pd()->prb_);
-+    return kernel_->create_kernel();
-+}
-+
-+status_t jit_blk_reorder_t::execute(const exec_ctx_t &ctx) const {
-+    const auto in = CTX_IN_MEM(const char *, DNNL_ARG_FROM);
-+    auto out = CTX_OUT_MEM(char *, DNNL_ARG_TO);
-+    DEFINE_ZERO_POINT_VALUE(src_zp, DNNL_ARG_FROM);
-+    DEFINE_ZERO_POINT_VALUE(dst_zp, DNNL_ARG_TO);
-+
-+    // kernel handle 2-dimension tiles, a tail is possible
-+    auto &prb = this->pd()->prb_;
-+    ptrdiff_t BH = 1;
-+    for (int i = 2; i < prb.ndims; ++i) {
-+        BH *= prb.nodes[i].n;
-+    }
-+
-+    auto block_sz = prb.n(0);
-+    auto n1 = prb.n(1);
-+    auto i1 = prb.is(1);
-+    auto o1 = prb.os(1);
-+    auto FL = (n1 + block_sz - 1) / block_sz;
-+    auto bh_stride = BH == 1 ? 0 : prb.is(2);
-+
-+    auto itype_sz_ = data_type_size(pd()->prb_.itype);
-+    auto otype_sz_ = data_type_size(pd()->prb_.otype);
-+
-+    parallel_nd(BH, FL, [&](dim_t bh, dim_t fl) {
-+        auto fl_b = fl * block_sz;
-+        auto bh_b = bh_stride * bh;
-+        auto *i = in + (bh_b + fl_b * i1) * itype_sz_;
-+        auto *o = out + (bh_b + fl_b * o1) * otype_sz_;
-+        (*kernel_)(i, o, n1 - fl_b < block_sz, src_zp, dst_zp);
-+    });
- 
-     return status::success;
- }
-diff --git a/src/cpu/aarch64/jit_uni_reorder.hpp b/src/cpu/aarch64/jit_uni_reorder.hpp
-index 2fb6f0f89f3..bf400430ba5 100644
---- a/src/cpu/aarch64/jit_uni_reorder.hpp
-+++ b/src/cpu/aarch64/jit_uni_reorder.hpp
-@@ -1,6 +1,6 @@
- /*******************************************************************************
--* Copyright 2018-2020 Intel Corporation
--* Copyright 2020 FUJITSU LIMITED
-+* Copyright 2018-2022 Intel Corporation
-+* Copyright 2020-2022 FUJITSU LIMITED
- * Copyright 2022 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
-@@ -36,15 +36,76 @@ namespace tr {
- constexpr int max_ndims = DNNL_MAX_NDIMS;
- 
- struct node_t {
--    size_t n;
--    ptrdiff_t is; // input stride
--    ptrdiff_t os; // output stride
--    ptrdiff_t ss; // scale stride
-+    static constexpr int64_t empty_field = -1;
-+
-+    size_t n = 0;
-+    size_t tail_size = 0;
-+    int dim_id = empty_field;
-+    int parent_node_id = empty_field;
-+    bool is_zero_pad_needed = false;
-+    ptrdiff_t is = 0; // input stride
-+    ptrdiff_t os = 0; // output stride
-+    ptrdiff_t ss = 0; // scale stride
-+    ptrdiff_t cs = 0; // compensation stride
-+
-+    bool is_dim_id_empty() const { return dim_id == empty_field; }
-+    bool is_parent_empty() const { return parent_node_id == empty_field; }
- };
- 
- enum class scale_type_t { NONE, COMMON, MANY };
- 
- struct prb_t {
-+    /* The compensation mask value indicates how big an additional buffer should be.
-+     * Possible values for reorder:
-+     *     1) standard compensation = 1 = 0b01
-+     *     2) asymmetric compensation = 2 = 0b10
-+     *     3) compensation if tensor contains group = 3 = 0b11 */
-+    static constexpr int invalid_comp_mask = 0;
-+    static constexpr int standard_comp_mask = 0b1;
-+    static constexpr int asymmetric_comp_mask = 0b10;
-+    static constexpr int comp_mask_with_groups
-+            = standard_comp_mask + asymmetric_comp_mask;
-+
-+    bool is_tail_in_one_of_child_nodes(int parent_node_id) const {
-+        for (int i = parent_node_id; i >= 0; i--) {
-+            if (nodes[i].parent_node_id == parent_node_id) {
-+                if (nodes[i].tail_size != 0)
-+                    return true;
-+                else
-+                    parent_node_id = i;
-+            }
-+        }
-+
-+        return false;
-+    }
-+
-+    int tail(int d) const {
-+        assert(d < ndims);
-+        return static_cast<int>(nodes[d].tail_size);
-+    }
-+
-+    int n(int d) const {
-+        assert(d < ndims);
-+        return static_cast<int>(nodes[d].n);
-+    }
-+    int is(int d) const {
-+        assert(d < ndims);
-+        return static_cast<int>(nodes[d].is);
-+    }
-+    int os(int d) const {
-+        assert(d < ndims);
-+        return static_cast<int>(nodes[d].os);
-+    }
-+    int ss(int d) const {
-+        assert(d < ndims);
-+        return static_cast<int>(nodes[d].ss);
-+    }
-+
-+    int cs(int d) const {
-+        assert(d < ndims);
-+        return static_cast<int>(nodes[d].cs);
-+    }
-+
-     data_type_t itype;
-     data_type_t otype;
-     int ndims;
-@@ -54,21 +115,24 @@ struct prb_t {
-     scale_type_t scale_type;
-     float beta;
-     int full_ndims;
--    int ip_tail;
--    int op_tail;
--    int iblock;
--    int oblock;
--    int blk_chunk_idx;
-+    bool is_tail_present = false;
-+    float scale_adjust = 1.f;
-+    int compensation_mask = invalid_comp_mask;
-+    bool req_s8s8_comp = false;
-+    bool req_asymmetric_comp = false;
-+    bool req_src_zp = false;
-+    bool req_dst_zp = false;
- };
- 
- status_t prb_init(prb_t &prb, const memory_desc_t &imd,
-         const memory_desc_t &omd, const primitive_attr_t *attr);
- 
--status_t prb_check_blk(prb_t &prb, const memory_desc_t &imd);
--
- /** sorts the problem nodes so that output strides come in ascending order */
- void prb_normalize(prb_t &p);
- 
-+/** fill parent node info for blocked nodes */
-+void prb_node_dependency(prb_t &p);
-+
- /** folds nodes together if possible */
- void prb_simplify(prb_t &p);
- 
-@@ -88,10 +152,24 @@ void prb_node_move(prb_t &p, int d0, int d1);
- void prb_dump(const prb_t &p);
- 
- struct call_param_t {
--    const void *in;
--    void *out;
--    const float *scale;
--    size_t blk_chunks;
-+    const void *in = nullptr;
-+    void *out = nullptr;
-+    const float *scale = nullptr;
-+    int32_t src_zp = 0;
-+    int32_t dst_zp = 0;
-+    int32_t *compensation_scratch = nullptr;
-+};
-+
-+// The additional structure is needed because
-+// using a data structure with tail processing
-+// data for non-tail cases reduces kernel
-+// performance. This is because there is too
-+// much data that has to be transferred to the kernel.
-+struct tail_call_param_t {
-+    call_param_t base_params;
-+    int64_t curr_data_chunks[DNNL_MAX_NDIMS] = {-1};
-+    int64_t zeroing_data = static_cast<int64_t>(false);
-+    int64_t skip_kernel_execution = static_cast<int64_t>(false);
- };
- 
- struct kernel_t {
-@@ -100,8 +178,12 @@ struct kernel_t {
-         prb_t prb;
-     };
- 
--    kernel_t(const desc_t &desc) : desc_(desc) {}
-+    kernel_t(const desc_t &desc)
-+        : desc_(desc)
-+        , compensation_needed_(
-+                  desc.prb.req_s8s8_comp || desc.prb.req_asymmetric_comp) {}
-     virtual void operator()(const call_param_t *c) const = 0;
-+    virtual void operator()(const tail_call_param_t *c) const = 0;
-     virtual status_t create_kernel() = 0;
-     virtual ~kernel_t() {}
- 
-@@ -119,10 +201,13 @@ struct kernel_t {
- protected:
-     const desc_t desc_;
-     const prb_t &prb_ = desc_.prb;
-+    bool compensation_needed_ = false;
- };
- 
- /* TODO: add trans_t class */
- 
-+struct jit_single_blk_kernel_t;
-+
- } // namespace tr
- 
- struct jit_uni_reorder_t : public primitive_t {
-@@ -135,8 +220,13 @@ struct jit_uni_reorder_t : public primitive_t {
-         tr::prb_t prb_;
-         tr::kernel_t::desc_t ker_desc_;
-         int nthr_;
-+        bool with_groups_ = false;
-+
-+        status_t init(
-+                engine_t *engine, engine_t *src_engine, engine_t *dst_engine);
- 
-     private:
-+        void init_scratchpad();
-         static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
-                 const primitive_attr_t *attr, engine_t *src_engine,
-                 const memory_desc_t *src_md, engine_t *dst_engine,
-@@ -151,23 +241,66 @@ struct jit_uni_reorder_t : public primitive_t {
-     enum { ndims_driver_max = 4 };
- 
- private:
--    void omp_driver_0d(
--            int off, const char *in, char *out, const float *scale) const;
-+    void omp_driver_0d(int off, const char *in, char *out, const float *scale,
-+            int src_zp, int dst_zp, int32_t *compensation_scratch) const;
-     void omp_driver_1d(int ithr, int nthr, int off, const char *in, char *out,
--            const float *scale) const;
-+            const float *scale, int src_zp, int dst_zp,
-+            int32_t *compensation_scratch) const;
-     void omp_driver_2d(int ithr, int nthr, int off, const char *in, char *out,
--            const float *scale) const;
-+            const float *scale, int src_zp, int dst_zp,
-+            int32_t *compensation_scratch) const;
-     void omp_driver_3d(int ithr, int nthr, int off, const char *in, char *out,
--            const float *scale) const;
-+            const float *scale, int src_zp, int dst_zp,
-+            int32_t *compensation_scratch) const;
-     void omp_driver_4d(int ithr, int nthr, int off, const char *in, char *out,
--            const float *scale) const;
-+            const float *scale, int src_zp, int dst_zp,
-+            int32_t *compensation_scratch) const;
-+
-+    void omp_driver(const char *in, char *out, const float *scale, int src_zp,
-+            int dst_zp, const memory_tracking::grantor_t &scratchpad) const;
- 
--    void omp_driver(const char *in, char *out, const float *scale) const;
-+    void fill_curr_data_chunks(const tr::prb_t &prb, const int off,
-+            const ptrdiff_t *omp_data_chunks, const int omp_ndims,
-+            tr::tail_call_param_t &c) const;
-+
-+    void reduce_compensation(char *out,
-+            const int32_t *compensation_reduce_scratch, const int nthr,
-+            const dim_t wspace_per_thr_size) const;
- 
-     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-     std::unique_ptr<tr::kernel_t> kernel_;
- };
- 
-+struct jit_blk_reorder_t : public primitive_t {
-+    using primitive_t::primitive_t;
-+    struct pd_t : public cpu_reorder_pd_t {
-+        using cpu_reorder_pd_t::cpu_reorder_pd_t;
-+        DECLARE_COMMON_PD_T("jit:blk", jit_blk_reorder_t);
-+
-+        tr::prb_t prb_;
-+
-+    private:
-+        static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
-+                const primitive_attr_t *attr, engine_t *src_engine,
-+                const memory_desc_t *src_md, engine_t *dst_engine,
-+                const memory_desc_t *dst_md);
-+
-+        // Swap last two nodes, put block 4, 8, 16 nodes to first
-+        static void prb_tile_normalize(tr::prb_t &p);
-+        friend dnnl::impl::impl_list_item_t;
-+    };
-+
-+    status_t init(engine_t *engine) override;
-+    status_t execute(const exec_ctx_t &ctx) const override;
-+
-+    jit_blk_reorder_t(const pd_t *apd);
-+    ~jit_blk_reorder_t();
-+
-+private:
-+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-+    std::unique_ptr<tr::jit_single_blk_kernel_t> kernel_;
-+};
-+
- } // namespace aarch64
- } // namespace cpu
- } // namespace impl
-diff --git a/src/cpu/aarch64/jit_uni_reorder_utils.cpp b/src/cpu/aarch64/jit_uni_reorder_utils.cpp
-index 7123811f827..28f36a7e2e7 100644
---- a/src/cpu/aarch64/jit_uni_reorder_utils.cpp
-+++ b/src/cpu/aarch64/jit_uni_reorder_utils.cpp
-@@ -1,6 +1,6 @@
- /*******************************************************************************
--* Copyright 2018-2021 Intel Corporation
--* Copyright 2020 FUJITSU LIMITED
-+* Copyright 2018-2022 Intel Corporation
-+* Copyright 2020-2022 FUJITSU LIMITED
- * Copyright 2022 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
-@@ -25,10 +25,21 @@
- #include "common/nstl.hpp"
- #include "common/type_helpers.hpp"
- #include "common/utils.hpp"
--#include "dnnl_debug.h"
-+#include "oneapi/dnnl/dnnl_debug.h"
- 
- #include "cpu/aarch64/jit_uni_reorder.hpp"
- 
-+// #define TR_DEBUG
-+#if defined(TR_DEBUG)
-+#define DEBUg(...) \
-+    do { \
-+        __VA_ARGS__ \
-+    } while (0)
-+#else
-+#define DEBUg(...)
-+#endif
-+#define DEBUG(...) DEBUg(__VA_ARGS__)
-+
- using namespace dnnl::impl::types;
- using namespace dnnl::impl::status;
- 
-@@ -41,87 +52,45 @@ namespace tr {
- 
- /** ad-hoc structure to describe blocked memory layout */
- struct layout_desc_t {
-+    layout_desc_t()
-+        : dt(dnnl_data_type_undef)
-+        , ndims(0)
-+        , id {-1}
-+        , dims {0}
-+        , tails {0}
-+        , is_blk {false}
-+        , strides {0} {}
-     data_type_t dt;
-     int ndims;
-     dims_t id;
-     dims_t dims;
-+    dims_t tails;
-+    bool is_blk[DNNL_MAX_NDIMS];
-     strides_t strides;
- };
- 
--static status_t compute_blk_and_tail(
--        const memory_desc_t &md_, const int idx, int &blk, int &tail) {
--    const auto md = memory_desc_wrapper(md_);
--    const auto &bd = md.blocking_desc();
--    if (tail == 0) return status::success;
--
--    const std::set<dim_t> unique_inner_idxs(
--            bd.inner_idxs, bd.inner_idxs + bd.inner_nblks);
--    std::set<dim_t> dims_with_multiple_blks;
--    for (dim_t dim : unique_inner_idxs) {
--        if (std::count(bd.inner_idxs, bd.inner_idxs + bd.inner_nblks, dim) > 1)
--            dims_with_multiple_blks.insert(dim);
--    }
--
--    // Dims that have a tail and have multiple blocks are not supported by the jit kernel yet.
--    // For example:
--    // src_tag = abcd
--    // dst_tag = ABcd16b16a4b
--    // 16x15x3x3
--    // In this case, 'b' dim has two blocks and has a tail. It is not a supported case.
--    if (dims_with_multiple_blks.find(idx) != dims_with_multiple_blks.end())
--        return status::unimplemented;
--
--    // Only supports inconsistent padding in single and double blocks
--    // and the total block size <= 256
--    for (int iblk = bd.inner_nblks - 1; iblk > 0; --iblk) {
--        if (bd.inner_idxs[iblk] == idx) break;
--        blk *= bd.inner_blks[iblk];
--        tail *= bd.inner_blks[iblk];
--    }
--    if (unique_inner_idxs.size() > 2 || blk > 256) return status::unimplemented;
--
--    return status::success;
--}
--
--static status_t compute_chunk_idx(const prb_t &p, const memory_desc_t &imd_,
--        const memory_desc_t &omd_, const int blk_idx, int &chunk_idx) {
--    const auto imd = memory_desc_wrapper(imd_);
--    const auto omd = memory_desc_wrapper(omd_);
--    const auto &ibd = imd.blocking_desc();
--    const auto &obd = omd.blocking_desc();
--    if (p.ip_tail == 0 && p.op_tail == 0) return status::success;
--
--    const ptrdiff_t is
--            = ibd.strides[blk_idx] * obd.inner_blks[obd.inner_idxs[blk_idx]];
--    const ptrdiff_t os = obd.strides[blk_idx];
--
--    for (int i = blk_idx; i < omd.ndims(); ++i) {
--        if (p.nodes[i].os == os && p.nodes[i].is == is) {
--            chunk_idx = i;
--            return status::success;
--        }
--    }
--
--    return status::invalid_arguments;
--}
--
- status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_,
--        layout_desc_t &ld, const dims_t &blocks, const dims_t &ext_padding) {
-+        layout_desc_t &ld, const dims_t &blocks, const dims_t &external_padding,
-+        const dims_t &tails) {
-+    static constexpr bool it_is_blk = true;
-+
-     const auto md = memory_desc_wrapper(md_);
- 
--    bool ok = true && md.is_blocking_desc() && md.extra().flags == 0;
--    if (!ok) return invalid_arguments;
-+    if (!md.is_blocking_desc()) return invalid_arguments;
- 
-     const auto &bd = md.blocking_desc();
- 
-     ld.ndims = 0;
-     ld.dt = md.data_type();
- 
--    auto P = [&ld](int id, int dim, ptrdiff_t stride) {
-+    auto add_dim = [&ld](int id, dim_t dim, dim_t tail, bool is_blk,
-+                           ptrdiff_t stride) {
-         assert((size_t)ld.ndims < sizeof(ld.dims) / sizeof(ld.dims[0]));
-         ld.id[ld.ndims] = id;
-         ld.dims[ld.ndims] = dim;
-         ld.strides[ld.ndims] = stride;
-+        ld.tails[ld.ndims] = tail;
-+        ld.is_blk[ld.ndims] = is_blk;
-         ++ld.ndims;
-     };
- 
-@@ -129,12 +98,27 @@ status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_,
-         const int ld_ndims_start = ld.ndims;
-         if (blocks[d] != 1) {
-             stride_t stride = 1;
-+            int tail = tails[d];
-             for (int iblk = bd.inner_nblks - 1; iblk >= 0; --iblk) {
--                if (bd.inner_idxs[iblk] == d) P(d, bd.inner_blks[iblk], stride);
-+                if (bd.inner_idxs[iblk] == d) {
-+                    const dim_t inner_tail = tail % bd.inner_blks[iblk];
-+                    add_dim(d, bd.inner_blks[iblk], inner_tail, it_is_blk,
-+                            stride);
-+                    tail = utils::div_up(tail, bd.inner_blks[iblk]);
-+                }
-                 stride *= bd.inner_blks[iblk];
-             }
-         }
--        P(d, (md.padded_dims()[d] + ext_padding[d]) / blocks[d], bd.strides[d]);
-+
-+        const dim_t dim_with_external_padding
-+                = (md.padded_dims()[d] + external_padding[d]) / blocks[d];
-+        const dim_t padded_dim = md.padded_dims()[d] / blocks[d];
-+        const dim_t tail = dim_with_external_padding != padded_dim
-+                ? dim_with_external_padding
-+                        - (dim_with_external_padding - padded_dim)
-+                : 0;
-+
-+        add_dim(d, dim_with_external_padding, tail, !it_is_blk, bd.strides[d]);
- 
-         // TODO: NOW: revisit, do we need a reverse?
-         // TODO: NOW: consider using strides instead of block sizes in md
-@@ -144,12 +128,70 @@ status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_,
-             const int idx1 = ld.ndims - 1 - ld_d;
-             nstl::swap(ld.dims[idx0], ld.dims[idx1]);
-             nstl::swap(ld.strides[idx0], ld.strides[idx1]);
-+            nstl::swap(ld.tails[idx0], ld.tails[idx1]);
-+            nstl::swap(ld.is_blk[idx0], ld.is_blk[idx1]);
-         }
-     }
- 
-     return success;
- }
- 
-+static bool is_with_groups(const memory_desc_t &dst_md) {
-+    using namespace memory_extra_flags;
-+    auto dst_d = memory_desc_wrapper(dst_md);
-+    const int grp_bit = 1 << 1;
-+    auto check_flag_and_mask = [&](int flag, int mask) {
-+        return (dst_d.extra().flags & flag) && (mask & grp_bit);
-+    };
-+
-+    return check_flag_and_mask(
-+                   compensation_conv_s8s8, dst_d.extra().compensation_mask)
-+            || check_flag_and_mask(compensation_conv_asymmetric_src,
-+                    dst_d.extra().asymm_compensation_mask);
-+}
-+
-+static inline int get_next_parent_node(node_t *nodes, int ndims, int cur_node) {
-+    const int cur_id = nodes[cur_node].dim_id;
-+    for (int d = cur_node + 1; d < ndims; ++d) {
-+        if (nodes[d].dim_id == cur_id) return d;
-+    }
-+    return -1;
-+}
-+
-+static void prb_set_compensation_strides(prb_t &p) {
-+
-+    auto require_n_stride = [&](int cur_node) -> bool {
-+        const int parent = get_next_parent_node(p.nodes, p.ndims, cur_node);
-+        if (parent < 0) return false;
-+
-+        const size_t p_n = p.nodes[parent].n;
-+
-+        // if 'parent_node.n' is larger than 1, then cur_node stride
-+        // is 'cur_node.n'
-+        return p_n > size_t(1);
-+    };
-+
-+    const auto compensation_needed = p.req_s8s8_comp || p.req_asymmetric_comp;
-+    if (!compensation_needed) return;
-+    int mask = p.compensation_mask;
-+    ptrdiff_t cs = 1;
-+    for (int d = 0; d < p.ndims; ++d) {
-+        if (mask & (1 << p.nodes[d].dim_id)) {
-+
-+            // correct cases when 'cs' exceeds output stride
-+            if (cs > p.nodes[d].os) cs = p.nodes[d].os;
-+
-+            p.nodes[d].cs = cs;
-+            const bool n_stride = require_n_stride(d);
-+            if (p.nodes[d].tail_size > 0 && (!p.nodes[d].is_zero_pad_needed)
-+                    && (!n_stride))
-+                cs *= p.nodes[d].tail_size;
-+            else
-+                cs *= p.nodes[d].n;
-+        }
-+    }
-+}
-+
- status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
-         const primitive_attr_t *attr) {
-     auto im_d = memory_desc_wrapper(imd);
-@@ -157,8 +199,7 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
- 
-     auto check_post_ops = [](const primitive_attr_t *attr) {
-         const auto &po = attr->post_ops_;
--        return po.len() == 0
--                || (po.len() == 1 && po.contain(primitive_kind::sum, 0));
-+        return po.len() == 0 || (po.len() == 1 && po.entry_[0].is_sum(false));
-     };
- 
-     bool ok = im_d.is_blocking_desc() && om_d.is_blocking_desc()
-@@ -166,81 +207,129 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
-             && !om_d.has_runtime_dims_or_strides() && !om_d.has_zero_dim()
-             && attr->has_default_values(
-                     primitive_attr_t::skip_mask_t::oscale_runtime
-+                    | primitive_attr_t::skip_mask_t::zero_points_runtime
-                     | primitive_attr_t::skip_mask_t::post_ops)
-             && check_post_ops(attr);
-     if (!ok) return unimplemented;
- 
--    dims_t iblocks, oblocks, ip_padding, op_padding;
-+    bool is_tail_present = false;
-+    dims_t iblocks, oblocks, i_tails, o_tails, i_paddings, o_paddings;
-     im_d.compute_blocks(iblocks);
-     om_d.compute_blocks(oblocks);
--    utils::array_set(ip_padding, 0, im_d.ndims());
--    utils::array_set(op_padding, 0, om_d.ndims());
--
--    /* padding_dim consistency check
--     * only supports inconsitent padding for src
--     * TODO: Add inconsistent padding support for dst */
--    int ip_tail = 0;
--    int op_tail = 0;
--    int iblk_w_tail = 1;
--    int oblk_w_tail = 1;
--    int blk_idx = 0;
-+
-+    for (int d = 0; d < om_d.ndims(); ++d) {
-+        const auto dim = om_d.dims()[d];
-+        const auto pdim = om_d.padded_dims()[d];
-+        const auto cblock = oblocks[d];
-+        // do not allow excess pdim other than required for rounding-up of dim.
-+        if (utils::rnd_up(dim, cblock) != pdim) return unimplemented;
-+    }
-+
-+    utils::array_set(i_tails, 0, im_d.ndims());
-+    utils::array_set(o_tails, 0, om_d.ndims());
-+    utils::array_set(i_paddings, 0, im_d.ndims());
-+    utils::array_set(o_paddings, 0, om_d.ndims());
- 
-     for (int d = 0; d < im_d.ndims(); ++d) {
--        const int ip_tmp_dim = im_d.padded_dims()[d];
--        const int op_tmp_dim = om_d.padded_dims()[d];
--        const int ip_tmp_tail = ip_tmp_dim % oblocks[d];
--        const int op_tmp_tail = op_tmp_dim % iblocks[d];
--
--        const bool pdim_consistent = ip_tmp_dim == op_tmp_dim
--                && ip_tmp_tail == 0 && op_tmp_tail == 0;
--        const bool pdim_tail = ip_tmp_tail > 0
--                && (ip_tmp_dim + oblocks[d] - ip_tmp_tail) == op_tmp_dim
--                && op_tmp_tail == 0 && ip_tail == 0;
--        if (!pdim_consistent && !pdim_tail) return status::unimplemented;
--        if (pdim_tail) {
--            blk_idx = d;
--            ip_tail = ip_tmp_tail;
--            op_tail = op_tmp_tail;
--            iblk_w_tail = iblocks[d];
--            oblk_w_tail = oblocks[d];
--            ip_padding[d] = oblocks[d] - ip_tmp_tail;
--            op_padding[d] = iblocks[d] - op_tmp_tail;
-+        const dim_t i_dim = im_d.dims()[d];
-+        const dim_t o_dim = om_d.dims()[d];
-+        const dim_t i_tail = i_dim % iblocks[d];
-+        const dim_t o_tail = o_dim % oblocks[d];
-+
-+        if (o_tail > 0) {
-+            is_tail_present = true;
-+            o_tails[d] = o_tail;
-+            o_paddings[d] = oblocks[d] - o_tail;
-+        }
-+
-+        if (i_tail > 0) {
-+            is_tail_present = true;
-+            i_tails[d] = i_tail;
-+            i_paddings[d] = iblocks[d] - i_tail;
-         }
-     }
--    CHECK(compute_blk_and_tail(omd, blk_idx, oblk_w_tail, ip_tail));
- 
-+    // To compute input layout description we need to pass output paddings
-+    // which will be used to compute input dims rounded up to multiple of
-+    // output dims. Analogous applies to output layout description.
-+    // This is demanded by the algorithm of nodes creation.
-+    // Example:
-+    // input:
-+    //  format: abc
-+    //  size: 77, 15, 3
-+    //  o_padding: 3, 17, 0
-+    //  returns ild: 80, 32, 3
-+    // output:
-+    //  format: ABc16b16a2b
-+    //  size: 77, 15, 3
-+    //  i_padding: 0, 0, 0
-+    //  returns old: 5, 16, 1, 16, 2, 3
-     layout_desc_t ild, old;
--    status_t status
--            = cvt_mem_desc_to_layout_desc(imd, ild, iblocks, ip_padding);
--    if (status != success) return status;
--    status = cvt_mem_desc_to_layout_desc(omd, old, oblocks, op_padding);
--    if (status != success) return status;
-+    CHECK(cvt_mem_desc_to_layout_desc(imd, ild, iblocks, o_paddings, i_tails));
-+    CHECK(cvt_mem_desc_to_layout_desc(omd, old, oblocks, i_paddings, o_tails));
- 
-     p.itype = ild.dt;
-     p.otype = old.dt;
--    p.ip_tail = ip_tail;
--    p.op_tail = op_tail;
--    p.iblock = iblk_w_tail;
--    p.oblock = oblk_w_tail;
--
-+    p.is_tail_present = is_tail_present;
-+    p.req_src_zp = !attr->zero_points_.has_default_values(DNNL_ARG_SRC);
-+    p.req_dst_zp = !attr->zero_points_.has_default_values(DNNL_ARG_DST);
-     p.scale_type = attr->output_scales_.has_default_values()
-             ? scale_type_t::NONE
-             : (attr->output_scales_.mask_ == 0 ? scale_type_t::COMMON
-                                                : scale_type_t::MANY);
-+    p.scale_adjust = (om_d.extra().flags & memory_extra_flags::scale_adjust)
-+            ? om_d.extra().scale_adjust
-+            : 1.f;
-+    p.req_s8s8_comp
-+            = om_d.extra().flags & memory_extra_flags::compensation_conv_s8s8;
-+    p.req_asymmetric_comp = om_d.extra().flags
-+            & memory_extra_flags::compensation_conv_asymmetric_src;
-+
-+    const bool with_groups = is_with_groups(omd);
-+
-+    auto mask_ok = [&](bool check, int mask) {
-+        return IMPLICATION(check, mask == (with_groups ? 0x3 : 0x1));
-+    };
-+
-+    if (!mask_ok(p.req_s8s8_comp, om_d.extra().compensation_mask)
-+            || !mask_ok(p.req_asymmetric_comp,
-+                    om_d.extra().asymm_compensation_mask))
-+        return status::unimplemented;
- 
--    ptrdiff_t ss[max_ndims] = {0};
-+    ptrdiff_t ss[max_ndims] = {0}; // scales strides
-     if (p.scale_type == scale_type_t::MANY) {
--        ptrdiff_t last_ss = 1;
-+        const int mask = attr->output_scales_.mask_;
-+        ptrdiff_t dense_stride = 1;
-+        ptrdiff_t last_stride = 1;
-         for (int d = old.ndims - 1; d >= 0; --d) {
-             assert((d == 0 || old.id[d - 1] <= old.id[d])
-                     && "logical dimensions should be in ascending order");
--            if (attr->output_scales_.mask_ & (1 << old.id[d])) {
--                ss[d] = last_ss;
--                last_ss *= old.dims[d];
-+            if (mask & (1 << old.id[d])) {
-+                if ((d + 1) < old.ndims && old.id[d + 1] != old.id[d]
-+                        && (mask & (1 << old.id[d + 1]))) {
-+                    dense_stride = dense_stride * imd.dims[old.id[d + 1]];
-+                    last_stride = dense_stride;
-+                }
-+                ss[d] = last_stride;
-+                last_stride *= old.dims[d];
-             }
-         }
-     }
- 
-+    const auto compensation_needed = p.req_s8s8_comp || p.req_asymmetric_comp;
-+    if (compensation_needed) {
-+        p.compensation_mask = p.req_s8s8_comp
-+                ? om_d.extra().compensation_mask
-+                : (p.req_asymmetric_comp ? om_d.extra().asymm_compensation_mask
-+                                         : tr::prb_t::invalid_comp_mask);
-+
-+        if (p.compensation_mask == tr::prb_t::asymmetric_comp_mask)
-+            return unimplemented;
-+
-+        assert(p.compensation_mask == tr::prb_t::standard_comp_mask
-+                || p.compensation_mask == tr::prb_t::comp_mask_with_groups);
-+    }
-+
-     int ndims = 0;
- 
-     int i_pos = 0; /* state for input  -- current dimension */
-@@ -254,6 +343,10 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
- 
-         if (ild.dims[i_pos] == old.dims[o_pos]) {
-             p.nodes[ndims].n = ild.dims[i_pos];
-+            p.nodes[ndims].dim_id = old.id[o_pos];
-+            p.nodes[ndims].tail_size = old.tails[o_pos];
-+            p.nodes[ndims].is_zero_pad_needed
-+                    = old.is_blk[o_pos] && old.tails[o_pos] > 0;
-             p.nodes[ndims].is = ild.strides[i_pos];
-             p.nodes[ndims].os = old.strides[o_pos];
-             p.nodes[ndims].ss = ss[o_pos];
-@@ -261,19 +354,45 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
-             ++i_pos;
-             ++o_pos;
-         } else if (ild.dims[i_pos] < old.dims[o_pos]) {
--            assert(old.dims[o_pos] % ild.dims[i_pos] == 0);
--            int factor = old.dims[o_pos] / ild.dims[i_pos];
-+            // old must be divisible by ild or we will not be
-+            // able to create valid nodes. The problem appears
-+            // when stag=Acdb48a and dtag=Acdb32a for example.
-+            if (ild.dims[i_pos] == 0 || old.dims[o_pos] % ild.dims[i_pos] != 0)
-+                return status::unimplemented;
-+
-+            dim_t factor = old.dims[o_pos] / ild.dims[i_pos];
-+
-+            const size_t tail_of_upper_dim
-+                    = utils::div_up(old.tails[o_pos], factor) == ild.dims[i_pos]
-+                    ? 0
-+                    : utils::div_up(old.tails[o_pos], factor);
-+            const size_t tail_of_lower_dim = old.tails[o_pos] % factor;
-+
-             p.nodes[ndims].n = ild.dims[i_pos];
-+            p.nodes[ndims].dim_id = old.id[o_pos];
-+            p.nodes[ndims].tail_size = tail_of_upper_dim;
-+            p.nodes[ndims].is_zero_pad_needed
-+                    = old.is_blk[o_pos] && tail_of_upper_dim > 0;
-             p.nodes[ndims].is = ild.strides[i_pos];
-             p.nodes[ndims].os = old.strides[o_pos] * factor;
-             p.nodes[ndims].ss = ss[o_pos] * factor;
-             ++ndims;
-             ++i_pos;
-             old.dims[o_pos] = factor;
-+            old.tails[o_pos] = tail_of_lower_dim;
-         } else if (ild.dims[i_pos] > old.dims[o_pos]) {
--            assert(ild.dims[i_pos] % old.dims[o_pos] == 0);
--            int factor = ild.dims[i_pos] / old.dims[o_pos];
-+            // ild must be divisible by old or we will not be
-+            // able to create valid nodes. The problem appears
-+            // when stag=Acdb32a and dtag=Acdb48a for example.
-+            if (old.dims[o_pos] == 0 || ild.dims[i_pos] % old.dims[o_pos] != 0)
-+                return status::unimplemented;
-+
-+            dim_t factor = ild.dims[i_pos] / old.dims[o_pos];
-             p.nodes[ndims].n = old.dims[o_pos];
-+            p.nodes[ndims].dim_id = old.id[o_pos];
-+            p.nodes[ndims].tail_size = old.tails[o_pos];
-+            p.nodes[ndims].is_zero_pad_needed
-+                    = old.is_blk[o_pos] && old.tails[o_pos] > 0;
-             p.nodes[ndims].is = ild.strides[i_pos] * factor;
-             p.nodes[ndims].os = old.strides[o_pos];
-             p.nodes[ndims].ss = ss[o_pos];
-@@ -282,12 +401,9 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
-             ild.dims[i_pos] = factor;
-         }
-     }
--    int blk_chunk_idx = ndims;
--    CHECK(compute_chunk_idx(p, imd, omd, blk_idx, blk_chunk_idx));
- 
-     p.ndims = ndims;
-     p.full_ndims = ndims;
--    p.blk_chunk_idx = blk_chunk_idx;
- 
-     p.ioff = memory_desc_wrapper(imd).offset0();
-     p.ooff = memory_desc_wrapper(omd).offset0();
-@@ -295,6 +411,28 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
-     const int sum_idx = attr->post_ops_.find(primitive_kind::sum);
-     p.beta = sum_idx == -1 ? 0.f : attr->post_ops_.entry_[sum_idx].sum.scale;
- 
-+    DEBUG({
-+        printf("init : ");
-+        prb_dump(prb);
-+    });
-+    // Sort the prb array in increasing sizes of the output stride
-+    prb_normalize(p);
-+    DEBUG({
-+        printf("norm : ");
-+        prb_dump(prb);
-+    });
-+
-+    // compensation strides require prb_normalized
-+    prb_set_compensation_strides(p);
-+
-+    /* Combine the variables, which appear together on both
-+             * sides of the reorder */
-+    prb_simplify(p);
-+    DEBUG({
-+        printf("smpl : ");
-+        prb_dump(prb);
-+    });
-+
-     return success;
- }
- 
-@@ -307,28 +445,23 @@ void prb_normalize(prb_t &p) {
-                             && p.nodes[j].n < p.nodes[min_pos].n);
-             if (new_min) min_pos = j;
-         }
--        if (min_pos != d) {
--            nstl::swap(p.nodes[d], p.nodes[min_pos]);
--            if (p.blk_chunk_idx == min_pos || p.blk_chunk_idx == d)
--                p.blk_chunk_idx = p.blk_chunk_idx == min_pos ? d : min_pos;
--        }
-+        if (min_pos != d) { nstl::swap(p.nodes[d], p.nodes[min_pos]); }
-     }
- }
- 
--status_t prb_check_blk(prb_t &p, const memory_desc_t &md_) {
--    const auto md = memory_desc_wrapper(md_);
--    const auto &bd = md.blocking_desc();
--    if (p.ip_tail == 0) return status::success;
--
--    // Check if the inner blocks and p.nodes[blk].n in the firsti nblks
--    // is equivalent in reverse order when has tail in block layout.
--    const int nblk = bd.inner_nblks;
--    for (int iblk = 0; iblk < nblk; ++iblk) {
--        if (bd.inner_blks[nblk - iblk - 1]
--                != static_cast<ptrdiff_t>(p.nodes[iblk].n))
--            return status::unimplemented;
-+void prb_node_dependency(prb_t &prb) {
-+    for (int i = 0; i < prb.ndims; i++) {
-+        tr::node_t &node = prb.nodes[i];
-+        node.parent_node_id = node_t::empty_field;
-+        for (int j = i + 1; j < prb.ndims; j++) {
-+            const tr::node_t &potential_parent_node = prb.nodes[j];
-+            if (!potential_parent_node.is_dim_id_empty()
-+                    && potential_parent_node.dim_id == node.dim_id) {
-+                node.parent_node_id = j;
-+                break;
-+            }
-+        }
-     }
--    return status::success;
- }
- 
- void prb_simplify(prb_t &p) {
-@@ -338,16 +471,25 @@ void prb_simplify(prb_t &p) {
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Warray-bounds"
- #endif
-+
-+    const auto skip_dim_combining = [&p](const int node_id) -> bool {
-+        return (p.is_tail_in_one_of_child_nodes(node_id)
-+                       && p.nodes[node_id].n > 1)
-+                || p.nodes[node_id].tail_size > 0;
-+    };
-+
-+    if (p.is_tail_present) prb_node_dependency(p);
-+
-     for (int d = 0; d < p.ndims - 1; ++d) {
-         auto &this_node = p.nodes[d + 0];
-         auto &next_node = p.nodes[d + 1];
--        const bool skip_blk_idx = (p.ip_tail > 0 || p.op_tail > 0)
--                && (p.blk_chunk_idx == d || p.blk_chunk_idx == d + 1);
-+        const bool skip_dims_combining
-+                = skip_dim_combining(d) || skip_dim_combining(d + 1);
-         const bool fold = false
-                 || (next_node.n == static_cast<size_t>(1)
--                        && !skip_blk_idx) // trivial case, just drop next node
-+                        && !skip_dims_combining) // trivial case, just drop next node
-                 || (true // or real folding if possible
--                        && !skip_blk_idx
-+                        && !skip_dims_combining
-                         && next_node.is
-                                 == static_cast<ptrdiff_t>(
-                                         this_node.n * this_node.is)
-@@ -356,15 +498,20 @@ void prb_simplify(prb_t &p) {
-                                         this_node.n * this_node.os)
-                         && next_node.ss
-                                 == static_cast<ptrdiff_t>(
--                                        this_node.n * this_node.ss));
-+                                        this_node.n * this_node.ss)
-+                        && next_node.cs
-+                                == static_cast<ptrdiff_t>(
-+                                        this_node.n * this_node.cs));
-         if (fold) {
-             this_node.n *= next_node.n;
-+            this_node.dim_id = node_t::empty_field;
-+            this_node.is_zero_pad_needed = false;
-             for (int j = d + 2; j < p.ndims; ++j)
-                 p.nodes[j - 1] = p.nodes[j];
--            if (d < p.blk_chunk_idx) --p.blk_chunk_idx;
-             --p.ndims;
-             --p.full_ndims;
-             --d; // make another try
-+            if (p.is_tail_present) prb_node_dependency(p);
-         }
-     }
- #if defined(__GNUC__) && __GNUC__ >= 4
-@@ -372,24 +519,42 @@ void prb_simplify(prb_t &p) {
- #endif
- }
- 
--void prb_node_split(prb_t &p, int dim, size_t n1) {
-+void prb_node_split(prb_t &p, int dim, size_t new_node_size) {
-     assert(dim < p.ndims);
-     assert(p.ndims < max_ndims);
--    assert(p.nodes[dim].n % n1 == 0);
-+    assert(p.nodes[dim].n % new_node_size == 0);
- 
-     p.ndims += 1;
-     p.full_ndims += 1;
--    if (dim < p.blk_chunk_idx) p.blk_chunk_idx += 1;
- 
-     for (int d = p.ndims; d > dim + 1; --d)
-         p.nodes[d] = p.nodes[d - 1];
- 
--    p.nodes[dim + 1].n = p.nodes[dim].n / n1;
--    p.nodes[dim + 1].is = p.nodes[dim].is * n1;
--    p.nodes[dim + 1].os = p.nodes[dim].os * n1;
--    p.nodes[dim + 1].ss = p.nodes[dim].ss * n1;
--
--    p.nodes[dim].n = n1;
-+    const size_t upper_node_size = p.nodes[dim].n / new_node_size;
-+    const size_t lower_node_size = new_node_size;
-+    p.nodes[dim + 1].n = upper_node_size;
-+    p.nodes[dim].n = lower_node_size;
-+
-+    const bool is_tail = p.nodes[dim].tail_size > 0;
-+    const size_t upper_node_tail
-+            = utils::div_up(p.nodes[dim].tail_size, lower_node_size)
-+                    == upper_node_size
-+            ? 0
-+            : utils::div_up(p.nodes[dim].tail_size, lower_node_size);
-+    const size_t lower_node_tail = p.nodes[dim].tail_size % lower_node_size;
-+    p.nodes[dim].tail_size = is_tail ? lower_node_tail : 0;
-+    p.nodes[dim + 1].tail_size = is_tail ? upper_node_tail : 0;
-+
-+    p.nodes[dim + 1].is_zero_pad_needed
-+            = p.nodes[dim].is_zero_pad_needed && p.nodes[dim + 1].tail_size > 0;
-+    p.nodes[dim].is_zero_pad_needed
-+            = p.nodes[dim].is_zero_pad_needed && p.nodes[dim].tail_size > 0;
-+
-+    p.nodes[dim + 1].dim_id = p.nodes[dim].dim_id;
-+    p.nodes[dim + 1].is = p.nodes[dim].is * lower_node_size;
-+    p.nodes[dim + 1].os = p.nodes[dim].os * lower_node_size;
-+    p.nodes[dim + 1].ss = p.nodes[dim].ss * lower_node_size;
-+    p.nodes[dim + 1].cs = p.nodes[dim].cs * lower_node_size;
- }
- 
- void prb_node_swap(prb_t &p, int d0, int d1) {
-@@ -425,8 +590,11 @@ void prb_dump(const prb_t &p) {
-     printf("@@@ type:%s:%s ndims:%d ", dnnl_dt2str(p.itype),
-             dnnl_dt2str(p.otype), p.ndims);
-     for (int d = 0; d < p.ndims; ++d)
--        printf("[%zu:%td:%td:%td]", p.nodes[d].n, p.nodes[d].is, p.nodes[d].os,
--                p.nodes[d].ss);
-+        printf("[%zu:%zu:%d:%d:%s:%td:%td:%td:%td]", p.nodes[d].n,
-+                p.nodes[d].tail_size, p.nodes[d].dim_id,
-+                p.nodes[d].parent_node_id,
-+                p.nodes[d].is_zero_pad_needed ? "true" : "false", p.nodes[d].is,
-+                p.nodes[d].os, p.nodes[d].ss, p.nodes[d].cs);
-     printf(" off:%zu:%zu\n", p.ioff, p.ooff);
- }
- 
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-index f51e3c22414..fdefec8a049 100644
---- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-@@ -1,5 +1,6 @@
- /*******************************************************************************
- * Copyright 2020-2022 Intel Corporation
-+* Copyright 2022 FUJITSU LIMITED
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -32,6 +33,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-             REG_SR(f32, any, f32, any, fmt_order::any, spec::reference)
- 
-@@ -44,6 +46,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nCw16c))
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nCw8c))
-@@ -75,6 +78,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
- 
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nChw16c))
-@@ -123,6 +127,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
- 
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nCdhw16c))
-@@ -171,6 +176,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
- 
- 
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_s32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_s32.cpp
-index fadbee0ecf8..b1881df80e0 100644
---- a/src/cpu/reorder/cpu_reorder_regular_f32_s32.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_s32.cpp
-@@ -1,5 +1,6 @@
- /*******************************************************************************
- * Copyright 2020-2022 Intel Corporation
-+* Copyright 2022 FUJITSU LIMITED
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -31,6 +32,7 @@ const impl_list_map_t &regular_f32_s32_impl_list_map() {
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, s32, nChw16c))
-             REG_SR(f32, any, s32, any, fmt_order::any, spec::reference)
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_s8.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_s8.cpp
-index b83d47b2d6f..6bd305c7b41 100644
---- a/src/cpu/reorder/cpu_reorder_regular_f32_s8.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_s8.cpp
-@@ -1,5 +1,6 @@
- /*******************************************************************************
- * Copyright 2020-2022 Intel Corporation
-+* Copyright 2022 FUJITSU LIMITED
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -35,6 +36,7 @@ const impl_list_map_t &regular_f32_s8_impl_list_map() {
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
- 
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, s8, nChw16c))
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_u8.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_u8.cpp
-index 4bae84307e6..d306c3abeb8 100644
---- a/src/cpu/reorder/cpu_reorder_regular_f32_u8.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_u8.cpp
-@@ -1,5 +1,6 @@
- /*******************************************************************************
- * Copyright 2020-2022 Intel Corporation
-+* Copyright 2022 FUJITSU LIMITED
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -33,6 +34,7 @@ const impl_list_map_t &regular_f32_u8_impl_list_map() {
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, u8, nChw16c))
-             REG_SR(f32, any, u8, any, fmt_order::any, spec::reference)
-diff --git a/src/cpu/reorder/cpu_reorder_regular_s32.cpp b/src/cpu/reorder/cpu_reorder_regular_s32.cpp
-index 54d65661791..a8197402b0a 100644
---- a/src/cpu/reorder/cpu_reorder_regular_s32.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_s32.cpp
-@@ -1,5 +1,6 @@
- /*******************************************************************************
- * Copyright 2020-2022 Intel Corporation
-+* Copyright 2022 FUJITSU LIMITED
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -34,6 +35,7 @@ const impl_list_map_t &regular_s32_impl_list_map() {
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
- 
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(s32, any, f32, nChw16c))
-diff --git a/src/cpu/reorder/cpu_reorder_regular_s8.cpp b/src/cpu/reorder/cpu_reorder_regular_s8.cpp
-index f57d01e2009..ce18dc5caf1 100644
---- a/src/cpu/reorder/cpu_reorder_regular_s8.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_s8.cpp
-@@ -1,5 +1,6 @@
- /*******************************************************************************
- * Copyright 2020-2022 Intel Corporation
-+* Copyright 2022 FUJITSU LIMITED
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -41,6 +42,7 @@ const impl_list_map_t &regular_s8_impl_list_map() {
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
- 
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(s8, any, f32, nChw16c))
-diff --git a/src/cpu/reorder/cpu_reorder_regular_u8.cpp b/src/cpu/reorder/cpu_reorder_regular_u8.cpp
-index 73d731c3b15..87a58872262 100644
---- a/src/cpu/reorder/cpu_reorder_regular_u8.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_u8.cpp
-@@ -1,5 +1,6 @@
- /*******************************************************************************
- * Copyright 2020-2022 Intel Corporation
-+* Copyright 2022 FUJITSU LIMITED
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -35,6 +36,7 @@ const impl_list_map_t &regular_u8_impl_list_map() {
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
- 
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(u8, any, f32, nChw16c))
diff --git a/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch b/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch
index 11d6725f92eba8..9583308396dd1d 100644
--- a/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch
+++ b/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch
@@ -15,19 +15,19 @@
  limitations under the License.
  *******************************************************************************
 diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
-index d7d83badcb..1a7bcd74ed 100644
+index fd2c76d01..bd7bed837 100644
 --- a/src/cpu/aarch64/acl_thread.cpp
 +++ b/src/cpu/aarch64/acl_thread.cpp
-@@ -41,14 +41,17 @@ void acl_thread_bind() {
+@@ -55,14 +55,17 @@ void acl_set_benchmark_scheduler_default() {
  #endif
-
+ 
  #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
--void acl_set_custom_scheduler() {
+-void acl_set_tp_scheduler() {
 -    static std::once_flag flag_once;
 -    // Create threadpool scheduler
 -    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
 -            = std::make_unique<ThreadpoolScheduler>();
-+void acl_set_custom_scheduler(int intra_threads = 0) {
++void acl_set_tp_scheduler(int intra_threads = 0) {
 +    static thread_local std::once_flag flag_once;
      // set CUSTOM scheduler in ACL
      std::call_once(flag_once,
@@ -40,59 +40,58 @@ index d7d83badcb..1a7bcd74ed 100644
 +
 +                    arm_compute::Scheduler::set(threadpool_scheduler); });
  }
-
+ 
  void acl_set_threadpool_num_threads() {
+@@ -102,14 +105,6 @@ void set_acl_threading() {
+         acl_set_benchmark_scheduler_default();
+     }
+ #endif
+-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+-    if (verbose_has_profile_externals()) {
+-        acl_set_tp_benchmark_scheduler();
+-    } else {
+-        acl_set_tp_scheduler();
+-    }
+-
+-#endif
+ }
+ 
+ } // namespace acl_thread_utils
 diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp
-index 46dde5eb05..13b3910515 100644
+index f073376e6..654a2aa5d 100644
 --- a/src/cpu/aarch64/acl_thread.hpp
 +++ b/src/cpu/aarch64/acl_thread.hpp
-@@ -34,7 +34,7 @@ void acl_thread_bind();
-
+@@ -40,7 +40,7 @@ void acl_set_benchmark_scheduler_default();
+ 
  #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
  // Retrieve threadpool size during primitive execution and set ThreadpoolScheduler num_threads
--void acl_set_custom_scheduler();
-+void acl_set_custom_scheduler(int intra_threads);
+-void acl_set_tp_scheduler();
++void acl_set_tp_scheduler(int intra_threads);
  void acl_set_threadpool_num_threads();
- #endif
-
+ // Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler) for DNNL_VERBOSE=profile,profile_externals
+ void acl_set_tp_benchmark_scheduler();
 diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-index 418d7f30f9..7eb8a052b0 100644
+index 439ca862e..6656c37a5 100644
 --- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
 +++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
 @@ -102,8 +102,6 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
  void ThreadpoolScheduler::run_workloads(
          std::vector<arm_compute::IScheduler::Workload> &workloads) {
-
+ 
 -    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
 -
      const unsigned int num_threads
              = std::min(static_cast<unsigned int>(_num_threads),
                      static_cast<unsigned int>(workloads.size()));
 diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
-index 4ee70a405c..e9211f42e0 100644
+index 0bfec3871..7207b2b60 100644
 --- a/src/cpu/cpu_engine.cpp
 +++ b/src/cpu/cpu_engine.cpp
 @@ -47,6 +47,7 @@ status_t cpu_engine_t::create_stream(stream_t **stream, unsigned flags) {
  #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
  status_t cpu_engine_t::create_stream(stream_t **stream,
          dnnl::threadpool_interop::threadpool_iface *threadpool) {
-+    dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_custom_scheduler(threadpool->get_num_threads());
++    dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_tp_scheduler(threadpool->get_num_threads());
      return safe_ptr_assign<stream_t>(
              *stream, new cpu_stream_t(this, threadpool));
  }
-diff --git a/src/cpu/cpu_engine.hpp b/src/cpu/cpu_engine.hpp
-index 7aa077e4ef..2938650963 100644
---- a/src/cpu/cpu_engine.hpp
-+++ b/src/cpu/cpu_engine.hpp
-@@ -175,11 +175,6 @@ public:
-         // dnnl_get_max_threads() == OMP_NUM_THREADS
-         dnnl::impl::cpu::aarch64::acl_thread_utils::acl_thread_bind();
- #endif
--
--#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
--        // Set ACL scheduler for threadpool runtime
--        dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_custom_scheduler();
--#endif
- #endif
-         return status::success;
-     };
diff --git a/third_party/mkl_dnn/onednn_acl_threadcap.patch b/third_party/mkl_dnn/onednn_acl_threadcap.patch
index fb190861936f89..3a33af153e917c 100644
--- a/third_party/mkl_dnn/onednn_acl_threadcap.patch
+++ b/third_party/mkl_dnn/onednn_acl_threadcap.patch
@@ -1,5 +1,5 @@
  *******************************************************************************
- Copyright 2022 Arm Limited and affiliates.
+ Copyright 2023 Arm Limited and affiliates.
  SPDX-License-Identifier: Apache-2.0
 
  Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,19 +15,19 @@
  limitations under the License.
  *******************************************************************************
 diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
-index d7d83badc..5a263b8d5 100644
+index fd2c76d01..2d7c76d48 100644
 --- a/src/cpu/aarch64/acl_thread.cpp
 +++ b/src/cpu/aarch64/acl_thread.cpp
 @@ -17,6 +17,8 @@
-
+ #include "cpu/aarch64/acl_thread.hpp"
  #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
  #include "cpu/aarch64/acl_threadpool_scheduler.hpp"
 +#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
 +#include <thread>
  #endif
-
- namespace dnnl {
-@@ -29,9 +31,10 @@ namespace acl_thread_utils {
+ #include "cpu/aarch64/acl_benchmark_scheduler.hpp"
+ 
+@@ -30,9 +32,10 @@ namespace acl_thread_utils {
  #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
  void acl_thread_bind() {
      static std::once_flag flag_once;
diff --git a/third_party/mkl_dnn/onednn_acl_threadpool_scheduler.patch b/third_party/mkl_dnn/onednn_acl_threadpool_scheduler.patch
deleted file mode 100644
index 0e0cb39e82f1bb..00000000000000
--- a/third_party/mkl_dnn/onednn_acl_threadpool_scheduler.patch
+++ /dev/null
@@ -1,45 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-
-diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-index 418d7f30f..439ca862e 100644
---- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-@@ -1,5 +1,5 @@
- /*******************************************************************************
--* Copyright 2022 Arm Ltd. and affiliates
-+* Copyright 2022-2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -117,14 +117,14 @@ void ThreadpoolScheduler::run_workloads(
-     if (is_async) b.init(num_threads);
-     tp->parallel_for(num_threads, [&](int ithr, int nthr) {
-         bool is_main = get_active_threadpool() == tp;
--        if (is_main) activate_threadpool(tp);
-+        if (!is_main) activate_threadpool(tp);
-         // Make ThreadInfo local to avoid race conditions
-         ThreadInfo info;
-         info.cpu_info = &cpu_info();
-         info.num_threads = nthr;
-         info.thread_id = ithr;
-         process_workloads(workloads, feeder, info);
--        if (is_main) deactivate_threadpool();
-+        if (!is_main) deactivate_threadpool();
-         if (is_async) b.notify();
-     });
-     if (is_async) b.wait();
diff --git a/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
index 106d86a64fa916..d453ee83240f68 100644
--- a/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -27,6 +27,7 @@ _DNNL_RUNTIME_THREADPOOL = {
     "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
     "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
     "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
+    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
     "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
     "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
     "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
@@ -74,6 +75,7 @@ _DNNL_RUNTIME_OMP = {
     "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
     "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
     "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
+    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
     "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
     "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
     "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
@@ -124,9 +126,9 @@ expand_template(
     name = "dnnl_version_h",
     out = "include/oneapi/dnnl/dnnl_version.h",
     substitutions = {
-        "@DNNL_VERSION_MAJOR@": "2",
-        "@DNNL_VERSION_MINOR@": "7",
-        "@DNNL_VERSION_PATCH@": "3",
+        "@DNNL_VERSION_MAJOR@": "3",
+        "@DNNL_VERSION_MINOR@": "2",
+        "@DNNL_VERSION_PATCH@": "1",
         "@DNNL_VERSION_HASH@": "N/A",
     },
     template = "include/oneapi/dnnl/dnnl_version.h.in",
@@ -142,6 +144,7 @@ cc_library(
         ],
         exclude = [
             "src/cpu/x64/**",
+            "src/cpu/rv64/**",
         ],
     ),
     copts = select({
diff --git a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD
index d07e0e378c1671..fab95348cdc8b0 100644
--- a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -27,6 +27,7 @@ _DNNL_RUNTIME_THREADPOOL = {
     "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
     "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
     "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
+    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
     "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
     "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
     "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
@@ -74,6 +75,7 @@ _DNNL_RUNTIME_OMP = {
     "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
     "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
     "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
+    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
     "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
     "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
     "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
@@ -124,9 +126,9 @@ expand_template(
     name = "dnnl_version_h",
     out = "include/oneapi/dnnl/dnnl_version.h",
     substitutions = {
-        "@DNNL_VERSION_MAJOR@": "2",
-        "@DNNL_VERSION_MINOR@": "7",
-        "@DNNL_VERSION_PATCH@": "3",
+        "@DNNL_VERSION_MAJOR@": "3",
+        "@DNNL_VERSION_MINOR@": "2",
+        "@DNNL_VERSION_PATCH@": "1",
         "@DNNL_VERSION_HASH@": "N/A",
     },
     template = "include/oneapi/dnnl/dnnl_version.h.in",
@@ -142,6 +144,7 @@ cc_library(
         ],
         exclude = [
             "src/cpu/x64/**",
+            "src/cpu/rv64/**",
         ],
     ),
     copts = select({
diff --git a/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl b/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl
index 18cc7235166a32..90030a39744c00 100644
--- a/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl
+++ b/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl
@@ -115,7 +115,7 @@ def onednn_v3_define():
       An empty list of all other cases (include ARM builds).
     """
     return select({
-        "@local_tsl//tsl/mkl:build_with_mkl_aarch64": [],
+        "@local_tsl//tsl/mkl:build_with_mkl_aarch64": ["-DENABLE_ONEDNN_V3"],
         "@local_tsl//tsl:linux_x86_64": ["-DENABLE_ONEDNN_V3"],
         "@local_tsl//tsl:windows": ["-DENABLE_ONEDNN_V3"],
         "//conditions:default": [],
diff --git a/third_party/xla/third_party/tsl/workspace2.bzl b/third_party/xla/third_party/tsl/workspace2.bzl
index 261d456e9beda8..cfd8ab0f9f0472 100644
--- a/third_party/xla/third_party/tsl/workspace2.bzl
+++ b/third_party/xla/third_party/tsl/workspace2.bzl
@@ -166,19 +166,14 @@ def _tf_repositories():
         name = "mkl_dnn_acl_compatible",
         build_file = "//tensorflow/third_party/mkl_dnn:mkldnn_acl.BUILD",
         patch_file = [
-            "//tensorflow/third_party/mkl_dnn:onednn_acl_threadcap.patch",
-            "//tensorflow/third_party/mkl_dnn:onednn_acl_remove_winograd.patch",
-            "//tensorflow/third_party/mkl_dnn:onednn_acl_fixed_format_kernels.patch",
-            "//tensorflow/third_party/mkl_dnn:onednn_acl_depthwise_convolution.patch",
-            "//tensorflow/third_party/mkl_dnn:onednn_acl_threadpool_scheduler.patch",
-            "//tensorflow/third_party/mkl_dnn:onednn_acl_reorder_padded.patch",
-            "//tensorflow/third_party/mkl_dnn:onednn_acl_reorder_update.patch",
-            "//tensorflow/third_party/mkl_dnn:onednn_acl_reorder.patch",
-            "//tensorflow/third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
+            "//third_party/mkl_dnn:onednn_acl_threadcap.patch",
+            "//third_party/mkl_dnn:onednn_acl_reorder.patch",
+            "//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
+            "//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
         ],
-        sha256 = "a50993aa6265b799b040fe745e0010502f9f7103cc53a9525d59646aef006633",
-        strip_prefix = "oneDNN-2.7.3",
-        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/v2.7.3.tar.gz"),
+        sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
+        strip_prefix = "oneDNN-3.2.1",
+        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.2.1.tar.gz"),
     )
 
     tf_http_archive(

From cf3edc5aa450c8e7a95a93338e06a16e6d207e58 Mon Sep 17 00:00:00 2001
From: Moritz Firsching <firsching@google.com>
Date: Tue, 19 Sep 2023 16:11:04 +0200
Subject: [PATCH 020/567] nicer link to OpenSSF scorecard

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a40d3d357f19e9..dbe5e9a1c1b233 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 [![PyPI](https://badge.fury.io/py/tensorflow.svg)](https://badge.fury.io/py/tensorflow)
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4724125.svg)](https://doi.org/10.5281/zenodo.4724125)
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
-[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/tensorflow/tensorflow/badge)](https://api.securityscorecards.dev/projects/github.com/tensorflow/tensorflow)
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/tensorflow/tensorflow/badge)](https://securityscorecards.dev/viewer/?uri=github.com/tensorflow/tensorflow)
 [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/tensorflow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:tensorflow)
 [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/tensorflow-py.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:tensorflow-py)
 [![OSSRank](https://shields.io/endpoint?url=https://ossrank.com/shield/44)](https://ossrank.com/p/44)

From 4e101ec87686515d10830da52b735c87ecbaab95 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 02:02:05 -0700
Subject: [PATCH 021/567] compat: Update forward compatibility horizon to
 2023-09-20

PiperOrigin-RevId: 566892239
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 495482568ac319..0be4fb3a98e8b1 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 19)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 20)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 79be3fa6129625e695913c99ad0d7592c733fd27 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 02:02:10 -0700
Subject: [PATCH 022/567] Update GraphDef version to 1625.

PiperOrigin-RevId: 566892275
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 6736c45852074b..05991cc0e70419 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1624  // Updated: 2023/9/19
+#define TF_GRAPH_DEF_VERSION 1625  // Updated: 2023/9/20
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From f12ce2113e361dac536d95d6f1b396048259a3c3 Mon Sep 17 00:00:00 2001
From: Alan Kelly <alankelly@google.com>
Date: Wed, 20 Sep 2023 02:12:01 -0700
Subject: [PATCH 023/567] Conv3D doesn't have to transpose weights

PiperOrigin-RevId: 566894491
---
 tensorflow/lite/kernels/conv3d.cc             | 63 +++----------------
 .../internal/optimized/optimized_ops.h        | 34 ++++------
 2 files changed, 21 insertions(+), 76 deletions(-)

diff --git a/tensorflow/lite/kernels/conv3d.cc b/tensorflow/lite/kernels/conv3d.cc
index 9f775038ca8760..cf9eb2a8d4d89c 100644
--- a/tensorflow/lite/kernels/conv3d.cc
+++ b/tensorflow/lite/kernels/conv3d.cc
@@ -46,17 +46,14 @@ static constexpr size_t kMaxIm2colBufferSizeMobile = 1024 * 1024 * 1024;  // 1GB
 struct OpData {
   Padding3DValues padding;
   int im2col_tensor_id = kTensorNotAllocated;
-  int transposed_filter_tensor_id = kTensorNotAllocated;
 
   bool need_im2col = false;
-  bool need_transposed_filter = false;
 
   // Disable im2col if the temporary im2col tensor requires too much memory
   // (i.e. >= kMaxIm2colBufferSizeMobile).
   bool im2col_oversized = false;
 
   int32_t im2col_index;
-  int32_t transposed_filter_index;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -83,16 +80,12 @@ TfLiteStatus AllocateTemporaryTensorsIfRequired(
 
   opdata->need_im2col = (kernel_type == kGenericOptimized) &&
                         (need_dilated_im2col || need_non_dilated_im2col);
-  // TODO(b/183455632): Add transposing logic in converter so constant folding
-  // might work on constant filter tensor.
-  opdata->need_transposed_filter = (kernel_type == kGenericOptimized);
 
   // On mobile platforms, the generic optimized kernel will not be used if the
   // temporary im2col tensor requires too much memory.
   if (IsMobilePlatform() && opdata->need_im2col &&
       im2col_bytes >= kMaxIm2colBufferSizeMobile) {
     opdata->need_im2col = false;
-    opdata->need_transposed_filter = false;
     opdata->im2col_oversized = true;
   }
 
@@ -104,15 +97,6 @@ TfLiteStatus AllocateTemporaryTensorsIfRequired(
     opdata->im2col_index = temporaries_count++;
   }
 
-  if (opdata->need_transposed_filter) {
-    if (opdata->transposed_filter_tensor_id == kTensorNotAllocated) {
-      TF_LITE_ENSURE_OK(
-          context, context->AddTensors(context, 1,
-                                       &opdata->transposed_filter_tensor_id));
-    }
-    opdata->transposed_filter_index = temporaries_count++;
-  }
-
   TfLiteIntArrayFree(node->temporaries);
   node->temporaries = TfLiteIntArrayCreate(temporaries_count);
   return kTfLiteOk;
@@ -212,25 +196,6 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
                       context->ResizeTensor(context, im2col, im2col_size));
   }
 
-  if (opdata->need_transposed_filter) {
-    TfLiteIntArray* transposed_filter_size = TfLiteIntArrayCreate(5);
-    transposed_filter_size->data[0] = filter->dims->data[4];
-    transposed_filter_size->data[1] = filter->dims->data[0];
-    transposed_filter_size->data[2] = filter->dims->data[1];
-    transposed_filter_size->data[3] = filter->dims->data[2];
-    transposed_filter_size->data[4] = filter->dims->data[3];
-
-    TfLiteTensor* transposed_filter;
-    node->temporaries->data[opdata->transposed_filter_index] =
-        opdata->transposed_filter_tensor_id;
-    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node,
-                                                opdata->transposed_filter_index,
-                                                &transposed_filter));
-    transposed_filter->type = filter->type;
-    transposed_filter->allocation_type = kTfLiteArenaRw;
-    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, transposed_filter,
-                                                     transposed_filter_size));
-  }
   return kTfLiteOk;
 }
 
@@ -239,11 +204,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return Prepare(kernel_type, context, node);
 }
 
-void EvalFloat(KernelType kernel_type, TfLiteContext* context, TfLiteNode* node,
-               TfLiteConv3DParams* params, OpData* opdata,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* im2col,
-               TfLiteTensor* tranposed_filter, TfLiteTensor* output) {
+TfLiteStatus EvalFloat(KernelType kernel_type, TfLiteContext* context,
+                       TfLiteNode* node, TfLiteConv3DParams* params,
+                       OpData* opdata, const TfLiteTensor* input,
+                       const TfLiteTensor* filter, const TfLiteTensor* bias,
+                       TfLiteTensor* im2col, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -265,19 +230,17 @@ void EvalFloat(KernelType kernel_type, TfLiteContext* context, TfLiteNode* node,
                             GetTensorData<float>(filter), GetTensorShape(bias),
                             GetTensorData<float>(bias), GetTensorShape(output),
                             GetTensorData<float>(output));
-      break;
+      return kTfLiteOk;
     }
     case kGenericOptimized: {
-      optimized_ops::Conv3D(
+      return optimized_ops::Conv3D(
           runtime_params, GetTensorShape(input), GetTensorData<float>(input),
           GetTensorShape(filter), GetTensorData<float>(filter),
           GetTensorShape(bias), GetTensorData<float>(bias),
           GetTensorShape(output), GetTensorData<float>(output),
           GetTensorShape(im2col), GetTensorData<float>(im2col),
-          GetTensorShape(tranposed_filter),
-          GetTensorData<float>(tranposed_filter),
           CpuBackendContext::GetFromContext(context));
-    } break;
+    }
   }
 }
 
@@ -297,11 +260,6 @@ TfLiteStatus Eval(KernelType kernel_type, TfLiteContext* context,
   TfLiteTensor* im2col = opdata->need_im2col
                              ? &context->tensors[opdata->im2col_tensor_id]
                              : nullptr;
-  TfLiteTensor* transposed_filter =
-      opdata->need_transposed_filter
-          ? &context->tensors[opdata->transposed_filter_tensor_id]
-          : nullptr;
-
   // Fallback to reference execution path when im2col is needed but disabled.
   if (opdata->im2col_oversized) {
     kernel_type = kReference;
@@ -309,9 +267,8 @@ TfLiteStatus Eval(KernelType kernel_type, TfLiteContext* context,
 
   switch (input->type) {
     case kTfLiteFloat32:
-      EvalFloat(kernel_type, context, node, params, opdata, input, filter, bias,
-                im2col, transposed_filter, output);
-      break;
+      return EvalFloat(kernel_type, context, node, params, opdata, input,
+                       filter, bias, im2col, output);
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s currently not supported.",
                          TfLiteTypeGetName(input->type));
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 847d870065cc4d..5609719398fcee 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -7957,15 +7957,13 @@ inline void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
   ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data);
 }
 
-inline void Conv3D(const Conv3DParams& params, const RuntimeShape& input_shape,
-                   const float* input_data, const RuntimeShape& filter_shape,
-                   const float* filter_data, const RuntimeShape& bias_shape,
-                   const float* bias_data, const RuntimeShape& output_shape,
-                   float* output_data, const RuntimeShape& im2col_shape,
-                   float* im2col_data,
-                   const RuntimeShape& transposed_filter_shape,
-                   float* transposed_filter_data,
-                   CpuBackendContext* cpu_backend_context) {
+inline TfLiteStatus Conv3D(
+    const Conv3DParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data,
+    CpuBackendContext* cpu_backend_context) {
   const int stride_depth = params.stride_depth;
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
@@ -8012,24 +8010,13 @@ inline void Conv3D(const Conv3DParams& params, const RuntimeShape& input_shape,
     gemm_input_shape = &input_shape;
   }
 
-  // Transpose the filter tensor.
-  TransposeParams transpose_params;
-  transpose_params.perm_count = 5;
-  transpose_params.perm[0] = 4;
-  transpose_params.perm[1] = 0;
-  transpose_params.perm[2] = 1;
-  transpose_params.perm[3] = 2;
-  transpose_params.perm[4] = 3;
-  Transpose<float, 5>(transpose_params, filter_shape, filter_data,
-                      transposed_filter_shape, transposed_filter_data);
-
   const int gemm_input_dims = gemm_input_shape->DimensionsCount();
   int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
   int n = output_shape.Dims(4);
   int k = gemm_input_shape->Dims(gemm_input_dims - 1);
 
   cpu_backend_gemm::MatrixParams<float> lhs_params;
-  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.order = cpu_backend_gemm::Order::kColMajor;
   lhs_params.rows = n;
   lhs_params.cols = k;
   cpu_backend_gemm::MatrixParams<float> rhs_params;
@@ -8044,9 +8031,10 @@ inline void Conv3D(const Conv3DParams& params, const RuntimeShape& input_shape,
   gemm_params.bias = bias_data;
   gemm_params.clamp_min = output_activation_min;
   gemm_params.clamp_max = output_activation_max;
-  cpu_backend_gemm::Gemm(lhs_params, transposed_filter_data, rhs_params,
-                         gemm_input_data, dst_params, output_data, gemm_params,
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
+                         dst_params, output_data, gemm_params,
                          cpu_backend_context);
+  return kTfLiteOk;
 }
 
 // Returns in 'im_data' (assumed to be zero-initialized) image patch in storage

From fea0c082192a4063faf3b2f54a48b52cbb602487 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Wed, 20 Sep 2023 02:49:10 -0700
Subject: [PATCH 024/567] [XLA:GPU] Fix boundary checks for split-K GEMM
 fusions

Without this fix, we get CUDA_ERROR_ILLEGAL_ADDRESS on the added test case.

PiperOrigin-RevId: 566902695
---
 .../xla/xla/service/gpu/ir_emitter_triton.cc  | 30 ++++++++++++-----
 .../xla/service/gpu/ir_emitter_triton_test.cc | 32 +++++++++++++++++++
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index bce462fea8bc5c..d431f1349c294f 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -432,9 +432,16 @@ Value EmitConstant(ImplicitLocOpBuilder& b, const HloInstruction& constant) {
 }
 
 struct DimProperties {
+  DimProperties(int64_t index, Value offset, int block_size, int split_value)
+      : index(index),
+        offset(offset),
+        block_size(block_size),
+        split_value(split_value) {}
+
   int64_t index;
   Value offset;
   int block_size;
+  int split_value;
 };
 
 Value EmitBroadcast(ImplicitLocOpBuilder& b,
@@ -1075,7 +1082,7 @@ class MatMulEmitterHelper {
         // is logically split, major part is addressed using pid_batch.
         count /= *dims_.lhs_noncontracting_split;
       }
-      if (count % properties.block_size != 0) {
+      if (count % (properties.block_size * properties.split_value) != 0) {
         boundary_checks.push_back(bounds.size());
       }
       bounds.push_back(Cst64(count));
@@ -1269,18 +1276,24 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
 
   Side lhs{TritonFusionAnalysis::Scope::LHS,
            /*tiled_dims=*/
-           {{dims.lhs_noncontracting_dim_idx, pid_m_offset, block_m},
-            {dims.lhs_contracting_dim_idx, pid_k_offset, block_k}},
+           {DimProperties(dims.lhs_noncontracting_dim_idx, pid_m_offset,
+                          block_m, /*split_value=*/1),
+            DimProperties(dims.lhs_contracting_dim_idx, pid_k_offset, block_k,
+                          split_k)},
            dims.lhs_batch_dim_idx};
   Side rhs{TritonFusionAnalysis::Scope::RHS,
            /*tiled_dims=*/
-           {{dims.rhs_contracting_dim_idx, pid_k_offset, block_k},
-            {dims.rhs_noncontracting_dim_idx, pid_n_offset, block_n}},
+           {DimProperties(dims.rhs_contracting_dim_idx, pid_k_offset, block_k,
+                          split_k),
+            DimProperties(dims.rhs_noncontracting_dim_idx, pid_n_offset,
+                          block_n, /*split_value=*/1)},
            dims.rhs_batch_dim_idx};
   Side out{TritonFusionAnalysis::Scope::OUTPUT,
            /*tiled_dims=*/
-           {{dims.out_lhs_noncontracting_dim_idx, pid_m_offset, block_m},
-            {dims.out_rhs_noncontracting_dim_idx, pid_n_offset, block_n}},
+           {DimProperties(dims.out_lhs_noncontracting_dim_idx, pid_m_offset,
+                          block_m, /*split_value=*/1),
+            DimProperties(dims.out_rhs_noncontracting_dim_idx, pid_n_offset,
+                          block_n, /*split_value=*/1)},
            dims.out_batch_dim_idx};
 
   auto body_builder = [&](mlir::OpBuilder&, mlir::Location, Value ki,
@@ -1516,7 +1529,8 @@ Status EmitSoftMax(mlir::OpBuilder builder, absl::string_view libdevice_path,
       b, make_tensor_pointer(fn.getArgument(0)), boundary_checks);
   // Dimension 0 is the reduced one by construction and it's the only one
   // present in the tile shapes.
-  std::vector<DimProperties> tiled_dims = {{0, row_index, block_row}};
+  std::vector<DimProperties> tiled_dims = {
+      DimProperties(0, row_index, block_row, /*split_value=*/1)};
   TF_ASSIGN_OR_RETURN(
       Value result,
       EmitScope(b, libdevice_path, &analysis,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 2ffe9356e39bd9..65f6906475d8e2 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -235,6 +235,38 @@ ENTRY entry {
   EXPECT_GT(result.shmem_bytes, dev_info.shared_memory_per_block);
 }
 
+TEST_F(TritonGemmTest, WorksWhenKIsDivisibleByBlockKButNotByBlockKTimesSplitK) {
+  // The condition mentioned in the test name is fulfilled by
+  // GemmKey(16, 64, 256, 8, 1, 4), which was part of the default configs for
+  // Ampere at the time of the addition of this test case.
+  constexpr absl::string_view kHloText = R"(
+HloModule extracted
+
+ENTRY e {
+  a = f16[16,5120]{1,0} parameter(0)
+  b = s8[5120,10240]{1,0} parameter(1)
+  converted_b = f16[5120,10240]{1,0} convert(b)
+  ROOT r = f16[16,10240]{1,0} dot(a, converted_b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  // This check tests if Triton is used at all plus it runs TritonAutotuner,
+  // which verifies if the generated kernels can run without errors such as
+  // CUDA_ERROR_ILLEGAL_ADDRESS.
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK-NEXT: parameter
+; CHECK-NEXT: parameter
+; CHECK-NEXT: fusion(
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: "block_m":
+  )");
+
+  // Not doing a comparison here, because the input matrices are quite big.
+  // If I reduce their size then they can no longer trigger the error, that I
+  // want to avoid with this test case.
+}
+
 TEST_F(TritonGemmTest, MultipleDims) {
   const std::string hlo_text = R"(
 HloModule t

From 879ffa3aca8e6bdb3eb9ae1a3081dc7d37ce0d23 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <fadi.arafeh@arm.com>
Date: Mon, 18 Sep 2023 18:40:30 +0100
Subject: [PATCH 025/567] BF16 capability detection for Ubuntu 20.04

This adds a patch to oneDNN for BF16 capability detection
for Ubuntu 20.04 on aarch64.
The contents in the patch are fully authored by @kawakami-k,
the source used for this patch is available here:
oneapi-src/oneDNN#1670
---
 tensorflow/workspace2.bzl                     |  1 +
 ...capability_detection_for_ubuntu20.04.patch | 50 +++++++++++++++++++
 .../xla/third_party/tsl/workspace2.bzl        |  1 +
 3 files changed, 52 insertions(+)
 create mode 100644 third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch

diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 22d25f101488aa..6c30a25d0397f3 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -208,6 +208,7 @@ def _tf_repositories():
             "//third_party/mkl_dnn:onednn_acl_reorder.patch",
             "//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
             "//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
+            "//third_party/mkl_dnn:onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch",
         ],
         sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
         strip_prefix = "oneDNN-3.2.1",
diff --git a/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch b/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch
new file mode 100644
index 00000000000000..6d6f0c0eaabb13
--- /dev/null
+++ b/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch
@@ -0,0 +1,50 @@
+From 9a9430c7db870b78c6402d786a67921af4a66334 Mon Sep 17 00:00:00 2001
+From: Kentaro Kawakami <kawakami.k@fujitsu.com>
+Date: Fri, 26 May 2023 10:58:36 +0900
+Subject: [PATCH] cpu: aarch64: xbyak_aarch64: BF16 capability detection for
+ Ubuntu 20.04
+
+---
+ .../aarch64/xbyak_aarch64/src/util_impl_linux.h   | 15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
+index 743843bae50..3db37e972d1 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
++++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
+@@ -39,6 +39,13 @@
+ #include <asm/hwcap.h>
+ #endif
+ 
++/* Linux kernel used in Ubuntu 20.04 does not have HWCAP2_BF16 definition. */
++#ifdef AT_HWCAP2
++#ifndef HWCAP2_BF16
++#define HWCAP2_BF16 (1UL << 14)
++#endif
++#endif
++
+ namespace Xbyak_aarch64 {
+ namespace util {
+ #define XBYAK_AARCH64_ERROR_ fprintf(stderr, "%s, %d, Error occurrs during read cache infomation.\n", __FILE__, __LINE__);
+@@ -383,7 +390,7 @@ class CpuInfoLinux : public CpuInfo {
+   }
+ 
+   void setHwCap() {
+-    unsigned long hwcap = getauxval(AT_HWCAP);
++    const unsigned long hwcap = getauxval(AT_HWCAP);
+     if (hwcap & HWCAP_ATOMICS)
+       type_ |= (Type)XBYAK_AARCH64_HWCAP_ATOMIC;
+ 
+@@ -391,8 +398,10 @@ class CpuInfoLinux : public CpuInfo {
+       type_ |= (Type)XBYAK_AARCH64_HWCAP_FP;
+     if (hwcap & HWCAP_ASIMD)
+       type_ |= (Type)XBYAK_AARCH64_HWCAP_ADVSIMD;
+-#ifdef HWCAP2_BF16
+-    if (hwcap & HWCAP2_BF16)
++
++#ifdef AT_HWCAP2
++    const unsigned long hwcap2 = getauxval(AT_HWCAP2);
++    if (hwcap2 & HWCAP2_BF16)
+       type_ |= (Type)XBYAK_AARCH64_HWCAP_BF16;
+ #endif
+ 
diff --git a/third_party/xla/third_party/tsl/workspace2.bzl b/third_party/xla/third_party/tsl/workspace2.bzl
index cfd8ab0f9f0472..06b25e92a4cf87 100644
--- a/third_party/xla/third_party/tsl/workspace2.bzl
+++ b/third_party/xla/third_party/tsl/workspace2.bzl
@@ -170,6 +170,7 @@ def _tf_repositories():
             "//third_party/mkl_dnn:onednn_acl_reorder.patch",
             "//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
             "//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
+            "//third_party/mkl_dnn:onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch",
         ],
         sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
         strip_prefix = "oneDNN-3.2.1",

From f3578fae9a3195dd6ba58cd896a08869c5783ff2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 05:11:52 -0700
Subject: [PATCH 026/567] Initialize creation_pass_id in GPU Compiler

This sets the creation pass id and the logical creation pass id to -1
for all ops in the input HLO. -1 is the special value which identifies
ops present in the input HLO.

By setting the -1 for input ops we will be able to differentiate ops
originating from the input from ops generated by optimization passes.

PiperOrigin-RevId: 566929969
---
 third_party/xla/xla/service/gpu/gpu_compiler.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 9caa2ccdbbde36..bad85be4677da6 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -293,6 +293,15 @@ void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
                                                "hlo verifier");
   }
 }
+
+void SetInstructionMetadata(HloModule* module) {
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      instruction->set_creation_pass_id(-1);
+      instruction->set_logical_creation_pass_id(-1);
+    }
+  }
+}
 }  // namespace
 
 // Runs optimization passes on the given HLO module.
@@ -344,6 +353,8 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
   layout_insensitive_algsimp_opts
       .set_enable_unconditional_reduce_of_concat_replacement(false);
 
+  SetInstructionMetadata(hlo_module);
+
   HloPassPipeline pre_spmd_pipeline("pre-spmd-partitioner");
   // Run some IR cleanup passes before running the SPMD partitioning
   // passes.

From 1b7a86988e88103ba5567f478f6c9425b3766e01 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Wed, 20 Sep 2023 05:22:54 -0700
Subject: [PATCH 027/567] [XLA:GPU] Priority fusion: don't allow partial
 fusions.

Only consider cases when a producer can be fused will all the consumers. Otherwise fusion is not beneficial.

PiperOrigin-RevId: 566932055
---
 third_party/xla/xla/service/gpu/BUILD         |  1 +
 .../xla/xla/service/gpu/priority_fusion.cc    | 26 +++++++------------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 7c3efff98a8496..46de10211d6ccb 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1960,6 +1960,7 @@ cc_library(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_pass",
         "//xla/service:instruction_fusion",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
diff --git a/third_party/xla/xla/service/gpu/priority_fusion.cc b/third_party/xla/xla/service/gpu/priority_fusion.cc
index dfbc6b5cddbcec..1245f5a83458b2 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
@@ -114,7 +115,7 @@ class GpuPriorityFusionQueue : public FusionQueue {
       if (priority < 0) {
         continue;
       }
-      current_consumers_ = GetFusibleUsers(current_producer_);
+      current_consumers_ = current_producer_->users();
     }
 
     auto next_consumer = current_consumers_.back();
@@ -230,31 +231,22 @@ class GpuPriorityFusionQueue : public FusionQueue {
   // Returns the priority of the producer based on its current operands and
   // users.
   Priority CalculateProducerPriority(HloInstruction* producer) {
-    std::vector<HloInstruction*> fusible_users = GetFusibleUsers(producer);
-
-    // Don't bother computing cost for non-fusible ops.
-    if (fusible_users.empty()) {
+    // Don't fuse if we can't fuse in all users.
+    if (!CanFuseWithAllUsers(producer)) {
       return std::numeric_limits<Priority>::min();
     }
 
     GpuPerformanceModel::RunTimes run_times =
         GpuPerformanceModel::EstimateRunTimes(producer, &cost_analysis_,
-                                              fusible_users);
+                                              producer->users());
     return absl::ToInt64Nanoseconds(run_times.time_unfused -
                                     run_times.time_fused);
   }
 
-  std::vector<HloInstruction*> GetFusibleUsers(HloInstruction* producer) const {
-    std::vector<HloInstruction*> fusible_users;
-    for (auto user : producer->users()) {
-      int64_t operand_index = user->operand_index(producer);
-
-      if (can_fuse_(user, operand_index)) {
-        fusible_users.push_back(user);
-      }
-    }
-
-    return fusible_users;
+  bool CanFuseWithAllUsers(HloInstruction* producer) const {
+    return absl::c_all_of(producer->users(), [&](HloInstruction* user) {
+      return can_fuse_(user, user->operand_index(producer));
+    });
   }
 
   // Store computation for cost analysis.

From 40567bcecebfaca299cd1bc1ac8019bddb2f4db6 Mon Sep 17 00:00:00 2001
From: Rahul Batra <rahbatra@amd.com>
Date: Tue, 27 Sep 2022 16:03:26 +0000
Subject: [PATCH 028/567] [ROCm]: Updates for rocm_dnn header dependency

---
 tensorflow/core/kernels/BUILD  | 2 ++
 tensorflow/core/platform/BUILD | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 37aa3cbf625cde..c258e5c5a0ff7e 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2397,6 +2397,8 @@ tf_kernel_library(
         ":conv_ops_gpu_hdrs",
     ]) + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
+    ])+ if_rocm([
+	    "//tensorflow/core/platform:stream_executor",
     ]),
 )
 
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 28f25dedc66fb9..229277ad00c0d8 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -1109,7 +1109,9 @@ tf_cuda_library(
         "@local_xla//xla/stream_executor/host:host_platform_id",
         "@local_xla//xla/stream_executor/platform:dso_loader",
         "@local_xla//xla/stream_executor/rocm:rocm_platform_id",
-    ],
+    ] + if_rocm_is_configured([
+        "@local_xla//xla/stream_executor/rocm:miopen_plugin",
+    ]),
 )
 
 # Like stream_executor library, but compiles without --config=cuda

From 9ff97264b103695012907913ebce1d0a3d3b0930 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 07:39:32 -0700
Subject: [PATCH 029/567] Integrate LLVM at llvm/llvm-project@2baf4a06ef06

Updates LLVM usage to match
[2baf4a06ef06](https://github.com/llvm/llvm-project/commit/2baf4a06ef06)

PiperOrigin-RevId: 566960191
---
 third_party/llvm/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 9a1d9a278d1d19..a1abdc50e811c6 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "bbe3ee061f6d4698988b7c7aae0d607f0f2e3db5"
-    LLVM_SHA256 = "25488abfd0967154c37b0472eea7d55b1d1c5854207b0a0aabf80cfd8de5d629"
+    LLVM_COMMIT = "2baf4a06ef06c51c2ef09f981f204983b0f8082c"
+    LLVM_SHA256 = "0ce881f09d65b27810160d02842d42259209506fac98f3f9389059c8b8429d69"
 
     tf_http_archive(
         name = name,

From 0d09ec0c1e2cce294b338e8f2e0e52ce9ccdc5e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 08:07:13 -0700
Subject: [PATCH 030/567] Move missed bazel config to LLVM17 toolchain

PiperOrigin-RevId: 566966919
---
 ci/official/bazelrcs/cpu.bazelrc  | 16 ++++++++--------
 ci/official/bazelrcs/cuda.bazelrc | 22 +++++++++++-----------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/ci/official/bazelrcs/cpu.bazelrc b/ci/official/bazelrcs/cpu.bazelrc
index 43c951e3532466..096893bdc096c0 100644
--- a/ci/official/bazelrcs/cpu.bazelrc
+++ b/ci/official/bazelrcs/cpu.bazelrc
@@ -49,7 +49,7 @@ build --linkopt="-lm"
 build --copt=-Wno-gnu-offsetof-extensions
 
 # Use the NVCC toolchain to compile for manylinux2014
-build --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
 
 # Test-related settings below this point.
 test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
@@ -88,14 +88,14 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.14-clang17_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 
 # For continuous builds
diff --git a/ci/official/bazelrcs/cuda.bazelrc b/ci/official/bazelrcs/cuda.bazelrc
index 4d6df84931675d..14f4c4b96d0b6e 100644
--- a/ci/official/bazelrcs/cuda.bazelrc
+++ b/ci/official/bazelrcs/cuda.bazelrc
@@ -111,24 +111,24 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.14-clang17_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
 # For Remote build execution -- GPU configuration
 build:rbe --repo_env=REMOTE_GPU_TESTING=1
 test:rbe --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang_config_cuda"
-build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang_config_tensorrt"
-build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang_config_nccl"
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
+build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang17_config_cuda"
+build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang17_config_tensorrt"
+build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang17_config_nccl"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
 
 # For continuous builds
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11

From a8866e20fafb66161e403493f188a5639a023028 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 08:23:00 -0700
Subject: [PATCH 031/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/752d6d83d403986227dffe42beb5014843cf2ddb.

PiperOrigin-RevId: 566971528
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 6abf8996baba2f..ab7501924e78b2 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "e20500b4e4cb784965a31f67b062cf4982ab1136"
-    TFRT_SHA256 = "36abbebdb50b97391b5c608e9cd606ee998256dbe7a865723ad208b57bc04d78"
+    TFRT_COMMIT = "752d6d83d403986227dffe42beb5014843cf2ddb"
+    TFRT_SHA256 = "818d9b3951c1da81a937a24c3875bfae38664ceb37909e5438bbebf708279a24"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index 6abf8996baba2f..ab7501924e78b2 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "e20500b4e4cb784965a31f67b062cf4982ab1136"
-    TFRT_SHA256 = "36abbebdb50b97391b5c608e9cd606ee998256dbe7a865723ad208b57bc04d78"
+    TFRT_COMMIT = "752d6d83d403986227dffe42beb5014843cf2ddb"
+    TFRT_SHA256 = "818d9b3951c1da81a937a24c3875bfae38664ceb37909e5438bbebf708279a24"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 6abf8996baba2f..ab7501924e78b2 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "e20500b4e4cb784965a31f67b062cf4982ab1136"
-    TFRT_SHA256 = "36abbebdb50b97391b5c608e9cd606ee998256dbe7a865723ad208b57bc04d78"
+    TFRT_COMMIT = "752d6d83d403986227dffe42beb5014843cf2ddb"
+    TFRT_SHA256 = "818d9b3951c1da81a937a24c3875bfae38664ceb37909e5438bbebf708279a24"
 
     tf_http_archive(
         name = "tf_runtime",

From 69923fd52754c1c501e45f8cd87a7d93a74bcc4c Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Wed, 20 Sep 2023 08:24:32 -0700
Subject: [PATCH 032/567] Fix layout logic in IrArray::Index::Linearize.

The logic previously assumed the layout was major-to-minor. In practice, I believe this assumption is correct in all places where Linearize is called, because LayoutNormalization converts most instructions to be major-to-minor. But in general, IrArray supports arbitrary layouts and so we should obey the given layout in IrArray::Index::Linearize as well. I am working on a change which calls IrArray::Index::Linearize in another place, and it requires Linearize to correctly handle non-major-to-minor layouts.

Also fix a comment incorrectly stating multidim was major-to-minor.

PiperOrigin-RevId: 566971940
---
 .../service/gpu/tests/slice_to_dynamic.hlo    |  6 +++---
 .../xla/xla/service/llvm_ir/ir_array.cc       | 20 ++++++++++---------
 .../xla/xla/service/llvm_ir/ir_array.h        |  6 +++---
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
index f5621261cb7504..54aa009ef75a93 100644
--- a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
+++ b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
@@ -42,9 +42,9 @@
 // CHECK:       custom_call.in_bounds-true:                       ; preds = %[[VAL_11]]
 // CHECK:         %[[VAL_34:.*]] = mul nuw nsw i32 %[[VAL_22]], 1
 // CHECK:         %[[VAL_35:.*]] = add nuw nsw i32 0, %[[VAL_34]]
-// CHECK:         %[[VAL_36:.*]] = mul nuw nsw i32 %[[VAL_25]], 2
+// CHECK:         %[[VAL_36:.*]] = mul nuw nsw i32 %[[VAL_24]], 2
 // CHECK:         %[[VAL_37:.*]] = add nuw nsw i32 %[[VAL_35]], %[[VAL_36]]
-// CHECK:         %[[VAL_38:.*]] = mul nuw nsw i32 %[[VAL_24]], 4
+// CHECK:         %[[VAL_38:.*]] = mul nuw nsw i32 %[[VAL_25]], 4
 // CHECK:         %[[VAL_39:.*]] = add nuw nsw i32 %[[VAL_37]], %[[VAL_38]]
 // CHECK:         %[[VAL_40:.*]] = icmp ult i32 %[[VAL_39]], %[[VAL_15]]
 // CHECK:         br i1 %[[VAL_40]], label %[[VAL_41:.*]], label %[[VAL_29]]
@@ -59,7 +59,7 @@
 // CHECK:         %[[VAL_47:.*]] = mul i32 %[[VAL_44]], %[[VAL_0]]
 // CHECK:         %[[VAL_48:.*]] = udiv i32 %[[VAL_39]], %[[VAL_47]]
 // CHECK:         %[[VAL_49:.*]] = getelementptr inbounds [2 x [2 x [2 x i32]]], ptr %[[VAL_50:.*]], i32 0, i32 %[[VAL_48]], i32 %[[VAL_46]], i32 %[[VAL_43]]
-// CHECK:         %[[VAL_51:.*]] = load i32, ptr %[[VAL_49]], align 4, !invariant.load
+// CHECK:         %[[VAL_51:.*]] = load i32, ptr %[[VAL_49]], align 4, !invariant.load !4
 // CHECK:         %[[VAL_52:.*]] = getelementptr inbounds i32, ptr %[[VAL_31]], i32 %[[VAL_19]]
 // CHECK:         store i32 %[[VAL_51]], ptr %[[VAL_52]], align 4
 // CHECK:         br label %[[VAL_29]]
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array.cc b/third_party/xla/xla/service/llvm_ir/ir_array.cc
index 7708db56edb585..36794b4b757e8f 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array.cc
+++ b/third_party/xla/xla/service/llvm_ir/ir_array.cc
@@ -442,14 +442,15 @@ llvm::Value* IrArray::Index::Linearize(absl::Span<const int64_t> dimensions,
   CHECK_EQ(size(), dimensions.size());
   llvm::Value* logical_linear_index = GetConstantWithIndexType(0);
   int64_t multiplier = 1;
-  for (ssize_t i = size() - 1; i >= 0; --i) {
-    llvm::Value* addend =
-        builder->CreateMul((*this)[i], GetConstantWithIndexType(multiplier), "",
-                           /*HasNUW=*/true, /*HasNSW=*/true);
+  for (ssize_t i = 0; i < size(); ++i) {
+    int64_t dimension = layout_.minor_to_major(i);
+    llvm::Value* addend = builder->CreateMul(
+        (*this)[dimension], GetConstantWithIndexType(multiplier), "",
+        /*HasNUW=*/true, /*HasNSW=*/true);
     addend = builder->CreateZExtOrTrunc(addend, index_type_);
     logical_linear_index = builder->CreateAdd(logical_linear_index, addend, "",
                                               /*HasNUW=*/true, /*HasNSW=*/true);
-    multiplier *= dimensions[i];
+    multiplier *= dimensions[dimension];
   }
   return logical_linear_index;
 }
@@ -462,14 +463,15 @@ llvm::Value* IrArray::Index::Linearize(
   CHECK_EQ(size(), dynamic_dims.size());
   llvm::Value* logical_linear_index = GetConstantWithIndexType(0);
   llvm::Value* multiplier = GetConstantWithIndexType(1);
-  for (ssize_t i = size() - 1; i >= 0; --i) {
-    llvm::Value* addend = builder->CreateMul((*this)[i], multiplier, "",
+  for (ssize_t i = 0; i < size(); ++i) {
+    int64_t dimension = layout_.minor_to_major(i);
+    llvm::Value* addend = builder->CreateMul((*this)[dimension], multiplier, "",
                                              /*HasNUW=*/true, /*HasNSW=*/true);
     addend = builder->CreateZExtOrTrunc(addend, index_type_);
     logical_linear_index = builder->CreateAdd(logical_linear_index, addend, "",
                                               /*HasNUW=*/true, /*HasNSW=*/true);
-    if (i) {
-      multiplier = builder->CreateMul(multiplier, dynamic_dims[i],
+    if (i < size() - 1) {
+      multiplier = builder->CreateMul(multiplier, dynamic_dims[dimension],
                                       /*Name=*/"multiplier");
     }
   }
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array.h b/third_party/xla/xla/service/llvm_ir/ir_array.h
index b67c1536b20524..fdebb5fbaf17b6 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array.h
+++ b/third_party/xla/xla/service/llvm_ir/ir_array.h
@@ -42,9 +42,9 @@ namespace llvm_ir {
 // are supported.
 class IrArray {
  public:
-  // A multidimensional index into an IrArray. All the runtime indices
-  // (multidim) and dimensions (Shape::dimensions(), absl::Span<const int64_t>)
-  // are major-first.
+  // A multidimensional index into an IrArray. The order of the runtime indices
+  // (multidim) corresponds to the order of dimensions in the Shape passed to
+  // the constructor.
   //
   // This may also keep a linear index and the layout and dimensions it was
   // emitted for; if the shape where this `Index` is used matches, the linear

From 5f7dd0c0e3df973e1aff926fec11f08df7fcf859 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 08:38:03 -0700
Subject: [PATCH 033/567] Internal, fix build error.

PiperOrigin-RevId: 566975236
---
 .../xla/third_party/tsl/tsl/platform/windows/subprocess.cc    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.cc b/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.cc
index a28a30c1cf7510..e31432a0047575 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.cc
@@ -95,9 +95,9 @@ DWORD WINAPI OutputThreadFunction(LPVOID param) {
 
 SubProcess::SubProcess(int nfds)
     : running_(false),
+      win_pi_(nullptr),
       exec_path_(nullptr),
-      exec_argv_(nullptr),
-      win_pi_(nullptr) {
+      exec_argv_(nullptr) {
   // The input 'nfds' parameter is currently ignored and the internal constant
   // 'kNFds' is used to support the 3 channels (stdin, stdout, stderr).
   for (int i = 0; i < kNFds; i++) {

From 95060833cffbc61eecea769d07bb05914f50d502 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 08:38:21 -0700
Subject: [PATCH 034/567] Update Docker image tags and digests in Kokoro and
 Bazel

This is changing the Kokoro job config back to its original state (using the latest-python3.X docker image tags) and is updating the Bazel remote toolchain config to the latest Docker containers. (The ones created here https://github.com/tensorflow/tensorflow/actions/runs/6239795702/job/16938502214 which also the latest tags refer to.)

PiperOrigin-RevId: 566975309
---
 tensorflow/tools/toolchains/remote_config/configs.bzl     | 8 ++++----
 third_party/xla/.kokoro/linux/cpu/build_cpu.cfg           | 4 +---
 third_party/xla/.kokoro/linux/gpu/build_gpu.cfg           | 3 +--
 .../xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg   | 4 +---
 .../tsl/tools/toolchains/remote_config/configs.bzl        | 8 ++++----
 .../xla/tools/toolchains/remote_config/configs.bzl        | 8 ++++----
 6 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index a7db31e6276f2a..0ef444f89f17d5 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -659,10 +659,10 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.14-clang17": "docker://gcr.io/tensorflow-sigs/build@sha256:77b26125af4e2c6d7955a94d8d75f6dbb1e35a33db5bdaa915806110527ab927",
-            "sigbuild-r2.14-clang17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:77b26125af4e2c6d7955a94d8d75f6dbb1e35a33db5bdaa915806110527ab927",
-            "sigbuild-r2.14-clang17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:fa47f1bc501983fb57e7af0e04f3c45051e42129640ef4d4a10e829d255f11ac",
-            "sigbuild-r2.14-clang17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:6935af1dd34f2f1d663ce1a6c63b3e96595ac9fefdf1e587a9bc53f2bfbf0c47",
+            "sigbuild-r2.14-clang17": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
+            "sigbuild-r2.14-clang17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
         },
         # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
         # and manylinux2014 is 2.17.
diff --git a/third_party/xla/.kokoro/linux/cpu/build_cpu.cfg b/third_party/xla/.kokoro/linux/cpu/build_cpu.cfg
index 6f74611c651f3b..a58a2f918320be 100644
--- a/third_party/xla/.kokoro/linux/cpu/build_cpu.cfg
+++ b/third_party/xla/.kokoro/linux/cpu/build_cpu.cfg
@@ -1,7 +1,5 @@
 build_file: "xla/.kokoro/linux/build.sh"
 env_vars: {
   key: "DOCKER_IMAGE"
-  # TODO(b/296975791): Change the tag back to `latest-python3.9` after the LLVM-17 update
-  value: "gcr.io/tensorflow-sigs/build:565341047-python3.9"
-
+  value: "gcr.io/tensorflow-sigs/build:latest-python3.9"
 }
diff --git a/third_party/xla/.kokoro/linux/gpu/build_gpu.cfg b/third_party/xla/.kokoro/linux/gpu/build_gpu.cfg
index 697e639d0659f1..a58a2f918320be 100644
--- a/third_party/xla/.kokoro/linux/gpu/build_gpu.cfg
+++ b/third_party/xla/.kokoro/linux/gpu/build_gpu.cfg
@@ -1,6 +1,5 @@
 build_file: "xla/.kokoro/linux/build.sh"
 env_vars: {
   key: "DOCKER_IMAGE"
-  # TODO(b/296975791): Change the tag back to `latest-python3.9` after the LLVM-17 update
-  value: "gcr.io/tensorflow-sigs/build:565341047-python3.9"
+  value: "gcr.io/tensorflow-sigs/build:latest-python3.9"
 }
diff --git a/third_party/xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg b/third_party/xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg
index fe035133997d6c..8e105be39e67c0 100644
--- a/third_party/xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg
+++ b/third_party/xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg
@@ -1,7 +1,5 @@
 build_file: "tsl/.kokoro/linux/build.sh"
 env_vars: {
   key: "DOCKER_IMAGE"
-  # TODO(b/296975791): Change the tag back to `latest-python3.9` after the LLVM-17 update
-  value: "gcr.io/tensorflow-sigs/build:565341047-python3.9"
-
+  value: "gcr.io/tensorflow-sigs/build:latest-python3.9"
 }
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index 4a6c6376453c76..d1e467c45c91b6 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -659,10 +659,10 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.14-clang17": "docker://gcr.io/tensorflow-sigs/build@sha256:77b26125af4e2c6d7955a94d8d75f6dbb1e35a33db5bdaa915806110527ab927",
-            "sigbuild-r2.14-clang17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:77b26125af4e2c6d7955a94d8d75f6dbb1e35a33db5bdaa915806110527ab927",
-            "sigbuild-r2.14-clang17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:fa47f1bc501983fb57e7af0e04f3c45051e42129640ef4d4a10e829d255f11ac",
-            "sigbuild-r2.14-clang17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:6935af1dd34f2f1d663ce1a6c63b3e96595ac9fefdf1e587a9bc53f2bfbf0c47",
+            "sigbuild-r2.14-clang17": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
+            "sigbuild-r2.14-clang17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
         },
         # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
         # and manylinux2014 is 2.17.
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index 4a6c6376453c76..d1e467c45c91b6 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -659,10 +659,10 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.14-clang17": "docker://gcr.io/tensorflow-sigs/build@sha256:77b26125af4e2c6d7955a94d8d75f6dbb1e35a33db5bdaa915806110527ab927",
-            "sigbuild-r2.14-clang17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:77b26125af4e2c6d7955a94d8d75f6dbb1e35a33db5bdaa915806110527ab927",
-            "sigbuild-r2.14-clang17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:fa47f1bc501983fb57e7af0e04f3c45051e42129640ef4d4a10e829d255f11ac",
-            "sigbuild-r2.14-clang17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:6935af1dd34f2f1d663ce1a6c63b3e96595ac9fefdf1e587a9bc53f2bfbf0c47",
+            "sigbuild-r2.14-clang17": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
+            "sigbuild-r2.14-clang17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
         },
         # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
         # and manylinux2014 is 2.17.

From 1f9a25d2f55aeab6c57ece9173f27fb1af3ffda2 Mon Sep 17 00:00:00 2001
From: Jake Harmon <jakeharmon@google.com>
Date: Wed, 20 Sep 2023 09:15:26 -0700
Subject: [PATCH 035/567] Restore TSL/XLA headers in tensorflow/include

The headers were previously in tensorflow/include/tensorflow/tsl and tensorflow/include/tensorflow/compiler/xla, but they can now be found in tensorflow/include/{tsl,xla}. C projects built against the TF wheel should now build unless they directly reference one of these headers.

PiperOrigin-RevId: 566984872
---
 .../tools/pip_package/build_pip_package.sh       |  9 +++++++--
 tensorflow/tools/pip_package/setup.py            | 16 +++++++++++-----
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index b202fe0ee5f83b..d83f2096f277a1 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -254,10 +254,15 @@ function prepare_src() {
     fi
   fi
 
+  # Move headers from TSL/XLA into tensorflow so that InstallHeaders can move
+  # them back into tensorflow/include
+  cp -rL bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_tsl/tsl/ ${TMPDIR}/tensorflow
+  cp -rL bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
   # Move vendored files into proper locations
   # This is required because TSL/XLA don't publish their own wheels
-  cp -r bazel-bin/external/local_tsl/tsl/ ${TMPDIR}/tensorflow/tsl
-  cp -r bazel-bin/external/local_xla/xla/ ${TMPDIR}/tensorflow/compiler/xla
+  # TODO(jakeharmon): These two copy statements may no longer be necessary
+  cp -rL bazel-bin/external/local_tsl/tsl/ ${TMPDIR}/tensorflow
+  cp -rL bazel-bin/external/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
   # Fix the proto stubs
   if is_macos; then
     find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i '' 's/from tsl\./from tensorflow.tsl./' {} \;
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index cfc65d92635a3c..755405fbf15eea 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -261,13 +261,19 @@ def mkdir_and_copy_file(self, header):
     # symlink within the directory hierarchy.
     # NOTE(keveman): Figure out how to customize bdist_wheel package so
     # we can do the symlink.
-    external_header_locations = [
-        'tensorflow/include/external/eigen_archive/',
-        'tensorflow/include/external/com_google_absl/',
-    ]
+    # pylint: disable=line-too-long
+    external_header_locations = {
+        '/tensorflow/include/external/eigen_archive': '',
+        '/tensorflow/include/external/com_google_absl': '',
+        '/tensorflow/include/tensorflow/compiler/xla': '/tensorflow/include/xla',
+        '/tensorflow/include/tensorflow/tsl': '/tensorflow/include/tsl',
+    }
+    # pylint: enable=line-too-long
+
     for location in external_header_locations:
       if location in install_dir:
-        extra_dir = install_dir.replace(location, '')
+        extra_dir = install_dir.replace(location,
+                                        external_header_locations[location])
         if not os.path.exists(extra_dir):
           self.mkpath(extra_dir)
         self.copy_file(header, extra_dir)

From 1e035a5f6055c0531f35b1ec8508a54fcf2737ca Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Wed, 20 Sep 2023 09:25:25 -0700
Subject: [PATCH 036/567] Proposal to create a cluster Bridge API for TF:

V1 - Used when the bridge is called via the Function API, which roughly aligns to TF2.

V0 - Used when the Bridge is called via the Session API. We're limited to not supporting infeed / outfeed ops.

After this change, existing bridge.cc logic gets pulled out into here.

PiperOrigin-RevId: 566987673
---
 tensorflow/compiler/mlir/tf2xla/api/v1/BUILD  | 23 ++++++++++
 .../compiler/mlir/tf2xla/api/v1/cluster_tf.cc | 34 +++++++++++++++
 .../compiler/mlir/tf2xla/api/v1/cluster_tf.h  | 43 +++++++++++++++++++
 .../mlir/tf2xla/api/v1/cluster_tf_test.cc     | 37 ++++++++++++++++
 tensorflow/compiler/mlir/tf2xla/api/v2/BUILD  | 24 +++++++++++
 .../compiler/mlir/tf2xla/api/v2/cluster_tf.cc | 36 ++++++++++++++++
 .../compiler/mlir/tf2xla/api/v2/cluster_tf.h  | 43 +++++++++++++++++++
 .../mlir/tf2xla/api/v2/cluster_tf_test.cc     | 38 ++++++++++++++++
 8 files changed, 278 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
index 827fb33a7874f5..08b5e4d485a6e4 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -143,3 +143,26 @@ tf_cc_test(
         "@local_xla//xla/client:client_library",
     ],
 )
+
+cc_library(
+    name = "cluster_tf",
+    srcs = ["cluster_tf.cc"],
+    hdrs = ["cluster_tf.h"],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/platform:status",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
+tf_cc_test(
+    name = "cluster_tf_test",
+    srcs = ["cluster_tf_test.cc"],
+    deps = [
+        ":cluster_tf",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/lib/core:status_test_util",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
new file mode 100644
index 00000000000000..2994242fb06730
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h"
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+
+using mlir::ModuleOp;
+
+tensorflow::Status RunSessionTf2xlaClusteringBridge(ModuleOp module) {
+  return tsl::OkStatus();
+}
+
+}  // namespace v1
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h
new file mode 100644
index 00000000000000..02a68e741bab10
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_CLUSTER_TF_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_CLUSTER_TF_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+
+// Run all the passes involved in transforming the graph before execution so
+// that it is suitable for targeting devices when called via the TF1 Session
+// API.
+// These transformations take as input a Tensorflow Graph as an MLIR Module
+// and transforms the module in place to cluster the given ops for compilation
+// that is compatible with the given device_type. The MLIR should be in the TF
+// Executor Dialect for graph nodes and edges. Individual Op inside a node
+// should be the Tensorflow Dialect. The output MLIR is in the TF Executor
+// Dialect.  The input MLIR should not have infeed and outfeed ops, which are
+// unsupported via this API.
+// Returns OkStatus if passed, otherwise an error.
+tensorflow::Status RunSessionTf2xlaClusteringBridge(mlir::ModuleOp module);
+
+}  // namespace v1
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_CLUSTER_TF_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
new file mode 100644
index 00000000000000..f5d2e9a9e947fd
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h"
+
+#include <gtest/gtest.h>
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tsl/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+namespace {
+
+using mlir::ModuleOp;
+
+TEST(SessionTf2xlaClusteringBridge, ClustersTf) {
+  ModuleOp module;
+  TF_ASSERT_OK(RunSessionTf2xlaClusteringBridge(module));
+}
+
+}  // namespace
+}  // namespace v1
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index e71842b3ff866c..acd65e27f78a34 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -89,3 +89,27 @@ tf_proto_library(
     srcs = ["device_type.proto"],
     cc_api_version = 2,
 )
+
+cc_library(
+    name = "cluster_tf",
+    srcs = ["cluster_tf.cc"],
+    hdrs = ["cluster_tf.h"],
+    deps = [
+        ":device_type_proto_cc",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/platform:status",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
+tf_cc_test(
+    name = "cluster_tf_test",
+    srcs = ["cluster_tf_test.cc"],
+    deps = [
+        ":cluster_tf",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/lib/core:status_test_util",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
new file mode 100644
index 00000000000000..4e0c220a47589d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h"
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/device_type.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+
+using mlir::ModuleOp;
+
+tensorflow::Status RunFunctionTf2xlaClusteringBridge(ModuleOp module,
+                                                     DeviceType device_type) {
+  return tsl::OkStatus();
+}
+
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h
new file mode 100644
index 00000000000000..e25ae4ed973ce0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_CLUSTER_TF_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_CLUSTER_TF_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/device_type.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+
+// Run all the passes involved in transforming the graph before execution so
+// that it is suitable for targeting devices when called with the TF 2 Function
+// API. Users that need clustering with the Session API should use the v1 Bridge
+// API. These transformations take as input a Tensorflow Graph as an MLIR Module
+// and transforms the module in place to cluster the given ops for compilation
+// that is compatible with the given device_type. The MLIR should be in the TF
+// Executor Dialect for graph nodes and edges. Individual Op inside a node
+// should be the Tensorflow Dialect. The output MLIR is in the TF Executor
+// Dialect. Returns OkStatus if passed, otherwise an error.
+tensorflow::Status RunFunctionTf2xlaClusteringBridge(mlir::ModuleOp module,
+                                                     DeviceType device_type);
+
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_CLUSTER_TF_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
new file mode 100644
index 00000000000000..994bf2bd69e470
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h"
+
+#include <gtest/gtest.h>
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tsl/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+namespace {
+
+using mlir::ModuleOp;
+
+TEST(FunctionTf2xlaClusteringBridgeTest, ClustersTf) {
+  ModuleOp module;
+  TF_ASSERT_OK(
+      RunFunctionTf2xlaClusteringBridge(module, DeviceType::XLA_TPU_JIT));
+}
+
+}  // namespace
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow

From ab54126289a5ee3976bc4de6d93032cf6b750ffa Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Wed, 20 Sep 2023 09:28:34 -0700
Subject: [PATCH 037/567] [NFC] Log NCCL calls for AllToAll

PiperOrigin-RevId: 566988443
---
 .../xla/service/gpu/nccl_all_to_all_thunk.cc  | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/nccl_all_to_all_thunk.cc b/third_party/xla/xla/service/gpu/nccl_all_to_all_thunk.cc
index abaa47a29fd1fa..2dcf4ac8fe79f2 100644
--- a/third_party/xla/xla/service/gpu/nccl_all_to_all_thunk.cc
+++ b/third_party/xla/xla/service/gpu/nccl_all_to_all_thunk.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_format.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
 #include "xla/shape_util.h"
@@ -138,9 +139,21 @@ Status RunAllToAll(bool has_split_dimension,
                                                 buffer.element_type);
 
       for (int rank = 0; rank < num_participants; ++rank) {
+        VLOG(3) << absl::StreamFormat(
+            "Calling ncclSend(sendbuff=%p, count=%d, peer=%d "
+            "comm=%p, stream=%p)",
+            send_buffer + rank * chunk_bytes, chunk_elements, rank,
+            static_cast<const void*>(comm), gpu_stream);
         XLA_CUDA_RETURN_IF_ERROR(ncclSend(send_buffer + rank * chunk_bytes,
                                           chunk_elements, dtype, rank, comm,
                                           gpu_stream));
+
+        VLOG(3) << absl::StreamFormat(
+            "Calling ncclRecv(recvbuff=%p, count=%d, peer=%d "
+            "comm=%p, stream=%p)",
+            recv_buffer + rank * chunk_bytes, chunk_elements, rank,
+            static_cast<const void*>(comm), gpu_stream);
+
         XLA_CUDA_RETURN_IF_ERROR(ncclRecv(recv_buffer + rank * chunk_bytes,
                                           chunk_elements, dtype, rank, comm,
                                           gpu_stream));
@@ -164,8 +177,21 @@ Status RunAllToAll(bool has_split_dimension,
       int64_t element_count =
           buffer.element_count * dtype_and_multiplier.second;
 
+      VLOG(3) << absl::StreamFormat(
+          "Calling ncclSend(sendbuff=%p, count=%d, peer=%d "
+          "comm=%p, stream=%p)",
+          send_buffer, element_count, i, static_cast<const void*>(comm),
+          gpu_stream);
+
       XLA_CUDA_RETURN_IF_ERROR(ncclSend(send_buffer, element_count, dtype,
                                         /*rank=*/i, comm, gpu_stream));
+
+      VLOG(3) << absl::StreamFormat(
+          "Calling ncclRecv(recvbuff=%p, count=%d, peer=%d "
+          "comm=%p, stream=%p)",
+          recv_buffer, element_count, i, static_cast<const void*>(comm),
+          gpu_stream);
+
       XLA_CUDA_RETURN_IF_ERROR(ncclRecv(recv_buffer, element_count, dtype,
                                         /*rank=*/i, comm, gpu_stream));
     }

From 105a1a38382f2574222072d86bf4dbc8ae2fb7ae Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Wed, 20 Sep 2023 09:43:39 -0700
Subject: [PATCH 038/567] Add tf_mlrt.await_all_control op if there is any
 unused future.

PiperOrigin-RevId: 566992749
---
 .../mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir      | 19 +++++++++++
 .../mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc   | 34 +++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
index 12a562bf6b6962..b468f8f88f9b1f 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
@@ -417,3 +417,22 @@ func.func @case_test(%arg0: tensor<i32>, %arg1: tensor<f32>,  %arg2: tensor<f32>
   %0 = "tf.Case"(%arg0, %arg1, %arg2) {_lower_using_switch_merge = true, branches = [@branch0, @branch1], is_stateless = true} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
+
+// -----
+
+// Test await is added for unused futures
+
+// CHECK-LABEL: func @unused_future_arg
+// CHECK-SAME: ({{%.*}}: !tf_mlrt.tensor, [[unused:%.*]]: !mlrt.future)
+func.func @unused_future_arg(%x: tensor<i32>, %unused: !mlrt.future) -> tensor<i32> {
+  // CHECK: mlrt.await_all_control [[unused]]
+  return %x : tensor<i32>
+}
+
+// CHECK-LABEL: func @unused_future
+func.func @unused_future(%x: tensor<i32>) -> tensor<i32> {
+  // CHECK: [[unused:%.*]] = tf_mlrt.async_executeop
+  %unused = "tf.TestAsyncIdentity"(%x) {__op_key = 0: i32, T = i32} : (tensor<i32>) -> tensor<i32>
+  // CHECK: mlrt.await_all_control [[unused]]
+  return %x : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index 71b0d7d8403d12..603167cb0f5ef8 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -1015,6 +1015,40 @@ class TfToMlrtConversionPass
       op->replaceAllUsesWith(op->getOperands());
       op->erase();
     });
+
+    AddAwaitOpToUnusedFutures(module);
+  }
+
+  void AddAwaitOpToUnusedFutures(mlir::ModuleOp module) {
+    for (auto func : module.getOps<mlir::func::FuncOp>()) {
+      llvm::SmallVector<mlir::Value> unused_futures;
+
+      auto is_unused_future = [](mlir::Value result) {
+        return llvm::isa<::mlrt::compiler::FutureType>(result.getType()) &&
+               result.use_empty();
+      };
+
+      for (auto arg : func.getArguments()) {
+        if (is_unused_future(arg)) {
+          unused_futures.push_back(arg);
+        }
+      }
+
+      for (auto &op : func.getBody().front()) {
+        for (mlir::Value result : op.getResults()) {
+          if (is_unused_future(result)) {
+            unused_futures.push_back(result);
+          }
+        }
+      }
+
+      if (!unused_futures.empty()) {
+        auto builder =
+            mlir::OpBuilder::atBlockTerminator(&func.getBody().front());
+        builder.create<::mlrt::compiler::AwaitAllControlOp>(func.getLoc(),
+                                                            unused_futures);
+      }
+    }
   }
 
   mlir::LogicalResult PostProcessFunctionSignature(

From d03c477d727b93b71ac1710885c6c918d7754361 Mon Sep 17 00:00:00 2001
From: Raviteja Gorijala <gorijala@google.com>
Date: Wed, 20 Sep 2023 09:51:53 -0700
Subject: [PATCH 039/567] Add licenses and notices for third party libraries

PiperOrigin-RevId: 566995194
---
 .../tools/pip_package/THIRD_PARTY_NOTICES.txt | 9346 +++++++++++++++++
 1 file changed, 9346 insertions(+)
 create mode 100644 tensorflow/tools/pip_package/THIRD_PARTY_NOTICES.txt

diff --git a/tensorflow/tools/pip_package/THIRD_PARTY_NOTICES.txt b/tensorflow/tools/pip_package/THIRD_PARTY_NOTICES.txt
new file mode 100644
index 00000000000000..c0ecfe99bcefff
--- /dev/null
+++ b/tensorflow/tools/pip_package/THIRD_PARTY_NOTICES.txt
@@ -0,0 +1,9346 @@
+--------------------------------------------------------------------------------
+== Astunparse
+
+
+LICENSE
+=======
+
+Copyright (c) 2014, Simon Percivall
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+* Neither the name of AST Unparser nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+--------------------------------------------
+
+1. This LICENSE AGREEMENT is between the Python Software Foundation
+("PSF"), and the Individual or Organization ("Licensee") accessing and
+otherwise using this software ("Python") in source or binary form and
+its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, PSF hereby
+grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
+analyze, test, perform and/or display publicly, prepare derivative works,
+distribute, and otherwise use Python alone or in any derivative version,
+provided, however, that PSF's License Agreement and PSF's notice of copyright,
+i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+2011, 2012, 2013, 2014 Python Software Foundation; All Rights Reserved" are retained
+in Python alone or in any derivative version prepared by Licensee.
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python.
+
+4. PSF is making Python available to Licensee on an "AS IS"
+basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. Nothing in this License Agreement shall be deemed to create any
+relationship of agency, partnership, or joint venture between PSF and
+Licensee.  This License Agreement does not grant permission to use PSF
+trademarks or trade name in a trademark sense to endorse or promote
+products or services of Licensee, or any third party.
+
+8. By copying, installing or otherwise using Python, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== AbslPy
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== Boring SSL
+
+BoringSSL is a fork of OpenSSL. As such, large parts of it fall under OpenSSL
+licensing. Files that are completely new have a Google copyright and an ISC
+license. This license is reproduced at the bottom of this file.
+
+Contributors to BoringSSL are required to follow the CLA rules for Chromium:
+https://cla.developers.google.com/clas
+
+Files in third_party/ have their own licenses, as described therein. The MIT
+license, for third_party/fiat, which, unlike other third_party directories, is
+compiled into non-test libraries, is included below.
+
+The OpenSSL toolkit stays under a dual license, i.e. both the conditions of the
+OpenSSL License and the original SSLeay license apply to the toolkit. See below
+for the actual license texts. Actually both licenses are BSD-style Open Source
+licenses. In case of any license issues related to OpenSSL please contact
+openssl-core@openssl.org.
+
+The following are Google-internal bug numbers where explicit permission from
+some authors is recorded for use of their work. (This is purely for our own
+record keeping.)
+  27287199
+  27287880
+  27287883
+  263291445
+
+  OpenSSL License
+  ---------------
+
+/* ====================================================================
+ * Copyright (c) 1998-2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+ Original SSLeay License
+ -----------------------
+
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+
+ISC license used for completely new code in BoringSSL:
+
+/* Copyright (c) 2015, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+
+The code in third_party/fiat carries the MIT license:
+
+Copyright (c) 2015-2016 the fiat-crypto authors (see
+https://github.com/mit-plv/fiat-crypto/blob/master/AUTHORS).
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+Licenses for support code
+-------------------------
+
+Parts of the TLS test suite are under the Go license. This code is not included
+in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so
+distributing code linked against BoringSSL does not trigger this license:
+
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+BoringSSL uses the Chromium test infrastructure to run a continuous build,
+trybots etc. The scripts which manage this, and the script for generating build
+metadata, are under the Chromium license. Distributing code linked against
+BoringSSL does not trigger this license.
+
+Copyright 2015 The Chromium Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== cudnn-frontend
+
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */ 
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== Curl
+
+Files: src/...
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1996 - 2014, Daniel Stenberg, <daniel@haxx.se>.
+
+All rights reserved.
+
+Permission to use, copy, modify, and distribute this software for any purpose
+with or without fee is hereby granted, provided that the above copyright
+notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
+NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall not
+be used in advertising or otherwise to promote the sale, use or other dealings
+in this Software without prior written authorization of the copyright holder.
+
+------------------
+
+Files: src/lib/inet_ntop.c, src/lib/inet_pton.c
+
+Copyright (C) 1996-2019  Internet Software Consortium.
+
+Permission to use, copy, modify, and distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM
+DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
+INTERNET SOFTWARE CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
+FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+------------------
+
+Files: src/lib/security.c, src/lib/krb5.c: unused in Google
+
+This source code was modified by Martin Hedenfalk <mhe@stacken.kth.se> for
+use in Curl. His latest changes were done 2000-09-18.
+
+It has since been patched and modified a lot by Daniel Stenberg
+<daniel@haxx.se> to make it better applied to curl conditions, and to make
+it not use globals, pollute name space and more. This source code awaits a
+rewrite to work around the paragraph 2 in the BSD licenses as explained
+below.
+
+Copyright (c) 1998, 1999, 2017 Kungliga Tekniska Högskolan
+(Royal Institute of Technology, Stockholm, Sweden).
+
+Copyright (C) 2001 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the Institute nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+------------------
+
+Files: src/LICENSES/...
+
+BSD-4-Clause (University of California-Specific)
+
+Copyright [various years] The Regents of the University of California. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. All advertising materials mentioning features or use of this software must display the following acknowledgement: This product includes software developed by the University of California, Berkeley and its contributors.
+
+4. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== Dlpack
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2017 by Contributors
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== double_conversion
+
+Copyright 2006-2011, the V8 project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the name of Google Inc. nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== Eigen
+
+Eigen 3.3.90
+The corresponding source for this library is available at
+https://third-party-mirror.googlesource.com/eigen/+/20f4d829087ef6ed6a502859c6fd093065e806d0
+
+Eigen is primarily MPL2 licensed. See COPYING.MPL2 and these links:
+  http://www.mozilla.org/MPL/2.0/
+  http://www.mozilla.org/MPL/2.0/FAQ.html
+
+Some files contain third-party code under BSD, whence
+the other COPYING.* files here.
+
+If you want to guarantee that the Eigen code that you are #including
+is licensed under the MPL2 and possibly more permissive licenses (like
+BSD), #define this preprocessor symbol: EIGEN_MPL2_ONLY 
+For example, with most compilers, you could add this to your project
+      CXXFLAGS: -DEIGEN_MPL2_ONLY 
+This will cause a compilation error to be generated if you #include
+any code that is covered by more restrictive licences than MPL2.
+
+----------------------------------------------------------------------
+Following applies to:
+./test/sparseqr.cpp
+./test/half_float.cpp
+./test/zerosized.cpp
+./test/nesting_ops.cpp
+./test/sizeoverflow.cpp
+./test/swap.cpp
+./test/product_mmtr.cpp
+./test/stdvector_overload.cpp
+./test/product_symm.cpp
+./test/sparse_block.cpp
+./test/eigen2support.cpp
+./test/upperbidiagonalization.cpp
+./test/numext.cpp
+./test/adjoint.cpp
+./test/AnnoyingScalar.h
+./test/mpl2only.cpp
+./test/stddeque.cpp
+./test/householder.cpp
+./test/product_small.cpp
+./test/product_syrk.cpp
+./test/inplace_decomposition.cpp
+./test/vectorwiseop.cpp
+./test/meta.cpp
+./test/stdvector.cpp
+./test/sparseLM.cpp
+./test/diagonalmatrices.cpp
+./test/stdlist_overload.cpp
+./test/block.cpp
+./test/cholmod_support.cpp
+./test/basicstuff.cpp
+./test/triangular.cpp
+./test/product.h
+./test/vectorization_logic.cpp
+./test/dontalign.cpp
+./test/first_aligned.cpp
+./test/mapped_matrix.cpp
+./test/umfpack_support.cpp
+./test/product_selfadjoint.cpp
+./test/smallvectors.cpp
+./test/corners.cpp
+./test/product_trsolve.cpp
+./test/determinant.cpp
+./test/stdlist.cpp
+./test/unalignedcount.cpp
+./test/qr.cpp
+./test/svd_common.h
+./test/ref.cpp
+./test/symbolic_index.cpp
+./test/geo_transformations.cpp
+./test/geo_eulerangles.cpp
+./test/eigensolver_selfadjoint.cpp
+./test/stddeque_overload.cpp
+./test/jacobisvd.cpp
+./test/nullary.cpp
+./test/inverse.cpp
+./test/integer_types.cpp
+./test/metis_support.cpp
+./test/exceptions.cpp
+./test/packetmath.cpp
+./test/schur_complex.cpp
+./test/type_alias.cpp
+./test/unalignedassert.cpp
+./test/geo_quaternion.cpp
+./test/lu.cpp
+./test/qr_fullpivoting.cpp
+./test/denseLM.cpp
+./test/linearstructure.cpp
+./test/rand.cpp
+./test/conservative_resize.cpp
+./test/eigensolver_generalized_real.cpp
+./test/pastix_support.cpp
+./test/sparse_solver.h
+./test/num_dimensions.cpp
+./test/simplicial_cholesky.cpp
+./test/hessenberg.cpp
+./test/array_reverse.cpp
+./test/special_numbers.cpp
+./test/array_for_matrix.cpp
+./test/product_large.cpp
+./test/resize.cpp
+./test/sparse_solvers.cpp
+./test/selfadjoint.cpp
+./test/schur_real.cpp
+./test/sparse_basic.cpp
+./test/conjugate_gradient.cpp
+./test/real_qz.cpp
+./test/bandmatrix.cpp
+./test/dense_storage.cpp
+./test/permutationmatrices.cpp
+./test/array_cwise.cpp
+./test/qr_colpivoting.cpp
+./test/array_replicate.cpp
+./test/rvalue_types.cpp
+./test/stable_norm.cpp
+./test/geo_homogeneous.cpp
+./test/main.h
+./test/eigensolver_complex.cpp
+./test/product_trmm.cpp
+./test/bicgstab.cpp
+./test/redux.cpp
+./test/klu_support.cpp
+./test/geo_alignedbox.cpp
+./test/is_same_dense.cpp
+./test/sparse_permutations.cpp
+./test/sparse_vector.cpp
+./test/diagonal.cpp
+./test/sparse.h
+./test/mapstride.cpp
+./test/visitor.cpp
+./test/geo_hyperplane.cpp
+./test/bdcsvd.cpp
+./test/product_trmv.cpp
+./test/nestbyvalue.cpp
+./test/array_of_string.cpp
+./test/superlu_support.cpp
+./test/sizeof.cpp
+./test/boostmultiprec.cpp
+./test/commainitializer.cpp
+./test/constructor.cpp
+./test/mixingtypes.cpp
+./test/miscmatrices.cpp
+./test/mapstaticmethods.cpp
+./test/product_notemporary.cpp
+./test/initializer_list_construction.cpp
+./test/incomplete_cholesky.cpp
+./test/geo_parametrizedline.cpp
+./test/indexed_view.cpp
+./test/qtvector.cpp
+./test/sparselu.cpp
+./test/sparse_product.cpp
+./test/dynalloc.cpp
+./test/fastmath.cpp
+./test/prec_inverse_4x4.cpp
+./test/umeyama.cpp
+./test/reshape.cpp
+./test/product_extra.cpp
+./test/jacobi.cpp
+./test/sparse_ref.cpp
+./test/nomalloc.cpp
+./test/spqr_support.cpp
+./test/lscg.cpp
+./test/cholesky.cpp
+./test/eigensolver_generic.cpp
+./test/geo_orthomethods.cpp
+./test/svd_fill.h
+./test/stl_iterators.cpp
+./Eigen/src/MetisSupport/MetisSupport.h
+./Eigen/src/CholmodSupport/CholmodSupport.h
+./Eigen/src/QR/CompleteOrthogonalDecomposition.h
+./Eigen/src/QR/FullPivHouseholderQR.h
+./Eigen/src/QR/HouseholderQR.h
+./Eigen/src/QR/ColPivHouseholderQR.h
+./Eigen/src/plugins/CommonCwiseUnaryOps.h
+./Eigen/src/plugins/BlockMethods.h
+./Eigen/src/plugins/CommonCwiseBinaryOps.h
+./Eigen/src/plugins/MatrixCwiseUnaryOps.h
+./Eigen/src/plugins/IndexedViewMethods.h
+./Eigen/src/plugins/MatrixCwiseBinaryOps.h
+./Eigen/src/SVD/UpperBidiagonalization.h
+./Eigen/src/SVD/SVDBase.h
+./Eigen/src/SVD/BDCSVD.h
+./Eigen/src/SVD/JacobiSVD.h
+./Eigen/src/SparseLU/SparseLU_relax_snode.h
+./Eigen/src/SparseLU/SparseLU_column_dfs.h
+./Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+./Eigen/src/SparseLU/SparseLU_pivotL.h
+./Eigen/src/SparseLU/SparseLU.h
+./Eigen/src/SparseLU/SparseLU_pruneL.h
+./Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
+./Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
+./Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+./Eigen/src/SparseLU/SparseLU_panel_dfs.h
+./Eigen/src/SparseLU/SparseLU_panel_bmod.h
+./Eigen/src/SparseLU/SparseLU_Structs.h
+./Eigen/src/SparseLU/SparseLUImpl.h
+./Eigen/src/SparseLU/SparseLU_Memory.h
+./Eigen/src/SparseLU/SparseLU_column_bmod.h
+./Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+./Eigen/src/SparseLU/SparseLU_Utils.h
+./Eigen/src/OrderingMethods/Eigen_Colamd.h
+./Eigen/src/OrderingMethods/Ordering.h
+./Eigen/src/OrderingMethods/Amd.h
+./Eigen/src/UmfPackSupport/UmfPackSupport.h
+./Eigen/src/Geometry/Umeyama.h
+./Eigen/src/Geometry/Transform.h
+./Eigen/src/Geometry/OrthoMethods.h
+./Eigen/src/Geometry/Hyperplane.h
+./Eigen/src/Geometry/Homogeneous.h
+./Eigen/src/Geometry/RotationBase.h
+./Eigen/src/Geometry/EulerAngles.h
+./Eigen/src/Geometry/Translation.h
+./Eigen/src/Geometry/Rotation2D.h
+./Eigen/src/Geometry/Scaling.h
+./Eigen/src/Geometry/AlignedBox.h
+./Eigen/src/Geometry/ParametrizedLine.h
+./Eigen/src/Geometry/Quaternion.h
+./Eigen/src/Geometry/AngleAxis.h
+./Eigen/src/Geometry/arch/Geometry_SSE.h
+./Eigen/src/KLUSupport/KLUSupport.h
+./Eigen/src/misc/Kernel.h
+./Eigen/src/misc/RealSvd2x2.h
+./Eigen/src/misc/Image.h
+./Eigen/src/StlSupport/details.h
+./Eigen/src/StlSupport/StdList.h
+./Eigen/src/StlSupport/StdDeque.h
+./Eigen/src/StlSupport/StdVector.h
+./Eigen/src/SparseQR/SparseQR.h
+./Eigen/src/SuperLUSupport/SuperLUSupport.h
+./Eigen/src/Householder/Householder.h
+./Eigen/src/Householder/HouseholderSequence.h
+./Eigen/src/Householder/BlockHouseholder.h
+./Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+./Eigen/src/Eigenvalues/EigenSolver.h
+./Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+./Eigen/src/Eigenvalues/Tridiagonalization.h
+./Eigen/src/Eigenvalues/HessenbergDecomposition.h
+./Eigen/src/Eigenvalues/RealQZ.h
+./Eigen/src/Eigenvalues/RealSchur.h
+./Eigen/src/Eigenvalues/ComplexSchur.h
+./Eigen/src/Eigenvalues/ComplexEigenSolver.h
+./Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+./Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
+./Eigen/src/SparseCholesky/SimplicialCholesky.h
+./Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+./Eigen/src/Cholesky/LLT.h
+./Eigen/src/Cholesky/LDLT.h
+./Eigen/src/Jacobi/Jacobi.h
+./Eigen/src/PaStiXSupport/PaStiXSupport.h
+./Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+./Eigen/src/LU/Determinant.h
+./Eigen/src/LU/InverseImpl.h
+./Eigen/src/LU/PartialPivLU.h
+./Eigen/src/LU/arch/Inverse_SSE.h
+./Eigen/src/LU/FullPivLU.h
+./Eigen/src/Core/Map.h
+./Eigen/src/Core/VectorwiseOp.h
+./Eigen/src/Core/VectorBlock.h
+./Eigen/src/Core/Array.h
+./Eigen/src/Core/Assign.h
+./Eigen/src/Core/Dot.h
+./Eigen/src/Core/NestByValue.h
+./Eigen/src/Core/CoreEvaluators.h
+./Eigen/src/Core/ReturnByValue.h
+./Eigen/src/Core/SelfCwiseBinaryOp.h
+./Eigen/src/Core/GlobalFunctions.h
+./Eigen/src/Core/Transpositions.h
+./Eigen/src/Core/Fuzzy.h
+./Eigen/src/Core/NoAlias.h
+./Eigen/src/Core/CwiseNullaryOp.h
+./Eigen/src/Core/NumTraits.h
+./Eigen/src/Core/IndexedView.h
+./Eigen/src/Core/ArrayWrapper.h
+./Eigen/src/Core/util/SymbolicIndex.h
+./Eigen/src/Core/util/BlasUtil.h
+./Eigen/src/Core/util/Constants.h
+./Eigen/src/Core/util/IntegralConstant.h
+./Eigen/src/Core/util/ReshapedHelper.h
+./Eigen/src/Core/util/StaticAssert.h
+./Eigen/src/Core/util/IndexedViewHelper.h
+./Eigen/src/Core/util/ConfigureVectorization.h
+./Eigen/src/Core/util/ForwardDeclarations.h
+./Eigen/src/Core/util/Meta.h
+./Eigen/src/Core/util/XprHelper.h
+./Eigen/src/Core/util/Macros.h
+./Eigen/src/Core/util/Memory.h
+./Eigen/src/Core/Product.h
+./Eigen/src/Core/Replicate.h
+./Eigen/src/Core/ArrayBase.h
+./Eigen/src/Core/functors/NullaryFunctors.h
+./Eigen/src/Core/functors/StlFunctors.h
+./Eigen/src/Core/functors/AssignmentFunctors.h
+./Eigen/src/Core/functors/UnaryFunctors.h
+./Eigen/src/Core/functors/TernaryFunctors.h
+./Eigen/src/Core/functors/BinaryFunctors.h
+./Eigen/src/Core/Redux.h
+./Eigen/src/Core/EigenBase.h
+./Eigen/src/Core/SolverBase.h
+./Eigen/src/Core/ProductEvaluators.h
+./Eigen/src/Core/Block.h
+./Eigen/src/Core/SolveTriangular.h
+./Eigen/src/Core/ArithmeticSequence.h
+./Eigen/src/Core/MatrixBase.h
+./Eigen/src/Core/PlainObjectBase.h
+./Eigen/src/Core/Transpose.h
+./Eigen/src/Core/IO.h
+./Eigen/src/Core/MathFunctions.h
+./Eigen/src/Core/Stride.h
+./Eigen/src/Core/MathFunctionsImpl.h
+./Eigen/src/Core/StableNorm.h
+./Eigen/src/Core/DiagonalProduct.h
+./Eigen/src/Core/products/GeneralMatrixMatrix.h
+./Eigen/src/Core/products/GeneralMatrixVector.h
+./Eigen/src/Core/products/SelfadjointMatrixVector.h
+./Eigen/src/Core/products/GeneralBlockPanelKernel.h
+./Eigen/src/Core/products/TriangularSolverMatrix.h
+./Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+./Eigen/src/Core/products/Parallelizer.h
+./Eigen/src/Core/products/SelfadjointRank2Update.h
+./Eigen/src/Core/products/TriangularMatrixMatrix.h
+./Eigen/src/Core/products/TriangularMatrixVector.h
+./Eigen/src/Core/products/SelfadjointProduct.h
+./Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+./Eigen/src/Core/products/TriangularSolverVector.h
+./Eigen/src/Core/CwiseUnaryView.h
+./Eigen/src/Core/CommaInitializer.h
+./Eigen/src/Core/DenseStorage.h
+./Eigen/src/Core/DenseBase.h
+./Eigen/src/Core/PartialReduxEvaluator.h
+./Eigen/src/Core/CoreIterators.h
+./Eigen/src/Core/PermutationMatrix.h
+./Eigen/src/Core/CwiseTernaryOp.h
+./Eigen/src/Core/Reverse.h
+./Eigen/src/Core/Reshaped.h
+./Eigen/src/Core/Inverse.h
+./Eigen/src/Core/TriangularMatrix.h
+./Eigen/src/Core/BooleanRedux.h
+./Eigen/src/Core/ForceAlignedAccess.h
+./Eigen/src/Core/Ref.h
+./Eigen/src/Core/StlIterators.h
+./Eigen/src/Core/BandMatrix.h
+./Eigen/src/Core/ConditionEstimator.h
+./Eigen/src/Core/Diagonal.h
+./Eigen/src/Core/DiagonalMatrix.h
+./Eigen/src/Core/AssignEvaluator.h
+./Eigen/src/Core/CwiseBinaryOp.h
+./Eigen/src/Core/Visitor.h
+./Eigen/src/Core/GenericPacketMath.h
+./Eigen/src/Core/SelfAdjointView.h
+./Eigen/src/Core/Random.h
+./Eigen/src/Core/Solve.h
+./Eigen/src/Core/arch/AltiVec/MathFunctions.h
+./Eigen/src/Core/arch/AltiVec/PacketMath.h
+./Eigen/src/Core/arch/AltiVec/Complex.h
+./Eigen/src/Core/arch/MSA/MathFunctions.h
+./Eigen/src/Core/arch/MSA/Complex.h
+./Eigen/src/Core/arch/MSA/PacketMath.h
+./Eigen/src/Core/arch/GPU/Half.h
+./Eigen/src/Core/arch/GPU/PacketMathHalf.h
+./Eigen/src/Core/arch/GPU/MathFunctions.h
+./Eigen/src/Core/arch/GPU/PacketMath.h
+./Eigen/src/Core/arch/GPU/TypeCasting.h
+./Eigen/src/Core/arch/NEON/MathFunctions.h
+./Eigen/src/Core/arch/NEON/Complex.h
+./Eigen/src/Core/arch/NEON/PacketMath.h
+./Eigen/src/Core/arch/NEON/TypeCasting.h
+./Eigen/src/Core/arch/AVX/MathFunctions.h
+./Eigen/src/Core/arch/AVX/TypeCasting.h
+./Eigen/src/Core/arch/AVX/Complex.h
+./Eigen/src/Core/arch/AVX/PacketMath.h
+./Eigen/src/Core/arch/SYCL/InteropHeaders.h
+./Eigen/src/Core/arch/SYCL/PacketMath.h
+./Eigen/src/Core/arch/SYCL/TypeCasting.h
+./Eigen/src/Core/arch/SYCL/MathFunctions.h
+./Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+./Eigen/src/Core/arch/Default/ConjHelper.h
+./Eigen/src/Core/arch/Default/Settings.h
+./Eigen/src/Core/arch/AVX512/MathFunctions.h
+./Eigen/src/Core/arch/AVX512/PacketMath.h
+./Eigen/src/Core/arch/AVX512/Complex.h
+./Eigen/src/Core/arch/SSE/PacketMath.h
+./Eigen/src/Core/arch/SSE/Complex.h
+./Eigen/src/Core/arch/SSE/TypeCasting.h
+./Eigen/src/Core/arch/SSE/MathFunctions.h
+./Eigen/src/Core/arch/ZVector/MathFunctions.h
+./Eigen/src/Core/arch/ZVector/PacketMath.h
+./Eigen/src/Core/arch/ZVector/Complex.h
+./Eigen/src/Core/arch/CUDA/Complex.h
+./Eigen/src/Core/Swap.h
+./Eigen/src/Core/MapBase.h
+./Eigen/src/Core/GeneralProduct.h
+./Eigen/src/Core/Matrix.h
+./Eigen/src/Core/Select.h
+./Eigen/src/Core/CwiseUnaryOp.h
+./Eigen/src/Core/DenseCoeffsBase.h
+./Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+./Eigen/src/SparseCore/TriangularSolver.h
+./Eigen/src/SparseCore/SparseView.h
+./Eigen/src/SparseCore/SparseSolverBase.h
+./Eigen/src/SparseCore/SparseTranspose.h
+./Eigen/src/SparseCore/SparseDenseProduct.h
+./Eigen/src/SparseCore/SparseMap.h
+./Eigen/src/SparseCore/SparseProduct.h
+./Eigen/src/SparseCore/SparseUtil.h
+./Eigen/src/SparseCore/SparsePermutation.h
+./Eigen/src/SparseCore/SparseTriangularView.h
+./Eigen/src/SparseCore/SparseSelfAdjointView.h
+./Eigen/src/SparseCore/SparseMatrixBase.h
+./Eigen/src/SparseCore/AmbiVector.h
+./Eigen/src/SparseCore/SparseAssign.h
+./Eigen/src/SparseCore/SparseRedux.h
+./Eigen/src/SparseCore/SparseDot.h
+./Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+./Eigen/src/SparseCore/SparseCompressedBase.h
+./Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+./Eigen/src/SparseCore/SparseColEtree.h
+./Eigen/src/SparseCore/SparseRef.h
+./Eigen/src/SparseCore/CompressedStorage.h
+./Eigen/src/SparseCore/MappedSparseMatrix.h
+./Eigen/src/SparseCore/SparseDiagonalProduct.h
+./Eigen/src/SparseCore/SparseFuzzy.h
+./Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+./Eigen/src/SparseCore/SparseMatrix.h
+./Eigen/src/SparseCore/SparseVector.h
+./Eigen/src/SparseCore/SparseBlock.h
+./Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+./Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+./Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+./Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+./Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+./Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+./Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+./Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
+./unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+./unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
+./unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
+./unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+./unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
+./unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
+./unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
+./unsupported/Eigen/src/Polynomials/Companion.h
+./unsupported/Eigen/src/Polynomials/PolynomialUtils.h
+./unsupported/Eigen/src/Polynomials/PolynomialSolver.h
+./unsupported/Eigen/src/Splines/Spline.h
+./unsupported/Eigen/src/Splines/SplineFwd.h
+./unsupported/Eigen/src/Splines/SplineFitting.h
+./unsupported/Eigen/src/BVH/KdBVH.h
+./unsupported/Eigen/src/BVH/BVAlgorithms.h
+./unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
+./unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
+./unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+./unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+./unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+./unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+./unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+./unsupported/Eigen/src/MatrixFunctions/StemFunction.h
+./unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+./unsupported/Eigen/src/Skyline/SkylineStorage.h
+./unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
+./unsupported/Eigen/src/Skyline/SkylineMatrix.h
+./unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
+./unsupported/Eigen/src/Skyline/SkylineProduct.h
+./unsupported/Eigen/src/Skyline/SkylineUtil.h
+./unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+./unsupported/Eigen/src/FFT/ei_fftw_impl.h
+./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+./unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
+./unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
+./unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+./unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
+./unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
+./unsupported/Eigen/src/IterativeSolvers/MINRES.h
+./unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+./unsupported/Eigen/src/IterativeSolvers/Scaling.h
+./unsupported/Eigen/src/IterativeSolvers/GMRES.h
+./unsupported/Eigen/src/MoreVectorization/MathFunctions.h
+./unsupported/Eigen/src/EulerAngles/EulerAngles.h
+./unsupported/Eigen/src/EulerAngles/EulerSystem.h
+./unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h
+./unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
+./unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h
+./unsupported/Eigen/src/SparseExtra/RandomSetter.h
+./unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+./unsupported/Eigen/src/SparseExtra/MarketIO.h
+./unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
+./unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
+./unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
+./unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+./unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
+./unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+./unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
+./unsupported/Eigen/CXX11/src/util/EmulateArray.h
+./unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
+./unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
+./unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+./unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+./unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
+./unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+./unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
+./unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
+./unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+./unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+./unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+./unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+./unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
+./unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
+./unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
+./unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
+./unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
+./unsupported/bench/bench_svd.cpp
+./unsupported/test/cxx11_tensor_image_patch_sycl.cpp
+./unsupported/test/cxx11_tensor_expr.cpp
+./unsupported/test/FFTW.cpp
+./unsupported/test/cxx11_tensor_reverse_sycl.cpp
+./unsupported/test/cxx11_tensor_comparisons.cpp
+./unsupported/test/cxx11_tensor_intdiv.cpp
+./unsupported/test/autodiff.cpp
+./unsupported/test/cxx11_tensor_executor.cpp
+./unsupported/test/cxx11_tensor_reduction.cpp
+./unsupported/test/cxx11_tensor_device_sycl.cpp
+./unsupported/test/minres.cpp
+./unsupported/test/cxx11_tensor_striding.cpp
+./unsupported/test/cxx11_tensor_chipping.cpp
+./unsupported/test/cxx11_tensor_convolution_sycl.cpp
+./unsupported/test/openglsupport.cpp
+./unsupported/test/cxx11_tensor_ifft.cpp
+./unsupported/test/polynomialutils.cpp
+./unsupported/test/cxx11_tensor_block_access.cpp
+./unsupported/test/cxx11_tensor_block_eval.cpp
+./unsupported/test/cxx11_tensor_block_io.cpp
+./unsupported/test/cxx11_tensor_morphing.cpp
+./unsupported/test/cxx11_tensor_casts.cpp
+./unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+./unsupported/test/cxx11_tensor_morphing_sycl.cpp
+./unsupported/test/forward_adolc.cpp
+./unsupported/test/cxx11_tensor_layout_swap.cpp
+./unsupported/test/cxx11_tensor_move.cpp
+./unsupported/test/EulerAngles.cpp
+./unsupported/test/cxx11_tensor_trace.cpp
+./unsupported/test/alignedvector3.cpp
+./unsupported/test/cxx11_tensor_lvalue.cpp
+./unsupported/test/cxx11_tensor_argmax.cpp
+./unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+./unsupported/test/autodiff_scalar.cpp
+./unsupported/test/sparse_extra.cpp
+./unsupported/test/cxx11_tensor_of_strings.cpp
+./unsupported/test/cxx11_tensor_empty.cpp
+./unsupported/test/cxx11_tensor_patch.cpp
+./unsupported/test/cxx11_tensor_sycl.cpp
+./unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+./unsupported/test/cxx11_tensor_inflation_sycl.cpp
+./unsupported/test/BVH.cpp
+./unsupported/test/cxx11_tensor_generator.cpp
+./unsupported/test/cxx11_meta.cpp
+./unsupported/test/matrix_functions.h
+./unsupported/test/kronecker_product.cpp
+./unsupported/test/matrix_function.cpp
+./unsupported/test/cxx11_tensor_thread_pool.cpp
+./unsupported/test/cxx11_non_blocking_thread_pool.cpp
+./unsupported/test/cxx11_tensor_fft.cpp
+./unsupported/test/cxx11_tensor_assign.cpp
+./unsupported/test/cxx11_tensor_simple.cpp
+./unsupported/test/cxx11_tensor_of_complex.cpp
+./unsupported/test/cxx11_tensor_inflation.cpp
+./unsupported/test/cxx11_tensor_map.cpp
+./unsupported/test/cxx11_tensor_shuffling.cpp
+./unsupported/test/cxx11_tensor_padding.cpp
+./unsupported/test/cxx11_tensor_argmax_sycl.cpp
+./unsupported/test/matrix_square_root.cpp
+./unsupported/test/dgmres.cpp
+./unsupported/test/cxx11_tensor_custom_op_sycl.cpp
+./unsupported/test/cxx11_tensor_reduction_sycl.cpp
+./unsupported/test/cxx11_runqueue.cpp
+./unsupported/test/cxx11_tensor_const.cpp
+./unsupported/test/matrix_power.cpp
+./unsupported/test/cxx11_tensor_contraction.cpp
+./unsupported/test/cxx11_tensor_random.cpp
+./unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
+./unsupported/test/cxx11_tensor_contract_sycl.cpp
+./unsupported/test/cxx11_tensor_math.cpp
+./unsupported/test/splines.cpp
+./unsupported/test/cxx11_tensor_ref.cpp
+./unsupported/test/cxx11_tensor_concatenation_sycl.cpp
+./unsupported/test/gmres.cpp
+./unsupported/test/cxx11_tensor_fixed_size.cpp
+./unsupported/test/cxx11_tensor_custom_op.cpp
+./unsupported/test/cxx11_tensor_generator_sycl.cpp
+./unsupported/test/cxx11_tensor_uint128.cpp
+./unsupported/test/cxx11_tensor_builtins_sycl.cpp
+./unsupported/test/polynomialsolver.cpp
+./unsupported/test/cxx11_tensor_concatenation.cpp
+./unsupported/test/cxx11_tensor_broadcasting.cpp
+./unsupported/test/cxx11_tensor_convolution.cpp
+./unsupported/test/cxx11_tensor_forced_eval.cpp
+./unsupported/test/levenberg_marquardt.cpp
+./unsupported/test/cxx11_tensor_reverse.cpp
+./unsupported/test/cxx11_tensor_notification.cpp
+./unsupported/test/cxx11_tensor_patch_sycl.cpp
+./unsupported/test/cxx11_tensor_image_patch.cpp
+./unsupported/test/cxx11_tensor_scan.cpp
+./unsupported/test/cxx11_tensor_padding_sycl.cpp
+./unsupported/test/cxx11_tensor_index_list.cpp
+./unsupported/test/cxx11_tensor_io.cpp
+./unsupported/test/cxx11_tensor_mixed_indices.cpp
+./unsupported/test/cxx11_tensor_striding_sycl.cpp
+./unsupported/test/cxx11_tensor_of_const_values.cpp
+./unsupported/test/cxx11_tensor_symmetry.cpp
+./unsupported/test/cxx11_tensor_custom_index.cpp
+./unsupported/test/cxx11_tensor_chipping_sycl.cpp
+./unsupported/test/cxx11_tensor_roundings.cpp
+./unsupported/test/matrix_exponential.cpp
+./unsupported/test/cxx11_eventcount.cpp
+./unsupported/test/special_functions.cpp
+./unsupported/test/cxx11_tensor_dimension.cpp
+./unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
+./lapack/eigenvalues.cpp
+./lapack/single.cpp
+./lapack/svd.cpp
+./lapack/complex_single.cpp
+./lapack/lu.cpp
+./lapack/double.cpp
+./lapack/complex_double.cpp
+./lapack/cholesky.cpp
+./lapack/lapack_common.h
+./blas/level2_impl.h
+./blas/PackedTriangularMatrixVector.h
+./blas/level3_impl.h
+./blas/complex_double.cpp
+./blas/common.h
+./blas/GeneralRank1Update.h
+./blas/double.cpp
+./blas/complex_single.cpp
+./blas/Rank2Update.h
+./blas/level1_impl.h
+./blas/level2_real_impl.h
+./blas/level1_real_impl.h
+./blas/single.cpp
+./blas/PackedSelfadjointProduct.h
+./blas/BandTriangularSolver.h
+./blas/level2_cplx_impl.h
+./blas/PackedTriangularSolverVector.h
+./blas/level1_cplx_impl.h
+./bench/analyze-blocking-sizes.cpp
+./bench/BenchTimer.h
+./bench/spbench/spbenchsolver.h
+./bench/spbench/spbenchstyle.h
+./bench/benchFFT.cpp
+./bench/eig33.cpp
+./bench/benchmark-blocking-sizes.cpp
+./demos/opengl/quaternion_demo.cpp
+./demos/opengl/camera.h
+./demos/opengl/gpuhelper.cpp
+./demos/opengl/gpuhelper.h
+./demos/opengl/icosphere.cpp
+./demos/opengl/quaternion_demo.h
+./demos/opengl/trackball.h
+./demos/opengl/icosphere.h
+./demos/opengl/camera.cpp
+./demos/opengl/trackball.cpp
+./demos/mix_eigen_and_c/binary_library.h
+./demos/mix_eigen_and_c/binary_library.cpp
+./demos/mandelbrot/mandelbrot.cpp
+./demos/mandelbrot/mandelbrot.h
+
+Mozilla Public License Version 2.0
+==================================
+
+1. Definitions
+--------------
+
+1.1. "Contributor"
+    means each individual or legal entity that creates, contributes to
+    the creation of, or owns Covered Software.
+
+1.2. "Contributor Version"
+    means the combination of the Contributions of others (if any) used
+    by a Contributor and that particular Contributor's Contribution.
+
+1.3. "Contribution"
+    means Covered Software of a particular Contributor.
+
+1.4. "Covered Software"
+    means Source Code Form to which the initial Contributor has attached
+    the notice in Exhibit A, the Executable Form of such Source Code
+    Form, and Modifications of such Source Code Form, in each case
+    including portions thereof.
+
+1.5. "Incompatible With Secondary Licenses"
+    means
+
+    (a) that the initial Contributor has attached the notice described
+        in Exhibit B to the Covered Software; or
+
+    (b) that the Covered Software was made available under the terms of
+        version 1.1 or earlier of the License, but not also under the
+        terms of a Secondary License.
+
+1.6. "Executable Form"
+    means any form of the work other than Source Code Form.
+
+1.7. "Larger Work"
+    means a work that combines Covered Software with other material, in 
+    a separate file or files, that is not Covered Software.
+
+1.8. "License"
+    means this document.
+
+1.9. "Licensable"
+    means having the right to grant, to the maximum extent possible,
+    whether at the time of the initial grant or subsequently, any and
+    all of the rights conveyed by this License.
+
+1.10. "Modifications"
+    means any of the following:
+
+    (a) any file in Source Code Form that results from an addition to,
+        deletion from, or modification of the contents of Covered
+        Software; or
+
+    (b) any new file in Source Code Form that contains any Covered
+        Software.
+
+1.11. "Patent Claims" of a Contributor
+    means any patent claim(s), including without limitation, method,
+    process, and apparatus claims, in any patent Licensable by such
+    Contributor that would be infringed, but for the grant of the
+    License, by the making, using, selling, offering for sale, having
+    made, import, or transfer of either its Contributions or its
+    Contributor Version.
+
+1.12. "Secondary License"
+    means either the GNU General Public License, Version 2.0, the GNU
+    Lesser General Public License, Version 2.1, the GNU Affero General
+    Public License, Version 3.0, or any later versions of those
+    licenses.
+
+1.13. "Source Code Form"
+    means the form of the work preferred for making modifications.
+
+1.14. "You" (or "Your")
+    means an individual or a legal entity exercising rights under this
+    License. For legal entities, "You" includes any entity that
+    controls, is controlled by, or is under common control with You. For
+    purposes of this definition, "control" means (a) the power, direct
+    or indirect, to cause the direction or management of such entity,
+    whether by contract or otherwise, or (b) ownership of more than
+    fifty percent (50%) of the outstanding shares or beneficial
+    ownership of such entity.
+
+2. License Grants and Conditions
+--------------------------------
+
+2.1. Grants
+
+Each Contributor hereby grants You a world-wide, royalty-free,
+non-exclusive license:
+
+(a) under intellectual property rights (other than patent or trademark)
+    Licensable by such Contributor to use, reproduce, make available,
+    modify, display, perform, distribute, and otherwise exploit its
+    Contributions, either on an unmodified basis, with Modifications, or
+    as part of a Larger Work; and
+
+(b) under Patent Claims of such Contributor to make, use, sell, offer
+    for sale, have made, import, and otherwise transfer either its
+    Contributions or its Contributor Version.
+
+2.2. Effective Date
+
+The licenses granted in Section 2.1 with respect to any Contribution
+become effective for each Contribution on the date the Contributor first
+distributes such Contribution.
+
+2.3. Limitations on Grant Scope
+
+The licenses granted in this Section 2 are the only rights granted under
+this License. No additional rights or licenses will be implied from the
+distribution or licensing of Covered Software under this License.
+Notwithstanding Section 2.1(b) above, no patent license is granted by a
+Contributor:
+
+(a) for any code that a Contributor has removed from Covered Software;
+    or
+
+(b) for infringements caused by: (i) Your and any other third party's
+    modifications of Covered Software, or (ii) the combination of its
+    Contributions with other software (except as part of its Contributor
+    Version); or
+
+(c) under Patent Claims infringed by Covered Software in the absence of
+    its Contributions.
+
+This License does not grant any rights in the trademarks, service marks,
+or logos of any Contributor (except as may be necessary to comply with
+the notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+No Contributor makes additional grants as a result of Your choice to
+distribute the Covered Software under a subsequent version of this
+License (see Section 10.2) or under the terms of a Secondary License (if
+permitted under the terms of Section 3.3).
+
+2.5. Representation
+
+Each Contributor represents that the Contributor believes its
+Contributions are its original creation(s) or it has sufficient rights
+to grant the rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+This License is not intended to limit any rights You have under
+applicable copyright doctrines of fair use, fair dealing, or other
+equivalents.
+
+2.7. Conditions
+
+Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
+in Section 2.1.
+
+3. Responsibilities
+-------------------
+
+3.1. Distribution of Source Form
+
+All distribution of Covered Software in Source Code Form, including any
+Modifications that You create or to which You contribute, must be under
+the terms of this License. You must inform recipients that the Source
+Code Form of the Covered Software is governed by the terms of this
+License, and how they can obtain a copy of this License. You may not
+attempt to alter or restrict the recipients' rights in the Source Code
+Form.
+
+3.2. Distribution of Executable Form
+
+If You distribute Covered Software in Executable Form then:
+
+(a) such Covered Software must also be made available in Source Code
+    Form, as described in Section 3.1, and You must inform recipients of
+    the Executable Form how they can obtain a copy of such Source Code
+    Form by reasonable means in a timely manner, at a charge no more
+    than the cost of distribution to the recipient; and
+
+(b) You may distribute such Executable Form under the terms of this
+    License, or sublicense it under different terms, provided that the
+    license for the Executable Form does not attempt to limit or alter
+    the recipients' rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+You may create and distribute a Larger Work under terms of Your choice,
+provided that You also comply with the requirements of this License for
+the Covered Software. If the Larger Work is a combination of Covered
+Software with a work governed by one or more Secondary Licenses, and the
+Covered Software is not Incompatible With Secondary Licenses, this
+License permits You to additionally distribute such Covered Software
+under the terms of such Secondary License(s), so that the recipient of
+the Larger Work may, at their option, further distribute the Covered
+Software under the terms of either this License or such Secondary
+License(s).
+
+3.4. Notices
+
+You may not remove or alter the substance of any license notices
+(including copyright notices, patent notices, disclaimers of warranty,
+or limitations of liability) contained within the Source Code Form of
+the Covered Software, except that You may alter any license notices to
+the extent required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+You may choose to offer, and to charge a fee for, warranty, support,
+indemnity or liability obligations to one or more recipients of Covered
+Software. However, You may do so only on Your own behalf, and not on
+behalf of any Contributor. You must make it absolutely clear that any
+such warranty, support, indemnity, or liability obligation is offered by
+You alone, and You hereby agree to indemnify every Contributor for any
+liability incurred by such Contributor as a result of warranty, support,
+indemnity or liability terms You offer. You may include additional
+disclaimers of warranty and limitations of liability specific to any
+jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+---------------------------------------------------
+
+If it is impossible for You to comply with any of the terms of this
+License with respect to some or all of the Covered Software due to
+statute, judicial order, or regulation then You must: (a) comply with
+the terms of this License to the maximum extent possible; and (b)
+describe the limitations and the code they affect. Such description must
+be placed in a text file included with all distributions of the Covered
+Software under this License. Except to the extent prohibited by statute
+or regulation, such description must be sufficiently detailed for a
+recipient of ordinary skill to be able to understand it.
+
+5. Termination
+--------------
+
+5.1. The rights granted under this License will terminate automatically
+if You fail to comply with any of its terms. However, if You become
+compliant, then the rights granted under this License from a particular
+Contributor are reinstated (a) provisionally, unless and until such
+Contributor explicitly and finally terminates Your grants, and (b) on an
+ongoing basis, if such Contributor fails to notify You of the
+non-compliance by some reasonable means prior to 60 days after You have
+come back into compliance. Moreover, Your grants from a particular
+Contributor are reinstated on an ongoing basis if such Contributor
+notifies You of the non-compliance by some reasonable means, this is the
+first time You have received notice of non-compliance with this License
+from such Contributor, and You become compliant prior to 30 days after
+Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+infringement claim (excluding declaratory judgment actions,
+counter-claims, and cross-claims) alleging that a Contributor Version
+directly or indirectly infringes any patent, then the rights granted to
+You by any and all Contributors for the Covered Software under Section
+2.1 of this License shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all
+end user license agreements (excluding distributors and resellers) which
+have been validly granted by You or Your distributors under this License
+prior to termination shall survive termination.
+
+************************************************************************
+*                                                                      *
+*  6. Disclaimer of Warranty                                           *
+*  -------------------------                                           *
+*                                                                      *
+*  Covered Software is provided under this License on an "as is"       *
+*  basis, without warranty of any kind, either expressed, implied, or  *
+*  statutory, including, without limitation, warranties that the       *
+*  Covered Software is free of defects, merchantable, fit for a        *
+*  particular purpose or non-infringing. The entire risk as to the     *
+*  quality and performance of the Covered Software is with You.        *
+*  Should any Covered Software prove defective in any respect, You     *
+*  (not any Contributor) assume the cost of any necessary servicing,   *
+*  repair, or correction. This disclaimer of warranty constitutes an   *
+*  essential part of this License. No use of any Covered Software is   *
+*  authorized under this License except under this disclaimer.         *
+*                                                                      *
+************************************************************************
+
+************************************************************************
+*                                                                      *
+*  7. Limitation of Liability                                          *
+*  --------------------------                                          *
+*                                                                      *
+*  Under no circumstances and under no legal theory, whether tort      *
+*  (including negligence), contract, or otherwise, shall any           *
+*  Contributor, or anyone who distributes Covered Software as          *
+*  permitted above, be liable to You for any direct, indirect,         *
+*  special, incidental, or consequential damages of any character      *
+*  including, without limitation, damages for lost profits, loss of    *
+*  goodwill, work stoppage, computer failure or malfunction, or any    *
+*  and all other commercial damages or losses, even if such party      *
+*  shall have been informed of the possibility of such damages. This   *
+*  limitation of liability shall not apply to liability for death or   *
+*  personal injury resulting from such party's negligence to the       *
+*  extent applicable law prohibits such limitation. Some               *
+*  jurisdictions do not allow the exclusion or limitation of           *
+*  incidental or consequential damages, so this exclusion and          *
+*  limitation may not apply to You.                                    *
+*                                                                      *
+************************************************************************
+
+8. Litigation
+-------------
+
+Any litigation relating to this License may be brought only in the
+courts of a jurisdiction where the defendant maintains its principal
+place of business and such litigation shall be governed by laws of that
+jurisdiction, without reference to its conflict-of-law provisions.
+Nothing in this Section shall prevent a party's ability to bring
+cross-claims or counter-claims.
+
+9. Miscellaneous
+----------------
+
+This License represents the complete agreement concerning the subject
+matter hereof. If any provision of this License is held to be
+unenforceable, such provision shall be reformed only to the extent
+necessary to make it enforceable. Any law or regulation which provides
+that the language of a contract shall be construed against the drafter
+shall not be used to construe this License against a Contributor.
+
+10. Versions of the License
+---------------------------
+
+10.1. New Versions
+
+Mozilla Foundation is the license steward. Except as provided in Section
+10.3, no one other than the license steward has the right to modify or
+publish new versions of this License. Each version will be given a
+distinguishing version number.
+
+10.2. Effect of New Versions
+
+You may distribute the Covered Software under the terms of the version
+of the License under which You originally received the Covered Software,
+or under the terms of any subsequent version published by the license
+steward.
+
+10.3. Modified Versions
+
+If you create software not governed by this License, and you want to
+create a new license for such software, you may create and use a
+modified version of this License if you rename the license and remove
+any references to the name of the license steward (except to note that
+such modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+Licenses
+
+If You choose to distribute Source Code Form that is Incompatible With
+Secondary Licenses under the terms of this version of the License, the
+notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+-------------------------------------------
+
+  This Source Code Form is subject to the terms of the Mozilla Public
+  License, v. 2.0. If a copy of the MPL was not distributed with this
+  file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular
+file, then You may include the notice in a location (such as a LICENSE
+file in a relevant directory) where a recipient would be likely to look
+for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+---------------------------------------------------------
+
+  This Source Code Form is "Incompatible With Secondary Licenses", as
+  defined by the Mozilla Public License, v. 2.0.
+
+----------------------------------------------------------------------
+Following applies to:
+./doc/UsingIntelMKL.dox
+./doc/UsingIntelMKL.dox
+./Eigen/src/Eigenvalues/ComplexSchur_MKL.h
+./Eigen/src/Eigenvalues/ComplexSchur_MKL.h
+./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
+./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
+./Eigen/src/Eigenvalues/RealSchur_MKL.h
+./Eigen/src/Eigenvalues/RealSchur_MKL.h
+./Eigen/src/LU/arch/Inverse_SSE.h
+./Eigen/src/LU/arch/Inverse_SSE.h
+./Eigen/src/LU/PartialPivLU_MKL.h
+./Eigen/src/LU/PartialPivLU_MKL.h
+./Eigen/src/QR/HouseholderQR_MKL.h
+./Eigen/src/QR/HouseholderQR_MKL.h
+./Eigen/src/QR/ColPivHouseholderQR_MKL.h
+./Eigen/src/QR/ColPivHouseholderQR_MKL.h
+./Eigen/src/SVD/JacobiSVD_MKL.h
+./Eigen/src/SVD/JacobiSVD_MKL.h
+./Eigen/src/PardisoSupport/PardisoSupport.h
+./Eigen/src/PardisoSupport/PardisoSupport.h
+./Eigen/src/Core/Assign_MKL.h
+./Eigen/src/Core/Assign_MKL.h
+./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
+./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
+./Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+./Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
+./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
+./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
+./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
+./Eigen/src/Core/products/TriangularMatrixVector_MKL.h
+./Eigen/src/Core/products/TriangularMatrixVector_MKL.h
+./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
+./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
+./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
+./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
+./Eigen/src/Core/util/MKL_support.h
+./Eigen/src/Core/util/MKL_support.h
+./Eigen/src/Cholesky/LLT_MKL.h
+./Eigen/src/Cholesky/LLT_MKL.h
+
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.  *
+   Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.  * Neither the name of Intel Corporation nor the
+   names of its contributors may be used to endorse or promote
+   products derived from this software without specific prior written
+   permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+----------------------------------------------------------------------
+Following applies to:
+./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+./unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
+./unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h
+./unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
+./unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
+
+Minpack Copyright Notice (1999) University of Chicago.  All rights
+reserved
+
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the
+following conditions are met:
+
+1. Redistributions of source code must retain the above
+copyright notice, this list of conditions and the following
+disclaimer.
+
+2. Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials
+provided with the distribution.
+
+3. The end-user documentation included with the
+redistribution, if any, must include the following
+acknowledgment:
+
+   "This product includes software developed by the
+   University of Chicago, as Operator of Argonne National
+   Laboratory.
+
+Alternately, this acknowledgment may appear in the software
+itself, if and wherever such third-party acknowledgments
+normally appear.
+
+4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
+WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
+UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
+THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
+OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
+OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
+USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
+THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
+DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
+UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
+BE CORRECTED.
+
+5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
+HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
+ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
+INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
+ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
+PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
+SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
+(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
+EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
+POSSIBILITY OF SUCH LOSS OR DAMAGES.
+
+
+Copyright (c) 1992-2013 The University of Tennessee and The University
+                        of Tennessee Research Foundation.  All rights
+                        reserved.
+Copyright (c) 2000-2013 The University of California Berkeley. All
+                        rights reserved.
+Copyright (c) 2006-2013 The University of Colorado Denver.  All rights
+                        reserved.
+
+Following applies to:
+./lapack/*.c
+
+$COPYRIGHT$
+
+Additional copyrights may follow
+
+$HEADER$
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer listed
+  in this license in the documentation and/or other materials
+  provided with the distribution.
+
+- Neither the name of the copyright holders nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+The copyright holders provide no reassurances that the source code
+provided does not infringe any patent, copyright, or any other
+intellectual property rights of third parties.  The copyright holders
+disclaim any liability to any recipient for claims brought against
+recipient by any third party for infringement of that parties
+intellectual property rights.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+----------------------------------------------------------------------
+Following applies to:
+
+./cmake/FindComputeCpp.cmake
+
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== Farmhash
+
+// Copyright (c) 2014 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== Flatbuffers
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== Gast
+
+Copyright (c) 2016, Serge Guelton
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+	Redistributions of source code must retain the above copyright notice, this
+	list of conditions and the following disclaimer.
+
+	Redistributions in binary form must reproduce the above copyright notice,
+	this list of conditions and the following disclaimer in the documentation
+	and/or other materials provided with the distribution.
+
+	Neither the name of HPCProject, Serge Guelton nor the names of its
+	contributors may be used to endorse or promote products derived from this
+	software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== Gemmlowp
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== google-pasta
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== grpc
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== grpcio
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+-----------------------------------------------------------
+
+BSD 3-Clause License
+
+Copyright 2016, Google Inc.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from this
+software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+
+-----------------------------------------------------------
+
+Mozilla Public License Version 2.0
+==================================
+
+1. Definitions
+--------------
+
+1.1. "Contributor"
+    means each individual or legal entity that creates, contributes to
+    the creation of, or owns Covered Software.
+
+1.2. "Contributor Version"
+    means the combination of the Contributions of others (if any) used
+    by a Contributor and that particular Contributor's Contribution.
+
+1.3. "Contribution"
+    means Covered Software of a particular Contributor.
+
+1.4. "Covered Software"
+    means Source Code Form to which the initial Contributor has attached
+    the notice in Exhibit A, the Executable Form of such Source Code
+    Form, and Modifications of such Source Code Form, in each case
+    including portions thereof.
+
+1.5. "Incompatible With Secondary Licenses"
+    means
+
+    (a) that the initial Contributor has attached the notice described
+        in Exhibit B to the Covered Software; or
+
+    (b) that the Covered Software was made available under the terms of
+        version 1.1 or earlier of the License, but not also under the
+        terms of a Secondary License.
+
+1.6. "Executable Form"
+    means any form of the work other than Source Code Form.
+
+1.7. "Larger Work"
+    means a work that combines Covered Software with other material, in 
+    a separate file or files, that is not Covered Software.
+
+1.8. "License"
+    means this document.
+
+1.9. "Licensable"
+    means having the right to grant, to the maximum extent possible,
+    whether at the time of the initial grant or subsequently, any and
+    all of the rights conveyed by this License.
+
+1.10. "Modifications"
+    means any of the following:
+
+    (a) any file in Source Code Form that results from an addition to,
+        deletion from, or modification of the contents of Covered
+        Software; or
+
+    (b) any new file in Source Code Form that contains any Covered
+        Software.
+
+1.11. "Patent Claims" of a Contributor
+    means any patent claim(s), including without limitation, method,
+    process, and apparatus claims, in any patent Licensable by such
+    Contributor that would be infringed, but for the grant of the
+    License, by the making, using, selling, offering for sale, having
+    made, import, or transfer of either its Contributions or its
+    Contributor Version.
+
+1.12. "Secondary License"
+    means either the GNU General Public License, Version 2.0, the GNU
+    Lesser General Public License, Version 2.1, the GNU Affero General
+    Public License, Version 3.0, or any later versions of those
+    licenses.
+
+1.13. "Source Code Form"
+    means the form of the work preferred for making modifications.
+
+1.14. "You" (or "Your")
+    means an individual or a legal entity exercising rights under this
+    License. For legal entities, "You" includes any entity that
+    controls, is controlled by, or is under common control with You. For
+    purposes of this definition, "control" means (a) the power, direct
+    or indirect, to cause the direction or management of such entity,
+    whether by contract or otherwise, or (b) ownership of more than
+    fifty percent (50%) of the outstanding shares or beneficial
+    ownership of such entity.
+
+2. License Grants and Conditions
+--------------------------------
+
+2.1. Grants
+
+Each Contributor hereby grants You a world-wide, royalty-free,
+non-exclusive license:
+
+(a) under intellectual property rights (other than patent or trademark)
+    Licensable by such Contributor to use, reproduce, make available,
+    modify, display, perform, distribute, and otherwise exploit its
+    Contributions, either on an unmodified basis, with Modifications, or
+    as part of a Larger Work; and
+
+(b) under Patent Claims of such Contributor to make, use, sell, offer
+    for sale, have made, import, and otherwise transfer either its
+    Contributions or its Contributor Version.
+
+2.2. Effective Date
+
+The licenses granted in Section 2.1 with respect to any Contribution
+become effective for each Contribution on the date the Contributor first
+distributes such Contribution.
+
+2.3. Limitations on Grant Scope
+
+The licenses granted in this Section 2 are the only rights granted under
+this License. No additional rights or licenses will be implied from the
+distribution or licensing of Covered Software under this License.
+Notwithstanding Section 2.1(b) above, no patent license is granted by a
+Contributor:
+
+(a) for any code that a Contributor has removed from Covered Software;
+    or
+
+(b) for infringements caused by: (i) Your and any other third party's
+    modifications of Covered Software, or (ii) the combination of its
+    Contributions with other software (except as part of its Contributor
+    Version); or
+
+(c) under Patent Claims infringed by Covered Software in the absence of
+    its Contributions.
+
+This License does not grant any rights in the trademarks, service marks,
+or logos of any Contributor (except as may be necessary to comply with
+the notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+No Contributor makes additional grants as a result of Your choice to
+distribute the Covered Software under a subsequent version of this
+License (see Section 10.2) or under the terms of a Secondary License (if
+permitted under the terms of Section 3.3).
+
+2.5. Representation
+
+Each Contributor represents that the Contributor believes its
+Contributions are its original creation(s) or it has sufficient rights
+to grant the rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+This License is not intended to limit any rights You have under
+applicable copyright doctrines of fair use, fair dealing, or other
+equivalents.
+
+2.7. Conditions
+
+Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
+in Section 2.1.
+
+3. Responsibilities
+-------------------
+
+3.1. Distribution of Source Form
+
+All distribution of Covered Software in Source Code Form, including any
+Modifications that You create or to which You contribute, must be under
+the terms of this License. You must inform recipients that the Source
+Code Form of the Covered Software is governed by the terms of this
+License, and how they can obtain a copy of this License. You may not
+attempt to alter or restrict the recipients' rights in the Source Code
+Form.
+
+3.2. Distribution of Executable Form
+
+If You distribute Covered Software in Executable Form then:
+
+(a) such Covered Software must also be made available in Source Code
+    Form, as described in Section 3.1, and You must inform recipients of
+    the Executable Form how they can obtain a copy of such Source Code
+    Form by reasonable means in a timely manner, at a charge no more
+    than the cost of distribution to the recipient; and
+
+(b) You may distribute such Executable Form under the terms of this
+    License, or sublicense it under different terms, provided that the
+    license for the Executable Form does not attempt to limit or alter
+    the recipients' rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+You may create and distribute a Larger Work under terms of Your choice,
+provided that You also comply with the requirements of this License for
+the Covered Software. If the Larger Work is a combination of Covered
+Software with a work governed by one or more Secondary Licenses, and the
+Covered Software is not Incompatible With Secondary Licenses, this
+License permits You to additionally distribute such Covered Software
+under the terms of such Secondary License(s), so that the recipient of
+the Larger Work may, at their option, further distribute the Covered
+Software under the terms of either this License or such Secondary
+License(s).
+
+3.4. Notices
+
+You may not remove or alter the substance of any license notices
+(including copyright notices, patent notices, disclaimers of warranty,
+or limitations of liability) contained within the Source Code Form of
+the Covered Software, except that You may alter any license notices to
+the extent required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+You may choose to offer, and to charge a fee for, warranty, support,
+indemnity or liability obligations to one or more recipients of Covered
+Software. However, You may do so only on Your own behalf, and not on
+behalf of any Contributor. You must make it absolutely clear that any
+such warranty, support, indemnity, or liability obligation is offered by
+You alone, and You hereby agree to indemnify every Contributor for any
+liability incurred by such Contributor as a result of warranty, support,
+indemnity or liability terms You offer. You may include additional
+disclaimers of warranty and limitations of liability specific to any
+jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+---------------------------------------------------
+
+If it is impossible for You to comply with any of the terms of this
+License with respect to some or all of the Covered Software due to
+statute, judicial order, or regulation then You must: (a) comply with
+the terms of this License to the maximum extent possible; and (b)
+describe the limitations and the code they affect. Such description must
+be placed in a text file included with all distributions of the Covered
+Software under this License. Except to the extent prohibited by statute
+or regulation, such description must be sufficiently detailed for a
+recipient of ordinary skill to be able to understand it.
+
+5. Termination
+--------------
+
+5.1. The rights granted under this License will terminate automatically
+if You fail to comply with any of its terms. However, if You become
+compliant, then the rights granted under this License from a particular
+Contributor are reinstated (a) provisionally, unless and until such
+Contributor explicitly and finally terminates Your grants, and (b) on an
+ongoing basis, if such Contributor fails to notify You of the
+non-compliance by some reasonable means prior to 60 days after You have
+come back into compliance. Moreover, Your grants from a particular
+Contributor are reinstated on an ongoing basis if such Contributor
+notifies You of the non-compliance by some reasonable means, this is the
+first time You have received notice of non-compliance with this License
+from such Contributor, and You become compliant prior to 30 days after
+Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+infringement claim (excluding declaratory judgment actions,
+counter-claims, and cross-claims) alleging that a Contributor Version
+directly or indirectly infringes any patent, then the rights granted to
+You by any and all Contributors for the Covered Software under Section
+2.1 of this License shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all
+end user license agreements (excluding distributors and resellers) which
+have been validly granted by You or Your distributors under this License
+prior to termination shall survive termination.
+
+************************************************************************
+*                                                                      *
+*  6. Disclaimer of Warranty                                           *
+*  -------------------------                                           *
+*                                                                      *
+*  Covered Software is provided under this License on an "as is"       *
+*  basis, without warranty of any kind, either expressed, implied, or  *
+*  statutory, including, without limitation, warranties that the       *
+*  Covered Software is free of defects, merchantable, fit for a        *
+*  particular purpose or non-infringing. The entire risk as to the     *
+*  quality and performance of the Covered Software is with You.        *
+*  Should any Covered Software prove defective in any respect, You     *
+*  (not any Contributor) assume the cost of any necessary servicing,   *
+*  repair, or correction. This disclaimer of warranty constitutes an   *
+*  essential part of this License. No use of any Covered Software is   *
+*  authorized under this License except under this disclaimer.         *
+*                                                                      *
+************************************************************************
+
+************************************************************************
+*                                                                      *
+*  7. Limitation of Liability                                          *
+*  --------------------------                                          *
+*                                                                      *
+*  Under no circumstances and under no legal theory, whether tort      *
+*  (including negligence), contract, or otherwise, shall any           *
+*  Contributor, or anyone who distributes Covered Software as          *
+*  permitted above, be liable to You for any direct, indirect,         *
+*  special, incidental, or consequential damages of any character      *
+*  including, without limitation, damages for lost profits, loss of    *
+*  goodwill, work stoppage, computer failure or malfunction, or any    *
+*  and all other commercial damages or losses, even if such party      *
+*  shall have been informed of the possibility of such damages. This   *
+*  limitation of liability shall not apply to liability for death or   *
+*  personal injury resulting from such party's negligence to the       *
+*  extent applicable law prohibits such limitation. Some               *
+*  jurisdictions do not allow the exclusion or limitation of           *
+*  incidental or consequential damages, so this exclusion and          *
+*  limitation may not apply to You.                                    *
+*                                                                      *
+************************************************************************
+
+8. Litigation
+-------------
+
+Any litigation relating to this License may be brought only in the
+courts of a jurisdiction where the defendant maintains its principal
+place of business and such litigation shall be governed by laws of that
+jurisdiction, without reference to its conflict-of-law provisions.
+Nothing in this Section shall prevent a party's ability to bring
+cross-claims or counter-claims.
+
+9. Miscellaneous
+----------------
+
+This License represents the complete agreement concerning the subject
+matter hereof. If any provision of this License is held to be
+unenforceable, such provision shall be reformed only to the extent
+necessary to make it enforceable. Any law or regulation which provides
+that the language of a contract shall be construed against the drafter
+shall not be used to construe this License against a Contributor.
+
+10. Versions of the License
+---------------------------
+
+10.1. New Versions
+
+Mozilla Foundation is the license steward. Except as provided in Section
+10.3, no one other than the license steward has the right to modify or
+publish new versions of this License. Each version will be given a
+distinguishing version number.
+
+10.2. Effect of New Versions
+
+You may distribute the Covered Software under the terms of the version
+of the License under which You originally received the Covered Software,
+or under the terms of any subsequent version published by the license
+steward.
+
+10.3. Modified Versions
+
+If you create software not governed by this License, and you want to
+create a new license for such software, you may create and use a
+modified version of this License if you rename the license and remove
+any references to the name of the license steward (except to note that
+such modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+Licenses
+
+If You choose to distribute Source Code Form that is Incompatible With
+Secondary Licenses under the terms of this version of the License, the
+notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+-------------------------------------------
+
+  This Source Code Form is subject to the terms of the Mozilla Public
+  License, v. 2.0. If a copy of the MPL was not distributed with this
+  file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular
+file, then You may include the notice in a location (such as a LICENSE
+file in a relevant directory) where a recipient would be likely to look
+for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+---------------------------------------------------------
+
+  This Source Code Form is "Incompatible With Secondary Licenses", as
+  defined by the Mozilla Public License, v. 2.0.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== H5py
+
+Copyright Notice and Statement for the h5py Project
+===================================================
+
+    Copyright (c) 2008-2013 Andrew Collette and contributors
+    http://www.h5py.org
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+    a. Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    b. Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the
+       distribution.
+
+    c. Neither the name of the author nor the names of contributors may 
+       be used to endorse or promote products derived from this software 
+       without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-------------------------------------------------------------------------------
+
+HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+Copyright 2006-2007 by The HDF Group (THG).
+
+NCSA HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+Copyright 1998-2006 by the Board of Trustees of the University of Illinois.
+
+All rights reserved.
+
+Contributors: National Center for Supercomputing Applications (NCSA)
+at the University of Illinois, Fortner Software, Unidata Program
+Center (netCDF), The Independent JPEG Group (JPEG), Jean-loup Gailly
+and Mark Adler (gzip), and Digital Equipment Corporation (DEC).
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted for any purpose (including commercial
+purposes) provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright
+notice, this list of conditions, and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions, and the following
+disclaimer in the documentation and/or materials provided with the
+distribution.
+   3. In addition, redistributions of modified forms of the source or
+binary code must carry prominent notices stating that the original
+code was changed and the date of the change.
+   4. All publications or advertising materials mentioning features or
+use of this software are asked, but not required, to acknowledge that
+it was developed by The HDF Group and by the National Center for
+Supercomputing Applications at the University of Illinois at
+Urbana-Champaign and credit the contributors.
+   5. Neither the name of The HDF Group, the name of the University,
+nor the name of any Contributor may be used to endorse or promote
+products derived from this software without specific prior written
+permission from THG, the University, or the Contributor, respectively.
+
+DISCLAIMER: THIS SOFTWARE IS PROVIDED BY THE HDF GROUP (THG) AND THE
+CONTRIBUTORS "AS IS" WITH NO WARRANTY OF ANY KIND, EITHER EXPRESSED OR
+IMPLIED. In no event shall THG or the Contributors be liable for any
+damages suffered by the users arising out of the use of this software,
+even if advised of the possibility of such damage.
+
+Portions of HDF5 were developed with support from the University of
+California, Lawrence Livermore National Laboratory (UC LLNL). The
+following statement applies to those portions of the product and must
+be retained in any redistribution of source code, binaries,
+documentation, and/or accompanying materials:
+
+This work was partially produced at the University of California,
+Lawrence Livermore National Laboratory (UC LLNL) under contract
+no. W-7405-ENG-48 (Contract 48) between the U.S. Department of Energy
+(DOE) and The Regents of the University of California (University) for
+the operation of UC LLNL.
+
+DISCLAIMER: This work was prepared as an account of work sponsored by
+an agency of the United States Government. Neither the United States
+Government nor the University of California nor any of their
+employees, makes any warranty, express or implied, or assumes any
+liability or responsibility for the accuracy, completeness, or
+usefulness of any information, apparatus, product, or process
+disclosed, or represents that its use would not infringe privately-
+owned rights. Reference herein to any specific commercial products,
+process, or service by trade name, trademark, manufacturer, or
+otherwise, does not necessarily constitute or imply its endorsement,
+recommendation, or favoring by the United States Government or the
+University of California. The views and opinions of authors expressed
+herein do not necessarily state or reflect those of the United States
+Government or the University of California, and shall not be used for
+advertising or product endorsement purposes.
+
+-------------------------------------------------------------------------------
+
+Copyright Notice and Statement for PyTables Software Library and Utilities:
+
+Copyright (c) 2002, 2003, 2004  Francesc Altet
+Copyright (c) 2005, 2006, 2007  Carabos Coop. V.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+a. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+b. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.
+
+c. Neither the name of the Carabos Coop. V. nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== Highwayhash
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== icu
+
+UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
+
+See Terms of Use <https://www.unicode.org/copyright.html>
+for definitions of Unicode Inc.’s Data Files and Software.
+
+NOTICE TO USER: Carefully read the following legal agreement.
+BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
+DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
+YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT.
+IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
+THE DATA FILES OR SOFTWARE.
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright © 1991-2023 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+
+----------------------------------------------------------------------
+
+Third-Party Software Licenses
+
+This section contains third-party software notices and/or additional
+terms for licensed third-party software components included within ICU
+libraries.
+
+----------------------------------------------------------------------
+
+ICU License - ICU 1.8.1 to ICU 57.1
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2016 International Business Machines Corporation and others
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies of
+the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
+SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
+CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale, use
+or other dealings in this Software without prior written authorization
+of the copyright holder.
+
+All trademarks and registered trademarks mentioned herein are the
+property of their respective owners.
+
+----------------------------------------------------------------------
+
+Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
+
+ #     The Google Chrome software developed by Google is licensed under
+ # the BSD license. Other software included in this distribution is
+ # provided under other licenses, as set forth below.
+ #
+ #  The BSD License
+ #  http://opensource.org/licenses/bsd-license.php
+ #  Copyright (C) 2006-2008, Google Inc.
+ #
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ # modification, are permitted provided that the following conditions are met:
+ #
+ #  Redistributions of source code must retain the above copyright notice,
+ # this list of conditions and the following disclaimer.
+ #  Redistributions in binary form must reproduce the above
+ # copyright notice, this list of conditions and the following
+ # disclaimer in the documentation and/or other materials provided with
+ # the distribution.
+ #  Neither the name of  Google Inc. nor the names of its
+ # contributors may be used to endorse or promote products derived from
+ # this software without specific prior written permission.
+ #
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #
+ #
+ #  The word list in cjdict.txt are generated by combining three word lists
+ # listed below with further processing for compound word breaking. The
+ # frequency is generated with an iterative training against Google web
+ # corpora.
+ #
+ #  * Libtabe (Chinese)
+ #    - https://sourceforge.net/project/?group_id=1519
+ #    - Its license terms and conditions are shown below.
+ #
+ #  * IPADIC (Japanese)
+ #    - http://chasen.aist-nara.ac.jp/chasen/distribution.html
+ #    - Its license terms and conditions are shown below.
+ #
+ #  ---------COPYING.libtabe ---- BEGIN--------------------
+ #
+ #  /*
+ #   * Copyright (c) 1999 TaBE Project.
+ #   * Copyright (c) 1999 Pai-Hsiang Hsiao.
+ #   * All rights reserved.
+ #   *
+ #   * Redistribution and use in source and binary forms, with or without
+ #   * modification, are permitted provided that the following conditions
+ #   * are met:
+ #   *
+ #   * . Redistributions of source code must retain the above copyright
+ #   *   notice, this list of conditions and the following disclaimer.
+ #   * . Redistributions in binary form must reproduce the above copyright
+ #   *   notice, this list of conditions and the following disclaimer in
+ #   *   the documentation and/or other materials provided with the
+ #   *   distribution.
+ #   * . Neither the name of the TaBE Project nor the names of its
+ #   *   contributors may be used to endorse or promote products derived
+ #   *   from this software without specific prior written permission.
+ #   *
+ #   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ #   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ #   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ #   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ #   * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ #   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ #   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ #   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ #   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ #   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ #   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ #   * OF THE POSSIBILITY OF SUCH DAMAGE.
+ #   */
+ #
+ #  /*
+ #   * Copyright (c) 1999 Computer Systems and Communication Lab,
+ #   *                    Institute of Information Science, Academia
+ #       *                    Sinica. All rights reserved.
+ #   *
+ #   * Redistribution and use in source and binary forms, with or without
+ #   * modification, are permitted provided that the following conditions
+ #   * are met:
+ #   *
+ #   * . Redistributions of source code must retain the above copyright
+ #   *   notice, this list of conditions and the following disclaimer.
+ #   * . Redistributions in binary form must reproduce the above copyright
+ #   *   notice, this list of conditions and the following disclaimer in
+ #   *   the documentation and/or other materials provided with the
+ #   *   distribution.
+ #   * . Neither the name of the Computer Systems and Communication Lab
+ #   *   nor the names of its contributors may be used to endorse or
+ #   *   promote products derived from this software without specific
+ #   *   prior written permission.
+ #   *
+ #   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ #   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ #   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ #   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ #   * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ #   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ #   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ #   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ #   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ #   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ #   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ #   * OF THE POSSIBILITY OF SUCH DAMAGE.
+ #   */
+ #
+ #  Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
+ #      University of Illinois
+ #  c-tsai4@uiuc.edu  http://casper.beckman.uiuc.edu/~c-tsai4
+ #
+ #  ---------------COPYING.libtabe-----END--------------------------------
+ #
+ #
+ #  ---------------COPYING.ipadic-----BEGIN-------------------------------
+ #
+ #  Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
+ #  and Technology.  All Rights Reserved.
+ #
+ #  Use, reproduction, and distribution of this software is permitted.
+ #  Any copy of this software, whether in its original form or modified,
+ #  must include both the above copyright notice and the following
+ #  paragraphs.
+ #
+ #  Nara Institute of Science and Technology (NAIST),
+ #  the copyright holders, disclaims all warranties with regard to this
+ #  software, including all implied warranties of merchantability and
+ #  fitness, in no event shall NAIST be liable for
+ #  any special, indirect or consequential damages or any damages
+ #  whatsoever resulting from loss of use, data or profits, whether in an
+ #  action of contract, negligence or other tortuous action, arising out
+ #  of or in connection with the use or performance of this software.
+ #
+ #  A large portion of the dictionary entries
+ #  originate from ICOT Free Software.  The following conditions for ICOT
+ #  Free Software applies to the current dictionary as well.
+ #
+ #  Each User may also freely distribute the Program, whether in its
+ #  original form or modified, to any third party or parties, PROVIDED
+ #  that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+ #  on, or be attached to, the Program, which is distributed substantially
+ #  in the same form as set out herein and that such intended
+ #  distribution, if actually made, will neither violate or otherwise
+ #  contravene any of the laws and regulations of the countries having
+ #  jurisdiction over the User or the intended distribution itself.
+ #
+ #  NO WARRANTY
+ #
+ #  The program was produced on an experimental basis in the course of the
+ #  research and development conducted during the project and is provided
+ #  to users as so produced on an experimental basis.  Accordingly, the
+ #  program is provided without any warranty whatsoever, whether express,
+ #  implied, statutory or otherwise.  The term "warranty" used herein
+ #  includes, but is not limited to, any warranty of the quality,
+ #  performance, merchantability and fitness for a particular purpose of
+ #  the program and the nonexistence of any infringement or violation of
+ #  any right of any third party.
+ #
+ #  Each user of the program will agree and understand, and be deemed to
+ #  have agreed and understood, that there is no warranty whatsoever for
+ #  the program and, accordingly, the entire risk arising from or
+ #  otherwise connected with the program is assumed by the user.
+ #
+ #  Therefore, neither ICOT, the copyright holder, or any other
+ #  organization that participated in or was otherwise related to the
+ #  development of the program and their respective officials, directors,
+ #  officers and other employees shall be held liable for any and all
+ #  damages, including, without limitation, general, special, incidental
+ #  and consequential damages, arising out of or otherwise in connection
+ #  with the use or inability to use the program or any product, material
+ #  or result produced or otherwise obtained by using the program,
+ #  regardless of whether they have been advised of, or otherwise had
+ #  knowledge of, the possibility of such damages at any time during the
+ #  project or thereafter.  Each user will be deemed to have agreed to the
+ #  foregoing by his or her commencement of use of the program.  The term
+ #  "use" as used herein includes, but is not limited to, the use,
+ #  modification, copying and distribution of the program and the
+ #  production of secondary products from the program.
+ #
+ #  In the case where the program, whether in its original form or
+ #  modified, was distributed or delivered to or received by a user from
+ #  any person, organization or entity other than ICOT, unless it makes or
+ #  grants independently of ICOT any specific warranty to the user in
+ #  writing, such person, organization or entity, will also be exempted
+ #  from and not be held liable to the user for any such damages as noted
+ #  above as far as the program is concerned.
+ #
+ #  ---------------COPYING.ipadic-----END----------------------------------
+
+----------------------------------------------------------------------
+
+Lao Word Break Dictionary Data (laodict.txt)
+
+ # Copyright (C) 2016 and later: Unicode, Inc. and others.
+ # License & terms of use: http://www.unicode.org/copyright.html
+ # Copyright (c) 2015 International Business Machines Corporation
+ # and others. All Rights Reserved.
+ #
+ # Project: https://github.com/rober42539/lao-dictionary
+ # Dictionary: https://github.com/rober42539/lao-dictionary/laodict.txt
+ # License: https://github.com/rober42539/lao-dictionary/LICENSE.txt
+ #          (copied below)
+ #
+ #	This file is derived from the above dictionary version of Nov 22, 2020
+ #  ----------------------------------------------------------------------
+ #  Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification, are permitted provided that the following conditions are met:
+ #
+ #  Redistributions of source code must retain the above copyright notice, this
+ #  list of conditions and the following disclaimer. Redistributions in binary
+ #  form must reproduce the above copyright notice, this list of conditions and
+ #  the following disclaimer in the documentation and/or other materials
+ #  provided with the distribution.
+ #
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ # OF THE POSSIBILITY OF SUCH DAMAGE.
+ #  --------------------------------------------------------------------------
+
+----------------------------------------------------------------------
+
+Burmese Word Break Dictionary Data (burmesedict.txt)
+
+ #  Copyright (c) 2014 International Business Machines Corporation
+ #  and others. All Rights Reserved.
+ #
+ #  This list is part of a project hosted at:
+ #    github.com/kanyawtech/myanmar-karen-word-lists
+ #
+ #  --------------------------------------------------------------------------
+ #  Copyright (c) 2013, LeRoy Benjamin Sharon
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification, are permitted provided that the following conditions
+ #  are met: Redistributions of source code must retain the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer.  Redistributions in binary form must reproduce the
+ #  above copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #    Neither the name Myanmar Karen Word Lists, nor the names of its
+ #    contributors may be used to endorse or promote products derived
+ #    from this software without specific prior written permission.
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ #  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ #  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ #  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ #  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ #  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ #  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ #  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ #  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ #  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ #  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ #  THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ #  SUCH DAMAGE.
+ #  --------------------------------------------------------------------------
+
+----------------------------------------------------------------------
+
+Time Zone Database
+
+  ICU uses the public domain data and code derived from Time Zone
+Database for its time zone support. The ownership of the TZ database
+is explained in BCP 175: Procedure for Maintaining the Time Zone
+Database section 7.
+
+ # 7.  Database Ownership
+ #
+ #    The TZ database itself is not an IETF Contribution or an IETF
+ #    document.  Rather it is a pre-existing and regularly updated work
+ #    that is in the public domain, and is intended to remain in the
+ #    public domain.  Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
+ #    not apply to the TZ Database or contributions that individuals make
+ #    to it.  Should any claims be made and substantiated against the TZ
+ #    Database, the organization that is providing the IANA
+ #    Considerations defined in this RFC, under the memorandum of
+ #    understanding with the IETF, currently ICANN, may act in accordance
+ #    with all competent court orders.  No ownership claims will be made
+ #    by ICANN or the IETF Trust on the database or the code.  Any person
+ #    making a contribution to the database or code waives all rights to
+ #    future claims in that contribution or in the TZ Database.
+
+----------------------------------------------------------------------
+
+punycode.cpp:
+
+Disclaimer and license
+
+    Regarding this entire document or any portion of it (including
+    the pseudocode and C code), the author makes no guarantees and
+    is not responsible for any damage resulting from its use.  The
+    author grants irrevocable permission to anyone to use, modify,
+    and distribute it in any way that does not diminish the rights
+    of anyone else to use, modify, and distribute it, provided that
+    redistributed derivative works do not contain misleading author or
+    version information.  Derivative works need not be licensed under
+    similar terms.
+
+----------------------------------------------------------------------
+
+Google double-conversion
+
+Copyright 2006-2011, the V8 project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the name of Google Inc. nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== libjpeg_turbo
+
+For a summary of these license terms for the main libjpeg-turbo code, see
+LICENSE.md.
+
+libjpeg-turbo license
+---------------------
+    This license covers the TurboJPEG API library and associated programs.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+- Neither the name of the libjpeg-turbo Project nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+libjpeg license, Independent JPEG Group
+---------------------------------------
+    This license applies to the libjpeg API library and associated programs
+    (any code inherited from libjpeg, and any modifications to that code.)
+
+The authors make NO WARRANTY or representation, either express or implied,
+with respect to this software, its quality, accuracy, merchantability, or
+fitness for a particular purpose.  This software is provided "AS IS", and you,
+its user, assume the entire risk as to its quality and accuracy.
+
+This software is copyright (C) 1991-2016, Thomas G. Lane, Guido Vollbeding.
+All Rights Reserved except as specified below.
+
+Permission is hereby granted to use, copy, modify, and distribute this
+software (or portions thereof) for any purpose, without fee, subject to these
+conditions:
+(1) If any part of the source code for this software is distributed, then this
+README file must be included, with this copyright and no-warranty notice
+unaltered; and any additions, deletions, or changes to the original files
+must be clearly indicated in accompanying documentation.
+(2) If only executable code is distributed, then the accompanying
+documentation must state that "this software is based in part on the work of
+the Independent JPEG Group".
+(3) Permission for use of this software is granted only if the user accepts
+full responsibility for any undesirable consequences; the authors accept
+NO LIABILITY for damages of any kind.
+
+These conditions apply to any software derived from or based on the IJG code,
+not just to the unmodified library.  If you use our work, you ought to
+acknowledge us.
+
+Permission is NOT granted for the use of any IJG author's name or company name
+in advertising or publicity relating to this software or products derived from
+it.  This software may be referred to only as "the Independent JPEG Group's
+software".
+
+We specifically permit and encourage the use of this software as the basis of
+commercial products, provided that all warranty or liability claims are
+assumed by the product vendor.
+
+
+The Unix configuration script "configure" was produced with GNU Autoconf.
+It is copyright by the Free Software Foundation but is freely distributable.
+The same holds for its supporting scripts (config.guess, config.sub,
+ltmain.sh).  Another support script, install-sh, is copyright by X Consortium
+but is also freely distributable.
+
+The IJG distribution formerly included code to read and write GIF files.
+To avoid entanglement with the Unisys LZW patent (now expired), GIF reading
+support has been removed altogether, and the GIF writer has been simplified
+to produce "uncompressed GIFs".  This technique does not use the LZW
+algorithm; the resulting GIF files are larger than usual, but are readable
+by all standard GIF decoders.
+
+We are required to state that
+    "The Graphics Interchange Format(c) is the Copyright property of
+    CompuServe Incorporated.  GIF(sm) is a Service Mark property of
+    CompuServe Incorporated."
+
+
+zlib License
+------------
+    This license is a subset of the other two, and it covers the libjpeg-turbo
+    SIMD extensions.
+
+This software is provided 'as-is', without any express or implied
+warranty.  In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+
+"THE BEER-WARE LICENSE" (Revision 42)
+-------------------------------------
+    This license covers //third_party/libjpeg_turbo/src/md5/md5hl.c.
+
+<phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
+can do whatever you want with this stuff. If we meet some day, and you think
+this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+
+
+MIT License
+-----------
+    This license covers //third_party/libjpeg_turbo/src/doc/html/dynsections.js.
+
+MIT License Copyright (c) 1997-2020 by Dimitri van Heesch
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished
+to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
+OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== llvm
+
+Copied from llvm-project/llvm/LICENSE.TXT:
+==============================================================================
+The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
+==============================================================================
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+
+==============================================================================
+Software from third parties included in the LLVM Project:
+==============================================================================
+The LLVM Project contains third party software which is under different license
+terms. All such code will be identified clearly using at least one of two
+mechanisms:
+1) It will be in a separate directory tree with its own `LICENSE.txt` or
+   `LICENSE` file at the top containing the specific license and restrictions
+   which apply to that software, or
+2) It will contain specific license and restriction terms at the top of every
+   file.
+
+==============================================================================
+Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy):
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2003-2019 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+==============================================================================
+Copied from llvm-project/libcxx/src/include/ryu/common.h:
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+==============================================================================
+==============================================================================
+Copied from llvm-project/llvm/utils/unittest/googletest/LICENSE.TXT and
+llvm-project/llvm/utils/unittest/googlemock/LICENSE.txt:
+
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+==============================================================================
+==============================================================================                                                                                
+Copied from llvm-project/llvm/lib/Support/COPYRIGHT.regex:
+$OpenBSD: COPYRIGHT,v 1.3 2003/06/02 20:18:36 millert Exp $
+
+Copyright 1992, 1993, 1994 Henry Spencer.  All rights reserved.
+This software is not subject to any license of the American Telephone
+and Telegraph Company or of the Regents of the University of California.
+
+Permission is granted to anyone to use this software for any purpose on
+any computer system, and to alter it and redistribute it, subject
+to the following restrictions:
+
+1. The author is not responsible for the consequences of use of this
+   software, no matter how awful, even if they arise from flaws in it.
+
+2. The origin of this software must not be misrepresented, either by
+   explicit claim or by omission.  Since few users ever read sources,
+   credits must appear in the documentation.
+
+3. Altered versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.  Since few users
+   ever read sources, credits must appear in the documentation.
+
+4. This notice may not be removed or altered.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+/*-
+ * Copyright (c) 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)COPYRIGHT	8.1 (Berkeley) 3/16/94
+ */
+
+==============================================================================
+==============================================================================                                                                                
+License for third_party/llvm/llvm-project/llvm/cmake/config.guess:
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+Autoconf Exception
+
+As a special exception, the Free Software Foundation gives unlimited
+permission to copy, distribute and modify the configure scripts that are the
+output of Autoconf. You need not follow the terms of the GNU General Public
+License when using or distributing such scripts, even though portions of the
+text of Autoconf appear in them. The GNU General Public License (GPL) does
+govern all other use of the material that constitutes the Autoconf program.
+
+Certain portions of the Autoconf source text are designed to be copied (in
+certain cases, depending on the input) into the output of Autoconf. We call
+these the "data" portions. The rest of the Autoconf source text consists of
+comments plus executable code that decides which of the data portions to
+output in any given case. We call these comments and executable code the "non-
+data" portions. Autoconf never copies any of the non-data portions into its
+output.
+
+This special exception to the GPL applies to versions of Autoconf released by
+the Free Software Foundation. When you make and distribute a modified version
+of Autoconf, you may extend this special exception to the GPL to apply to your
+modified version as well, *unless* your modified version has the potential to
+copy into its output some of the text that was the non-data portion of the
+version that you started with. (In other words, unless your change moves or
+copies text from the non-data portions to the data portions.) If your
+modification has such potential, you must delete any notice of this special
+exception to the GPL from your modified version.
+
+                     END OF TERMS AND CONDITIONS
+
+
+==============================================================================
+==============================================================================                                                                                
+Copied from llvm-project/llvm/-project/polly/lib/External/isl/LICENSE:
+
+MIT License (MIT)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+==============================================================================
+==============================================================================                                                                                
+Copied from llvm-project/llgo/third_party/gotools/LICENSE:
+
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+==============================================================================
+==============================================================================                                                                                
+Copied from llvm-project/llgo/third_party/gofrontend/libffi/LICENSE:
+
+libffi - Copyright (c) 1996-2014  Anthony Green, Red Hat, Inc and others.
+See source files for details.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+==============================================================================
+==============================================================================                                                                                
+Copied from llvm-project/lldb/third_party/Python/module/six/LICENSE:
+
+Copyright (c) 2010-2015 Benjamin Peterson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+==============================================================================
+==============================================================================                                                                                
+Copied from llvm-project/lldb/third_party/Python/module/pexpect-4.6/LICENSE and
+lldb/third_party/Python/module/ptyprocess-0.6.0/LICENSE.
+
+ISC LICENSE
+
+    This license is approved by the OSI and FSF as GPL-compatible.
+        http://opensource.org/licenses/isc-license.txt
+
+    Copyright (c) 2013-2014, Pexpect development team
+    Copyright (c) 2012, Noah Spurrier <noah@noah.org>
+
+    Permission to use, copy, modify, and/or distribute this software for any
+    purpose with or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+    
+    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+==============================================================================
+==============================================================================                                                                                
+Copied from
+llvm-project/clang-tools-extra/clangd/clients/clangd-vscode/LICENSE:
+
+The MIT License (MIT)
+
+Copyright (c) 2019 The LLVM Developers
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+==============================================================================
+==============================================================================                                                                                
+Copied from llvm-project/llvm/include/llvm/Support/LICENSE.TXT:
+
+LLVM System Interface Library
+-------------------------------------------------------------------------------
+The LLVM System Interface Library is licensed under the Illinois Open Source
+License and has the following additional copyright:
+
+Copyright (C) 2004 eXtensible Systems, Inc.
+
+==============================================================================
+==============================================================================                                                                                
+Copied from llvm-project/llvm/test/YAMLParser/LICENSE.txt:
+
+Copyright (c) 2006 Kirill Simonov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+==============================================================================
+==============================================================================                                                                                
+Copied from llvm-project/clang-tools-extra/clang-tidy/cert/LICENSE.TXT:
+
+------------------------------------------------------------------------------
+clang-tidy CERT Files
+------------------------------------------------------------------------------
+All clang-tidy files are licensed under the same terms as the rest of the LLVM
+project with the following additions:
+
+Any file referencing a CERT Secure Coding guideline:
+Please allow this letter to serve as confirmation that open source projects on
+http://llvm.org are permitted to link via hypertext to the CERT(R) secure coding
+guidelines available at https://www.securecoding.cert.org.
+
+The foregoing is permitted by the Terms of Use as follows:
+"Linking to the Service
+Because we update many of our Web documents regularly, we would prefer that you
+link to our Web pages whenever possible rather than reproduce them. It is not
+necessary to request permission to make referential hypertext links to The
+Service."
+http://www.sei.cmu.edu/legal/ip/index.cfm.
+
+Please allow this letter to also confirm that no formal permission is required
+to reproduce the title of the content being linked to, nor to reproduce any
+de Minimis description of such content.
+
+==============================================================================
+==============================================================================                                                                                
+Copied from llvm-project/clang-tools-extra/clang-tidy/hicpp/LICENSE.TXT:
+
+------------------------------------------------------------------------------
+clang-tidy High-Integrity C++ Files
+------------------------------------------------------------------------------
+All clang-tidy files are licensed under the same terms as the rest of the LLVM
+project with the following additions:
+
+Any file referencing a High-Integrity C++ Coding guideline:
+
+HIC++ Coding Standard as created by PRQA.
+
+Please see http://www.codingstandard.com/section/conditions-of-use/ for more
+information.
+
+==============================================================================
+==============================================================================
+Copied from llvm-project/polly/lib/External/isl/interface/cpp.cc:
+
+Copyright 2016, 2017 Tobias Grosser. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY TOBIAS GROSSER ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SVEN VERDOOLAEGE OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation
+are those of the authors and should not be interpreted as
+representing official policies, either expressed or implied, of
+Tobias Grosser.
+
+==============================================================================
+==============================================================================
+Copied from llvm-project/llvm/lib/Support/BLAKE3/LICENSE:
+
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
+
+
+==============================================================================
+==============================================================================
+Copied from llvm-project/llvm/lib/Support/UnicodeNameToCodepointGenerated.cpp:
+
+UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
+
+See Terms of Use <https://www.unicode.org/copyright.html>
+for definitions of Unicode Inc.’s Data Files and Software.
+
+NOTICE TO USER: Carefully read the following legal agreement.
+BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
+DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
+YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT.
+IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
+THE DATA FILES OR SOFTWARE.
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright © 1991-2022 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+
+
+==============================================================================
+==============================================================================
+Copied from llvm-project/llvm/include/llvm/Support/ConvertUTF.h:
+
+Copyright © 1991-2015 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in
+http://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software,
+(b) this copyright and permission notice appear in associated
+documentation, and
+(c) there is clear notice in each modified Data File or in the Software
+as well as in the documentation associated with the Data File(s) or
+Software that the data or software has been modified.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+
+
+==============================================================================
+==============================================================================
+Copied from docker_kokoro/dockerfiles/scripts/google_packages/deb_packages/copyright:
+
+Files: libcxx/utils/google-benchmark/*
+License: Apache 2.0
+ 
+                                  Apache License
+                            Version 2.0, January 2004
+                         http://www.apache.org/licenses/
+ 
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+ 
+    1. Definitions.
+ 
+       "License" shall mean the terms and conditions for use, reproduction,
+       and distribution as defined by Sections 1 through 9 of this document.
+ 
+       "Licensor" shall mean the copyright owner or entity authorized by
+       the copyright owner that is granting the License.
+ 
+       "Legal Entity" shall mean the union of the acting entity and all
+       other entities that control, are controlled by, or are under common
+       control with that entity. For the purposes of this definition,
+       "control" means (i) the power, direct or indirect, to cause the
+       direction or management of such entity, whether by contract or
+       otherwise, or (ii) ownership of fifty percent (50%) or more of the
+       outstanding shares, or (iii) beneficial ownership of such entity.
+ 
+       "You" (or "Your") shall mean an individual or Legal Entity
+       exercising permissions granted by this License.
+ 
+       "Source" form shall mean the preferred form for making modifications,
+       including but not limited to software source code, documentation
+       source, and configuration files.
+ 
+       "Object" form shall mean any form resulting from mechanical
+       transformation or translation of a Source form, including but
+       not limited to compiled object code, generated documentation,
+       and conversions to other media types.
+ 
+       "Work" shall mean the work of authorship, whether in Source or
+       Object form, made available under the License, as indicated by a
+       copyright notice that is included in or attached to the work
+       (an example is provided in the Appendix below).
+ 
+       "Derivative Works" shall mean any work, whether in Source or Object
+       form, that is based on (or derived from) the Work and for which the
+       editorial revisions, annotations, elaborations, or other modifications
+       represent, as a whole, an original work of authorship. For the purposes
+       of this License, Derivative Works shall not include works that remain
+       separable from, or merely link (or bind by name) to the interfaces of,
+       the Work and Derivative Works thereof.
+ 
+       "Contribution" shall mean any work of authorship, including
+       the original version of the Work and any modifications or additions
+       to that Work or Derivative Works thereof, that is intentionally
+       submitted to Licensor for inclusion in the Work by the copyright owner
+       or by an individual or Legal Entity authorized to submit on behalf of
+       the copyright owner. For the purposes of this definition, "submitted"
+       means any form of electronic, verbal, or written communication sent
+       to the Licensor or its representatives, including but not limited to
+       communication on electronic mailing lists, source code control systems,
+       and issue tracking systems that are managed by, or on behalf of, the
+       Licensor for the purpose of discussing and improving the Work, but
+       excluding communication that is conspicuously marked or otherwise
+       designated in writing by the copyright owner as "Not a Contribution."
+ 
+       "Contributor" shall mean Licensor and any individual or Legal Entity
+       on behalf of whom a Contribution has been received by Licensor and
+       subsequently incorporated within the Work.
+ 
+    2. Grant of Copyright License. Subject to the terms and conditions of
+       this License, each Contributor hereby grants to You a perpetual,
+       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+       copyright license to reproduce, prepare Derivative Works of,
+       publicly display, publicly perform, sublicense, and distribute the
+       Work and such Derivative Works in Source or Object form.
+ 
+    3. Grant of Patent License. Subject to the terms and conditions of
+       this License, each Contributor hereby grants to You a perpetual,
+       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+       (except as stated in this section) patent license to make, have made,
+       use, offer to sell, sell, import, and otherwise transfer the Work,
+       where such license applies only to those patent claims licensable
+       by such Contributor that are necessarily infringed by their
+       Contribution(s) alone or by combination of their Contribution(s)
+       with the Work to which such Contribution(s) was submitted. If You
+       institute patent litigation against any entity (including a
+       cross-claim or counterclaim in a lawsuit) alleging that the Work
+       or a Contribution incorporated within the Work constitutes direct
+       or contributory patent infringement, then any patent licenses
+       granted to You under this License for that Work shall terminate
+       as of the date such litigation is filed.
+ 
+    4. Redistribution. You may reproduce and distribute copies of the
+       Work or Derivative Works thereof in any medium, with or without
+       modifications, and in Source or Object form, provided that You
+       meet the following conditions:
+ 
+       (a) You must give any other recipients of the Work or
+           Derivative Works a copy of this License; and
+ 
+       (b) You must cause any modified files to carry prominent notices
+           stating that You changed the files; and
+ 
+       (c) You must retain, in the Source form of any Derivative Works
+           that You distribute, all copyright, patent, trademark, and
+           attribution notices from the Source form of the Work,
+           excluding those notices that do not pertain to any part of
+           the Derivative Works; and
+ 
+       (d) If the Work includes a "NOTICE" text file as part of its
+           distribution, then any Derivative Works that You distribute must
+           include a readable copy of the attribution notices contained
+           within such NOTICE file, excluding those notices that do not
+           pertain to any part of the Derivative Works, in at least one
+           of the following places: within a NOTICE text file distributed
+           as part of the Derivative Works; within the Source form or
+           documentation, if provided along with the Derivative Works; or,
+           within a display generated by the Derivative Works, if and
+           wherever such third-party notices normally appear. The contents
+           of the NOTICE file are for informational purposes only and
+           do not modify the License. You may add Your own attribution
+           notices within Derivative Works that You distribute, alongside
+           or as an addendum to the NOTICE text from the Work, provided
+           that such additional attribution notices cannot be construed
+           as modifying the License.
+ 
+       You may add Your own copyright statement to Your modifications and
+       may provide additional or different license terms and conditions
+       for use, reproduction, or distribution of Your modifications, or
+       for any such Derivative Works as a whole, provided Your use,
+       reproduction, and distribution of the Work otherwise complies with
+       the conditions stated in this License.
+ 
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+       any Contribution intentionally submitted for inclusion in the Work
+       by You to the Licensor shall be under the terms and conditions of
+       this License, without any additional terms or conditions.
+       Notwithstanding the above, nothing herein shall supersede or modify
+       the terms of any separate license agreement you may have executed
+       with Licensor regarding such Contributions.
+ 
+    6. Trademarks. This License does not grant permission to use the trade
+       names, trademarks, service marks, or product names of the Licensor,
+       except as required for reasonable and customary use in describing the
+       origin of the Work and reproducing the content of the NOTICE file.
+ 
+    7. Disclaimer of Warranty. Unless required by applicable law or
+       agreed to in writing, Licensor provides the Work (and each
+       Contributor provides its Contributions) on an "AS IS" BASIS,
+       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+       implied, including, without limitation, any warranties or conditions
+       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+       PARTICULAR PURPOSE. You are solely responsible for determining the
+       appropriateness of using or redistributing the Work and assume any
+       risks associated with Your exercise of permissions under this License.
+ 
+    8. Limitation of Liability. In no event and under no legal theory,
+       whether in tort (including negligence), contract, or otherwise,
+       unless required by applicable law (such as deliberate and grossly
+       negligent acts) or agreed to in writing, shall any Contributor be
+       liable to You for damages, including any direct, indirect, special,
+       incidental, or consequential damages of any character arising as a
+       result of this License or out of the use or inability to use the
+       Work (including but not limited to damages for loss of goodwill,
+       work stoppage, computer failure or malfunction, or any and all
+       other commercial damages or losses), even if such Contributor
+       has been advised of the possibility of such damages.
+ 
+    9. Accepting Warranty or Additional Liability. While redistributing
+       the Work or Derivative Works thereof, You may choose to offer,
+       and charge a fee for, acceptance of support, warranty, indemnity,
+       or other liability obligations and/or rights consistent with this
+       License. However, in accepting such obligations, You may act only
+       on Your own behalf and on Your sole responsibility, not on behalf
+       of any other Contributor, and only if You agree to indemnify,
+       defend, and hold each Contributor harmless for any liability
+       incurred by, or claims asserted against, such Contributor by reason
+       of your accepting any such warranty or additional liability.
+ 
+    END OF TERMS AND CONDITIONS
+ 
+    APPENDIX: How to apply the Apache License to your work.
+ 
+       To apply the Apache License to your work, attach the following
+       boilerplate notice, with the fields enclosed by brackets "[]"
+       replaced with your own identifying information. (Don't include
+       the brackets!)  The text should be enclosed in the appropriate
+       comment syntax for the file format. We also recommend that a
+       file or class name and description of purpose be included on the
+       same "printed page" as the copyright notice for easier
+       identification within third-party archives.
+ 
+    Copyright [yyyy] [name of copyright owner]
+ 
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== ml_dtypes
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== nccl
+
+ Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+    Laboratory, the U.S. Department of Energy, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ The U.S. Department of Energy funded the development of this software
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
+
+This code also includes files from the NVIDIA Tools Extension SDK project.
+
+See:
+
+   https://github.com/NVIDIA/NVTX
+
+for more information and license details.
+
+================================================================================
+NVTX LICENSE.txt - verbatim copy of https://llvm.org/LICENSE.txt
+================================================================================
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+--- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== nsync
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== numpy
+
+Copyright (c) 2005-2019, NumPy Developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the NumPy Developers nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+numpy/core/src/multiarray/dragon4.c:
+------------------------------------
+
+/*
+ * Copyright (c) 2014 Ryan Juckett
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+ * This file contains a modified version of Ryan Juckett's Dragon4
+ * implementation, obtained from http://www.ryanjuckett.com,
+ * which has been ported from C++ to C and which has
+ * modifications specific to printing floats in numpy.
+ *
+ * Ryan Juckett's original code was under the Zlib license; he gave numpy
+ * permission to include it under the MIT license instead.
+ */
+
+numpy/core/src/npymath/ieee754.c.src:
+-------------------------------------
+/*
+ * nextafter code taken from BSD math lib, the code contains the following
+ * notice:
+ *
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+
+numpy/core/src/npymath/npy_math_complex.c.src:
+----------------------------------------------
+
+ Most of the code is taken from the msun library in FreeBSD (HEAD @ 4th
+ October 2013), under the following license:
+
+ Copyright (c) 2007, 2011 David Schultz <das@FreeBSD.ORG>
+ Copyright (c) 2012 Stephen Montgomery-Smith <stephen@FreeBSD.ORG>
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGE.
+
+
+numpy/doc/sphinxext/:
+--------------------
+
+-------------------------------------------------------------------------------
+    The files
+    - numpydoc.py
+    - docscrape.py
+    - docscrape_sphinx.py
+    - phantom_import.py
+    have the following license:
+
+Copyright (C) 2008 Stefan van der Walt <stefan@mentat.za.net>, Pauli Virtanen <pav@iki.fi>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+-------------------------------------------------------------------------------
+    The files
+    - compiler_unparse.py
+    - comment_eater.py
+    - traitsdoc.py
+    have the following license:
+
+This software is OSI Certified Open Source Software.
+OSI Certified is a certification mark of the Open Source Initiative.
+
+Copyright (c) 2006, Enthought, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Enthought, Inc. nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+-------------------------------------------------------------------------------
+    The file
+    - plot_directive.py
+    originates from Matplotlib (http://matplotlib.sf.net/) which has
+    the following license:
+
+Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved.
+
+1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee.
+
+3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3.
+
+4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material breach of its terms and conditions.
+
+7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party.
+
+8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement.
+
+
+numpy/linalg/lapack_lite/:
+--------------------------
+
+Copyright (c) 1992-2013 The University of Tennessee and The University
+                        of Tennessee Research Foundation.  All rights
+                        reserved.
+Copyright (c) 2000-2013 The University of California Berkeley. All
+                        rights reserved.
+Copyright (c) 2006-2013 The University of Colorado Denver.  All rights
+                        reserved.
+
+$COPYRIGHT$
+
+Additional copyrights may follow
+
+$HEADER$
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer listed
+  in this license in the documentation and/or other materials
+  provided with the distribution.
+
+- Neither the name of the copyright holders nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+The copyright holders provide no reassurances that the source code
+provided does not infringe any patent, copyright, or any other
+intellectual property rights of third parties.  The copyright holders
+disclaim any liability to any recipient for claims brought against
+recipient by any third party for infringement of that parties
+intellectual property rights.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+numpy/random:
+-------------
+
+**This software is dual-licensed under the The University of Illinois/NCSA
+Open Source License (NCSA) and The 3-Clause BSD License**
+
+# NCSA Open Source License
+**Copyright (c) 2019 Kevin Sheppard. All rights reserved.**
+
+Developed by: Kevin Sheppard (<kevin.sheppard@economics.ox.ac.uk>,
+<kevin.k.sheppard@gmail.com>)
+[http://www.kevinsheppard.com](http://www.kevinsheppard.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimers.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimers in the documentation and/or
+other materials provided with the distribution.
+
+Neither the names of Kevin Sheppard, nor the names of any contributors may be
+used to endorse or promote products derived from this Software without specific
+prior written permission.
+
+**THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+THE SOFTWARE.**
+
+
+# 3-Clause BSD License
+**Copyright (c) 2019 Kevin Sheppard. All rights reserved.**
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+**THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.**
+
+# Components
+
+Many parts of this module have been derived from original sources, 
+often the algorithm's designer. Component licenses are located with 
+the component code.
+
+
+numpy/random/src/mt19937:
+-------------------------
+
+Copyright (c) 2003-2005, Jean-Sebastien Roy (js@jeannot.org)
+
+The rk_random and rk_seed functions algorithms and the original design of
+the Mersenne Twister RNG:
+
+  Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  1. Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+  3. The names of its contributors may not be used to endorse or promote
+  products derived from this software without specific prior written
+  permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Original algorithm for the implementation of rk_interval function from
+Richard J. Wagner's implementation of the Mersenne Twister RNG, optimised by
+Magnus Jonsson.
+
+Constants used in the rk_double implementation by Isaku Wada.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+numpy/random/src/pcg64/:
+------------------------
+
+/*
+ * PCG64 Random Number Generation for C.
+ *
+ * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
+ * Copyright 2015 Robert Kern <robert.kern@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *     http://www.pcg-random.org
+ *
+ * Relicensed MIT in May 2019
+ *
+ * The MIT License
+ *
+ * PCG Random Number Generation for C.
+ *
+ * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+Full text of Apache license:
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+numpy/testing/_private/parameterized.py:
+----------------------------------------
+
+tl;dr: all code code is licensed under simplified BSD, unless stated otherwise.
+
+Unless stated otherwise in the source files, all code is copyright 2010 David
+Wolever <david@wolever.net>. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation are those
+of the authors and should not be interpreted as representing official policies,
+either expressed or implied, of David Wolever.
+
+
+numpy/tools/npy_tempita/:
+-------------------------
+
+Copyright (c) 2008 Ian Bicking and Contributors
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+numpy/core/include/numpy/libdivide/*:
+-------------------------------------
+  zlib License
+  ------------
+
+  Copyright (C) 2010 - 2019 ridiculous_fish, <libdivide@ridiculousfish.com>
+  Copyright (C) 2016 - 2019 Kim Walisch, <kim.walisch@gmail.com>
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== onednn
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   ============================================================================
+
+   Copyright 2016-2023 Intel Corporation
+   Copyright 2018 YANDEX LLC
+   Copyright 2019-2023 FUJITSU LIMITED
+   Copyright 2020-2023 Arm Ltd. and affiliates
+   Copyright 2020-2022 Codeplay Software Limited
+   Copyright 2021 Alanna Tempest
+   Copyright 2022-2023 IBM Corporation
+   Copyright 2023 KNS Group LLC (YADRO)
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   This distribution includes third party software ("third party programs").
+   This third party software, even if included with the distribution of
+   the Intel software, may be governed by separate license terms, including
+   without limitation, third party license terms, other Intel software license
+   terms, and open source software license terms. These separate license terms
+   govern your use of the third party programs as set forth in the
+   "THIRD-PARTY-PROGRAMS" file.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== opt_einsum
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Daniel Smith
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== png
+
+COPYRIGHT NOTICE, DISCLAIMER, and LICENSE
+=========================================
+
+PNG Reference Library License version 2
+---------------------------------------
+
+ * Copyright (c) 1995-2022 The PNG Reference Library Authors.
+ * Copyright (c) 2018-2022 Cosmin Truta.
+ * Copyright (c) 2000-2002, 2004, 2006-2018 Glenn Randers-Pehrson.
+ * Copyright (c) 1996-1997 Andreas Dilger.
+ * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
+
+The software is supplied "as is", without warranty of any kind,
+express or implied, including, without limitation, the warranties
+of merchantability, fitness for a particular purpose, title, and
+non-infringement.  In no event shall the Copyright owners, or
+anyone distributing the software, be liable for any damages or
+other liability, whether in contract, tort or otherwise, arising
+from, out of, or in connection with the software, or the use or
+other dealings in the software, even if advised of the possibility
+of such damage.
+
+Permission is hereby granted to use, copy, modify, and distribute
+this software, or portions hereof, for any purpose, without fee,
+subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you
+    must not claim that you wrote the original software.  If you
+    use this software in a product, an acknowledgment in the product
+    documentation would be appreciated, but is not required.
+
+ 2. Altered source versions must be plainly marked as such, and must
+    not be misrepresented as being the original software.
+
+ 3. This Copyright notice may not be removed or altered from any
+    source or altered source distribution.
+
+
+PNG Reference Library License version 1 (for libpng 0.5 through 1.6.35)
+-----------------------------------------------------------------------
+
+libpng versions 1.0.7, July 1, 2000, through 1.6.35, July 15, 2018 are
+Copyright (c) 2000-2002, 2004, 2006-2018 Glenn Randers-Pehrson, are
+derived from libpng-1.0.6, and are distributed according to the same
+disclaimer and license as libpng-1.0.6 with the following individuals
+added to the list of Contributing Authors:
+
+    Simon-Pierre Cadieux
+    Eric S. Raymond
+    Mans Rullgard
+    Cosmin Truta
+    Gilles Vollant
+    James Yu
+    Mandar Sahastrabuddhe
+    Google Inc.
+    Vadim Barkov
+
+and with the following additions to the disclaimer:
+
+    There is no warranty against interference with your enjoyment of
+    the library or against infringement.  There is no warranty that our
+    efforts or the library will fulfill any of your particular purposes
+    or needs.  This library is provided with all faults, and the entire
+    risk of satisfactory quality, performance, accuracy, and effort is
+    with the user.
+
+Some files in the "contrib" directory and some configure-generated
+files that are distributed with libpng have other copyright owners, and
+are released under other open source licenses.
+
+libpng versions 0.97, January 1998, through 1.0.6, March 20, 2000, are
+Copyright (c) 1998-2000 Glenn Randers-Pehrson, are derived from
+libpng-0.96, and are distributed according to the same disclaimer and
+license as libpng-0.96, with the following individuals added to the
+list of Contributing Authors:
+
+    Tom Lane
+    Glenn Randers-Pehrson
+    Willem van Schaik
+
+libpng versions 0.89, June 1996, through 0.96, May 1997, are
+Copyright (c) 1996-1997 Andreas Dilger, are derived from libpng-0.88,
+and are distributed according to the same disclaimer and license as
+libpng-0.88, with the following individuals added to the list of
+Contributing Authors:
+
+    John Bowler
+    Kevin Bracey
+    Sam Bushell
+    Magnus Holmgren
+    Greg Roelofs
+    Tom Tanner
+
+Some files in the "scripts" directory have other copyright owners,
+but are released under this license.
+
+libpng versions 0.5, May 1995, through 0.88, January 1996, are
+Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
+
+For the purposes of this copyright and license, "Contributing Authors"
+is defined as the following set of individuals:
+
+    Andreas Dilger
+    Dave Martindale
+    Guy Eric Schalnat
+    Paul Schmidt
+    Tim Wegner
+
+The PNG Reference Library is supplied "AS IS".  The Contributing
+Authors and Group 42, Inc. disclaim all warranties, expressed or
+implied, including, without limitation, the warranties of
+merchantability and of fitness for any purpose.  The Contributing
+Authors and Group 42, Inc. assume no liability for direct, indirect,
+incidental, special, exemplary, or consequential damages, which may
+result from the use of the PNG Reference Library, even if advised of
+the possibility of such damage.
+
+Permission is hereby granted to use, copy, modify, and distribute this
+source code, or portions hereof, for any purpose, without fee, subject
+to the following restrictions:
+
+ 1. The origin of this source code must not be misrepresented.
+
+ 2. Altered versions must be plainly marked as such and must not
+    be misrepresented as being the original source.
+
+ 3. This Copyright notice may not be removed or altered from any
+    source or altered source distribution.
+
+The Contributing Authors and Group 42, Inc. specifically permit,
+without fee, and encourage the use of this source code as a component
+to supporting the PNG file format in commercial products.  If you use
+this source code in a product, acknowledgment is not required but would
+be appreciated.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== protobuf
+
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The above license applies to all files in this directory and
+subdirectories, with exceptions noted below.
+
+Code generated by the Protocol Buffer compiler is owned by the owner
+of the input file used when generating it.  This code is not
+standalone and requires a support library to be linked with it.  This
+support library is itself covered by the above license.
+
+===========================================================================
+rust/utf8.rs
+===========================================================================
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== pybind11
+
+Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Please also refer to the file .github/CONTRIBUTING.md, which clarifies licensing of
+external contributions to this project including patches, pull requests, etc.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== Python
+
+A. HISTORY OF THE SOFTWARE
+==========================
+
+Python was created in the early 1990s by Guido van Rossum at Stichting
+Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands
+as a successor of a language called ABC.  Guido remains Python's
+principal author, although it includes many contributions from others.
+
+In 1995, Guido continued his work on Python at the Corporation for
+National Research Initiatives (CNRI, see https://www.cnri.reston.va.us)
+in Reston, Virginia where he released several versions of the
+software.
+
+In May 2000, Guido and the Python core development team moved to
+BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
+year, the PythonLabs team moved to Digital Creations, which became
+Zope Corporation.  In 2001, the Python Software Foundation (PSF, see
+https://www.python.org/psf/) was formed, a non-profit organization
+created specifically to own Python-related Intellectual Property.
+Zope Corporation was a sponsoring member of the PSF.
+
+All Python releases are Open Source (see https://opensource.org for
+the Open Source Definition).  Historically, most, but not all, Python
+releases have also been GPL-compatible; the table below summarizes
+the various releases.
+
+    Release         Derived     Year        Owner       GPL-
+                    from                                compatible? (1)
+
+    0.9.0 thru 1.2              1991-1995   CWI         yes
+    1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
+    1.6             1.5.2       2000        CNRI        no
+    2.0             1.6         2000        BeOpen.com  no
+    1.6.1           1.6         2001        CNRI        yes (2)
+    2.1             2.0+1.6.1   2001        PSF         no
+    2.0.1           2.0+1.6.1   2001        PSF         yes
+    2.1.1           2.1+2.0.1   2001        PSF         yes
+    2.1.2           2.1.1       2002        PSF         yes
+    2.1.3           2.1.2       2002        PSF         yes
+    2.2 and above   2.1.1       2001-now    PSF         yes
+
+Footnotes:
+
+(1) GPL-compatible doesn't mean that we're distributing Python under
+    the GPL.  All Python licenses, unlike the GPL, let you distribute
+    a modified version without making your changes open source.  The
+    GPL-compatible licenses make it possible to combine Python with
+    other software that is released under the GPL; the others don't.
+
+(2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
+    because its license has a choice of law clause.  According to
+    CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
+    is "not incompatible" with the GPL.
+
+Thanks to the many outside volunteers who have worked under Guido's
+direction to make these releases possible.
+
+
+B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
+===============================================================
+
+Python software and documentation are licensed under the
+Python Software Foundation License Version 2.
+
+Starting with Python 3.8.6, examples, recipes, and other code in
+the documentation are dual licensed under the PSF License Version 2
+and the Zero-Clause BSD license.
+
+Some software incorporated into Python is under different licenses.
+The licenses are listed with code falling under that license.
+
+
+PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+--------------------------------------------
+
+1. This LICENSE AGREEMENT is between the Python Software Foundation
+("PSF"), and the Individual or Organization ("Licensee") accessing and
+otherwise using this software ("Python") in source or binary form and
+its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, PSF hereby
+grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
+analyze, test, perform and/or display publicly, prepare derivative works,
+distribute, and otherwise use Python alone or in any derivative version,
+provided, however, that PSF's License Agreement and PSF's notice of copyright,
+i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation;
+All Rights Reserved" are retained in Python alone or in any derivative version
+prepared by Licensee.
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python.
+
+4. PSF is making Python available to Licensee on an "AS IS"
+basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. Nothing in this License Agreement shall be deemed to create any
+relationship of agency, partnership, or joint venture between PSF and
+Licensee.  This License Agreement does not grant permission to use PSF
+trademarks or trade name in a trademark sense to endorse or promote
+products or services of Licensee, or any third party.
+
+8. By copying, installing or otherwise using Python, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+
+BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
+-------------------------------------------
+
+BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
+
+1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
+office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
+Individual or Organization ("Licensee") accessing and otherwise using
+this software in source or binary form and its associated
+documentation ("the Software").
+
+2. Subject to the terms and conditions of this BeOpen Python License
+Agreement, BeOpen hereby grants Licensee a non-exclusive,
+royalty-free, world-wide license to reproduce, analyze, test, perform
+and/or display publicly, prepare derivative works, distribute, and
+otherwise use the Software alone or in any derivative version,
+provided, however, that the BeOpen Python License is retained in the
+Software, alone or in any derivative version prepared by Licensee.
+
+3. BeOpen is making the Software available to Licensee on an "AS IS"
+basis.  BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
+SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
+AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
+DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+5. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+6. This License Agreement shall be governed by and interpreted in all
+respects by the law of the State of California, excluding conflict of
+law provisions.  Nothing in this License Agreement shall be deemed to
+create any relationship of agency, partnership, or joint venture
+between BeOpen and Licensee.  This License Agreement does not grant
+permission to use BeOpen trademarks or trade names in a trademark
+sense to endorse or promote products or services of Licensee, or any
+third party.  As an exception, the "BeOpen Python" logos available at
+http://www.pythonlabs.com/logos.html may be used according to the
+permissions granted on that web page.
+
+7. By copying, installing or otherwise using the software, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+
+CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
+---------------------------------------
+
+1. This LICENSE AGREEMENT is between the Corporation for National
+Research Initiatives, having an office at 1895 Preston White Drive,
+Reston, VA 20191 ("CNRI"), and the Individual or Organization
+("Licensee") accessing and otherwise using Python 1.6.1 software in
+source or binary form and its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, CNRI
+hereby grants Licensee a nonexclusive, royalty-free, world-wide
+license to reproduce, analyze, test, perform and/or display publicly,
+prepare derivative works, distribute, and otherwise use Python 1.6.1
+alone or in any derivative version, provided, however, that CNRI's
+License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
+1995-2001 Corporation for National Research Initiatives; All Rights
+Reserved" are retained in Python 1.6.1 alone or in any derivative
+version prepared by Licensee.  Alternately, in lieu of CNRI's License
+Agreement, Licensee may substitute the following text (omitting the
+quotes): "Python 1.6.1 is made available subject to the terms and
+conditions in CNRI's License Agreement.  This Agreement together with
+Python 1.6.1 may be located on the internet using the following
+unique, persistent identifier (known as a handle): 1895.22/1013.  This
+Agreement may also be obtained from a proxy server on the internet
+using the following URL: http://hdl.handle.net/1895.22/1013".
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python 1.6.1 or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python 1.6.1.
+
+4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
+basis.  CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. This License Agreement shall be governed by the federal
+intellectual property law of the United States, including without
+limitation the federal copyright law, and, to the extent such
+U.S. federal law does not apply, by the law of the Commonwealth of
+Virginia, excluding Virginia's conflict of law provisions.
+Notwithstanding the foregoing, with regard to derivative works based
+on Python 1.6.1 that incorporate non-separable material that was
+previously distributed under the GNU General Public License (GPL), the
+law of the Commonwealth of Virginia shall govern this License
+Agreement only as to issues arising under or with respect to
+Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in this
+License Agreement shall be deemed to create any relationship of
+agency, partnership, or joint venture between CNRI and Licensee.  This
+License Agreement does not grant permission to use CNRI trademarks or
+trade name in a trademark sense to endorse or promote products or
+services of Licensee, or any third party.
+
+8. By clicking on the "ACCEPT" button where indicated, or by copying,
+installing or otherwise using Python 1.6.1, Licensee agrees to be
+bound by the terms and conditions of this License Agreement.
+
+        ACCEPT
+
+
+CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
+--------------------------------------------------
+
+Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
+The Netherlands.  All rights reserved.
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Stichting Mathematisch
+Centrum or CWI not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
+THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
+FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON DOCUMENTATION
+----------------------------------------------------------------------
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== re2
+
+// Copyright (c) 2009 The RE2 Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== setuptools
+
+Copyright (C) 2016 Jason R Coombs <jaraco@jaraco.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== six
+
+Copyright (c) 2010-2020 Benjamin Peterson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== snappy
+
+Copyright (c) 2011-2017, Andres Moreira <andres@andresmoreira.com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the authors nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRES MOREIRA BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== stablehlo
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== termcolor
+
+Copyright (c) 2008-2011 Volvox Development Team
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== typing_extensions
+
+A. HISTORY OF THE SOFTWARE
+==========================
+
+Python was created in the early 1990s by Guido van Rossum at Stichting
+Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
+as a successor of a language called ABC.  Guido remains Python's
+principal author, although it includes many contributions from others.
+
+In 1995, Guido continued his work on Python at the Corporation for
+National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
+in Reston, Virginia where he released several versions of the
+software.
+
+In May 2000, Guido and the Python core development team moved to
+BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
+year, the PythonLabs team moved to Digital Creations, which became
+Zope Corporation.  In 2001, the Python Software Foundation (PSF, see
+https://www.python.org/psf/) was formed, a non-profit organization
+created specifically to own Python-related Intellectual Property.
+Zope Corporation was a sponsoring member of the PSF.
+
+All Python releases are Open Source (see http://www.opensource.org for
+the Open Source Definition).  Historically, most, but not all, Python
+releases have also been GPL-compatible; the table below summarizes
+the various releases.
+
+    Release         Derived     Year        Owner       GPL-
+                    from                                compatible? (1)
+
+    0.9.0 thru 1.2              1991-1995   CWI         yes
+    1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
+    1.6             1.5.2       2000        CNRI        no
+    2.0             1.6         2000        BeOpen.com  no
+    1.6.1           1.6         2001        CNRI        yes (2)
+    2.1             2.0+1.6.1   2001        PSF         no
+    2.0.1           2.0+1.6.1   2001        PSF         yes
+    2.1.1           2.1+2.0.1   2001        PSF         yes
+    2.1.2           2.1.1       2002        PSF         yes
+    2.1.3           2.1.2       2002        PSF         yes
+    2.2 and above   2.1.1       2001-now    PSF         yes
+
+Footnotes:
+
+(1) GPL-compatible doesn't mean that we're distributing Python under
+    the GPL.  All Python licenses, unlike the GPL, let you distribute
+    a modified version without making your changes open source.  The
+    GPL-compatible licenses make it possible to combine Python with
+    other software that is released under the GPL; the others don't.
+
+(2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
+    because its license has a choice of law clause.  According to
+    CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
+    is "not incompatible" with the GPL.
+
+Thanks to the many outside volunteers who have worked under Guido's
+direction to make these releases possible.
+
+
+B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
+===============================================================
+
+PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+--------------------------------------------
+
+1. This LICENSE AGREEMENT is between the Python Software Foundation
+("PSF"), and the Individual or Organization ("Licensee") accessing and
+otherwise using this software ("Python") in source or binary form and
+its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, PSF hereby
+grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
+analyze, test, perform and/or display publicly, prepare derivative works,
+distribute, and otherwise use Python alone or in any derivative version,
+provided, however, that PSF's License Agreement and PSF's notice of copyright,
+i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022 Python Software Foundation;
+All Rights Reserved" are retained in Python alone or in any derivative version
+prepared by Licensee.
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python.
+
+4. PSF is making Python available to Licensee on an "AS IS"
+basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. Nothing in this License Agreement shall be deemed to create any
+relationship of agency, partnership, or joint venture between PSF and
+Licensee.  This License Agreement does not grant permission to use PSF
+trademarks or trade name in a trademark sense to endorse or promote
+products or services of Licensee, or any third party.
+
+8. By copying, installing or otherwise using Python, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+
+BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
+-------------------------------------------
+
+BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
+
+1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
+office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
+Individual or Organization ("Licensee") accessing and otherwise using
+this software in source or binary form and its associated
+documentation ("the Software").
+
+2. Subject to the terms and conditions of this BeOpen Python License
+Agreement, BeOpen hereby grants Licensee a non-exclusive,
+royalty-free, world-wide license to reproduce, analyze, test, perform
+and/or display publicly, prepare derivative works, distribute, and
+otherwise use the Software alone or in any derivative version,
+provided, however, that the BeOpen Python License is retained in the
+Software, alone or in any derivative version prepared by Licensee.
+
+3. BeOpen is making the Software available to Licensee on an "AS IS"
+basis.  BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
+SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
+AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
+DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+5. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+6. This License Agreement shall be governed by and interpreted in all
+respects by the law of the State of California, excluding conflict of
+law provisions.  Nothing in this License Agreement shall be deemed to
+create any relationship of agency, partnership, or joint venture
+between BeOpen and Licensee.  This License Agreement does not grant
+permission to use BeOpen trademarks or trade names in a trademark
+sense to endorse or promote products or services of Licensee, or any
+third party.  As an exception, the "BeOpen Python" logos available at
+http://www.pythonlabs.com/logos.html may be used according to the
+permissions granted on that web page.
+
+7. By copying, installing or otherwise using the software, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+
+CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
+---------------------------------------
+
+1. This LICENSE AGREEMENT is between the Corporation for National
+Research Initiatives, having an office at 1895 Preston White Drive,
+Reston, VA 20191 ("CNRI"), and the Individual or Organization
+("Licensee") accessing and otherwise using Python 1.6.1 software in
+source or binary form and its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, CNRI
+hereby grants Licensee a nonexclusive, royalty-free, world-wide
+license to reproduce, analyze, test, perform and/or display publicly,
+prepare derivative works, distribute, and otherwise use Python 1.6.1
+alone or in any derivative version, provided, however, that CNRI's
+License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
+1995-2001 Corporation for National Research Initiatives; All Rights
+Reserved" are retained in Python 1.6.1 alone or in any derivative
+version prepared by Licensee.  Alternately, in lieu of CNRI's License
+Agreement, Licensee may substitute the following text (omitting the
+quotes): "Python 1.6.1 is made available subject to the terms and
+conditions in CNRI's License Agreement.  This Agreement together with
+Python 1.6.1 may be located on the internet using the following
+unique, persistent identifier (known as a handle): 1895.22/1013.  This
+Agreement may also be obtained from a proxy server on the internet
+using the following URL: http://hdl.handle.net/1895.22/1013".
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python 1.6.1 or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python 1.6.1.
+
+4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
+basis.  CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. This License Agreement shall be governed by the federal
+intellectual property law of the United States, including without
+limitation the federal copyright law, and, to the extent such
+U.S. federal law does not apply, by the law of the Commonwealth of
+Virginia, excluding Virginia's conflict of law provisions.
+Notwithstanding the foregoing, with regard to derivative works based
+on Python 1.6.1 that incorporate non-separable material that was
+previously distributed under the GNU General Public License (GPL), the
+law of the Commonwealth of Virginia shall govern this License
+Agreement only as to issues arising under or with respect to
+Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in this
+License Agreement shall be deemed to create any relationship of
+agency, partnership, or joint venture between CNRI and Licensee.  This
+License Agreement does not grant permission to use CNRI trademarks or
+trade name in a trademark sense to endorse or promote products or
+services of Licensee, or any third party.
+
+8. By clicking on the "ACCEPT" button where indicated, or by copying,
+installing or otherwise using Python 1.6.1, Licensee agrees to be
+bound by the terms and conditions of this License Agreement.
+
+        ACCEPT
+
+
+CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
+--------------------------------------------------
+
+Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
+The Netherlands.  All rights reserved.
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Stichting Mathematisch
+Centrum or CWI not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
+THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
+FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== upb
+
+Copyright (c) 2009-2021, Google LLC
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Google LLC nor the names of any other
+      contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL GOOGLE LLC BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== wrapt
+
+Copyright (c) 2013-2023, Graham Dumpleton
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== zlib
+
+ (C) 1995-2017 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+If you use the zlib library in a product, we would appreciate *not* receiving
+lengthy legal documents to sign.  The sources are provided for free but without
+warranty of any kind.  The library has been entirely written by Jean-loup
+Gailly and Mark Adler; it does not include third-party code.
+
+If you redistribute modified sources, we would appreciate that you include in
+the file ChangeLog history information documenting your changes.  Please read
+the FAQ for more information on the distribution of modified source versions.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== gif
+
+The GIFLIB distribution is Copyright (c) 1997  Eric S. Raymond
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== sqlite
+
+All of the code and documentation in SQLite has been dedicated to the public
+domain by the authors. All code authors, and representatives of the companies
+they work for, have signed affidavits dedicating their contributions to the
+public domain and originals of those signed affidavits are stored in a firesafe
+at the main offices of Hwaci. Anyone is free to copy, modify, publish, use,
+compile, sell, or distribute the original SQLite code, either in source code form
+or as a compiled binary, for any purpose, commercial or non-commercial, and by
+any means.
+
+The previous paragraph applies to the deliverable code and documentation in
+SQLite - those parts of the SQLite library that you actually bundle and ship
+with a larger application. Some scripts used as part of the build process (for
+example the "configure" scripts generated by autoconf) might fall under other
+open-source licenses. Nothing from these build scripts ever reaches the final
+deliverable SQLite library, however, and so the licenses associated with those
+scripts should not be a factor in assessing your rights to copy and use the
+SQLite library.
+
+All of the deliverable code in SQLite has been written from scratch. No code has
+been taken from other projects or from the open internet. Every line of code can
+be traced back to its original author, and all of those authors have public
+domain dedications on file. So the SQLite code base is clean and is
+uncontaminated with licensed code from other projects.
+  
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== triton
+
+/* 
+* Copyright 2018-2020 Philippe Tillet
+* Copyright 2020-2022 OpenAI
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+Copyright (c) <year> <owner>. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+MIT License Copyright (c) 2020 Da Yan @ HKUST
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished
+to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
+OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
\ No newline at end of file

From 879a5e37eb350f556debec3cd5c9b6dcb8809318 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 20 Sep 2023 09:55:15 -0700
Subject: [PATCH 040/567] Rename tsl/cuda/BUILD to tsl/cuda/BUILD.bazel.

Unify _stub and _lib targets in tsl/cuda (under an unsuffixed name), and update users to point directly to merged target. This also allows us to remove some .bzl macros.

PiperOrigin-RevId: 566996155
---
 tensorflow/core/BUILD                         |  2 +-
 tensorflow/core/common_runtime/BUILD          |  2 +-
 tensorflow/core/profiler/backends/gpu/BUILD   | 12 +--
 tensorflow/python/BUILD                       |  6 +-
 tensorflow/tensorflow.bzl                     |  2 +-
 .../tsl/tsl/cuda/{BUILD => BUILD.bazel}       | 79 +++----------------
 .../xla/third_party/tsl/tsl/platform/BUILD    |  8 --
 third_party/xla/third_party/tsl/tsl/tsl.bzl   |  2 +-
 .../xla/xla/backends/profiler/gpu/BUILD       | 20 ++---
 .../experiments/sm_bandwidth_benchmark/BUILD  |  2 +-
 third_party/xla/xla/stream_executor/BUILD     |  4 +-
 .../xla/xla/stream_executor/build_defs.bzl    |  6 --
 .../xla/xla/stream_executor/cuda/BUILD        | 40 +++++-----
 13 files changed, 52 insertions(+), 133 deletions(-)
 rename third_party/xla/third_party/tsl/tsl/cuda/{BUILD => BUILD.bazel} (74%)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 48b5e4da4fb2ba..757e75829b4503 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1164,7 +1164,7 @@ alias(
 
 alias(
     name = "cuda",
-    actual = "@local_tsl//tsl/platform:cuda",
+    actual = "@local_tsl//tsl/cuda:cudart",
     visibility = ["//visibility:public"],
 )
 
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 496e1a367b484b..4b7c5eb8b7c29d 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -2740,7 +2740,7 @@ tf_cuda_cc_test(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
-    ] + if_cuda(["@local_tsl//tsl/platform:cuda"]),
+    ] + if_cuda(["@local_tsl//tsl/cuda:cudart"]),
 )
 
 # This is identical to :common_runtime_direct_session_test with the addition of
diff --git a/tensorflow/core/profiler/backends/gpu/BUILD b/tensorflow/core/profiler/backends/gpu/BUILD
index 874440391a6580..674c90237a11de 100644
--- a/tensorflow/core/profiler/backends/gpu/BUILD
+++ b/tensorflow/core/profiler/backends/gpu/BUILD
@@ -1,5 +1,4 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_copts",
@@ -10,11 +9,8 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load(
-    "@local_xla//xla/stream_executor:build_defs.bzl",
-    "tf_additional_cupti_deps",
-)
 load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -77,7 +73,7 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:platform_base",
         "@local_xla//xla/backends/profiler/gpu:cupti_interface",
-    ] + tf_additional_cupti_deps(),
+    ] + if_cuda(["@local_tsl//tsl/cuda:cupti"]),
 )
 
 tf_cuda_library(
@@ -146,7 +142,7 @@ tf_cuda_library(
     deps = [
         ":cupti_interface",
         "@local_xla//xla/backends/profiler/gpu:cupti_wrapper",
-    ] + tf_additional_cupti_deps(),
+    ] + if_cuda(["@local_tsl//tsl/cuda:cupti"]),
 )
 
 tf_cuda_library(
@@ -219,7 +215,7 @@ tf_cuda_library(
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/utils:parse_annotation",
         "@local_xla//xla/backends/profiler/gpu:cupti_collector",
-    ] + tf_additional_cupti_deps(),
+    ] + if_cuda(["@local_tsl//tsl/cuda:cupti"]),
 )
 
 cc_library(
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 70cb50b408d61d..2fea59e32f411e 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -846,9 +846,9 @@ pywrap_tensorflow_macro(
         # nvidia standalone wheels into pywrap_tensorflow_internal. We might be
         # able to remove this in the future, as these stubs should already
         # be brought in via other dependencies.
-        "@local_tsl//tsl/cuda:cudnn_stub",
-        "@local_tsl//tsl/cuda:cufft_stub",
-        "@local_tsl//tsl/cuda:nccl_rpath_stub",
+        "@local_tsl//tsl/cuda:cudnn",
+        "@local_tsl//tsl/cuda:cufft",
+        "@local_tsl//tsl/cuda:nccl_rpath",
     ])) + if_xla_available([
         "//tensorflow/compiler/aot:tfcompile_lib",
     ]) + tf_monitoring_python_deps() + tf_additional_plugin_deps() + tf_additional_profiler_deps() + tf_additional_binary_deps(),
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 202ea7cac66796..58ac889dd8997f 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1912,7 +1912,7 @@ def tf_gpu_kernel_library(
         hdrs = hdrs,
         copts = copts,
         deps = deps + if_cuda([
-            clean_dep("@local_tsl//tsl/cuda:cudart_stub"),
+            clean_dep("@local_tsl//tsl/cuda:cudart"),
         ]) + if_cuda_or_rocm([
             clean_dep("//tensorflow/core:gpu_lib"),
         ]),
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/BUILD b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
similarity index 74%
rename from third_party/xla/third_party/tsl/tsl/cuda/BUILD
rename to third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
index efcf0bf7dd12ae..ecb7bdb2cfffa7 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
@@ -17,12 +17,11 @@ load(
 
 package(
     default_visibility = ["//visibility:public"],
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
 cc_library(
-    name = "cublas_stub",
+    name = "cublas",
     srcs = if_cuda_is_configured(["cublas_stub.cc"]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags(
         "nvidia/cublas/lib",
@@ -36,17 +35,8 @@ cc_library(
     ]),
 )
 
-alias(
-    name = "cublas_lib",
-    actual = select({
-        "//tsl:oss": ":cublas_stub",
-        "//conditions:default": "@local_config_cuda//cuda:cublas",
-    }),
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
-    name = "cublas_lt_stub",
+    name = "cublas_lt",
     srcs = if_cuda_is_configured(["cublasLt_stub.cc"]),
     textual_hdrs = glob(["cublasLt_*.inc"]),
     visibility = ["//visibility:public"],
@@ -57,17 +47,8 @@ cc_library(
     ]),
 )
 
-alias(
-    name = "cublas_lt_lib",
-    actual = select({
-        "//tsl:oss": ":cublas_lt_stub",
-        "//conditions:default": "@local_config_cuda//cuda:cublasLt",
-    }),
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
-    name = "cuda_stub",
+    name = "cuda",
     srcs = if_cuda_is_configured(["cuda_stub.cc"]),
     textual_hdrs = glob(["cuda_*.inc"]),
     visibility = ["//visibility:public"],
@@ -79,7 +60,7 @@ cc_library(
 )
 
 cc_library(
-    name = "cudart_stub",
+    name = "cudart",
     srcs = select({
         # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
         "//tsl:is_cuda_enabled_and_oss": ["cudart_stub.cc"],
@@ -93,7 +74,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tsl:is_cuda_enabled_and_oss": [
-            ":cuda_stub",
+            ":cuda",
             "//tsl/platform:dso_loader",
             "//tsl/platform:env",
             "@local_config_cuda//cuda:cuda_headers",
@@ -103,7 +84,7 @@ cc_library(
 )
 
 cc_library(
-    name = "cudnn_stub",
+    name = "cudnn",
     srcs = if_cuda_is_configured(["cudnn_stub.cc"]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/cudnn/lib")),
     textual_hdrs = glob(["cudnn_*.inc"]),
@@ -117,26 +98,17 @@ cc_library(
 )
 
 cc_library(
-    name = "nccl_rpath_stub",
+    name = "nccl_rpath",
     linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/nccl/lib")),
     visibility = ["//visibility:public"],
 )
 
 cc_library(
-    name = "tensorrt_rpath_stub",
+    name = "tensorrt_rpath",
     linkopts = if_cuda_is_configured(cuda_rpath_flags("tensorrt")),
     visibility = ["//visibility:public"],
 )
 
-alias(
-    name = "cudnn_lib",
-    actual = select({
-        "//tsl:oss": ":cudnn_stub",
-        "//conditions:default": "@local_config_cuda//cuda:cudnn",
-    }),
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "cudnn_version",
     srcs = ["cudnn_version.cc"],
@@ -159,7 +131,7 @@ tsl_cc_test(
 )
 
 cc_library(
-    name = "cufft_stub",
+    name = "cufft",
     srcs = if_cuda_is_configured(["cufft_stub.cc"]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/cufft/lib")),
     textual_hdrs = glob(["cufft_*.inc"]),
@@ -171,17 +143,8 @@ cc_library(
     ]),
 )
 
-alias(
-    name = "cufft_lib",
-    actual = select({
-        "//tsl:oss": ":cufft_stub",
-        "//conditions:default": "@local_config_cuda//cuda:cufft",
-    }),
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
-    name = "cupti_stub",
+    name = "cupti",
     srcs = if_cuda_is_configured(["cupti_stub.cc"]),
     data = if_cuda_is_configured(["@local_config_cuda//cuda:cupti_dsos"]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/cuda_cupti/lib")),
@@ -196,7 +159,7 @@ cc_library(
 )
 
 cc_library(
-    name = "cusolver_stub",
+    name = "cusolver",
     srcs = if_cuda_is_configured(["cusolver_stub.cc"]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/cusolver/lib")),
     textual_hdrs = glob(["cusolver_dense_*.inc"]),
@@ -208,17 +171,8 @@ cc_library(
     ]),
 )
 
-alias(
-    name = "cusolver_lib",
-    actual = select({
-        "//tsl:oss": ":cusolver_stub",
-        "//conditions:default": "@local_config_cuda//cuda:cusolver",
-    }),
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
-    name = "cusparse_stub",
+    name = "cusparse",
     srcs = if_cuda_is_configured(["cusparse_stub.cc"]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/cusparse/lib")),
     textual_hdrs = glob(["cusparse_*.inc"]),
@@ -229,12 +183,3 @@ cc_library(
         "//tsl/platform:env",
     ]),
 )
-
-alias(
-    name = "cusparse_lib",
-    actual = select({
-        "//tsl:oss": ":cusparse_stub",
-        "//conditions:default": "@local_config_cuda//cuda:cusparse",
-    }),
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/tsl/tsl/platform/BUILD b/third_party/xla/third_party/tsl/tsl/platform/BUILD
index 2574e65bf2fd1c..22212210dccb6a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/BUILD
@@ -1102,14 +1102,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "cuda",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cudart_static",
-    ],
-)
-
 cc_library(
     name = "dso_loader",
     hdrs = ["dso_loader.h"],
diff --git a/third_party/xla/third_party/tsl/tsl/tsl.bzl b/third_party/xla/third_party/tsl/tsl/tsl.bzl
index 4bd78c3f771e55..adc0cbf00c39c4 100644
--- a/third_party/xla/third_party/tsl/tsl/tsl.bzl
+++ b/third_party/xla/third_party/tsl/tsl/tsl.bzl
@@ -321,7 +321,7 @@ def tsl_gpu_library(deps = None, cuda_deps = None, copts = tsl_copts(), **kwargs
         kwargs.pop("default_copts", None)
     cc_library(
         deps = deps + if_cuda([
-            clean_dep("//tsl/cuda:cudart_stub"),
+            clean_dep("//tsl/cuda:cudart"),
             "@local_config_cuda//cuda:cuda_headers",
         ]) + if_rocm_is_configured([
             "@local_config_rocm//rocm:rocm_headers",
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 1ec816c76f8784..09de99f682ba6f 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -1,4 +1,8 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "@local_tsl//tsl:tsl.bzl",
@@ -13,19 +17,11 @@ load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "tf_additional_cupti_deps",
-)
-load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
+load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -74,7 +70,7 @@ tsl_gpu_library(
     deps = [
         "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:types",
-    ] + tf_additional_cupti_deps(),
+    ] + if_cuda(["@local_tsl//tsl/cuda:cupti"]),
 )
 
 tsl_gpu_library(
@@ -169,7 +165,7 @@ tsl_gpu_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cupti_interface",
-    ] + tf_additional_cupti_deps(),
+    ] + if_cuda(["@local_tsl//tsl/cuda:cupti"]),
 )
 
 tsl_gpu_library(
@@ -287,7 +283,7 @@ tsl_gpu_library(
         "@local_tsl//tsl/profiler/utils:xplane_builder",
         "@local_tsl//tsl/profiler/utils:xplane_schema",
         "@local_tsl//tsl/profiler/utils:xplane_utils",
-    ] + tf_additional_cupti_deps(),
+    ] + if_cuda(["@local_tsl//tsl/cuda:cupti"]),
 )
 
 cc_library(
diff --git a/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD b/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD
index 5128f72a982e91..003c091632c73f 100644
--- a/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD
+++ b/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD
@@ -33,6 +33,6 @@ xla_cc_test(
         ":sm_bw_utils",
         "@com_google_googletest//:gtest_main",
     ] + if_cuda([
-        "@local_tsl//tsl/platform:cuda",
+        "@local_tsl//tsl/cuda:cudart",
     ]),
 )
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 2c8f6005e824ba..6760fa8d6fa477 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -6,7 +6,7 @@
 
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
-load("@local_tsl//tsl:tsl.bzl", "set_external_visibility", "transitive_hdrs", "tsl_gpu_library")
+load("@local_tsl//tsl:tsl.bzl", "set_external_visibility", "transitive_hdrs")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
@@ -433,7 +433,7 @@ cc_library(
 )
 
 # It implements :stream_executor_pimpl_header
-tsl_gpu_library(
+cc_library(
     name = "stream_executor_pimpl",
     srcs = [
         "stream.cc",
diff --git a/third_party/xla/xla/stream_executor/build_defs.bzl b/third_party/xla/xla/stream_executor/build_defs.bzl
index a52f22c066f565..c76ff0bdea1074 100644
--- a/third_party/xla/xla/stream_executor/build_defs.bzl
+++ b/third_party/xla/xla/stream_executor/build_defs.bzl
@@ -7,12 +7,6 @@ def stream_executor_friends():
 def tf_additional_cuda_platform_deps():
     return []
 
-def tf_additional_cuda_driver_deps():
-    return ["@local_tsl//tsl/cuda:cuda_stub"]
-
-def tf_additional_cupti_deps():
-    return ["@local_xla//xla/stream_executor/cuda:cupti_stub"]
-
 def tf_additional_cudnn_plugin_deps():
     return []
 
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 9ac7571e2dc8b7..b008a8a17559d1 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -8,7 +8,6 @@ load(
 load(
     "//xla/stream_executor:build_defs.bzl",
     "stream_executor_friends",
-    "tf_additional_cuda_driver_deps",
     "tf_additional_cuda_platform_deps",
     "tf_additional_cudnn_plugin_copts",
     "tf_additional_cudnn_plugin_deps",
@@ -127,18 +126,15 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_driver_header",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
+        "@local_tsl//tsl/cuda",
+        "@local_tsl//tsl/cuda:cudart",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:static_threadlocal",
-    ] + tf_additional_cuda_driver_deps()) + select({
-        # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
-        "@local_tsl//tsl:is_cuda_enabled_and_oss": ["cudart_stub"],
-        "//conditions:default": ["@local_tsl//tsl/platform:cuda"],
-    }) + [
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-    ],
+    ]),
 )
 
 xla_cc_test(
@@ -191,7 +187,7 @@ xla_cc_test(
 
 alias(
     name = "cudart_stub",
-    actual = "@local_tsl//tsl/cuda:cudart_stub",
+    actual = "@local_tsl//tsl/cuda:cudart",
     visibility = ["//visibility:public"],
 )
 
@@ -236,13 +232,13 @@ cc_library(
 
 alias(
     name = "cublas_stub",
-    actual = "@local_tsl//tsl/cuda:cublas_stub",
+    actual = "@local_tsl//tsl/cuda:cublas",
     visibility = ["//visibility:public"],
 )
 
 alias(
     name = "cublas_lib",
-    actual = "@local_tsl//tsl/cuda:cublas_lib",
+    actual = "@local_tsl//tsl/cuda:cublas",
     visibility = ["//visibility:public"],
 )
 
@@ -264,13 +260,13 @@ cc_library(
 
 alias(
     name = "cublas_lt_stub",
-    actual = "@local_tsl//tsl/cuda:cublas_lt_stub",
+    actual = "@local_tsl//tsl/cuda:cublas_lt",
     visibility = ["//visibility:public"],
 )
 
 alias(
     name = "cublas_lt_lib",
-    actual = "@local_tsl//tsl/cuda:cublas_lt_lib",
+    actual = "@local_tsl//tsl/cuda:cublas_lt",
     visibility = ["//visibility:public"],
 )
 
@@ -334,13 +330,13 @@ cc_library(
 
 alias(
     name = "cufft_stub",
-    actual = "@local_tsl//tsl/cuda:cufft_stub",
+    actual = "@local_tsl//tsl/cuda:cufft",
     visibility = ["//visibility:public"],
 )
 
 alias(
     name = "cufft_lib",
-    actual = "@local_tsl//tsl/cuda:cufft_lib",
+    actual = "@local_tsl//tsl/cuda:cufft",
     visibility = ["//visibility:public"],
 )
 
@@ -370,13 +366,13 @@ cc_library(
 
 alias(
     name = "cudnn_stub",
-    actual = "@local_tsl//tsl/cuda:cudnn_stub",
+    actual = "@local_tsl//tsl/cuda:cudnn",
     visibility = ["//visibility:public"],
 )
 
 alias(
     name = "cudnn_lib",
-    actual = "@local_tsl//tsl/cuda:cudnn_lib",
+    actual = "@local_tsl//tsl/cuda:cudnn",
     visibility = ["//visibility:public"],
 )
 
@@ -432,37 +428,37 @@ cc_library(
 
 alias(
     name = "cupti_stub",
-    actual = "@local_tsl//tsl/cuda:cupti_stub",
+    actual = "@local_tsl//tsl/cuda:cupti",
     visibility = ["//visibility:public"],
 )
 
 alias(
     name = "cusolver_stub",
-    actual = "@local_tsl//tsl/cuda:cusolver_stub",
+    actual = "@local_tsl//tsl/cuda:cusolver",
     visibility = ["//visibility:public"],
 )
 
 alias(
     name = "cusolver_lib",
-    actual = "@local_tsl//tsl/cuda:cusolver_lib",
+    actual = "@local_tsl//tsl/cuda:cusolver",
     visibility = ["//visibility:public"],
 )
 
 alias(
     name = "cusparse_stub",
-    actual = "@local_tsl//tsl/cuda:cusparse_stub",
+    actual = "@local_tsl//tsl/cuda:cusparse",
     visibility = ["//visibility:public"],
 )
 
 alias(
     name = "cusparse_lib",
-    actual = "@local_tsl//tsl/cuda:cusparse_lib",
+    actual = "@local_tsl//tsl/cuda:cusparse",
     visibility = ["//visibility:public"],
 )
 
 alias(
     name = "tensorrt_rpath_stub",
-    actual = "@local_tsl//tsl/cuda:tensorrt_rpath_stub",
+    actual = "@local_tsl//tsl/cuda:tensorrt_rpath",
     visibility = ["//visibility:public"],
 )
 

From 8ddb2e6879fedfa6928bdbcb97c2e665464338ef Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 20 Sep 2023 11:20:47 -0700
Subject: [PATCH 041/567] Refer to CUDA stubs directly from TSL, rather than
 using an alias defined in xla/stream_executor.

Remove the aliases in xla/stream_executor.

PiperOrigin-RevId: 567025507
---
 tensorflow/core/util/BUILD                    |  22 ++--
 third_party/xla/xla/service/gpu/BUILD         |   2 +-
 .../xla/xla/stream_executor/cuda/BUILD        | 108 ++----------------
 3 files changed, 21 insertions(+), 111 deletions(-)

diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index fbd116d1d2e9a9..aa86cb6a0ff568 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -1,11 +1,4 @@
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library",
-)
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//tensorflow:tensorflow.bzl",
     "check_deps",
@@ -19,11 +12,18 @@ load(
     "tf_mkl_kernel_library",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_portable", "tf_version_info_genrule")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_proto_library",
+)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm",
@@ -739,8 +739,8 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@local_tsl//tsl/cuda:cusolver",
         "@local_xla//xla/stream_executor/cuda:cublas_plugin",
-        "@local_xla//xla/stream_executor/cuda:cusolver_lib",
     ],
 )
 
@@ -778,9 +778,9 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ] + if_cuda([
-        "@local_xla//xla/stream_executor/cuda:cusparse_lib",
         "@local_xla//xla/stream_executor/cuda:cuda_blas_utils",
         "@local_xla//xla/stream_executor:data_type",
+        "@local_tsl//tsl/cuda:cusparse",
         "@local_config_cuda//cuda:cub_headers",
     ]) + if_rocm([
         "@local_xla//xla/stream_executor/rocm:rocsolver_wrapper",
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 46de10211d6ccb..374ba47563aa10 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1867,7 +1867,7 @@ cc_library(
         "@local_tsl//tsl/platform:status",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
-        "//xla/stream_executor/cuda:cusolver_lib",
+        "@local_tsl//tsl/cuda:cusolver",
     ]) + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
         "//xla/stream_executor/rocm:rocblas_wrapper",
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index b008a8a17559d1..22d30bf5fd23f9 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -185,12 +185,6 @@ xla_cc_test(
     ],
 )
 
-alias(
-    name = "cudart_stub",
-    actual = "@local_tsl//tsl/cuda:cudart",
-    visibility = ["//visibility:public"],
-)
-
 # The activation library is tightly coupled to the executor library.
 # TODO(leary) split up cuda_gpu_executor.cc so that this can stand alone.
 cc_library(
@@ -230,18 +224,6 @@ cc_library(
     ]),
 )
 
-alias(
-    name = "cublas_stub",
-    actual = "@local_tsl//tsl/cuda:cublas",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cublas_lib",
-    actual = "@local_tsl//tsl/cuda:cublas",
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "cublas_lt_header",
     hdrs = if_cuda_is_configured([
@@ -258,18 +240,6 @@ cc_library(
     ]) + ["@local_tsl//tsl/platform:errors"],
 )
 
-alias(
-    name = "cublas_lt_stub",
-    actual = "@local_tsl//tsl/cuda:cublas_lt",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cublas_lt_lib",
-    actual = "@local_tsl//tsl/cuda:cublas_lt",
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "cublas_plugin",
     srcs = if_cuda_is_configured([
@@ -282,8 +252,6 @@ cc_library(
     ]),
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
-        ":cublas_lib",
-        ":cublas_lt_lib",
         ":cuda_activation",
         ":cuda_blas_utils",
         ":cuda_gpu_executor",
@@ -308,6 +276,8 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_timer",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/stream_executor/platform",
+        "@local_tsl//tsl/cuda:cublas",
+        "@local_tsl//tsl/cuda:cublas_lt",
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
     ]) + if_static([
         "@local_tsl//tsl/platform:tensor_float_32_utils",
@@ -321,25 +291,13 @@ cc_library(
     hdrs = if_cuda_is_configured(["cuda_blas_utils.h"]),
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
-        ":cublas_lib",
+        "@local_tsl//tsl/cuda:cublas",
         "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor:stream_executor_headers",
     ]) + ["@local_tsl//tsl/platform:errors"],
 )
 
-alias(
-    name = "cufft_stub",
-    actual = "@local_tsl//tsl/cuda:cufft",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cufft_lib",
-    actual = "@local_tsl//tsl/cuda:cufft",
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "cufft_plugin",
     srcs = if_cuda_is_configured(["cuda_fft.cc"]),
@@ -351,7 +309,6 @@ cc_library(
         ":cuda_platform_id",
         ":cuda_stream",
         ":cuda_helpers",
-        ":cufft_lib",
         "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor:event",
         "//xla/stream_executor:fft",
@@ -360,22 +317,11 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_helpers_header",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
+        "@local_tsl//tsl/cuda:cufft",
     ]) + ["@local_tsl//tsl/platform:errors"],
     alwayslink = True,
 )
 
-alias(
-    name = "cudnn_stub",
-    actual = "@local_tsl//tsl/cuda:cudnn",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cudnn_lib",
-    actual = "@local_tsl//tsl/cuda:cudnn",
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "cuda_dnn_headers",
     textual_hdrs = ["cuda_dnn.h"],
@@ -400,7 +346,6 @@ cc_library(
         ":cuda_gpu_executor",
         ":cuda_platform_id",
         ":cuda_stream",
-        ":cudnn_lib",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -408,6 +353,7 @@ cc_library(
         "@eigen_archive//:eigen3",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cudnn_header",
+        "@local_tsl//tsl/cuda:cudnn",
         "@local_tsl//tsl/cuda:cudnn_version",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
         "//xla/stream_executor:dnn",
@@ -426,42 +372,6 @@ cc_library(
     alwayslink = True,
 )
 
-alias(
-    name = "cupti_stub",
-    actual = "@local_tsl//tsl/cuda:cupti",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cusolver_stub",
-    actual = "@local_tsl//tsl/cuda:cusolver",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cusolver_lib",
-    actual = "@local_tsl//tsl/cuda:cusolver",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cusparse_stub",
-    actual = "@local_tsl//tsl/cuda:cusparse",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cusparse_lib",
-    actual = "@local_tsl//tsl/cuda:cusparse",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "tensorrt_rpath_stub",
-    actual = "@local_tsl//tsl/cuda:tensorrt_rpath",
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "cuda_kernel",
     srcs = if_cuda_is_configured(["cuda_kernel.cc"]),
@@ -579,9 +489,9 @@ cc_library(
         ":cuda_platform",
         ":cudnn_plugin",
         ":cufft_plugin",
-        ":cusolver_lib",
-        ":cusparse_lib",
-        ":tensorrt_rpath_stub",
+        "@local_tsl//tsl/cuda:cusolver",
+        "@local_tsl//tsl/cuda:cusparse",
+        "@local_tsl//tsl/cuda:tensorrt_rpath",
     ],
     alwayslink = 1,
 )
@@ -609,7 +519,7 @@ cc_library(
             ],
         }),
         [
-            ":cudart_stub",
+            "@local_tsl//tsl/cuda:cudart",
         ] + select({
             "@local_tsl//tsl:macos": ["IOKit"],
             "//conditions:default": [],

From 9770efd3b28709716705ffd11af55fa1c7a60637 Mon Sep 17 00:00:00 2001
From: Chao <cchen104@amd.com>
Date: Wed, 20 Sep 2023 11:37:18 -0700
Subject: [PATCH 042/567] PR #5749: [ROCm] gpu command buffer for ROCm

Imported from GitHub PR https://github.com/openxla/xla/pull/5749

ROCm enable command buffer https://github.com/openxla/xla/commit/ec74b3b29d549ba97f499751651684800ab55aa6

@ezhulenev @akuegel Thanks in advance
Copybara import of the project:

--
aca5aa0e853b201c94420f41c22f5b837dd97f76 by Chao Chen <cchen104@amd.com>:

ROCm adds gpu command buffer

Merging this change closes #5749

PiperOrigin-RevId: 567031199
---
 third_party/xla/xla/stream_executor/rocm/BUILD         |  1 +
 .../xla/xla/stream_executor/rocm/rocm_gpu_executor.cc  | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 5353a5ec9ad721..a4c1bf64976212 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -115,6 +115,7 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_event",
         "//xla/stream_executor/gpu:gpu_kernel_header",
+        "//xla/stream_executor/gpu:gpu_command_buffer",
         "//xla/stream_executor/gpu:gpu_stream",
         "//xla/stream_executor/gpu:gpu_timer",
         "//xla/stream_executor/platform",
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
index 20024780b067c4..5ad9feb70e5652 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include <unistd.h>
 
+#include <memory>
+
 #include "absl/base/casts.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/strings/ascii.h"
@@ -22,9 +24,11 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "xla/stream_executor/gpu/gpu_command_buffer.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_event.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/gpu/gpu_kernel.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/kernel_cache_config.h"
@@ -715,6 +719,12 @@ GpuExecutor::GetStreamImplementation() {
   return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
 }
 
+tsl::StatusOr<std::unique_ptr<internal::CommandBufferInterface>>
+GpuExecutor::GetCommandBufferImplementation() {
+  return std::unique_ptr<internal::CommandBufferInterface>(
+      new GpuCommandBuffer());
+}
+
 void* GpuExecutor::GpuContextHack() { return context_; }
 
 GpuContext* GpuExecutor::gpu_context() { return context_; }

From f17f1a7b0c0abc9b1e0b11a461f71685b51aae2d Mon Sep 17 00:00:00 2001
From: Ziyin Huang <ziyinh@google.com>
Date: Wed, 20 Sep 2023 12:06:51 -0700
Subject: [PATCH 043/567] add id shuffling to the sparse core preprocess ops.

PiperOrigin-RevId: 567040448
---
 tensorflow/core/tpu/kernels/BUILD             |  2 -
 .../tpu/kernels/sparse_core_preprocess_ops.cc | 56 ++++++++++---------
 .../tpu/kernels/sparse_core_preprocess_ops.h  |  6 ++
 3 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 0c600b45d37dbd..e75f17f57625d7 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -142,10 +142,8 @@ cc_library(
         ":sparse_core_ops_stats_handler",
         ":sparse_core_ops_utils",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index df3364f1ec8407..7181edfe33d7b2 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -356,8 +356,7 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
 
   const int num_physical_replica = num_replica_ * num_sc_per_chip_;
 
-  size_t xla_pad_size = stream_executor::tpu::OpsApiFn()
-                            ->TpuUtil_GetXlaPadSizeFromTpuTopologyFn();
+  size_t xla_pad_size = 8;
 
   OP_REQUIRES(ctx, sample_count_ % num_sc_per_chip_ == 0,
               absl::InvalidArgumentError(
@@ -646,12 +645,11 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
                                                (kMaxDivisions + 1));
   std::vector<int32_t> record_total_unique_id_counter(num_physical_replica *
                                                       (kMaxDivisions + 1));
-  // Array which keeps track of the index of each physical replica.
-  std::vector<int32> per_physical_replica_index(num_physical_replica);
 
-  // Accumulated sum of the id count for each physical replica.
-  std::vector<int32> physical_replica_id_count((num_physical_replica + 1) *
-                                               num_sc_per_chip_);
+  // Array which keeps track of the index of each physical replica and each
+  // bucket.
+  std::vector<int32_t> per_physical_replica_bucket_index(num_physical_replica *
+                                                         kMaxDivisions);
 
   // Id counts for each sc input.
   std::vector<int32_t> per_sc_id_count(num_sc_per_chip_, 0);
@@ -696,10 +694,15 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
       if (row_id != previous_row_id || col_id != previous_col_id) {
         dedup_ids_index_mapping[id_array_index] = id_array_index;
         gains_after_dedup[id_array_index] = *(gains_ptr + id_array_index);
-        int32 replica_id = col_id % num_physical_replica;
-        int32 bucket_id = col_id / division_size + 1;
+        int32_t replica_id = col_id % num_physical_replica;
+        int32_t bucket_id;
+        if (allow_id_shuffling_for_minibatching_) {
+          bucket_id = CalculateBucketIdWithHashing(col_id, kMaxDivisions);
+        } else {
+          bucket_id = std::min(col_id / division_size, kMaxDivisions - 1);
+        }
         uint32_t id_counter_index =
-            replica_id * (kMaxDivisions + 1) + bucket_id;
+            replica_id * (kMaxDivisions + 1) + bucket_id + 1;
         record_total_id_counter[id_counter_index]++;
         if (col_id != previous_col_id)
           record_total_unique_id_counter[id_counter_index]++;
@@ -770,11 +773,6 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
       this_max_ids = std::max(this_max_ids, record_id_counter[kMaxDivisions]);
       this_max_uniques =
           std::max(this_max_uniques, record_unique_id_counter[kMaxDivisions]);
-      physical_replica_id_count[sc_id * (num_physical_replica + 1) +
-                                replica_id + 1] =
-          physical_replica_id_count[sc_id * (num_physical_replica + 1) +
-                                    replica_id] +
-          id_counter[kMaxDivisions];
       per_sc_id_count[sc_id] += id_counter[kMaxDivisions];
 
       for (int level = 0; level < max_division_level; ++level) {
@@ -868,11 +866,9 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
       sorted_col_ids_tensor->flat<int32_t>().data();
   float* sorted_gains_tensor_ptr = sorted_gains_tensor->flat<float>().data();
 
-  int32_t previous_index = 0;
-
   for (int sc_id = 0; sc_id < num_sc_per_chip_; ++sc_id) {
-    memset(per_physical_replica_index.data(), 0,
-           num_physical_replica * sizeof(int32));
+    memset(per_physical_replica_bucket_index.data(), 0,
+           num_physical_replica * kMaxDivisions * sizeof(int32_t));
     for (uint64_t item : col_ids_index_list[sc_id]) {
       uint32_t id_array_index = item & 0xffffffff;
       // Skip deduped ids.
@@ -881,20 +877,30 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
         continue;
       }
       int32_t col_id = item >> 32;
+      std::string col_id_str = std::to_string(col_id);
       int32_t replica_id = col_id % num_physical_replica;
-      int32_t main_index =
-          per_physical_replica_index[replica_id] + previous_index +
-          physical_replica_id_count[sc_id * (num_physical_replica + 1) +
-                                    replica_id];
+      int32_t bucket_id;
+      int32_t main_index;
+      if (allow_id_shuffling_for_minibatching_) {
+        bucket_id = CalculateBucketIdWithHashing(col_id, kMaxDivisions);
+      } else {
+        bucket_id = std::min(col_id / division_size, kMaxDivisions - 1);
+      }
+      main_index =
+          per_physical_replica_bucket_index[replica_id * kMaxDivisions +
+                                            bucket_id] +
+          *(id_counts_tensor_ptr +
+            (sc_id * num_physical_replica + replica_id) * kMaxDivisions +
+            bucket_id);
+      ++per_physical_replica_bucket_index[replica_id * kMaxDivisions +
+                                          bucket_id];
       *(sorted_row_ids_tensor_ptr + main_index) =
           *(row_ids_ptr + id_array_index) % per_sc_sample_count;
       *(sorted_col_ids_tensor_ptr + main_index) = col_id / num_physical_replica;
       // Use the updated gains instead.
       *(sorted_gains_tensor_ptr + main_index) =
           gains_after_dedup[id_array_index];
-      per_physical_replica_index[replica_id]++;
     }
-    previous_index += per_sc_id_count[sc_id];
   }
 
   sprase_core_ops_stats_handler_->Record(
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
index b1f2e8802eac9a..0ecf6214f1f4a5 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
@@ -112,11 +112,17 @@ class GetMinibatchSplitsWithPhysicalReplicaOp : public OpKernel {
                                  tstring program_key,
                                  int64_t max_ids_per_partition,
                                  int64_t max_unique_ids_per_partition) {}
+  virtual inline int32_t CalculateBucketIdWithHashing(int32_t col_id,
+                                                      int32_t num_buckets) {
+    // TODO(pineapplejuice233): Add a proper hashing function here.
+    return col_id % num_buckets;
+  }
 
   std::string device_name_;
   std::string table_name_;
   std::unique_ptr<SparseCoreOpsStatsHandler> sprase_core_ops_stats_handler_;
   bool allow_id_dropping_for_minibatching_ = false;
+  bool allow_id_shuffling_for_minibatching_ = false;
 
  private:
   int num_replica_ = 1;

From 6dc6334a711da5b8135fe6da1445a0b259ab69ac Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Wed, 20 Sep 2023 12:10:30 -0700
Subject: [PATCH 044/567] Rollback of commit
 a9075649062815e7859544728a0c3e0af9311a0c

Pass along the `shape` param from `tf.compat.v1.get_variable` to its underlying variable creator. At this point, we have already checked that either `shape` is compatible w/ `initial_value`, or that it is not specified (`None`). Therefore it should alway...

PiperOrigin-RevId: 567041608
---
 .../feature_column/feature_column_test.py     | 28 ++++++-------------
 .../feature_column/feature_column_v2_test.py  | 12 +++-----
 tensorflow/python/ops/variable_scope.py       | 17 ++++-------
 3 files changed, 17 insertions(+), 40 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index f8fbd8db7e3b15..3cde6c5657edc1 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -4924,19 +4924,16 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
-        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
-          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
-          return array_ops.slice(
-              embedding_values, partition_info.var_offset, shape
-          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-          return embedding_values
+
+        self.assertEqual(dtypes.float32, dtype)
+        return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups = (
@@ -4979,12 +4976,7 @@ def _initializer(shape, dtype, partition_info=None):
       for v in global_vars:
         self.assertIsInstance(v, variables_lib.Variable)
       with _initialized_session():
-        if partition_variables:
-          self.assertAllEqual(
-              embedding_values, array_ops.concat(global_vars, axis=0)
-          )
-        else:
-          self.assertAllEqual(embedding_values, global_vars[0])
+        self.assertAllEqual(embedding_values, global_vars[0])
         self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
       if use_safe_embedding_lookup:
@@ -5791,19 +5783,16 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
-        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
-          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
-          return array_ops.slice(
-              embedding_values, partition_info.var_offset, shape
-          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-          return embedding_values
+
+        self.assertEqual(dtypes.float32, dtype)
+        return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups_a = (
@@ -5853,11 +5842,10 @@ def _initializer(shape, dtype, partition_info=None):
         self.assertCountEqual(('vars/embedding_weights/part_0:0',
                                'vars/embedding_weights/part_1:0'),
                               tuple([v.name for v in global_vars]))
-        embedding_var = array_ops.concat(global_vars, axis=0)
       else:
         self.assertCountEqual(('vars/embedding_weights:0',),
                               tuple([v.name for v in global_vars]))
-        embedding_var = global_vars[0]
+      embedding_var = global_vars[0]
 
       self.evaluate(variables_lib.global_variables_initializer())
       self.evaluate(lookup_ops.tables_initializer())
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index cb98bf60c03184..21dcbb4452d6c7 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -5762,19 +5762,16 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
-        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
-          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
-          return array_ops.slice(
-              embedding_values, partition_info.var_offset, shape
-          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-          return embedding_values
+
+        self.assertEqual(dtypes.float32, dtype)
+        return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups_a = (
@@ -5824,11 +5821,10 @@ def _initializer(shape, dtype, partition_info=None):
         self.assertCountEqual(('vars/aaa_bbb_shared_embedding/part_0:0',
                                'vars/aaa_bbb_shared_embedding/part_1:0'),
                               tuple([v.name for v in global_vars]))
-        embedding_var = array_ops.concat(global_vars, axis=0)
       else:
         self.assertCountEqual(('vars/aaa_bbb_shared_embedding:0',),
                               tuple([v.name for v in global_vars]))
-        embedding_var = global_vars[0]
+      embedding_var = global_vars[0]
 
       self.evaluate(variables_lib.global_variables_initializer())
       self.evaluate(lookup_ops.tables_initializer())
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 622f70732adaba..33dd0438fa2f2f 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -807,8 +807,7 @@ def _get_partitioned_variable(self,
             use_resource=use_resource,
             constraint=constraint,
             synchronization=synchronization,
-            aggregation=aggregation,
-        )
+            aggregation=aggregation)
 
       # pylint: disable=protected-access
       var._set_save_slice_info(
@@ -881,8 +880,7 @@ def _get_single_variable(self,
       raise ValueError("If initializer is a constant, do not specify shape.")
 
     dtype = dtypes.as_dtype(dtype)
-    if shape is not None:
-      shape = tensor_shape.as_shape(shape)
+    shape = tensor_shape.as_shape(shape)
 
     if name in self._vars:
       # Here we handle the case when returning an existing variable.
@@ -903,9 +901,7 @@ def _get_single_variable(self,
         raise ValueError("%s Originally defined at:\n\n%s" %
                          (err_msg, "".join(traceback.format_list(tb))))
       found_var = self._vars[name]
-      if shape is not None and not shape.is_compatible_with(
-          found_var.get_shape()
-      ):
+      if not shape.is_compatible_with(found_var.get_shape()):
         raise ValueError("Trying to share variable %s, but specified shape %s"
                          " and found shape %s." %
                          (name, shape, found_var.get_shape()))
@@ -925,7 +921,6 @@ def _get_single_variable(self,
 
     # Create the tensor to initialize the variable with default value.
     if initializer is None:
-      assert shape is not None
       initializer, initializing_from_value = self._get_default_initializer(
           name=name, shape=shape, dtype=dtype)
     # Enter an init scope when creating the initializer.
@@ -937,7 +932,7 @@ def _get_single_variable(self,
         # Instantiate initializer if provided initializer is a type object.
         if tf_inspect.isclass(initializer):
           initializer = initializer()
-        if shape is not None and shape.is_fully_defined():
+        if shape.is_fully_defined():
           if "partition_info" in tf_inspect.getargspec(initializer).args:
             init_val = functools.partial(initializer,
                                          shape.as_list(),
@@ -972,9 +967,7 @@ def _get_single_variable(self,
         constraint=constraint,
         use_resource=use_resource,
         synchronization=synchronization,
-        aggregation=aggregation,
-        shape=shape,
-    )
+        aggregation=aggregation)
     if context.executing_eagerly() and self._store_eager_variables:
       if collections:
         ops.add_to_collections(collections, v)

From 04c9a3483f02e6d04bb90032df5812065e5d4143 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Wed, 20 Sep 2023 12:27:57 -0700
Subject: [PATCH 045/567] [XLA:GPU] Triton GEMM: fix fusion traversal logic.

PiperOrigin-RevId: 567046240
---
 .../xla/service/gpu/gemm_rewriter_triton.cc   | 83 +++++++++++--------
 .../service/gpu/gemm_rewriter_triton_test.cc  | 24 +++++-
 2 files changed, 70 insertions(+), 37 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
index 70090a5070ce2d..8f1e0399ddb13a 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
@@ -1234,49 +1234,60 @@ void FusionContext::TryToFuseWithInputsRecursively(
         old_to_new_mapping,
     std::vector<HloInstruction*>& fusion_inputs,
     HloComputation::Builder& builder) {
-  absl::flat_hash_set<const HloInstruction*> visited;
-  std::stack<HloInstruction*> to_fuse;
-  // Instructions at the edge of 'to_fuse' that can either get fused too or
-  // become parameters of the fusion. Used to track the number of parameters
-  // of the fusion.
+  // Instructions at the fusion edge that can either get fused too or
+  // become parameters of the fusion. Used to track the number of parameters.
   absl::flat_hash_set<const HloInstruction*> inputs;
-  auto try_fuse_one = [&](HloInstruction& hlo) {
+  // Traverse all connected instructions that could be fused, analyze them and
+  // collect ones that will be fused.
+  absl::flat_hash_set<const HloInstruction*> to_fuse_set;
+  std::list<HloInstruction*> to_fuse_list;
+  absl::flat_hash_set<const HloInstruction*> enqueued;
+  std::queue<HloInstruction*> to_visit;
+  to_visit.push(&root);
+  while (!to_visit.empty()) {
+    HloInstruction* hlo = to_visit.front();
+    to_visit.pop();
+    // Limit the total number of fusion parameters.
+    if (inputs.size() >= TritonFusionAnalysis::kMaxParameterPerScope &&
+        NumAddedParameters(*hlo) > 0) {
+      continue;
+    }
     const DimOrderUpdatesOrError result = AnalyzeForFusion(
-        hlo, /*as_input=*/true, old_to_new_mapping, gpu_version);
-    if (!std::holds_alternative<DimOrderUpdates>(result)) {
-      return false;
+        *hlo, /*as_input=*/true, old_to_new_mapping, gpu_version);
+    if (!std::holds_alternative<DimOrderUpdates>(result) ||
+        !MergeUpdates(std::get<DimOrderUpdates>(result))) {
+      continue;
     }
-
-    if (!MergeUpdates(std::get<DimOrderUpdates>(result))) {
-      return false;
+    if (hlo->opcode() != HloOpcode::kParameter) {
+      inputs.erase(hlo);
     }
-    to_fuse.push(&hlo);
-    if (hlo.opcode() != HloOpcode::kParameter) {
-      inputs.erase(&hlo);
+    inputs.insert(hlo->operands().cbegin(), hlo->operands().cend());
+    to_fuse_set.insert(hlo);
+    to_fuse_list.push_back(hlo);
+    for (HloInstruction* operand : hlo->operands()) {
+      if (enqueued.insert(operand).second) {
+        to_visit.push(operand);
+      }
     }
-    inputs.insert(hlo.operands().cbegin(), hlo.operands().cend());
-    return true;
-  };
-  try_fuse_one(root);
-  visited.insert(&root);
-  while (!to_fuse.empty()) {
-    bool top_is_ready_to_fuse = true;
-    HloInstruction* hlo = to_fuse.top();
-    for (HloInstruction* operand : hlo->mutable_operands()) {
-      if (visited.insert(operand).second) {
-        // Stop adding new parameters.
-        if (inputs.size() >= TritonFusionAnalysis::kMaxParameterPerScope &&
-            NumAddedParameters(*operand) > 0) {
-          continue;
-        }
-        if (try_fuse_one(*operand)) {
-          top_is_ready_to_fuse = false;
+  }
+  // Find one by one instructions that have no operands queued to be fused and
+  // fuse them.
+  while (!to_fuse_list.empty()) {
+    for (auto it = to_fuse_list.begin(); it != to_fuse_list.end();) {
+      bool ready_to_fuse = true;
+      for (const HloInstruction* operand : (*it)->operands()) {
+        if (to_fuse_set.contains(operand)) {
+          ready_to_fuse = false;
+          break;
         }
       }
-    }
-    if (top_is_ready_to_fuse) {
-      Fuse(*hlo, old_to_new_mapping, fusion_inputs, builder);
-      to_fuse.pop();
+      if (ready_to_fuse) {
+        Fuse(**it, old_to_new_mapping, fusion_inputs, builder);
+        to_fuse_set.erase(*it);
+        it = to_fuse_list.erase(it);
+      } else {
+        ++it;
+      }
     }
   }
 }
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
index 4164a295948e69..b91abc7f617ffe 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
@@ -1521,6 +1521,28 @@ ENTRY e {
   EXPECT_FALSE(GemmRewriterTriton(cc).Run(module.get()).value());
 }
 
+TEST_F(GemmRewriterTritonTest, MultipleUsesAreHandled) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  c = f32[] constant(1)
+  b = f32[6,8] broadcast(c), dimensions={}
+  p0 = f32[6,8] parameter(0)
+  a1 = f32[6,8] add(p0, b)
+  e = f32[6,8] exponential(a1)
+  a2 = f32[6,8] add(e, b)
+  d = f32[6,8] divide(b, a2)
+  p2 = f16[8,6] parameter(1)
+  cv = f32[8,6] convert(p2)
+  ROOT r = f32[6,6] dot(d, cv),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})"));
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::AMPERE, 0};
+  EXPECT_TRUE(GemmRewriterTriton(cc).Run(module.get()).value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
+}
+
 class GemmRewriterTritonLevel2Test : public GemmRewriterTritonTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {
@@ -1568,7 +1590,7 @@ ENTRY e {
   EXPECT_TRUE(GemmRewriterTriton(gpu_version_).Run(module.get()).value());
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Transpose(), m::Parameter())));
+      GmockMatch(m::Fusion(m::Transpose(), m::Parameter(), m::Parameter())));
 }
 
 TEST_F(GemmRewriterTritonLevel2Test, DoNotFuseTooManyParameters) {

From 0e3480236cec19ea558cd93dd017013e5cfee1b3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 12:39:26 -0700
Subject: [PATCH 046/567] include THIRD_PARTY_NOTICES.txt in the wheel.

PiperOrigin-RevId: 567049176
---
 tensorflow/tools/pip_package/BUILD                | 1 +
 tensorflow/tools/pip_package/MANIFEST.in          | 1 +
 tensorflow/tools/pip_package/build_pip_package.sh | 1 +
 3 files changed, 3 insertions(+)

diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 5afeb1ce7f62bb..4d7b9194ab80e6 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -98,6 +98,7 @@ DYNAMIC_LOADED_KERNELS = [
 COMMON_PIP_DEPS = [
     ":licenses",
     "MANIFEST.in",
+    "THIRD_PARTY_NOTICES.txt",
     "README",
     "setup.py",
     ":included_headers",
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index 921e4a60f1794f..d9d6779e161c0e 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -1,5 +1,6 @@
 include LICENSE
 include README
+include tensorflow/THIRD_PARTY_NOTICES.txt
 recursive-include * *.py
 recursive-include * *.pyd
 recursive-include * *.pyi
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index d83f2096f277a1..deff63b8d15557 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -277,6 +277,7 @@ function prepare_src() {
   mkdir -p ${TMPDIR}/third_party
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
   cp -LR $RUNFILES/../local_config_cuda/cuda/_virtual_includes/cuda_headers_virtual/third_party/gpus ${TMPDIR}/third_party
+  cp $RUNFILES/tensorflow/tools/pip_package/THIRD_PARTY_NOTICES.txt "${TMPDIR}/tensorflow"
 
   reorganize_includes "${TMPDIR}"
 

From 5b534b09c7ec440176d3ac85f8c0976217c0830b Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Wed, 20 Sep 2023 13:21:45 -0700
Subject: [PATCH 047/567] Update metrics name from v0 to v1

PiperOrigin-RevId: 567061192
---
 .../compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc     | 2 +-
 .../compiler/mlir/tf2xla/internal/legalize_tf_to_hlo_test.cc  | 4 ++--
 tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc | 4 ++--
 .../compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc   | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
index d2834844bd6a61..1485028adf33fc 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
@@ -70,7 +70,7 @@ TEST(LegalizeMlirTest, FailsLegalizesModule) {
     }
   })";
   CellReader<int64_t> count(
-      "/tensorflow/core/tf2xla/v0/mlir_failed_xla_legalize_tf_pass_count");
+      "/tensorflow/core/tf2xla/v1/mlir_failed_xla_legalize_tf_pass_count");
 
   std::vector<tensorflow::TensorShape> arg_shapes;
   XlaCompilationResult compilation_result;
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo_test.cc
index b6a3463234f5f0..67de8464a9c587 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo_test.cc
@@ -47,9 +47,9 @@ using tpu::ShardingAndIndex;
 using tpu::TPUCompileMetadataProto;
 
 static constexpr char kMlirLegalizeCount[] =
-    "/tensorflow/core/tf2xla/v0/mlir_failed_xla_legalize_tf_count";
+    "/tensorflow/core/tf2xla/v1/mlir_failed_xla_legalize_tf_count";
 static constexpr char kMlirLegalizeErrors[] =
-    "/tensorflow/core/tf2xla/v0/mlir_failed_xla_legalize_tf_pass_count";
+    "/tensorflow/core/tf2xla/v1/mlir_failed_xla_legalize_tf_pass_count";
 static constexpr char kBridgeStatusCounter[] =
     "/tensorflow/core/tf2xla/api/v2/phase2_compilation_status";
 constexpr char kMlirCombinedMlirSuccess[] = "kMlirCombinedMlirSuccess";
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
index 32ff6ad83c6f42..2cb569635226d1 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
@@ -55,11 +55,11 @@ namespace {
 #include "tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.h.inc"
 
 auto *mlir_legalization_count = tensorflow::monitoring::Counter<1>::New(
-    "/tensorflow/core/tf2xla/v0/mlir_failed_xla_legalize_tf_count",
+    "/tensorflow/core/tf2xla/v1/mlir_failed_xla_legalize_tf_count",
     "Counts the attempts of legalization of ops", "op_name");
 
 auto *mlir_failed_legalization_count = tensorflow::monitoring::Counter<2>::New(
-    "/tensorflow/core/tf2xla/v0/mlir_failed_xla_legalize_tf_pass_count",
+    "/tensorflow/core/tf2xla/v1/mlir_failed_xla_legalize_tf_pass_count",
     "Counts the failure of legalization of ops", "op_name", "legality");
 
 class LegalizeTF : public impl::LegalizeTFBase<LegalizeTF> {
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc
index b9085c12110d72..39eadcb93fcfce 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc
@@ -88,7 +88,7 @@ TEST(XlaLegalizeTest, IllegalOp) {
     }
   })";
   CellReader<int64_t> legalize_failure_count(
-      "/tensorflow/core/tf2xla/v0/mlir_failed_xla_legalize_tf_pass_count");
+      "/tensorflow/core/tf2xla/v1/mlir_failed_xla_legalize_tf_pass_count");
 
   auto status = BuildAndRunPipeline(kMlirIllegalOpStr, legalizeTFPasses());
 
@@ -103,7 +103,7 @@ TEST(XlaLegalizeTest, LegalOp) {
      %0:2 = "tf.InfeedDequeueTuple"() : () -> (tensor<3x3xf32>, tensor<4x?xf32>) func.return %0#0, %0#1 : tensor<3x3xf32>, tensor<4x?xf32>
    })";
   CellReader<int64_t> legalize_failure_count(
-      "/tensorflow/core/tf2xla/v0/mlir_failed_xla_legalize_tf_pass_count");
+      "/tensorflow/core/tf2xla/v1/mlir_failed_xla_legalize_tf_pass_count");
 
   auto status = BuildAndRunPipeline(kMlirLegalOpStr, legalizeTFPasses());
 

From 9258e1aeb9fdbdd8566847f31a7c64243ba6d599 Mon Sep 17 00:00:00 2001
From: Kanglan Tang <kanglan@google.com>
Date: Wed, 20 Sep 2023 13:25:39 -0700
Subject: [PATCH 048/567] Update tpu build flags in .bazelrc

PiperOrigin-RevId: 567062395
---
 .bazelrc                                 | 5 ++++-
 third_party/xla/.bazelrc                 | 5 ++++-
 third_party/xla/third_party/tsl/.bazelrc | 5 ++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 35ff0dc1440b7f..8fb09a849a8b57 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -267,8 +267,11 @@ build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON
 # AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498
 build:dbg --copt -DDEBUG_BUILD
 
-# Config to build TPU backend
+# Config to build TF TPU
 build:tpu --define=with_tpu_support=true
+build:tpu --define=framework_shared_object=true
+build:tpu --copt=-DLIBTPU_ON_GCE
+build:tpu --define=enable_mlir_bridge=true
 
 build:tensorrt --repo_env TF_NEED_TENSORRT=1
 
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 35ff0dc1440b7f..8fb09a849a8b57 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -267,8 +267,11 @@ build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON
 # AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498
 build:dbg --copt -DDEBUG_BUILD
 
-# Config to build TPU backend
+# Config to build TF TPU
 build:tpu --define=with_tpu_support=true
+build:tpu --define=framework_shared_object=true
+build:tpu --copt=-DLIBTPU_ON_GCE
+build:tpu --define=enable_mlir_bridge=true
 
 build:tensorrt --repo_env TF_NEED_TENSORRT=1
 
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 35ff0dc1440b7f..8fb09a849a8b57 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -267,8 +267,11 @@ build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON
 # AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498
 build:dbg --copt -DDEBUG_BUILD
 
-# Config to build TPU backend
+# Config to build TF TPU
 build:tpu --define=with_tpu_support=true
+build:tpu --define=framework_shared_object=true
+build:tpu --copt=-DLIBTPU_ON_GCE
+build:tpu --define=enable_mlir_bridge=true
 
 build:tensorrt --repo_env TF_NEED_TENSORRT=1
 

From 9d138e582de5f9de67afc7608ac9258a7d6670f1 Mon Sep 17 00:00:00 2001
From: Michael Hudgins <michaelhudgins@google.com>
Date: Wed, 20 Sep 2023 13:36:53 -0700
Subject: [PATCH 049/567] Modify ARM64 docker images to build under TF repo

PiperOrigin-RevId: 567065712
---
 ci/official/containers/linux_arm64/build.sh | 43 ++++++++++++++++-----
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/ci/official/containers/linux_arm64/build.sh b/ci/official/containers/linux_arm64/build.sh
index f6477b3c9040c9..dc30a354f20f0d 100755
--- a/ci/official/containers/linux_arm64/build.sh
+++ b/ci/official/containers/linux_arm64/build.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,14 +16,37 @@
 
 set -e
 
-export LANG=C
+is_continuous_or_release() {
+  [[ "$KOKORO_JOB_TYPE" == "CONTINUOUS_INTEGRATION" ]] || [[ "${KOKORO_JOB_TYPE}" == "RELEASE" ]]
+}
 
-release_tag=2.15
+# Move into the directory of the script
+cd "$(dirname "$0")"
 
-docker build --pull \
-        --tag=linaro/tensorflow-arm64-build:latest-multipython \
-        --tag=linaro/tensorflow-arm64-build:${release_tag}-multipython .
-mkdir -p tagdir-multipython
-echo linaro/tensorflow-arm64-build:latest-multipython > tagdir-multipython/.docker-tag
-mkdir -p tagdir-${release_tag}-multipython
-echo linaro/tensorflow-arm64-build:${release_tag}-multipython > tagdir-${release_tag}-multipython/.docker-tag
+if is_continuous_or_release; then
+  # A continuous job is the only one to publish to latest
+  TAG="latest-multi-python"
+else
+  # If it is a change, grab a good tag for iterative builds
+  if [[ -z "${KOKORO_GITHUB_PULL_REQUEST_NUMBER}" ]]; then
+    TAG=$(head -n 1 "$KOKORO_PIPER_DIR/presubmit_request.txt" | cut -d" " -f2)
+  else
+    TAG="pr-${KOKORO_GITHUB_PULL_REQUEST_NUMBER}"
+  fi
+fi
+
+# IMAGE="gcr.io/tensorflow-sigs/build-arm64:$TAG-$PYVER"
+IMAGE="gcr.io/tensorflow-sigs/build-arm64:$TAG"
+docker pull "$IMAGE" || true
+
+gcloud auth configure-docker
+
+# TODO(michaelhudgins): align with sig build and make it so not every python is
+# being included in a single image
+# --build-arg "PYTHON_VERSION=$PYVER" \
+DOCKER_BUILDKIT=1 docker build \
+  --cache-from "$IMAGE" \
+  --target=devel \
+  -t "$IMAGE"  .
+
+docker push "$IMAGE"

From 2ebb1b5b001d730a7077d8d60ff459443a54fb7c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 13:45:15 -0700
Subject: [PATCH 050/567] Remove fallback to the old xla builder bridge and
 rely on the combined bridge's call.

PiperOrigin-RevId: 567068282
---
 .../mlir/tf2xla/api/v2/legalize_tf.cc         | 55 +------------------
 1 file changed, 1 insertion(+), 54 deletions(-)

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
index 6fe12137d78966..ae7bfa8076d917 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
@@ -134,60 +134,7 @@ tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
                           combined_bridge_status.status().ToString())
       .IgnoreError();
 
-  Status old_bridge_status = tf2xla::v1::CompileTensorflowGraphToHlo(
-      computation, metadata, use_tuple_args, shape_determination_fns,
-      arg_shapes, arg_core_mapping, per_core_arg_shapes, client,
-      compilation_result.get());
-
-  // Record filter/failure stats only if the old bridge succeeds. This removes
-  // noise from invalid inputs.
-  if (!old_bridge_status.ok()) {
-    // If the old bridge failed for this input as well. Mark the input as
-    // invalid. This might be incorrect in case of old bridge bugs but that
-    // should be rare.
-    if (filtered_graph) {
-      IncrementTfMlirBridgeSecondPhaseCounter(
-          MlirBridgeSecondPhaseMetric ::kOldBridgeMlirFilteredFailure);
-    } else {
-      IncrementTfMlirBridgeSecondPhaseCounter(
-          MlirBridgeSecondPhaseMetric ::kOldBridgeWithFallbackModeFailure);
-    }
-    if (!old_bridge_status.ok()) {
-      tsl::error_logging::Log(kBridgeComponent, "TFXLA_API_V2_OLD_BRIDGE",
-                              mlir_bridge_status.status().ToString())
-          .IgnoreError();
-    }
-    return old_bridge_status;
-  }
-
-  if (VLOG_IS_ON(2)) {
-    TF_ASSIGN_OR_RETURN(
-        auto hlo_module_config,
-        xla::HloModule::CreateModuleConfigFromProto(
-            compilation_result->computation->proto(), xla::DebugOptions()));
-
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<xla::HloModule> hlo_module,
-        xla::HloModule::CreateFromProto(
-            compilation_result->computation->proto(), hlo_module_config));
-
-    std::string all_computations;
-    for (auto computation : hlo_module->computations()) {
-      all_computations += computation->ToString() + "\n\n";
-    }
-
-    tensorflow::DumpRawStringToFile("legalize_tf_fallback_hlo",
-                                    all_computations);
-  }
-
-  if (filtered_graph) {
-    IncrementTfMlirBridgeSecondPhaseCounter(
-        MlirBridgeSecondPhaseMetric ::kOldBridgeMlirFilteredSuccess);
-  } else {
-    IncrementTfMlirBridgeSecondPhaseCounter(
-        MlirBridgeSecondPhaseMetric ::kOldBridgeWithFallbackModeSuccess);
-  }
-  return *compilation_result;
+  return combined_bridge_status.status();
 }
 
 };  // namespace v2

From 03c58a3fa7229078dd4fab969dc18dc2c8a7265a Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 20 Sep 2023 13:46:41 -0700
Subject: [PATCH 051/567] [stream_executor] Remove unused gpu_launch_dim.h

PiperOrigin-RevId: 567068762
---
 third_party/xla/xla/stream_executor/BUILD     |  7 +-----
 .../xla/xla/stream_executor/gpu_launch_dim.h  | 23 -------------------
 2 files changed, 1 insertion(+), 29 deletions(-)
 delete mode 100644 third_party/xla/xla/stream_executor/gpu_launch_dim.h

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 6760fa8d6fa477..14348bd2a6e9e5 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -36,7 +36,6 @@ cc_library(
         "event.h",
         "executor_cache.h",
         "fft.h",
-        "gpu_launch_dim.h",
         "kernel.h",
         "kernel_cache_config.h",
         "kernel_spec.h",
@@ -86,10 +85,7 @@ transitive_hdrs(
 
 cc_library(
     name = "launch_dim",
-    hdrs = [
-        "gpu_launch_dim.h",
-        "launch_dim.h",
-    ],
+    hdrs = ["launch_dim.h"],
     visibility = ["//visibility:public"],
     deps = ["@com_google_absl//absl/strings"],
 )
@@ -731,7 +727,6 @@ cc_library(
         "event.h",
         "executor_cache.h",
         "fft.h",
-        "gpu_launch_dim.h",
         "kernel.h",
         "kernel_cache_config.h",
         "kernel_spec.h",
diff --git a/third_party/xla/xla/stream_executor/gpu_launch_dim.h b/third_party/xla/xla/stream_executor/gpu_launch_dim.h
deleted file mode 100644
index 8e5d23598da2bb..00000000000000
--- a/third_party/xla/xla/stream_executor/gpu_launch_dim.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_GPU_LAUNCH_DIM_H_
-#define XLA_STREAM_EXECUTOR_GPU_LAUNCH_DIM_H_
-
-// TODO(rspringer): Temporary redirection until all users - including gcudacc -
-// are using the new file.
-#include "xla/stream_executor/launch_dim.h"
-
-#endif  // XLA_STREAM_EXECUTOR_GPU_LAUNCH_DIM_H_

From ea0637ac525b4b1e1794373014f1159a7e13ebcb Mon Sep 17 00:00:00 2001
From: Bing Hu <binghu@google.com>
Date: Wed, 20 Sep 2023 14:05:44 -0700
Subject: [PATCH 052/567] Mesh creation API should also support taking a list
 of DeviceSpec as input

PiperOrigin-RevId: 567074638
---
 tensorflow/dtensor/python/mesh_util.py        | 14 ++++++-----
 tensorflow/dtensor/python/tests/BUILD         |  4 ++++
 .../dtensor/python/tests/mesh_util_test.py    | 23 +++++++++++++++++--
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/tensorflow/dtensor/python/mesh_util.py b/tensorflow/dtensor/python/mesh_util.py
index 9650d6f23bd7bb..12e86e321e1210 100644
--- a/tensorflow/dtensor/python/mesh_util.py
+++ b/tensorflow/dtensor/python/mesh_util.py
@@ -44,23 +44,25 @@ def _print_context(num_global_devices: int, num_clients: int, client_id: int,
 
 
 def _make_device_specs(
-    devices: Optional[List[str]] = None,
+    devices: Optional[List[Union[tf_device.DeviceSpec, str]]] = None,
     device_type: Optional[str] = None
 ) -> Tuple[List[tf_device.DeviceSpec], str]:
-  """Makes device specs from local devices names or number of global devices."""
+  """Makes device specs for all local devices or from a provided list."""
 
   if devices is None:
     if device_type is None:
       device_type = 'CPU'
     devices = config.local_devices(device_type)
   else:
-    devices = [tf_device.DeviceSpec.from_string(d) for d in devices]
+    if isinstance(devices[0], str):
+      devices = [tf_device.DeviceSpec.from_string(d) for d in devices]
     if device_type is None:
       device_type = devices[0].device_type
 
     if device_type.upper() != devices[0].device_type.upper():
       raise ValueError(
-          f'Conflicting devices {str(devices)} and device_type {device_type}')
+          f'Conflicting devices {str(devices)} and device_type {device_type}'
+      )
 
   return devices, device_type
 
@@ -69,7 +71,7 @@ def _make_device_specs(
 def create_mesh(
     mesh_dims: Optional[Union[List[Tuple[str, int]], Dict[str, int]]] = None,
     mesh_name: str = '',
-    devices: Optional[List[str]] = None,
+    devices: Optional[List[Union[tf_device.DeviceSpec, str]]] = None,
     device_type: Optional[str] = None,
     use_xla_spmd: bool = layout.USE_XLA_SPMD,
 ) -> layout.Mesh:
@@ -137,7 +139,7 @@ def create_mesh(
 def create_distributed_mesh(
     mesh_dims: Union[List[Tuple[str, int]], Dict[str, int]],
     mesh_name: str = '',
-    local_devices: Optional[List[str]] = None,
+    local_devices: Optional[List[Union[tf_device.DeviceSpec, str]]] = None,
     device_type: Optional[str] = None,
     use_xla_spmd: bool = layout.USE_XLA_SPMD,
 ) -> layout.Mesh:
diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index f2e71d18824455..801bbb267be40d 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -69,6 +69,9 @@ pytype_strict_library(
     ],
 )
 
+# TODO(b/301286466): Investigate why python annotation type mismatch is not catptured by the type
+# strict BUILD rules.
+
 dtensor_test(
     name = "array_ops_test",
     srcs = ["array_ops_test.py"],
@@ -301,6 +304,7 @@ dtensor_test(
         "//tensorflow/dtensor/python:tpu_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:device",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/dtensor/python/tests/mesh_util_test.py b/tensorflow/dtensor/python/tests/mesh_util_test.py
index 8f51e2e2011928..b8bc9960915648 100644
--- a/tensorflow/dtensor/python/tests/mesh_util_test.py
+++ b/tensorflow/dtensor/python/tests/mesh_util_test.py
@@ -24,6 +24,7 @@
 from tensorflow.dtensor.python.tests import test_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import config as tf_config
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.platform import test
 
 
@@ -65,7 +66,7 @@ def test_tpu_2d_mesh_creation(self, use_xla_spmd):
                            reason='Test requires exactly 2 cores',
                            unless_device_count_equals_to=2)
     devices = test_util.list_local_logical_devices('TPU')
-    self.assertEqual(len(devices), 2)
+    self.assertLen(devices, 2)
     mesh = mesh_util.create_mesh([('x', 2), ('y', 1)],
                                  device_type='TPU',
                                  use_xla_spmd=use_xla_spmd)
@@ -80,13 +81,31 @@ def test_tpu_2d_mesh_creation_with_devices(self):
                            reason='Test requires at least 2 cores',
                            unless_device_count_equals_to=2)
     devices = test_util.list_local_logical_devices('TPU')
-    self.assertEqual(len(devices), 2)
+    self.assertLen(devices, 2)
     mesh = mesh_util.create_mesh([('x', 2), ('y', 1)],
                                  devices=['/device:tpu:0', '/device:tpu:1'])
     self.assertEqual(mesh.num_local_devices(), 2)
     self.assertEqual(mesh.size, 2)
     self.assertAllEqual(mesh.dim_names, ['x', 'y'])
 
+  def test_tpu_2d_mesh_creation_with_device_specs(self):
+    self.skipForDeviceType(['CPU', 'GPU'], reason='Test is intended for TPUs.')
+    self.skipForDeviceType(['TPU'],
+                           reason='Test requires at least 2 cores',
+                           unless_device_count_equals_to=2)
+    devices = test_util.list_local_logical_devices('TPU')
+    self.assertLen(devices, 2)
+    mesh = mesh_util.create_mesh(
+        [('x', 2), ('y', 1)],
+        devices=[
+            tf_device.DeviceSpec.from_string('/tpu:0'),
+            tf_device.DeviceSpec.from_string('/tpu:1'),
+        ],
+    )
+    self.assertEqual(mesh.num_local_devices(), 2)
+    self.assertEqual(mesh.size, 2)
+    self.assertAllEqual(mesh.dim_names, ['x', 'y'])
+
   def test_single_client_mesh_creation(self):
     self.skipForDeviceType(['GPU', 'TPU'], reason='Test is intended for CPUs')
     num_devices = len(test_util.list_local_logical_devices('CPU'))

From 8fcaa1bb3130fdcc2f859820c4de49f029e355b2 Mon Sep 17 00:00:00 2001
From: Ziyin Huang <ziyinh@google.com>
Date: Wed, 20 Sep 2023 14:15:42 -0700
Subject: [PATCH 053/567] Roll back the accidental change for the xla pad size
 in GetMinibatchOp.

PiperOrigin-RevId: 567077638
---
 tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index 7181edfe33d7b2..d4cd8035fdc7dd 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -356,7 +356,8 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
 
   const int num_physical_replica = num_replica_ * num_sc_per_chip_;
 
-  size_t xla_pad_size = 8;
+  size_t xla_pad_size = stream_executor::tpu::OpsApiFn()
+                            ->TpuUtil_GetXlaPadSizeFromTpuTopologyFn();
 
   OP_REQUIRES(ctx, sample_count_ % num_sc_per_chip_ == 0,
               absl::InvalidArgumentError(

From e2624e25d0c9465f631389f4c3375d7a3df5ea47 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 14:24:40 -0700
Subject: [PATCH 054/567] Support storing bfloat16 in tensor_content format.

PiperOrigin-RevId: 567079988
---
 tensorflow/python/framework/tensor_util.py      | 1 +
 tensorflow/python/framework/tensor_util_test.py | 5 +----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index b494f52410ee2a..59fbeb3429c68d 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -300,6 +300,7 @@ def _FlattenToStrings(nested_strings):
         dtypes.uint64,
         dtypes.float8_e5m2,
         dtypes.float8_e4m3fn,
+        dtypes.bfloat16
         # int4/uint4 intentionally not listed, since their binary representation
         # is implementation-dependent.
     ]
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 88c3541fcf22fb..14619c12f48824 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -254,8 +254,6 @@ def testHalf(self):
   def testBfloat16(self):
     test_type = dtypes.bfloat16.as_numpy_dtype
     t = tensor_util.make_tensor_proto(np.array([10.0, 20.0], dtype=test_type))
-    # 10.0: 16672 = 010000010(130) 0100000: (1+0/2+1/4) * 2^(130-127)
-    # 20.0: 16800 = 010000011(131) 0100000: (1+0/2+1/4) * 2^(131-127)
     self.assertProtoEquals("""
       dtype: DT_BFLOAT16
       tensor_shape {
@@ -263,8 +261,7 @@ def testBfloat16(self):
           size: 2
         }
       }
-      half_val: 16672
-      half_val: 16800
+      tensor_content: "\x20\x41\x5C\x32\x34\x30\x41"
       """, t)
 
     a = tensor_util.MakeNdarray(t)

From caf5b4d890095df814d8b39d2ac9717aa32ecf1a Mon Sep 17 00:00:00 2001
From: Edward Schwartz <schwartzedward@google.com>
Date: Wed, 20 Sep 2023 14:44:36 -0700
Subject: [PATCH 055/567] Remove obsolete bincount code since forward
 compatibility window has expired

PiperOrigin-RevId: 567085770
---
 tensorflow/python/ops/BUILD                |  3 ---
 tensorflow/python/ops/bincount_ops.py      | 30 ----------------------
 tensorflow/python/ops/bincount_ops_test.py | 10 --------
 3 files changed, 43 deletions(-)

diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index e278893370aa4a..36a41014cba5ca 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -1438,8 +1438,6 @@ py_strict_library(
         ":array_ops",
         ":math_ops",
         ":math_ops_gen",
-        "//tensorflow/python/compat",
-        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor",
@@ -1459,7 +1457,6 @@ cuda_py_strict_test(
         ":bincount_ops",
         ":count_ops_gen",
         ":sparse_ops",
-        "//tensorflow/python/compat",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py
index 361481a975ca2e..7d19fc3a4cbffd 100644
--- a/tensorflow/python/ops/bincount_ops.py
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """bincount ops."""
 
-from tensorflow.python.compat import compat
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor
@@ -132,34 +130,6 @@ def bincount(arr,
   """
   name = "bincount" if name is None else name
   with ops.name_scope(name):
-    # TODO(b/255381064) Remove the following block which uses older kernels for
-    # certain cases once the forward compatibility window expries (and remove
-    # the imports in this file and dependencies in the BUILD file for compat
-    # and constant_op which are only required for this block.)
-    if (
-        not compat.forward_compatible(2023, 9, 10)
-        and not binary_output
-        and axis is None
-    ):
-      arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
-      array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0
-      output_size = math_ops.cast(array_is_nonempty, dtypes.int32) * (
-          math_ops.reduce_max(arr) + 1)
-      if minlength is not None:
-        minlength = ops.convert_to_tensor(
-            minlength, name="minlength", dtype=dtypes.int32)
-        output_size = gen_math_ops.maximum(minlength, output_size)
-      if maxlength is not None:
-        maxlength = ops.convert_to_tensor(
-            maxlength, name="maxlength", dtype=dtypes.int32)
-        output_size = gen_math_ops.minimum(maxlength, output_size)
-      if weights is not None:
-        weights = ops.convert_to_tensor(weights, name="weights")
-        return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
-      weights = constant_op.constant([], dtype)
-      arr = array_ops.reshape(arr, [-1])
-      return gen_math_ops.bincount(arr, output_size, weights)
-
     arr = tensor_conversion.convert_to_tensor_v2_with_dispatch(arr, name="arr")
     if weights is not None:
       weights = tensor_conversion.convert_to_tensor_v2_with_dispatch(
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index a5873215e9a130..4e14b1b8a45ef8 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -17,7 +17,6 @@
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.compat import compat
 from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -325,15 +324,6 @@ def test_weights(
           "b/263004039 The DenseBincount GPU kernel does not support weights."
           " unsorted_segment_sum should be used instead on GPU."
       )
-    # TODO(b/255381064) Remove the following block which uses older kernels for
-    # certain cases once the forward compatibility window expries (and remove
-    # the imports in this file and dependencies in the BUILD file for compat
-    # which is only required for this block.)
-    if not compat.forward_compatible(2023, 9, 10):
-      self.skipTest(
-          "b/255381064 tests with weights will pass once forward comptibiliy"
-          " window expires"
-      )
     if axis == -1:
       expected = _adjust_expected_rank2(expected, minlength, maxlength)
     else:

From de064617a4e8c07a2f4646447e84dfa45a66c80d Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Wed, 20 Sep 2023 14:54:30 -0700
Subject: [PATCH 056/567] [XLA] Fix DCHECK in ShapeUtil::ByteSizeOf that failed
 when shape didn't have a layout

PiperOrigin-RevId: 567088606
---
 third_party/xla/xla/shape_util.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index 0bd89ce9fdaaa6..4f0fc50bb52206 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -876,7 +876,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
 
 /* static */ int64_t ShapeUtil::ByteSizeOf(const Shape& shape,
                                            int64_t pointer_size) {
-  TF_DCHECK_OK(ValidateShape(shape));
+  TF_DCHECK_OK(ValidateShapeWithOptionalLayout(shape));
   if (shape.element_type() == TUPLE) {
     return ByteSizeOfTupleIndexTable(shape, pointer_size);
   } else if (shape.IsArray()) {
@@ -900,7 +900,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
 }
 
 /* static */ int64_t ShapeUtil::ByteSizeOfElements(const Shape& shape) {
-  TF_DCHECK_OK(ValidateShape(shape));
+  TF_DCHECK_OK(ValidateShapeWithOptionalLayout(shape));
   int64_t allocated_element_count;
 
   CHECK(LayoutUtil::IsDenseArray(shape)) << shape.ShortDebugString();

From 7974a0b3afd2b53e92491a1bdea3b54294e0f6e7 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Wed, 20 Sep 2023 15:46:33 -0700
Subject: [PATCH 057/567] [xla:gpu] NFC: Print node type instead of node
 pointer in error messages

PiperOrigin-RevId: 567102686
---
 .../xla/xla/stream_executor/gpu/gpu_graph.cc  | 70 ++++++++++++++-----
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_graph.cc b/third_party/xla/xla/stream_executor/gpu/gpu_graph.cc
index 70135573f266b7..c96ca81dccd6b3 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_graph.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_graph.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
-#include <sstream>
 #include <string>
 
 #include "absl/strings/str_cat.h"
@@ -95,6 +94,39 @@ tsl::StatusOr<std::string> GraphExecUpdateResultToString(
   return tsl::errors::Internal("Unexpected value for GraphExecUpdateResult");
 }
 
+tsl::StatusOr<std::string> GraphNodeTypeToString(
+    GpuDriver::GraphNodeType node_type) {
+  switch (node_type) {
+    case GpuDriver::GraphNodeType::kKernel:
+      return "kKernel";
+    case GpuDriver::GraphNodeType::kMemcpy:
+      return "kMemcpy";
+    case GpuDriver::GraphNodeType::kMemset:
+      return "kMemset";
+    case GpuDriver::GraphNodeType::kHost:
+      return "kHost";
+    case GpuDriver::GraphNodeType::kGraph:
+      return "kGraph";
+    case GpuDriver::GraphNodeType::kEmpty:
+      return "kEmpty";
+    case GpuDriver::GraphNodeType::kWaitEvent:
+      return "kWaitEvent";
+    case GpuDriver::GraphNodeType::kEventRecord:
+      return "kEventRecord";
+    case GpuDriver::GraphNodeType::kExtSemasSignal:
+      return "kExtSemasSignal";
+    case GpuDriver::GraphNodeType::kExtSemasWait:
+      return "kExtSemasWait";
+    case GpuDriver::GraphNodeType::kMemAlloc:
+      return "kMemAlloc";
+    case GpuDriver::GraphNodeType::kMemFree:
+      return "kMemFree";
+    case GpuDriver::GraphNodeType::kBatchMemOp:
+      return "kBatchMemOp";
+  }
+  return tsl::errors::Internal("Unexpected value for GraphNodeType");
+}
+
 tsl::StatusOr<OwnedGpuGraphExec::UpdateResult> OwnedGpuGraphExec::Update(
     OwnedGpuGraph graph) {
   VLOG(3) << "Update gpu graph exec with a new graph after " << num_launches_
@@ -138,21 +170,27 @@ tsl::StatusOr<OwnedGpuGraphExec::UpdateResult> OwnedGpuGraphExec::Update(
 
     TF_ASSIGN_OR_RETURN(std::string result_str,
                         GraphExecUpdateResultToString(result.result));
-    GpuGraphNodeHandle error_from_node = result.error_from_node;
-    std::ostringstream error_from_node_ptr;
-    error_from_node_ptr << reinterpret_cast<void*>(error_from_node);
-    std::string error_from_node_str = error_from_node_ptr.str();
-
-    GpuGraphNodeHandle error_node = result.error_node;
-    std::ostringstream error_node_ptr;
-    error_node_ptr << reinterpret_cast<void*>(error_node);
-    std::string error_node_str = error_node_ptr.str();
-
-    return tsl::errors::Internal(
-        absl::StrCat("Failed to update gpu graph: ", "Graph update result=",
-                     result_str, ", Error node handle=", error_node_str,
-                     ", Error from node handle=", error_from_node_str, ": "),
-        st.message());
+    std::string error_message = absl::StrCat(
+        "Failed to update gpu graph: Graph update result=", result_str);
+
+    if (result.error_node) {
+      TF_ASSIGN_OR_RETURN(GpuDriver::GraphNodeType node_type,
+                          GpuDriver::GraphNodeGetType(result.error_node));
+      TF_ASSIGN_OR_RETURN(std::string node_type_str,
+                          GraphNodeTypeToString(node_type));
+      absl::StrAppend(&error_message, ", Error node name=", node_type_str);
+    }
+
+    if (result.error_from_node) {
+      TF_ASSIGN_OR_RETURN(GpuDriver::GraphNodeType node_type,
+                          GpuDriver::GraphNodeGetType(result.error_from_node));
+      TF_ASSIGN_OR_RETURN(std::string node_type_str,
+                          GraphNodeTypeToString(node_type));
+      absl::StrAppend(&error_message, ", Error from node name=", node_type_str);
+    }
+
+    absl::StrAppend(&error_message, ": ", st.message());
+    return tsl::errors::Internal(error_message);
   }
 
   return UpdateResult::kSuccess;

From c2519c4aa372eecc6a26dfd4dedee866f3f0f2e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 16:03:35 -0700
Subject: [PATCH 058/567] Pass along the shape param from
 tf.compat.v1.get_variable to its underlying variable creator. At this point,
 we have already checked that either shape is compatible w/ initial_value, or
 that it is not specified (None). Therefore it should always be safe to pass
 it. Having it can help subsequent nested variable creators save some cycles
 tracing the initializer function when all they need to know is the shape.

PiperOrigin-RevId: 567107017
---
 .../feature_column/feature_column_test.py     | 28 +++++++++++++------
 .../feature_column/feature_column_v2_test.py  | 12 +++++---
 tensorflow/python/ops/variable_scope.py       | 17 +++++++----
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 3cde6c5657edc1..f8fbd8db7e3b15 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -4924,16 +4924,19 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
+        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
+          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
+          return array_ops.slice(
+              embedding_values, partition_info.var_offset, shape
+          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-
-        self.assertEqual(dtypes.float32, dtype)
-        return embedding_values
+          return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups = (
@@ -4976,7 +4979,12 @@ def _initializer(shape, dtype, partition_info=None):
       for v in global_vars:
         self.assertIsInstance(v, variables_lib.Variable)
       with _initialized_session():
-        self.assertAllEqual(embedding_values, global_vars[0])
+        if partition_variables:
+          self.assertAllEqual(
+              embedding_values, array_ops.concat(global_vars, axis=0)
+          )
+        else:
+          self.assertAllEqual(embedding_values, global_vars[0])
         self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
       if use_safe_embedding_lookup:
@@ -5783,16 +5791,19 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
+        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
+          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
+          return array_ops.slice(
+              embedding_values, partition_info.var_offset, shape
+          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-
-        self.assertEqual(dtypes.float32, dtype)
-        return embedding_values
+          return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups_a = (
@@ -5842,10 +5853,11 @@ def _initializer(shape, dtype, partition_info=None):
         self.assertCountEqual(('vars/embedding_weights/part_0:0',
                                'vars/embedding_weights/part_1:0'),
                               tuple([v.name for v in global_vars]))
+        embedding_var = array_ops.concat(global_vars, axis=0)
       else:
         self.assertCountEqual(('vars/embedding_weights:0',),
                               tuple([v.name for v in global_vars]))
-      embedding_var = global_vars[0]
+        embedding_var = global_vars[0]
 
       self.evaluate(variables_lib.global_variables_initializer())
       self.evaluate(lookup_ops.tables_initializer())
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 21dcbb4452d6c7..cb98bf60c03184 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -5762,16 +5762,19 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
+        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
+          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
+          return array_ops.slice(
+              embedding_values, partition_info.var_offset, shape
+          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-
-        self.assertEqual(dtypes.float32, dtype)
-        return embedding_values
+          return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups_a = (
@@ -5821,10 +5824,11 @@ def _initializer(shape, dtype, partition_info=None):
         self.assertCountEqual(('vars/aaa_bbb_shared_embedding/part_0:0',
                                'vars/aaa_bbb_shared_embedding/part_1:0'),
                               tuple([v.name for v in global_vars]))
+        embedding_var = array_ops.concat(global_vars, axis=0)
       else:
         self.assertCountEqual(('vars/aaa_bbb_shared_embedding:0',),
                               tuple([v.name for v in global_vars]))
-      embedding_var = global_vars[0]
+        embedding_var = global_vars[0]
 
       self.evaluate(variables_lib.global_variables_initializer())
       self.evaluate(lookup_ops.tables_initializer())
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 33dd0438fa2f2f..622f70732adaba 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -807,7 +807,8 @@ def _get_partitioned_variable(self,
             use_resource=use_resource,
             constraint=constraint,
             synchronization=synchronization,
-            aggregation=aggregation)
+            aggregation=aggregation,
+        )
 
       # pylint: disable=protected-access
       var._set_save_slice_info(
@@ -880,7 +881,8 @@ def _get_single_variable(self,
       raise ValueError("If initializer is a constant, do not specify shape.")
 
     dtype = dtypes.as_dtype(dtype)
-    shape = tensor_shape.as_shape(shape)
+    if shape is not None:
+      shape = tensor_shape.as_shape(shape)
 
     if name in self._vars:
       # Here we handle the case when returning an existing variable.
@@ -901,7 +903,9 @@ def _get_single_variable(self,
         raise ValueError("%s Originally defined at:\n\n%s" %
                          (err_msg, "".join(traceback.format_list(tb))))
       found_var = self._vars[name]
-      if not shape.is_compatible_with(found_var.get_shape()):
+      if shape is not None and not shape.is_compatible_with(
+          found_var.get_shape()
+      ):
         raise ValueError("Trying to share variable %s, but specified shape %s"
                          " and found shape %s." %
                          (name, shape, found_var.get_shape()))
@@ -921,6 +925,7 @@ def _get_single_variable(self,
 
     # Create the tensor to initialize the variable with default value.
     if initializer is None:
+      assert shape is not None
       initializer, initializing_from_value = self._get_default_initializer(
           name=name, shape=shape, dtype=dtype)
     # Enter an init scope when creating the initializer.
@@ -932,7 +937,7 @@ def _get_single_variable(self,
         # Instantiate initializer if provided initializer is a type object.
         if tf_inspect.isclass(initializer):
           initializer = initializer()
-        if shape.is_fully_defined():
+        if shape is not None and shape.is_fully_defined():
           if "partition_info" in tf_inspect.getargspec(initializer).args:
             init_val = functools.partial(initializer,
                                          shape.as_list(),
@@ -967,7 +972,9 @@ def _get_single_variable(self,
         constraint=constraint,
         use_resource=use_resource,
         synchronization=synchronization,
-        aggregation=aggregation)
+        aggregation=aggregation,
+        shape=shape,
+    )
     if context.executing_eagerly() and self._store_eager_variables:
       if collections:
         ops.add_to_collections(collections, v)

From af50677da4f39028b85fe42df41cf1ebab68bea5 Mon Sep 17 00:00:00 2001
From: William Muir <wamuir@gmail.com>
Date: Wed, 20 Sep 2023 18:09:57 -0500
Subject: [PATCH 059/567] Strip `external/local_tsl` prefix during zip of tsl
 protos

---
 tensorflow/tools/lib_package/BUILD | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 578d2e3f2f5bfe..d30b414d5ddbc2 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -2,18 +2,17 @@
 # This includes the C API, Java API, and protocol buffer files.
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@rules_pkg//pkg:tar.bzl", "pkg_tar")
+load("@rules_pkg//:pkg.bzl", "pkg_tar", "pkg_zip")
 load("//tensorflow:tensorflow.bzl", "VERSION", "VERSION_MAJOR", "if_macos")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_license_deps")
 load("//third_party/mkl:build_defs.bzl", "if_enable_mkl", "if_mkl")
 
 package(default_visibility = ["//visibility:private"])
 
-genrule(
+pkg_zip(
     name = "libtensorflow_proto",
     srcs = ["//tensorflow/core:protos_all_proto_srcs"],
-    outs = ["libtensorflow_proto.zip"],
-    cmd = "zip $@ $(SRCS)",
+    strip_prefix = "/external/local_tsl",
 )
 
 pkg_tar(

From 5e3184466b9f28423f5538eceec6b61274e0ade8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 16:09:01 -0700
Subject: [PATCH 060/567] Exercise new version of TF/XLA bridge in
 legalize_tf_quant_test

PiperOrigin-RevId: 567108435
---
 .../mlir/quantization/stablehlo/BUILD         |  13 +-
 .../passes/bridge/legalize_tf_quant_test.cc   | 121 ++++++++----------
 tensorflow/compiler/mlir/tf2xla/api/v2/BUILD  |   5 +-
 tensorflow/core/tpu/kernels/BUILD             |   1 +
 4 files changed, 66 insertions(+), 74 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index e3c574600d5c52..a4618d6de03082 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -198,15 +198,20 @@ tf_cc_test(
     tags = ["no_oss"],
     deps = [
         "//tensorflow/compiler/jit",
-        "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
+        "//tensorflow/compiler/mlir/tf2xla/api/v2:legalize_tf",
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/service:hlo_proto_cc",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Pass",
+        "@local_xla//xla:shape_util",
+        "@local_xla//xla/client:client_library",
+        "@local_xla//xla/stream_executor",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc
index 369d0a235041f7..1fd1a0b6bab721 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc
@@ -16,22 +16,53 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/hlo.pb.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.h"
+#include "xla/client/client_library.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/platform.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tsl/lib/core/status_test_util.h"
-#include "tsl/platform/statusor.h"
 
-namespace tensorflow {
-namespace tf2xla {
-namespace v2 {
+namespace mlir::quant::stablehlo {
+namespace {
+
+class LegalizeTFQuantTest : public ::testing::Test {
+ protected:
+  void TestBridgeLowering(llvm::StringRef mlir_module_string,
+                          llvm::ArrayRef<tensorflow::TensorShape> arg_shapes) {
+    tensorflow::tpu::MlirToHloArgs mlir_to_hlo_args;
+    mlir_to_hlo_args.rollout_state =
+        tensorflow::ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED;
+    mlir_to_hlo_args.mlir_module = mlir_module_string;
+    tensorflow::se::Platform* platform =
+        tensorflow::se::MultiPlatformManager::PlatformWithName("Host").value();
+    auto client =
+        xla::ClientLibrary::GetOrCreateCompileOnlyClient(platform).value();
+    tensorflow::tpu::TPUCompileMetadataProto metadata_proto;
+    bool use_tuple_args = true;
+    std::vector<tensorflow::tpu::ShardingAndIndex> arg_core_mapping;
+    std::vector<std::vector<xla::Shape>> per_core_arg_shapes;
+    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes;
+
+    TF_EXPECT_OK(tensorflow::tf2xla::v2::LegalizeMlirToHlo(
+                     mlir_to_hlo_args, metadata_proto, use_tuple_args,
+                     /*device_type=*/"XLA_TPU_JIT", custom_legalization_passes,
+                     /*shape_determination_fns=*/{}, arg_shapes,
+                     &arg_core_mapping, &per_core_arg_shapes, client)
+                     .status());
+  }
+};
 
-TEST(LegalizeTFQuantTest, LegalizesModuleWithTFUniformQuantization) {
-  constexpr char legalization[] = R"mlir(
+TEST_F(LegalizeTFQuantTest, LegalizesModuleWithTFUniformQuantization) {
+  constexpr char mlir_module_string[] = R"mlir(
   module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
     func.func @main(%arg0 : tensor<1xf32>) -> tensor<1xf32> {
       %scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
@@ -48,47 +79,12 @@ TEST(LegalizeTFQuantTest, LegalizesModuleWithTFUniformQuantization) {
   })mlir";
 
   std::vector<tensorflow::TensorShape> arg_shapes = {{1}};
-  XlaCompilationResult compilation_result;
-
-  TF_ASSERT_OK(CompileSerializedMlirToXlaHlo(
-                   legalization, arg_shapes, /*device_type=*/"XLA_TPU_JIT",
-                   /*use_tuple_args=*/true, /*enable_op_fallback=*/true,
-                   /*shape_determination_fns=*/{}, &compilation_result)
-                   .status());
-
-  const xla::HloModuleProto& hlo_module =
-      compilation_result.computation->proto();
-  for (const xla::HloComputationProto computation : hlo_module.computations()) {
-    for (const xla::HloInstructionProto instruction :
-         computation.instructions()) {
-      TF_ASSERT_OK_AND_ASSIGN(xla::HloOpcode opcode,
-                              xla::StringToHloOpcode(instruction.opcode()));
-      switch (opcode) {
-        case xla::HloOpcode::kConstant:
-        case xla::HloOpcode::kDivide:
-        case xla::HloOpcode::kAdd:
-        case xla::HloOpcode::kFloor:
-        case xla::HloOpcode::kConvert:
-        case xla::HloOpcode::kMaximum:
-        case xla::HloOpcode::kMinimum:
-        case xla::HloOpcode::kSubtract:
-        case xla::HloOpcode::kParameter:
-        case xla::HloOpcode::kTuple:
-        case xla::HloOpcode::kGetTupleElement:
-        case xla::HloOpcode::kBroadcast:
-        case xla::HloOpcode::kClamp:
-        case xla::HloOpcode::kRoundNearestEven:
-          break;
-        default:
-          ADD_FAILURE() << "Failed to compile TF uniform quantized ops "
-                        << "(unexpected opcode: " << opcode << ")";
-      }
-    }
-  }
+
+  TestBridgeLowering(mlir_module_string, arg_shapes);
 }
 
-TEST(LegalizeTFQuantTest, LegalizesModuleWithDequantize) {
-  constexpr char legalization[] = R"mlir(
+TEST_F(LegalizeTFQuantTest, LegalizesModuleWithDequantize) {
+  constexpr char mlir_module_string[] = R"mlir(
   module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
     func.func @main(%arg0: tensor<1x!tf_type.qint8>) -> tensor<1xf32> {
       %min_range = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
@@ -97,19 +93,13 @@ TEST(LegalizeTFQuantTest, LegalizesModuleWithDequantize) {
       func.return %0 : tensor<1xf32>
     }
   })mlir";
-
   std::vector<tensorflow::TensorShape> arg_shapes = {{1}};
-  XlaCompilationResult compilation_result;
 
-  TF_EXPECT_OK(CompileSerializedMlirToXlaHlo(
-                   legalization, arg_shapes, /*device_type=*/"XLA_CPU_JIT",
-                   /*use_tuple_args=*/true, /*enable_op_fallback=*/true,
-                   /*shape_determination_fns=*/{}, &compilation_result)
-                   .status());
+  TestBridgeLowering(mlir_module_string, arg_shapes);
 }
 
-TEST(LegalizeTFQuantTest, LegalizesModuleWithClipByValue) {
-  constexpr char legalization[] = R"mlir(
+TEST_F(LegalizeTFQuantTest, LegalizesModuleWithClipByValue) {
+  constexpr char mlir_module_string[] = R"mlir(
   module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
     func.func @main(%arg0 : tensor<2x2xf32>) -> tensor<2x2xf32> {
       %max = "tf.Const"() { value = dense<12.0> : tensor<f32> } : () -> tensor<f32>
@@ -137,17 +127,10 @@ TEST(LegalizeTFQuantTest, LegalizesModuleWithClipByValue) {
       func.return %2 : tensor<2x2xf32>
     }
   })mlir";
-
   std::vector<tensorflow::TensorShape> arg_shapes = {{2, 2}};
-  XlaCompilationResult compilation_result;
 
-  TF_EXPECT_OK(CompileSerializedMlirToXlaHlo(
-                   legalization, arg_shapes, /*device_type=*/"XLA_TPU_JIT",
-                   /*use_tuple_args=*/true, /*enable_op_fallback=*/true,
-                   /*shape_determination_fns=*/{}, &compilation_result)
-                   .status());
+  TestBridgeLowering(mlir_module_string, arg_shapes);
 }
 
-}  // namespace v2
-}  // namespace tf2xla
-}  // namespace tensorflow
+}  // namespace
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index acd65e27f78a34..0ff72d68d418c6 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -11,7 +11,10 @@ package(
 )
 
 # Please reach out to tf-bridge-team@ before using the TF2XLA bridge.
-package_group(name = "tf2xla_users")
+package_group(
+    name = "tf2xla_users",
+    packages = ["//tensorflow/compiler/mlir/quantization/stablehlo/..."],
+)
 
 cc_library(
     name = "legalize_tf",
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index e75f17f57625d7..b7ac39d708bb39 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -32,6 +32,7 @@ load(
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
+        "//tensorflow/compiler/mlir/quantization:__subpackages__",
         "//tensorflow/compiler/mlir/tf2xla:__subpackages__",
         "//tensorflow/compiler/xrt/kernels:__subpackages__",
         "//tensorflow/core/tpu:__subpackages__",

From c4e5c7c6b4227ea5aedc62ae5ef3870f04e6bd74 Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Wed, 20 Sep 2023 16:19:35 -0700
Subject: [PATCH 061/567] Move CreateTFRegionControlFlowToFunctional to graph
 export as it doesn't really do any Bridge logic.

PiperOrigin-RevId: 567111139
---
 tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index b94e65f75e8a41..dfe07231999791 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -268,7 +268,6 @@ void CreateTPUBridgePipelineImpl(
           ->tf_mlir_enable_tpu_variable_runtime_reformatting_pass) {
     pm.addPass(CreateTPUVariableRuntimeReformattingPass());
   }
-  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
 }
 }  // namespace
 
@@ -298,6 +297,7 @@ void CreateTPUBridgePipelineV1(OpPassManager &pm) {
   pm.addPass(tf_executor::CreateTFExecutorTPUV1IslandOutliningPass());
   OpPassManager &nested_module = pm.nest<ModuleOp>();
   CreateTPUBridgePipelineImpl(nested_module);
+
   pm.addPass(tf_executor::CreateTFExecutorTPUV1IslandInliningPass());
   // There are cases where we don't consume all compilation and replication
   // attributes like we do for the V2 pipeline, so we need to convert them from
@@ -373,6 +373,7 @@ void AddGraphExportLoweringPasses(OpPassManager &pm) {
     pm.addPass(CreateBreakUpIslandsPass());
   };
 
+  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
   add_pass(CreateFunctionalToExecutorDialectConversionPass());
   add_pass(TFDevice::CreateReplicateToIslandPass(/*legacy_graph_export=*/true));
   add_pass(TFDevice::CreateReplicaIDToDeviceOrdinalPass());
@@ -391,6 +392,8 @@ void AddGraphExportLoweringPasses(OpPassManager &pm) {
 }
 
 void AddGraphExportLoweringPassesV2(OpPassManager &pm) {
+  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
+
   // First, we need to convert from functional, to executor dialect.
   pm.addNestedPass<func::FuncOp>(
       CreateFunctionalToExecutorDialectConversionPass());
@@ -520,7 +523,6 @@ void CreateTFXLABridgePipeline(OpPassManager &pm) {
 
   pm.addNestedPass<func::FuncOp>(createCSEPass());
   pm.addPass(createSymbolDCEPass());
-  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
 }
 
 tensorflow::Status RunTFXLABridge(ModuleOp module,

From aca5dfa4e11a4d18085b776ecc2bf741ee958569 Mon Sep 17 00:00:00 2001
From: Jian Cai <jiancai@google.com>
Date: Wed, 20 Sep 2023 16:35:59 -0700
Subject: [PATCH 062/567] Do not create dependencies among instances of an op
 with TF_RandomGeneratorSideEffect trait

This makes the MLIR side effect modelling of such ops consistent with ACD.

PiperOrigin-RevId: 567115291
---
 .../compiler/mlir/tensorflow/ir/tf_generated_ops.td   |  2 +-
 tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td  | 11 ++++++++++-
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc  | 10 ----------
 .../tensorflow/tests/side-effect-analysis-test.mlir   |  2 +-
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 6ccb126fd9686e..ecd8c234eff40d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -12306,7 +12306,7 @@ The generated values will have mean 0 and standard deviation 1.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_RandomUniformOp : TF_Op<"RandomUniform", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, TF_CannotDuplicate, TF_RandomGeneratorSideEffect]> {
+def TF_RandomUniformOp : TF_Op<"RandomUniform", [TF_CannotDuplicate, TF_RandomGeneratorSideEffect]> {
   let summary = "Outputs random values from a uniform distribution.";
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 55758e0ecec2be..b1805ce7167bb9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -253,7 +253,16 @@ def TF_RecvSideEffect : MemoryEffects<[MemWrite<TF_RecvResource>]>;
 def TF_XlaHostComputeSideEffect : MemoryEffects<[MemWrite<TF_XlaHostComputeResource>]>;
 
 def TF_WriteTrainingPredictions : MemoryEffects<[MemWrite<TF_WriteTrainingPredictionsResource>]>;
-def TF_RandomGeneratorSideEffect : MemoryEffects<[MemWrite<TF_RandomGeneratorResource>]>;
+
+// The state among all RNG ops is shared in a single global variable. When an
+// RNG op runs, it uses the global variable to produce a random number, then
+// updates the global variable. This would be nondeterministic if the RNG ops
+// ran in a nondeterministic order, but XLA scheduler is deterministic so the
+// order is always determinstic. However, we cannot mark such ops as Pure as
+// that may lead to incorrect optimization, e.g. two instances of the same op
+// with the same constant input may end up returning
+// the same value, even though they should have not.
+def TF_RandomGeneratorSideEffect : MemoryEffects<[MemRead<TF_RandomGeneratorResource>]>;
 
 // Special effect for keeping `CollectiveReduce` ops in order.
 def TF_CollectiveReduceOrderingEffect : MemoryEffects<[MemWrite<TF_CollectiveReduceOrderingResource>]>;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index d3994a6cf43645..e304bb2a24d9b5 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -668,16 +668,6 @@ LogicalResult RandomUniformOp::verify() {
   return success();
 }
 
-std::optional<std::string> RandomUniformOp::GetResourceInstanceStr() {
-  // We do not create dependencies among the ops. XLA will run the ops in a
-  // deterministic order. However, we cannot mark the op as Pure as that may
-  // lead to incorrect optimization, e.g. two ops with the same constant input
-  // may end up returning the same value, even though they should have returned
-  // different values.
-  static unsigned counter = 0;
-  return std::to_string(counter++);
-}
-
 //===----------------------------------------------------------------------===//
 // RangeOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
index c57e07b5e3f74e..00b140dbf2afb9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
@@ -2909,7 +2909,7 @@ func.func @tpu_execute_effect(
 
 // -----
 
-// Tests that we don't create dependencies between any two `RandomUniform` ops.
+// Tests that we don't create dependencies between any two instances of an op with `TF_RandomGeneratorSideEffect` trait.
 func.func @random_uniform_ordering_effect() -> (tensor<3xf32>) {
   // expected-remark@above {{ID: 9}}
   %graph = tf_executor.graph {

From 6b14d69788b63fbc9fbc6677098a96c2aca3b594 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Wed, 20 Sep 2023 16:44:09 -0700
Subject: [PATCH 063/567] Make `PjRtMemorySpace::id()` unique within a client

The current API semantics requires memory space ids to be unique only within a memory kind. This is inconsistent with `PjRtDevice::id()` (unique within a client regardless of device types) and cumbersome (since `(memory_kind, id)` is needed to uniquely identify a memory space). This CL changes the contract to require implementations to guarantee ids to be unique within a client.

PiperOrigin-RevId: 567117355
---
 third_party/xla/xla/pjrt/pjrt_client.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index 1541851ad0e4ff..1ac2461e8fc136 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -74,8 +74,7 @@ class PjRtMemorySpace {
   // The devices that this memory space is attached to.
   virtual absl::Span<PjRtDevice* const> devices() const = 0;
 
-  // The ID of this memory space. IDs are unique among memory spaces of this
-  // type.
+  // The ID of this memory space. IDs are globally unique across all hosts.
   virtual int id() const = 0;
 
   // A platform-dependent string that uniquely identifies the kind of the

From 86766a45c0d3a06342e2e34b5d5332131a6390c7 Mon Sep 17 00:00:00 2001
From: Subhankar Shah <subhankarshah@google.com>
Date: Wed, 20 Sep 2023 16:57:37 -0700
Subject: [PATCH 064/567] [MemorySpaceAssignment] Move code related to
 MemorySpaceAssignment (memory_space_assignment* files) in a separate folder.

PiperOrigin-RevId: 567120948
---
 tensorflow/core/BUILD                         |   2 +-
 third_party/xla/xla/service/BUILD             | 140 +--------------
 .../xla/xla/service/buffer_assignment.h       |   2 +-
 third_party/xla/xla/service/heap_simulator.cc |   2 +-
 third_party/xla/xla/service/heap_simulator.h  |   2 +-
 .../xla/service/memory_space_assignment/BUILD | 160 ++++++++++++++++++
 .../memory_space_assignment.cc                |   8 +-
 .../memory_space_assignment.h                 |  10 +-
 .../memory_space_assignment.proto             |   0
 ...mory_space_assignment_best_fit_repacker.cc |   2 +-
 ...emory_space_assignment_best_fit_repacker.h |   8 +-
 ...space_assignment_best_fit_repacker_test.cc |   2 +-
 .../memory_space_assignment_repacking.h       |   6 +-
 .../memory_space_assignment_test.cc           |  10 +-
 .../memory_space_assignment_tuning_utils.cc   |   4 +-
 .../memory_space_assignment_tuning_utils.h    |   6 +-
 .../memory_space_assignment_utils.cc          |   2 +-
 .../memory_space_assignment_utils.h           |   6 +-
 third_party/xla/xla/xla.bzl                   |   4 +-
 19 files changed, 200 insertions(+), 176 deletions(-)
 create mode 100644 third_party/xla/xla/service/memory_space_assignment/BUILD
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment.cc (99%)
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment.h (99%)
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment.proto (100%)
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment_best_fit_repacker.cc (97%)
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment_best_fit_repacker.h (79%)
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment_best_fit_repacker_test.cc (97%)
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment_repacking.h (95%)
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment_test.cc (99%)
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment_tuning_utils.cc (92%)
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment_tuning_utils.h (83%)
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment_utils.cc (97%)
 rename third_party/xla/xla/service/{ => memory_space_assignment}/memory_space_assignment_utils.h (84%)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 757e75829b4503..09bc9919bb9b4e 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1470,7 +1470,7 @@ cc_library(
         "@local_xla//xla:xla_proto_cc_impl",
         "@local_xla//xla:xla_data_proto_cc_impl",
         "@local_xla//xla/service:hlo_proto_cc_impl",
-        "@local_xla//xla/service:memory_space_assignment_proto_cc_impl",
+        "@local_xla//xla/service/memory_space_assignment:memory_space_assignment_proto_cc_impl",
         "@local_xla//xla/service/gpu:backend_configs_cc_impl",
         "@local_xla//xla/service/gpu:hlo_op_profile_proto_cc_impl",
     ] + tf_protos_grappler_impl() + tf_monitoring_framework_deps(),
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 5c07e6f9a7efec..934cbf95b298be 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1624,7 +1624,6 @@ cc_library(
         ":hlo_proto_cc",
         ":hlo_value",
         ":logical_buffer",
-        ":memory_space_assignment",
         ":tuple_points_to_analysis",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -1633,6 +1632,7 @@ cc_library(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_live_range",
+        "//xla/service/memory_space_assignment",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1735,7 +1735,6 @@ cc_library(
         ":hlo_dataflow_analysis",
         ":hlo_ordering",
         ":hlo_proto_cc",
-        ":memory_space_assignment_repacking",
         ":tuple_points_to_analysis",
         "//xla:comparison_util",
         "//xla:status",
@@ -1743,6 +1742,7 @@ cc_library(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_live_range",
+        "//xla/service/memory_space_assignment:memory_space_assignment_repacking",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -4591,134 +4591,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "memory_space_assignment_utils",
-    srcs = ["memory_space_assignment_utils.cc"],
-    hdrs = ["memory_space_assignment_utils.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":heap_simulator",
-        "//xla/hlo/ir:hlo",
-    ],
-)
-
-cc_library(
-    name = "memory_space_assignment_tuning_utils",
-    srcs = ["memory_space_assignment_tuning_utils.cc"],
-    hdrs = ["memory_space_assignment_tuning_utils.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":heap_simulator",
-        ":memory_space_assignment_utils",
-        "//xla/hlo/ir:hlo",
-    ],
-)
-
-cc_library(
-    name = "memory_space_assignment_repacking",
-    hdrs = ["memory_space_assignment_repacking.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:statusor",
-        "//xla:types",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "memory_space_assignment_best_fit_repacker",
-    srcs = ["memory_space_assignment_best_fit_repacker.cc"],
-    hdrs = ["memory_space_assignment_best_fit_repacker.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":heap_simulator",
-        ":memory_space_assignment_repacking",
-    ],
-)
-
-xla_cc_test(
-    name = "memory_space_assignment_best_fit_repacker_test",
-    srcs = ["memory_space_assignment_best_fit_repacker_test.cc"],
-    deps = [
-        ":memory_space_assignment_best_fit_repacker",
-        "//xla/tests:xla_internal_test_main",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
-cc_library(
-    name = "memory_space_assignment",
-    srcs = ["memory_space_assignment.cc"],
-    hdrs = ["memory_space_assignment.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":heap_simulator",
-        ":hlo_cost_analysis",
-        ":hlo_proto_cc",
-        ":hlo_value",
-        ":memory_space_assignment_proto_cc",
-        ":memory_space_assignment_repacking",
-        ":memory_space_assignment_tuning_utils",
-        ":memory_space_assignment_utils",
-        ":tuple_util",
-        "//xla:debug_options_flags",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_live_range",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "memory_space_assignment_test",
-    srcs = ["memory_space_assignment_test.cc"],
-    deps = [
-        ":heap_simulator",
-        ":hlo_cost_analysis",
-        ":hlo_value",
-        ":instruction_hoister",
-        ":memory_space_assignment",
-        ":memory_space_assignment_proto_cc",
-        ":memory_space_assignment_repacking",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
 cc_library(
     name = "memory_space_propagation",
     srcs = ["memory_space_propagation.cc"],
@@ -7251,11 +7123,3 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test",
     ]),
 )
-
-tf_proto_library(
-    name = "memory_space_assignment_proto",
-    srcs = ["memory_space_assignment.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index c77ffc4d895213..8c43b47fe9ded5 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/logical_buffer.h"
-#include "xla/service/memory_space_assignment.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.h"
 #include "xla/service/tuple_points_to_analysis.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
diff --git a/third_party/xla/xla/service/heap_simulator.cc b/third_party/xla/xla/service/heap_simulator.cc
index 44d7b87a804245..d5e099d587397d 100644
--- a/third_party/xla/xla/service/heap_simulator.cc
+++ b/third_party/xla/xla/service/heap_simulator.cc
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/map_util.h"
-#include "xla/service/memory_space_assignment_repacking.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
 #include "xla/status.h"
 #include "xla/util.h"
 
diff --git a/third_party/xla/xla/service/heap_simulator.h b/third_party/xla/xla/service/heap_simulator.h
index 755c735608ebf8..94cb22fd5e272b 100644
--- a/third_party/xla/xla/service/heap_simulator.h
+++ b/third_party/xla/xla/service/heap_simulator.h
@@ -46,7 +46,7 @@ limitations under the License.
 #include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_ordering.h"
-#include "xla/service/memory_space_assignment_repacking.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
 #include "xla/service/tuple_points_to_analysis.h"
 #include "xla/statusor.h"
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
new file mode 100644
index 00000000000000..48ff95b0a13f4e
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -0,0 +1,160 @@
+# Description:
+#   Memory Space Assignment service implementation.
+
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
+load(
+    "@local_tsl//tsl/platform:build_config.bzl",
+    "tf_proto_library",
+)
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+tf_proto_library(
+    name = "memory_space_assignment_proto",
+    srcs = ["memory_space_assignment.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "memory_space_assignment",
+    srcs = ["memory_space_assignment.cc"],
+    hdrs = ["memory_space_assignment.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":memory_space_assignment_proto_cc",
+        ":memory_space_assignment_repacking",
+        ":memory_space_assignment_tuning_utils",
+        ":memory_space_assignment_utils",
+        "//xla:debug_options_flags",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_live_range",
+        "//xla/service:heap_simulator",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_proto_cc",
+        "//xla/service:hlo_value",
+        "//xla/service:tuple_util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "memory_space_assignment_test",
+    srcs = ["memory_space_assignment_test.cc"],
+    deps = [
+        ":memory_space_assignment",
+        ":memory_space_assignment_proto_cc",
+        ":memory_space_assignment_repacking",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_matchers",
+        "//xla/service:heap_simulator",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_value",
+        "//xla/service:instruction_hoister",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "memory_space_assignment_repacking",
+    hdrs = ["memory_space_assignment_repacking.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:statusor",
+        "//xla:types",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "memory_space_assignment_best_fit_repacker",
+    srcs = ["memory_space_assignment_best_fit_repacker.cc"],
+    hdrs = ["memory_space_assignment_best_fit_repacker.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":memory_space_assignment_repacking",
+        "//xla/service:heap_simulator",
+    ],
+)
+
+cc_library(
+    name = "memory_space_assignment_utils",
+    srcs = ["memory_space_assignment_utils.cc"],
+    hdrs = ["memory_space_assignment_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/service:heap_simulator",
+    ],
+)
+
+cc_library(
+    name = "memory_space_assignment_tuning_utils",
+    srcs = ["memory_space_assignment_tuning_utils.cc"],
+    hdrs = ["memory_space_assignment_tuning_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":memory_space_assignment_utils",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:heap_simulator",
+    ],
+)
+
+xla_cc_test(
+    name = "memory_space_assignment_best_fit_repacker_test",
+    srcs = ["memory_space_assignment_best_fit_repacker_test.cc"],
+    deps = [
+        ":memory_space_assignment_best_fit_repacker",
+        "//xla/tests:xla_internal_test_main",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
diff --git a/third_party/xla/xla/service/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
similarity index 99%
rename from third_party/xla/xla/service/memory_space_assignment.cc
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 2a28441e9961d8..317162b005a125 100644
--- a/third_party/xla/xla/service/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/memory_space_assignment.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.h"
 
 #include <algorithm>
 #include <cmath>
@@ -46,9 +46,9 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/service/heap_simulator.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/memory_space_assignment_repacking.h"
-#include "xla/service/memory_space_assignment_tuning_utils.h"
-#include "xla/service/memory_space_assignment_utils.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_utils.h"
 #include "xla/service/tuple_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/memory_space_assignment.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
similarity index 99%
rename from third_party/xla/xla/service/memory_space_assignment.h
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
index b39e14170e9ca8..e988d598332b6c 100644
--- a/third_party/xla/xla/service/memory_space_assignment.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_
-#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
 
 #include <cstdint>
 #include <functional>
@@ -42,8 +42,8 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/memory_space_assignment.pb.h"
-#include "xla/service/memory_space_assignment_repacking.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
 #include "xla/shape.h"
 #include "xla/statusor.h"
 
@@ -2667,4 +2667,4 @@ class AlternateMemoryBestFitHeap
 }  // namespace memory_space_assignment
 }  // namespace xla
 
-#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment.proto b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
similarity index 100%
rename from third_party/xla/xla/service/memory_space_assignment.proto
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
diff --git a/third_party/xla/xla/service/memory_space_assignment_best_fit_repacker.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.cc
similarity index 97%
rename from third_party/xla/xla/service/memory_space_assignment_best_fit_repacker.cc
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.cc
index c5354e2d6bbc8b..6eaf744f5a9e84 100644
--- a/third_party/xla/xla/service/memory_space_assignment_best_fit_repacker.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/memory_space_assignment_best_fit_repacker.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h"
 
 #include <algorithm>
 #include <functional>
diff --git a/third_party/xla/xla/service/memory_space_assignment_best_fit_repacker.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h
similarity index 79%
rename from third_party/xla/xla/service/memory_space_assignment_best_fit_repacker.h
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h
index 71742efd428701..220c19350b5925 100644
--- a/third_party/xla/xla/service/memory_space_assignment_best_fit_repacker.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
-#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
 
 #include "xla/service/heap_simulator.h"
-#include "xla/service/memory_space_assignment_repacking.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
 
 namespace xla {
 
@@ -41,4 +41,4 @@ class MemorySpaceAssignmentBestFitRepacker
 
 }  // namespace xla
 
-#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment_best_fit_repacker_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker_test.cc
similarity index 97%
rename from third_party/xla/xla/service/memory_space_assignment_best_fit_repacker_test.cc
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker_test.cc
index 60e79cdfeba059..52b87ffebb3c28 100644
--- a/third_party/xla/xla/service/memory_space_assignment_best_fit_repacker_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/memory_space_assignment_best_fit_repacker.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h"
 
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/service/memory_space_assignment_repacking.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_repacking.h
similarity index 95%
rename from third_party/xla/xla/service/memory_space_assignment_repacking.h
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_repacking.h
index a5d34e844bf35b..9d0be634ddda01 100644
--- a/third_party/xla/xla/service/memory_space_assignment_repacking.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_repacking.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
-#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
 
 #include <cstdint>
 #include <optional>
@@ -138,4 +138,4 @@ class MemorySpaceAssignmentRepacker {
 
 }  // namespace xla
 
-#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
similarity index 99%
rename from third_party/xla/xla/service/memory_space_assignment_test.cc
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index ecb93020c529f5..4032ec52bc7f36 100644
--- a/third_party/xla/xla/service/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/memory_space_assignment.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.h"
 
 #include <algorithm>
 #include <cstdint>
@@ -53,8 +53,8 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/instruction_hoister.h"
-#include "xla/service/memory_space_assignment.pb.h"
-#include "xla/service/memory_space_assignment_repacking.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -11678,8 +11678,8 @@ class MockRepacker : public MemorySpaceAssignmentRepacker {
 // - With repacking, we are able to prefetch p4.
 // - When repacking occurs, we expect p2 and p3 to have been allocated chunks.
 //   We are only proposing slices for f32[32, 16] and not f32[16,16]; thus, we
-//   expect slicing metdata to be attached to the repacking block for p2 but not
-//   p3.
+//   expect slicing metadata to be attached to the repacking block for p2 but
+//   not p3.
 // - We make the repacker assign the first slice (in time) of p2 the larger
 //   offset. After MSA, we check to make sure the fist slice is using the
 //   larger slicing parameters
diff --git a/third_party/xla/xla/service/memory_space_assignment_tuning_utils.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.cc
similarity index 92%
rename from third_party/xla/xla/service/memory_space_assignment_tuning_utils.cc
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.cc
index 32db6becff9dba..2e48eaeb0f670f 100644
--- a/third_party/xla/xla/service/memory_space_assignment_tuning_utils.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/memory_space_assignment_tuning_utils.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.h"
 
-#include "xla/service/memory_space_assignment_utils.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_utils.h"
 namespace xla {
 
 namespace memory_space_assignment {
diff --git a/third_party/xla/xla/service/memory_space_assignment_tuning_utils.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.h
similarity index 83%
rename from third_party/xla/xla/service/memory_space_assignment_tuning_utils.h
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.h
index 749b4445e4be9a..ce6230982bd5b0 100644
--- a/third_party/xla/xla/service/memory_space_assignment_tuning_utils.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
-#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
 
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/heap_simulator.h"
@@ -35,4 +35,4 @@ void CustomizeSortedBufferInterval(
 }  // namespace memory_space_assignment
 }  // namespace xla
 
-#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment_utils.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.cc
similarity index 97%
rename from third_party/xla/xla/service/memory_space_assignment_utils.cc
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.cc
index c9e84955c19bc9..03f1621dfbaea9 100644
--- a/third_party/xla/xla/service/memory_space_assignment_utils.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/memory_space_assignment_utils.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment_utils.h"
 
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/service/memory_space_assignment_utils.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.h
similarity index 84%
rename from third_party/xla/xla/service/memory_space_assignment_utils.h
rename to third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.h
index 10b486983ee253..e4cf6f45b51517 100644
--- a/third_party/xla/xla/service/memory_space_assignment_utils.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
-#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
 
 #include "xla/service/heap_simulator.h"
 
@@ -35,4 +35,4 @@ class MemorySpaceAssignmentUtils {
 
 }  // namespace xla
 
-#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index 79a9ea075a7080..0cfaa67192e898 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -59,7 +59,7 @@ def xla_cc_binary(deps = None, copts = tsl_copts(), **kwargs):
         "//xla:xla_proto_cc_impl",
         "//xla:xla_data_proto_cc_impl",
         "//xla/service:hlo_proto_cc_impl",
-        "//xla/service:memory_space_assignment_proto_cc_impl",
+        "//xla/service/memory_space_assignment:memory_space_assignment_proto_cc_impl",
         "//xla/service/gpu:backend_configs_cc_impl",
         "//xla/service/gpu:hlo_op_profile_proto_cc_impl",
         "//xla/stream_executor:dnn_proto_cc_impl",
@@ -91,7 +91,7 @@ def xla_cc_test(
                        clean_dep("//xla:xla_proto_cc_impl"),
                        clean_dep("//xla:xla_data_proto_cc_impl"),
                        clean_dep("//xla/service:hlo_proto_cc_impl"),
-                       clean_dep("//xla/service:memory_space_assignment_proto_cc_impl"),
+                       clean_dep("//xla/service/memory_space_assignment:memory_space_assignment_proto_cc_impl"),
                        clean_dep("//xla/service/gpu:backend_configs_cc_impl"),
                        clean_dep("//xla/service/gpu:hlo_op_profile_proto_cc_impl"),
                        clean_dep("//xla/stream_executor:dnn_proto_cc_impl"),

From 44378dfdc0b10dea9a44c86b3d7210a83c304419 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 17:07:13 -0700
Subject: [PATCH 065/567] Allow `std::nullopt` in `xla::ifrt::MemoryKind`
 constructor.

PiperOrigin-RevId: 567123142
---
 third_party/xla/xla/python/ifrt/memory.cc      | 3 +++
 third_party/xla/xla/python/ifrt/memory_test.cc | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/third_party/xla/xla/python/ifrt/memory.cc b/third_party/xla/xla/python/ifrt/memory.cc
index 05821e488f61f6..27a7ad8d6d551e 100644
--- a/third_party/xla/xla/python/ifrt/memory.cc
+++ b/third_party/xla/xla/python/ifrt/memory.cc
@@ -39,6 +39,9 @@ struct MemoryKindsSet {
 
 MemoryKind::MemoryKind(std::optional<absl::string_view> memory_kind) {
   static auto* const global_set = new MemoryKindsSet();
+  if (!memory_kind.has_value()) {
+    return;
+  }
   absl::MutexLock lock(&global_set->mu);
   auto it = global_set->memory_kinds_set.find(*memory_kind);
   if (it == global_set->memory_kinds_set.end()) {
diff --git a/third_party/xla/xla/python/ifrt/memory_test.cc b/third_party/xla/xla/python/ifrt/memory_test.cc
index 0e7e291bd9f945..bb223471c44776 100644
--- a/third_party/xla/xla/python/ifrt/memory_test.cc
+++ b/third_party/xla/xla/python/ifrt/memory_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/python/ifrt/memory.h"
 
 #include <memory>
+#include <optional>
 #include <string>
 
 #include <gmock/gmock.h>
@@ -72,6 +73,12 @@ TEST(MemoryKindTest, MemorySafety) {
   EXPECT_THAT(memory_kind.memory_kind(), Optional(absl::string_view("abc")));
 }
 
+TEST(MemoryKindTest, EqualityForUnspecifiedAndNullopt) {
+  MemoryKind memory_kind1;
+  MemoryKind memory_kind2(std::nullopt);
+  EXPECT_EQ(memory_kind1, memory_kind2);
+}
+
 }  // namespace
 }  // namespace ifrt
 }  // namespace xla

From e6da72d8c1c7969e409488b843fe66cdf85e341f Mon Sep 17 00:00:00 2001
From: Edward Schwartz <schwartzedward@google.com>
Date: Wed, 20 Sep 2023 17:54:35 -0700
Subject: [PATCH 066/567] 1) Fix: * test_dense_input_ragged_weights_fails, *
 test_sparse_input_ragged_weights_fails.

sparse_ops.sparse_bincount is intended to handle inputs of dense tensors (including non-TensorFlow types that can be converted to dense tensors) and sparse tensors only. Ragged tensor inputs are expected to use other software via dispatch. `weights` are not considered by dispatch. Previously, if `weights` is not a sparse tensor, it is assumed to be something that can be converted to a dense tensor (e.g. a numpy array or Python list) but this conversion fails for ragged tensors. This CL only converts `weights` if it is not any type of composite tensor, allowing error checking to detect any mismatch in the type of tensor between the input and `weights`.

2) Enable running ragged bincount tests.

PiperOrigin-RevId: 567132514
---
 tensorflow/python/ops/BUILD                              | 1 +
 tensorflow/python/ops/ragged/BUILD                       | 1 +
 tensorflow/python/ops/ragged/ragged_bincount_ops_test.py | 4 ++++
 tensorflow/python/ops/sparse_ops.py                      | 7 ++++++-
 4 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 36a41014cba5ca..00679cbf1d36ec 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -2601,6 +2601,7 @@ py_strict_library(
         ":math_ops_gen",
         ":sparse_ops_gen",
         ":special_math_ops",
+        "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 27b6f311519c82..ccb6c9dcd63826 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -165,6 +165,7 @@ cuda_py_strict_test(
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/ops:bincount_ops",
         "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/ops/ragged/ragged_bincount_ops_test.py b/tensorflow/python/ops/ragged/ragged_bincount_ops_test.py
index ec7fa539c83a75..bda3dc32c6eedd 100644
--- a/tensorflow/python/ops/ragged/ragged_bincount_ops_test.py
+++ b/tensorflow/python/ops/ragged/ragged_bincount_ops_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
 
 
 def _ragged_factory(x):
@@ -551,3 +552,6 @@ def test_ragged_input_different_shape_fails(self):
     with self.assertRaisesRegex(errors.InvalidArgumentError,
                                 "must have the same row splits"):
       self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index db11f005f94d39..82cee2d96e0866 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -25,6 +25,7 @@
 
 import numpy as np
 
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -3351,7 +3352,11 @@ def sparse_bincount(values,
       values = tensor_conversion.convert_to_tensor_v2_with_dispatch(
           values, name="values")
     if weights is not None:
-      if not isinstance(weights, sparse_tensor.SparseTensor):
+      # Note that `weights` is not used for dispatch and if there is a type
+      # mismatch between `values` and `weights`, `weights` can be a RaggedTensor
+      # (or potentially some other kind of CompositeTensor) where conversion
+      # to a dense tensor fails.
+      if not isinstance(weights, composite_tensor.CompositeTensor):
         weights = tensor_conversion.convert_to_tensor_v2_with_dispatch(
             weights, name="weights")
 

From 21249b3f3a157d38b25ae954a13b0ef3057e06e6 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 20 Sep 2023 18:04:37 -0700
Subject: [PATCH 067/567] [stream_executor] NFC: Restrict visibility of
 stream_executor_internal target

https://github.com/openxla/xla/issues/5761

PiperOrigin-RevId: 567134621
---
 third_party/xla/xla/stream_executor/BUILD     | 65 +++++++++++--------
 .../xla/xla/stream_executor/build_defs.bzl    |  3 +
 .../stream_executor_internal.h                | 17 ++---
 3 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 14348bd2a6e9e5..f4b13f734c8301 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -5,7 +5,7 @@
 # do not link against restricted binary blobs.
 
 load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
+load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends", "stream_executor_internal")
 load("@local_tsl//tsl:tsl.bzl", "set_external_visibility", "transitive_hdrs")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
@@ -17,15 +17,51 @@ package(
     licenses = ["notice"],
 )
 
+# StreamExecutor clients that can depend on targets defined in stream_executor package.
 package_group(
     name = "friends",
     packages = stream_executor_friends(),
 )
 
+# StreamExecutor platform-dependent implementations. We restrict visibility of all internal
+# implementation interfaces to internal users (everything in `stream_executor::internal` namespace).
+package_group(
+    name = "internal",
+    packages = stream_executor_internal(),
+)
+
+#===--------------------------------------------------------------------------------------------===#
+# StreamExecutor platform-dependent implementation details
+#===--------------------------------------------------------------------------------------------===#
+
+# Only platform-dependent StreamExecutor implementations (e.g. StreamExecutor for GPUs) and targets
+# defined by StreamExecutor itself (e.g. `event`, `kernel`, etc.) can depend on internal
+# implementation details (interfaces that define platform-specific API).
+#
+# External clients of StreamExecutor should depend on `stream_executor` target (links StreamExecutor
+# implementation in static build configuration), or a header only `stream_executor_headers`.
+
+cc_library(
+    name = "stream_executor_internal",
+    hdrs = ["stream_executor_internal.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":stream_executor_headers",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+#===--------------------------------------------------------------------------------------------===#
+
 # The stream_executor_headers target does not prescribe an implementation.
 cc_library(
     name = "stream_executor_headers",
     textual_hdrs = [
+        "allocator_stats.h",
         "blas.h",
         "command_buffer.h",
         "device_description.h",
@@ -368,33 +404,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "stream_executor_internal",
-    hdrs = [
-        "stream_executor_internal.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":allocator_stats",
-        ":device_description",
-        ":device_memory",
-        ":device_options",
-        ":kernel",
-        ":kernel_cache_config",
-        ":kernel_spec",
-        ":launch_dim",
-        ":module_spec",
-        ":plugin_registry",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/types:optional",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "stream_executor_pimpl_header",
     hdrs = [
diff --git a/third_party/xla/xla/stream_executor/build_defs.bzl b/third_party/xla/xla/stream_executor/build_defs.bzl
index c76ff0bdea1074..8e75a55324a063 100644
--- a/third_party/xla/xla/stream_executor/build_defs.bzl
+++ b/third_party/xla/xla/stream_executor/build_defs.bzl
@@ -4,6 +4,9 @@ load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 def stream_executor_friends():
     return ["//..."]
 
+def stream_executor_internal():
+    return ["//..."]
+
 def tf_additional_cuda_platform_deps():
     return []
 
diff --git a/third_party/xla/xla/stream_executor/stream_executor_internal.h b/third_party/xla/xla/stream_executor/stream_executor_internal.h
index afb32653eaa222..9acd28dc84deb5 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_internal.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_internal.h
@@ -31,7 +31,6 @@ limitations under the License.
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
-#include "absl/types/optional.h"
 #include "xla/stream_executor/allocator_stats.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
@@ -46,7 +45,6 @@ limitations under the License.
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/trace_listener.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
@@ -186,21 +184,21 @@ class StreamExecutorInterface {
 
   virtual tsl::Status GetKernel(const MultiKernelLoaderSpec& spec,
                                 KernelBase* kernel) {
-    return tsl::errors::Unimplemented("Not Implemented");
+    return absl::UnimplementedError("Not Implemented");
   }
   virtual bool UnloadModule(ModuleHandle module_handle) { return false; }
   virtual tsl::Status LoadModule(const MultiModuleLoaderSpec& spec,
                                  ModuleHandle* module_handle) {
-    return tsl::errors::Unimplemented("Not Implemented");
+    return absl::UnimplementedError("Not Implemented");
   }
   virtual tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
   CreateOrShareConstant(Stream* stream, const std::vector<uint8_t>& content) {
-    return tsl::errors::Unimplemented("Not Implemented");
+    return absl::UnimplementedError("Not Implemented");
   }
   virtual tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
                              const BlockDim& block_dims, const KernelBase& k,
                              const KernelArgsArrayBase& args) {
-    return tsl::errors::Unimplemented("Not Implemented");
+    return absl::UnimplementedError("Not Implemented");
   }
 
   // Releases any state associated with the kernel.
@@ -262,8 +260,7 @@ class StreamExecutorInterface {
   virtual tsl::Status WaitForEvent(Stream* stream, Event* event) = 0;
   virtual tsl::Status WaitForEventOnExternalStream(std::intptr_t stream,
                                                    Event* event) {
-    return tsl::Status(
-        absl::StatusCode::kUnimplemented,
+    return absl::UnimplementedError(
         "WaitForEventOnExternalStream not supported on this executor.");
   }
   virtual Event::Status PollForEventStatus(Event* event) = 0;
@@ -272,8 +269,8 @@ class StreamExecutorInterface {
   virtual bool CreateStreamDependency(Stream* dependent, Stream* other) = 0;
   virtual tsl::Status BlockHostUntilDone(Stream* stream) = 0;
   virtual tsl::Status GetStatus(Stream* stream) {
-    return tsl::Status(absl::StatusCode::kUnimplemented,
-                       "GetStatus is not supported on this executor.");
+    return absl::UnimplementedError(
+        "GetStatus is not supported on this executor.");
   }
   virtual tsl::Status EnablePeerAccessTo(StreamExecutorInterface* other) = 0;
   virtual bool CanEnablePeerAccessTo(StreamExecutorInterface* other) = 0;

From cb83963df2ed9192a596c234eb4ac7f18879477b Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 20 Sep 2023 18:36:01 -0700
Subject: [PATCH 068/567] Integrate LLVM at llvm/llvm-project@afd7db48c55c

Updates LLVM usage to match
[afd7db48c55c](https://github.com/llvm/llvm-project/commit/afd7db48c55c)

PiperOrigin-RevId: 567141554
---
 third_party/llvm/generated.patch | 1668 ++++++++++++++++++++++++++++++
 third_party/llvm/workspace.bzl   |    4 +-
 2 files changed, 1670 insertions(+), 2 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 509398da979e83..450540fcebd5c7 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1 +1,1669 @@
 Auto generated patch. Do not edit or delete it, even if empty.
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
++++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+@@ -271,10 +271,7 @@
+   bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
+                                              IRBuilder<> &Builder);
+ 
+-  bool hoistCommonCodeFromSuccessors(BasicBlock *BB, bool EqTermsOnly);
+-  bool hoistSuccIdenticalTerminatorToSwitchOrIf(
+-      Instruction *TI, Instruction *I1,
+-      SmallVectorImpl<Instruction *> &OtherSuccTIs);
++  bool HoistThenElseCodeToIf(BranchInst *BI, bool EqTermsOnly);
+   bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
+   bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
+                                   BasicBlock *TrueBB, BasicBlock *FalseBB,
+@@ -1411,9 +1408,8 @@
+ }
+ 
+ // If we would need to insert a select that uses the value of this invoke
+-// (comments in hoistSuccIdenticalTerminatorToSwitchOrIf explain why we would
+-// need to do this), we can't hoist the invoke, as there is nowhere to put the
+-// select in this case.
++// (comments in HoistThenElseCodeToIf explain why we would need to do this), we
++// can't hoist the invoke, as there is nowhere to put the select in this case.
+ static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
+                                 Instruction *I1, Instruction *I2) {
+   for (BasicBlock *Succ : successors(BB1)) {
+@@ -1428,9 +1424,9 @@
+   return true;
+ }
+ 
+-// Get interesting characteristics of instructions that
+-// `hoistCommonCodeFromSuccessors` didn't hoist. They restrict what kind of
+-// instructions can be reordered across.
++// Get interesting characteristics of instructions that `HoistThenElseCodeToIf`
++// didn't hoist. They restrict what kind of instructions can be reordered
++// across.
+ enum SkipFlags {
+   SkipReadMem = 1,
+   SkipSideEffect = 2,
+@@ -1488,7 +1484,7 @@
+ 
+ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValueMayBeModified = false);
+ 
+-/// Helper function for hoistCommonCodeFromSuccessors. Return true if identical
++/// Helper function for HoistThenElseCodeToIf. Return true if identical
+ /// instructions \p I1 and \p I2 can and should be hoisted.
+ static bool shouldHoistCommonInstructions(Instruction *I1, Instruction *I2,
+                                           const TargetTransformInfo &TTI) {
+@@ -1519,51 +1515,62 @@
+   return true;
+ }
+ 
+-/// Hoist any common code in the successor blocks up into the block. This
+-/// function guarantees that BB dominates all successors. If EqTermsOnly is
+-/// given, only perform hoisting in case both blocks only contain a terminator.
+-/// In that case, only the original BI will be replaced and selects for PHIs are
+-/// added.
+-bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
+-                                                   bool EqTermsOnly) {
++/// Given a conditional branch that goes to BB1 and BB2, hoist any common code
++/// in the two blocks up into the branch block. The caller of this function
++/// guarantees that BI's block dominates BB1 and BB2. If EqTermsOnly is given,
++/// only perform hoisting in case both blocks only contain a terminator. In that
++/// case, only the original BI will be replaced and selects for PHIs are added.
++bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI, bool EqTermsOnly) {
+   // This does very trivial matching, with limited scanning, to find identical
+-  // instructions in the two blocks. In particular, we don't want to get into
+-  // O(N1*N2*...) situations here where Ni are the sizes of these successors. As
++  // instructions in the two blocks.  In particular, we don't want to get into
++  // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
+   // such, we currently just scan for obviously identical instructions in an
+   // identical order, possibly separated by the same number of non-identical
+   // instructions.
+-  unsigned int SuccSize = succ_size(BB);
+-  if (SuccSize < 2)
+-    return false;
++  BasicBlock *BB1 = BI->getSuccessor(0); // The true destination.
++  BasicBlock *BB2 = BI->getSuccessor(1); // The false destination
+ 
+   // If either of the blocks has it's address taken, then we can't do this fold,
+   // because the code we'd hoist would no longer run when we jump into the block
+   // by it's address.
+-  for (auto *Succ : successors(BB))
+-    if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
+-      return false;
++  if (BB1->hasAddressTaken() || BB2->hasAddressTaken())
++    return false;
+ 
+-  auto *TI = BB->getTerminator();
++  BasicBlock::iterator BB1_Itr = BB1->begin();
++  BasicBlock::iterator BB2_Itr = BB2->begin();
+ 
+-  // The second of pair is a SkipFlags bitmask.
+-  using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>;
+-  SmallVector<SuccIterPair, 8> SuccIterPairs;
+-  for (auto *Succ : successors(BB)) {
+-    BasicBlock::iterator SuccItr = Succ->begin();
+-    if (isa<PHINode>(*SuccItr))
+-      return false;
+-    SuccIterPairs.push_back(SuccIterPair(SuccItr, 0));
++  Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++;
++  // Skip debug info if it is not identical.
++  DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
++  DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
++  if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
++    while (isa<DbgInfoIntrinsic>(I1))
++      I1 = &*BB1_Itr++;
++    while (isa<DbgInfoIntrinsic>(I2))
++      I2 = &*BB2_Itr++;
+   }
++  if (isa<PHINode>(I1))
++    return false;
++
++  BasicBlock *BIParent = BI->getParent();
++
++  bool Changed = false;
++
++  auto _ = make_scope_exit([&]() {
++    if (Changed)
++      ++NumHoistCommonCode;
++  });
+ 
+   // Check if only hoisting terminators is allowed. This does not add new
+   // instructions to the hoist location.
+   if (EqTermsOnly) {
+     // Skip any debug intrinsics, as they are free to hoist.
+-    for (auto &SuccIter : make_first_range(SuccIterPairs)) {
+-      auto *INonDbg = &*skipDebugIntrinsics(SuccIter);
+-      if (!INonDbg->isTerminator())
+-        return false;
+-    }
++    auto *I1NonDbg = &*skipDebugIntrinsics(I1->getIterator());
++    auto *I2NonDbg = &*skipDebugIntrinsics(I2->getIterator());
++    if (!I1NonDbg->isIdenticalToWhenDefined(I2NonDbg))
++      return false;
++    if (!I1NonDbg->isTerminator())
++      return false;
+     // Now we know that we only need to hoist debug intrinsics and the
+     // terminator. Let the loop below handle those 2 cases.
+   }
+@@ -1572,234 +1579,154 @@
+   // many instructions we skip, serving as a compilation time control as well as
+   // preventing excessive increase of life ranges.
+   unsigned NumSkipped = 0;
+-  // If we find an unreachable instruction at the beginning of a basic block, we
+-  // can still hoist instructions from the rest of the basic blocks.
+-  if (SuccIterPairs.size() > 2) {
+-    erase_if(SuccIterPairs,
+-             [](const auto &Pair) { return isa<UnreachableInst>(Pair.first); });
+-    if (SuccIterPairs.size() < 2)
+-      return false;
+-  }
+ 
+-  bool Changed = false;
++  // Record any skipped instuctions that may read memory, write memory or have
++  // side effects, or have implicit control flow.
++  unsigned SkipFlagsBB1 = 0;
++  unsigned SkipFlagsBB2 = 0;
+ 
+   for (;;) {
+-    auto *SuccIterPairBegin = SuccIterPairs.begin();
+-    auto &BB1ItrPair = *SuccIterPairBegin++;
+-    auto OtherSuccIterPairRange =
+-        iterator_range(SuccIterPairBegin, SuccIterPairs.end());
+-    auto OtherSuccIterRange = make_first_range(OtherSuccIterPairRange);
+-
+-    Instruction *I1 = &*BB1ItrPair.first;
+-    auto *BB1 = I1->getParent();
+-
+-    // Skip debug info if it is not identical.
+-    bool AllDbgInstsAreIdentical = all_of(OtherSuccIterRange, [I1](auto &Iter) {
+-      Instruction *I2 = &*Iter;
+-      return I1->isIdenticalToWhenDefined(I2);
+-    });
+-    if (!AllDbgInstsAreIdentical) {
+-      while (isa<DbgInfoIntrinsic>(I1))
+-        I1 = &*++BB1ItrPair.first;
+-      for (auto &SuccIter : OtherSuccIterRange) {
+-        Instruction *I2 = &*SuccIter;
+-        while (isa<DbgInfoIntrinsic>(I2))
+-          I2 = &*++SuccIter;
+-      }
+-    }
+-
+-    bool AllInstsAreIdentical = true;
+-    bool HasTerminator = I1->isTerminator();
+-    for (auto &SuccIter : OtherSuccIterRange) {
+-      Instruction *I2 = &*SuccIter;
+-      HasTerminator |= I2->isTerminator();
+-      if (AllInstsAreIdentical && !I1->isIdenticalToWhenDefined(I2))
+-        AllInstsAreIdentical = false;
+-    }
+-
+     // If we are hoisting the terminator instruction, don't move one (making a
+     // broken BB), instead clone it, and remove BI.
+-    if (HasTerminator) {
++    if (I1->isTerminator() || I2->isTerminator()) {
+       // If any instructions remain in the block, we cannot hoist terminators.
+-      if (NumSkipped || SuccSize != SuccIterPairs.size() ||
+-          !AllInstsAreIdentical)
++      if (NumSkipped || !I1->isIdenticalToWhenDefined(I2))
+         return Changed;
+-      SmallVector<Instruction *, 8> Insts;
+-      for (auto &SuccIter : OtherSuccIterRange)
+-        Insts.push_back(&*SuccIter);
+-      return hoistSuccIdenticalTerminatorToSwitchOrIf(TI, I1, Insts) || Changed;
+-    }
+-
+-    if (AllInstsAreIdentical) {
+-      unsigned SkipFlagsBB1 = BB1ItrPair.second;
+-      AllInstsAreIdentical =
+-          isSafeToHoistInstr(I1, SkipFlagsBB1) &&
+-          all_of(OtherSuccIterPairRange, [=](const auto &Pair) {
+-            Instruction *I2 = &*Pair.first;
+-            unsigned SkipFlagsBB2 = Pair.second;
+-            // Even if the instructions are identical, it may not
+-            // be safe to hoist them if we have skipped over
+-            // instructions with side effects or their operands
+-            // weren't hoisted.
+-            return isSafeToHoistInstr(I2, SkipFlagsBB2) &&
+-                   shouldHoistCommonInstructions(I1, I2, TTI);
+-          });
+-    }
+-
+-    if (AllInstsAreIdentical) {
+-      BB1ItrPair.first++;
+-      if (isa<DbgInfoIntrinsic>(I1)) {
++      goto HoistTerminator;
++    }
++
++    if (I1->isIdenticalToWhenDefined(I2) &&
++        // Even if the instructions are identical, it may not be safe to hoist
++        // them if we have skipped over instructions with side effects or their
++        // operands weren't hoisted.
++        isSafeToHoistInstr(I1, SkipFlagsBB1) &&
++        isSafeToHoistInstr(I2, SkipFlagsBB2) &&
++        shouldHoistCommonInstructions(I1, I2, TTI)) {
++      if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
++        assert(isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
+         // The debug location is an integral part of a debug info intrinsic
+         // and can't be separated from it or replaced.  Instead of attempting
+         // to merge locations, simply hoist both copies of the intrinsic.
+-        I1->moveBeforePreserving(TI);
+-        for (auto &SuccIter : OtherSuccIterRange) {
+-          auto *I2 = &*SuccIter++;
+-          assert(isa<DbgInfoIntrinsic>(I2));
+-          I2->moveBeforePreserving(TI);
+-        }
++        I1->moveBeforePreserving(BI);
++        I2->moveBeforePreserving(BI);
++        Changed = true;
+       } else {
+         // For a normal instruction, we just move one to right before the
+         // branch, then replace all uses of the other with the first.  Finally,
+         // we remove the now redundant second instruction.
+-        I1->moveBeforePreserving(TI);
+-        BB->splice(TI->getIterator(), BB1, I1->getIterator());
+-        for (auto &SuccIter : OtherSuccIterRange) {
+-          Instruction *I2 = &*SuccIter++;
+-          assert(I2 != I1);
+-          if (!I2->use_empty())
+-            I2->replaceAllUsesWith(I1);
+-          I1->andIRFlags(I2);
+-          combineMetadataForCSE(I1, I2, true);
+-          // I1 and I2 are being combined into a single instruction.  Its debug
+-          // location is the merged locations of the original instructions.
+-          I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+-          I2->eraseFromParent();
+-        }
++        I1->moveBeforePreserving(BI);
++        if (!I2->use_empty())
++          I2->replaceAllUsesWith(I1);
++        I1->andIRFlags(I2);
++        combineMetadataForCSE(I1, I2, true);
++
++        // I1 and I2 are being combined into a single instruction.  Its debug
++        // location is the merged locations of the original instructions.
++        I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
++
++        I2->eraseFromParent();
+       }
+-      if (!Changed)
+-        NumHoistCommonCode += SuccIterPairs.size();
+       Changed = true;
+-      NumHoistCommonInstrs += SuccIterPairs.size();
++      ++NumHoistCommonInstrs;
+     } else {
+       if (NumSkipped >= HoistCommonSkipLimit)
+         return Changed;
+       // We are about to skip over a pair of non-identical instructions. Record
+       // if any have characteristics that would prevent reordering instructions
+       // across them.
+-      for (auto &SuccIterPair : SuccIterPairs) {
+-        Instruction *I = &*SuccIterPair.first++;
+-        SuccIterPair.second |= skippedInstrFlags(I);
+-      }
++      SkipFlagsBB1 |= skippedInstrFlags(I1);
++      SkipFlagsBB2 |= skippedInstrFlags(I2);
+       ++NumSkipped;
+     }
+-  }
+-}
+-
+-bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf(
+-    Instruction *TI, Instruction *I1,
+-    SmallVectorImpl<Instruction *> &OtherSuccTIs) {
+-
+-  auto *BI = dyn_cast<BranchInst>(TI);
+-
+-  bool Changed = false;
+-  BasicBlock *TIParent = TI->getParent();
+-  BasicBlock *BB1 = I1->getParent();
+ 
+-  // Use only for an if statement.
+-  auto *I2 = *OtherSuccTIs.begin();
+-  auto *BB2 = I2->getParent();
+-  if (BI) {
+-    assert(OtherSuccTIs.size() == 1);
+-    assert(BI->getSuccessor(0) == I1->getParent());
+-    assert(BI->getSuccessor(1) == I2->getParent());
++    I1 = &*BB1_Itr++;
++    I2 = &*BB2_Itr++;
++    // Skip debug info if it is not identical.
++    DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
++    DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
++    if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
++      while (isa<DbgInfoIntrinsic>(I1))
++        I1 = &*BB1_Itr++;
++      while (isa<DbgInfoIntrinsic>(I2))
++        I2 = &*BB2_Itr++;
++    }
+   }
+ 
+-  // In the case of an if statement, we try to hoist an invoke.
++  return Changed;
++
++HoistTerminator:
++  // It may not be possible to hoist an invoke.
+   // FIXME: Can we define a safety predicate for CallBr?
+-  // FIXME: Test case llvm/test/Transforms/SimplifyCFG/2009-06-15-InvokeCrash.ll
+-  // removed in 4c923b3b3fd0ac1edebf0603265ca3ba51724937 commit?
+-  if (isa<InvokeInst>(I1) && (!BI || !isSafeToHoistInvoke(BB1, BB2, I1, I2)))
+-    return false;
++  if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
++    return Changed;
+ 
+   // TODO: callbr hoisting currently disabled pending further study.
+   if (isa<CallBrInst>(I1))
+-    return false;
++    return Changed;
+ 
+   for (BasicBlock *Succ : successors(BB1)) {
+     for (PHINode &PN : Succ->phis()) {
+       Value *BB1V = PN.getIncomingValueForBlock(BB1);
+-      for (Instruction *OtherSuccTI : OtherSuccTIs) {
+-        Value *BB2V = PN.getIncomingValueForBlock(OtherSuccTI->getParent());
+-        if (BB1V == BB2V)
+-          continue;
++      Value *BB2V = PN.getIncomingValueForBlock(BB2);
++      if (BB1V == BB2V)
++        continue;
+ 
+-        // In the case of an if statement, check for
+-        // passingValueIsAlwaysUndefined here because we would rather eliminate
+-        // undefined control flow then converting it to a select.
+-        if (!BI || passingValueIsAlwaysUndefined(BB1V, &PN) ||
+-            passingValueIsAlwaysUndefined(BB2V, &PN))
+-          return false;
+-      }
++      // Check for passingValueIsAlwaysUndefined here because we would rather
++      // eliminate undefined control flow then converting it to a select.
++      if (passingValueIsAlwaysUndefined(BB1V, &PN) ||
++          passingValueIsAlwaysUndefined(BB2V, &PN))
++        return Changed;
+     }
+   }
+ 
+   // Okay, it is safe to hoist the terminator.
+   Instruction *NT = I1->clone();
+-  NT->insertInto(TIParent, TI->getIterator());
++  NT->insertInto(BIParent, BI->getIterator());
+   if (!NT->getType()->isVoidTy()) {
+     I1->replaceAllUsesWith(NT);
+-    for (Instruction *OtherSuccTI : OtherSuccTIs)
+-      OtherSuccTI->replaceAllUsesWith(NT);
++    I2->replaceAllUsesWith(NT);
+     NT->takeName(I1);
+   }
+   Changed = true;
+-  NumHoistCommonInstrs += OtherSuccTIs.size() + 1;
++  ++NumHoistCommonInstrs;
+ 
+   // Ensure terminator gets a debug location, even an unknown one, in case
+   // it involves inlinable calls.
+-  SmallVector<DILocation *, 4> Locs;
+-  Locs.push_back(I1->getDebugLoc());
+-  for (auto *OtherSuccTI : OtherSuccTIs)
+-    Locs.push_back(OtherSuccTI->getDebugLoc());
+-  NT->setDebugLoc(DILocation::getMergedLocations(Locs));
++  NT->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+ 
+   // PHIs created below will adopt NT's merged DebugLoc.
+   IRBuilder<NoFolder> Builder(NT);
+ 
+-  // In the case of an if statement, hoisting one of the terminators from our
+-  // successor is a great thing. Unfortunately, the successors of the if/else
+-  // blocks may have PHI nodes in them.  If they do, all PHI entries for BB1/BB2
+-  // must agree for all PHI nodes, so we insert select instruction to compute
+-  // the final result.
+-  if (BI) {
+-    std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
+-    for (BasicBlock *Succ : successors(BB1)) {
+-      for (PHINode &PN : Succ->phis()) {
+-        Value *BB1V = PN.getIncomingValueForBlock(BB1);
+-        Value *BB2V = PN.getIncomingValueForBlock(BB2);
+-        if (BB1V == BB2V)
+-          continue;
+-
+-        // These values do not agree.  Insert a select instruction before NT
+-        // that determines the right value.
+-        SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
+-        if (!SI) {
+-          // Propagate fast-math-flags from phi node to its replacement select.
+-          IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+-          if (isa<FPMathOperator>(PN))
+-            Builder.setFastMathFlags(PN.getFastMathFlags());
+-
+-          SI = cast<SelectInst>(Builder.CreateSelect(
+-              BI->getCondition(), BB1V, BB2V,
+-              BB1V->getName() + "." + BB2V->getName(), BI));
+-        }
++  // Hoisting one of the terminators from our successor is a great thing.
++  // Unfortunately, the successors of the if/else blocks may have PHI nodes in
++  // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
++  // nodes, so we insert select instruction to compute the final result.
++  std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
++  for (BasicBlock *Succ : successors(BB1)) {
++    for (PHINode &PN : Succ->phis()) {
++      Value *BB1V = PN.getIncomingValueForBlock(BB1);
++      Value *BB2V = PN.getIncomingValueForBlock(BB2);
++      if (BB1V == BB2V)
++        continue;
+ 
+-        // Make the PHI node use the select for all incoming values for BB1/BB2
+-        for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+-          if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2)
+-            PN.setIncomingValue(i, SI);
+-      }
++      // These values do not agree.  Insert a select instruction before NT
++      // that determines the right value.
++      SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
++      if (!SI) {
++        // Propagate fast-math-flags from phi node to its replacement select.
++        IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
++        if (isa<FPMathOperator>(PN))
++          Builder.setFastMathFlags(PN.getFastMathFlags());
++
++        SI = cast<SelectInst>(
++            Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
++                                 BB1V->getName() + "." + BB2V->getName(), BI));
++      }
++
++      // Make the PHI node use the select for all incoming values for BB1/BB2
++      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
++        if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2)
++          PN.setIncomingValue(i, SI);
+     }
+   }
+ 
+@@ -1807,16 +1734,16 @@
+ 
+   // Update any PHI nodes in our new successors.
+   for (BasicBlock *Succ : successors(BB1)) {
+-    AddPredecessorToBlock(Succ, TIParent, BB1);
++    AddPredecessorToBlock(Succ, BIParent, BB1);
+     if (DTU)
+-      Updates.push_back({DominatorTree::Insert, TIParent, Succ});
++      Updates.push_back({DominatorTree::Insert, BIParent, Succ});
+   }
+ 
+   if (DTU)
+-    for (BasicBlock *Succ : successors(TI))
+-      Updates.push_back({DominatorTree::Delete, TIParent, Succ});
++    for (BasicBlock *Succ : successors(BI))
++      Updates.push_back({DominatorTree::Delete, BIParent, Succ});
+ 
+-  EraseTerminatorAndDCECond(TI);
++  EraseTerminatorAndDCECond(BI);
+   if (DTU)
+     DTU->applyUpdates(Updates);
+   return Changed;
+@@ -2850,8 +2777,8 @@
+     Value *OrigV = PN.getIncomingValueForBlock(BB);
+     Value *ThenV = PN.getIncomingValueForBlock(ThenBB);
+ 
+-    // FIXME: Try to remove some of the duplication with
+-    // hoistCommonCodeFromSuccessors. Skip PHIs which are trivial.
++    // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf.
++    // Skip PHIs which are trivial.
+     if (ThenV == OrigV)
+       continue;
+ 
+@@ -6888,10 +6815,6 @@
+   if (ReduceSwitchRange(SI, Builder, DL, TTI))
+     return requestResimplify();
+ 
+-  if (HoistCommon &&
+-      hoistCommonCodeFromSuccessors(SI->getParent(), !Options.HoistCommonInsts))
+-    return requestResimplify();
+-
+   return false;
+ }
+ 
+@@ -7158,8 +7081,7 @@
+   // can hoist it up to the branching block.
+   if (BI->getSuccessor(0)->getSinglePredecessor()) {
+     if (BI->getSuccessor(1)->getSinglePredecessor()) {
+-      if (HoistCommon && hoistCommonCodeFromSuccessors(
+-                             BI->getParent(), !Options.HoistCommonInsts))
++      if (HoistCommon && HoistThenElseCodeToIf(BI, !Options.HoistCommonInsts))
+         return requestResimplify();
+     } else {
+       // If Successor #1 has multiple preds, we may be able to conditionally
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll b/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll
+--- a/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll
++++ b/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll
+@@ -70,7 +70,7 @@
+     i64 4, label %sw.bb4
+   ]
+ sw.bb0:
+-  call void asm sideeffect "nop", ""()
++  call void asm sideeffect "", ""()
+   ret void
+ sw.bb1:
+   call void asm sideeffect "", ""()
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
+--- a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
++++ b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
+@@ -19,9 +19,21 @@
+ 
+ define void @foo_switch(i64 %C, ptr %P) {
+ ; CHECK-LABEL: @foo_switch(
+-; CHECK-NEXT:  common.ret:
+-; CHECK-NEXT:    store i32 7, ptr [[P:%.*]], align 4
++; CHECK-NEXT:    switch i64 [[C:%.*]], label [[BB0:%.*]] [
++; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
++; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
++; CHECK-NEXT:    ]
++; CHECK:       common.ret:
+ ; CHECK-NEXT:    ret void
++; CHECK:       bb0:
++; CHECK-NEXT:    store i32 7, ptr [[P:%.*]], align 4
++; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
++; CHECK:       bb1:
++; CHECK-NEXT:    store i32 7, ptr [[P]], align 4
++; CHECK-NEXT:    br label [[COMMON_RET]]
++; CHECK:       bb2:
++; CHECK-NEXT:    store i32 7, ptr [[P]], align 4
++; CHECK-NEXT:    br label [[COMMON_RET]]
+ ;
+   switch i64 %C, label %bb0 [
+   i64 1, label %bb1
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
+--- a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
++++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
+@@ -26,11 +26,27 @@
+ 
+ define void @test_switch(i64 %i, ptr %Q) {
+ ; CHECK-LABEL: @test_switch(
+-; CHECK-NEXT:  common.ret:
++; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
++; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
++; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
++; CHECK-NEXT:    ]
++; CHECK:       common.ret:
++; CHECK-NEXT:    ret void
++; CHECK:       bb0:
+ ; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+ ; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[Q]], align 4
+ ; CHECK-NEXT:    call void @bar(i32 [[A]])
+-; CHECK-NEXT:    ret void
++; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
++; CHECK:       bb1:
++; CHECK-NEXT:    store i32 1, ptr [[Q]], align 4
++; CHECK-NEXT:    [[B:%.*]] = load i32, ptr [[Q]], align 4
++; CHECK-NEXT:    call void @bar(i32 [[B]])
++; CHECK-NEXT:    br label [[COMMON_RET]]
++; CHECK:       bb2:
++; CHECK-NEXT:    store i32 1, ptr [[Q]], align 4
++; CHECK-NEXT:    [[C:%.*]] = load i32, ptr [[Q]], align 4
++; CHECK-NEXT:    call void @bar(i32 [[C]])
++; CHECK-NEXT:    br label [[COMMON_RET]]
+ ;
+   switch i64 %i, label %bb0 [
+   i64 1, label %bb1
+@@ -53,41 +69,25 @@
+   ret void
+ }
+ 
+-; We ensure that we examine all instructions during each iteration to confirm the presence of a terminating one.
+-define void @test_switch_reach_terminator(i64 %i, ptr %p) {
+-; CHECK-LABEL: @test_switch_reach_terminator(
+-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+-; CHECK-NEXT:    i64 2, label [[COMMON_RET:%.*]]
+-; CHECK-NEXT:    ]
+-; CHECK:       common.ret:
+-; CHECK-NEXT:    ret void
+-; CHECK:       bb0:
+-; CHECK-NEXT:    store i32 1, ptr [[P:%.*]], align 4
+-; CHECK-NEXT:    br label [[COMMON_RET]]
+-; CHECK:       bb1:
+-; CHECK-NEXT:    store i32 2, ptr [[P]], align 4
+-; CHECK-NEXT:    br label [[COMMON_RET]]
+-;
+-  switch i64 %i, label %bb0 [
+-  i64 1, label %bb1
+-  i64 2, label %bb2
+-  ]
+-bb0:              ; preds = %0
+-  store i32 1, ptr %p
+-  ret void
+-bb1:              ; preds = %0
+-  store i32 2, ptr %p
+-  ret void
+-bb2:              ; preds = %0
+-  ret void
+-}
+-
+ define i1 @common_instr_on_switch(i64 %a, i64 %b, i64 %c) unnamed_addr {
+ ; CHECK-LABEL: @common_instr_on_switch(
+ ; CHECK-NEXT:  start:
++; CHECK-NEXT:    switch i64 [[A:%.*]], label [[BB0:%.*]] [
++; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
++; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
++; CHECK-NEXT:    ]
++; CHECK:       bb0:
+ ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
+-; CHECK-NEXT:    ret i1 [[TMP0]]
++; CHECK-NEXT:    br label [[EXIT:%.*]]
++; CHECK:       bb1:
++; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
++; CHECK-NEXT:    br label [[EXIT]]
++; CHECK:       bb2:
++; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
++; CHECK-NEXT:    br label [[EXIT]]
++; CHECK:       exit:
++; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
++; CHECK-NEXT:    ret i1 [[RESULT]]
+ ;
+ start:
+   switch i64 %a, label %bb0 [
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll
+--- a/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll
++++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll
+@@ -4,8 +4,25 @@
+ define i1 @common_instr_with_unreachable(i64 %a, i64 %b, i64 %c) {
+ ; CHECK-LABEL: @common_instr_with_unreachable(
+ ; CHECK-NEXT:  start:
++; CHECK-NEXT:    switch i64 [[A:%.*]], label [[UNREACHABLE:%.*]] [
++; CHECK-NEXT:    i64 0, label [[BB0:%.*]]
++; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
++; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
++; CHECK-NEXT:    ]
++; CHECK:       unreachable:
++; CHECK-NEXT:    unreachable
++; CHECK:       bb0:
+ ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
+-; CHECK-NEXT:    ret i1 [[TMP0]]
++; CHECK-NEXT:    br label [[EXIT:%.*]]
++; CHECK:       bb1:
++; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
++; CHECK-NEXT:    br label [[EXIT]]
++; CHECK:       bb2:
++; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
++; CHECK-NEXT:    br label [[EXIT]]
++; CHECK:       exit:
++; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
++; CHECK-NEXT:    ret i1 [[RESULT]]
+ ;
+ start:
+   switch i64 %a, label %unreachable [
+@@ -37,90 +54,43 @@
+ define i1 @common_instr_with_unreachable_2(i64 %a, i64 %b, i64 %c) {
+ ; CHECK-LABEL: @common_instr_with_unreachable_2(
+ ; CHECK-NEXT:  start:
+-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
+-; CHECK-NEXT:    ret i1 [[TMP0]]
+-;
+-start:
+-  switch i64 %a, label %bb1 [
+-  i64 0, label %bb0
+-  i64 1, label %unreachable
+-  i64 2, label %bb2
+-  ]
+-
+-unreachable:
+-  unreachable
+-
+-bb0:                                              ; preds = %start
+-  %0 = icmp eq i64 %b, %c
+-  br label %exit
+-
+-bb1:                                              ; preds = %start
+-  %1 = icmp eq i64 %b, %c
+-  br label %exit
+-
+-bb2:                                              ; preds = %start
+-  %2 = icmp eq i64 %b, %c
+-  br label %exit
+-
+-exit:                                             ; preds = %bb2, %bb1, %bb0
+-  %result = phi i1 [ %0, %bb0 ], [ %1, %bb1 ], [ %2, %bb2 ]
+-  ret i1 %result
+-}
+-
+-declare void @no_return()
+-declare void @foo()
+-
+-define i1 @not_only_unreachable(i64 %a, i64 %b, i64 %c) {
+-; CHECK-LABEL: @not_only_unreachable(
+-; CHECK-NEXT:  start:
+-; CHECK-NEXT:    switch i64 [[A:%.*]], label [[UNREACHABLE:%.*]] [
++; CHECK-NEXT:    switch i64 [[A:%.*]], label [[BB1:%.*]] [
+ ; CHECK-NEXT:    i64 0, label [[BB0:%.*]]
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+ ; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+ ; CHECK-NEXT:    ]
+-; CHECK:       unreachable:
+-; CHECK-NEXT:    call void @no_return()
+-; CHECK-NEXT:    unreachable
+ ; CHECK:       bb0:
+ ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
+-; CHECK-NEXT:    call void @foo()
+ ; CHECK-NEXT:    br label [[EXIT:%.*]]
+ ; CHECK:       bb1:
+ ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
+-; CHECK-NEXT:    call void @foo()
+ ; CHECK-NEXT:    br label [[EXIT]]
+ ; CHECK:       bb2:
+ ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
+-; CHECK-NEXT:    call void @foo()
+ ; CHECK-NEXT:    br label [[EXIT]]
+ ; CHECK:       exit:
+ ; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
+ ; CHECK-NEXT:    ret i1 [[RESULT]]
+ ;
+ start:
+-  switch i64 %a, label %unreachable [
++  switch i64 %a, label %bb1 [
+   i64 0, label %bb0
+-  i64 1, label %bb1
++  i64 1, label %unreachable
+   i64 2, label %bb2
+   ]
+ 
+ unreachable:
+-  call void @no_return()
+   unreachable
+ 
+ bb0:                                              ; preds = %start
+   %0 = icmp eq i64 %b, %c
+-  call void @foo()
+   br label %exit
+ 
+ bb1:                                              ; preds = %start
+   %1 = icmp eq i64 %b, %c
+-  call void @foo()
+   br label %exit
+ 
+ bb2:                                              ; preds = %start
+   %2 = icmp eq i64 %b, %c
+-  call void @foo()
+   br label %exit
+ 
+ exit:                                             ; preds = %bb2, %bb1, %bb0
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll
+--- a/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll
++++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll
+@@ -48,68 +48,6 @@
+   ret void
+ }
+ 
+-define void @f0_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m,  ptr nocapture noundef readonly %b) {
+-; CHECK-LABEL: @f0_switch(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[M:%.*]], align 2
+-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+-; CHECK-NEXT:    ]
+-; CHECK:       bb0:
+-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[TMP0]], 1
+-; CHECK-NEXT:    [[U:%.*]] = add i16 [[ADD]], [[TMP1]]
+-; CHECK-NEXT:    br label [[END:%.*]]
+-; CHECK:       bb1:
+-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i16 [[TMP0]], 1
+-; CHECK-NEXT:    [[TMP2:%.*]] = add i16 [[SUB]], 3
+-; CHECK-NEXT:    [[V:%.*]] = add i16 [[SUB]], [[TMP2]]
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       bb2:
+-; CHECK-NEXT:    [[SUB2:%.*]] = sub nsw i16 [[TMP0]], 1
+-; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[SUB2]], 3
+-; CHECK-NEXT:    [[W:%.*]] = add i16 [[SUB2]], [[TMP3]]
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       end:
+-; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[V]], [[BB1]] ], [ [[W]], [[BB2]] ]
+-; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+-; CHECK-NEXT:    ret void
+-;
+-entry:
+-  switch i64 %i, label %bb0 [
+-  i64 1, label %bb1
+-  i64 2, label %bb2
+-  ]
+-
+-bb0:
+-  %0 = load i16, ptr %b, align 2
+-  %add = add nsw i16 %0, 1
+-  %1 = load i16, ptr %m, align 2
+-  %u = add i16 %add, %1
+-  br label %end
+-
+-bb1:
+-  %2 = load i16, ptr %b, align 2
+-  %sub = sub nsw i16 %2, 1
+-  %3 = load i16, ptr %m, align 2
+-  %4 = add i16 %sub, 3
+-  %v = add i16 %sub, %4
+-  br label %end
+-
+-bb2:
+-  %5 = load i16, ptr %b, align 2
+-  %sub2 = sub nsw i16 %5, 1
+-  %6 = load i16, ptr %m, align 2
+-  %7 = add i16 %sub2, 3
+-  %w = add i16 %sub2, %7
+-  br label %end
+-
+-end:
+-  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+-  store i16 %uv, ptr %d, align 2
+-  ret void
+-}
+ 
+ ;; Check some instructions (e.g. add) can be reordered across instructions with side
+ ;; effects, while others (e.g. load) can't.
+@@ -159,70 +97,6 @@
+   ret void
+ }
+ 
+-define void @f2_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+-; CHECK-LABEL: @f2_switch(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+-; CHECK-NEXT:    [[ADD_0:%.*]] = add nsw i16 [[TMP0]], 1
+-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+-; CHECK-NEXT:    ]
+-; CHECK:       bb0:
+-; CHECK-NEXT:    call void @side_effects0()
+-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[M:%.*]], align 2
+-; CHECK-NEXT:    [[U:%.*]] = add i16 [[ADD_0]], [[TMP1]]
+-; CHECK-NEXT:    br label [[END:%.*]]
+-; CHECK:       bb1:
+-; CHECK-NEXT:    call void @no_side_effects0()
+-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[M]], align 2
+-; CHECK-NEXT:    [[V:%.*]] = add i16 [[ADD_0]], [[TMP2]]
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       bb2:
+-; CHECK-NEXT:    call void @no_side_effects0()
+-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[M]], align 2
+-; CHECK-NEXT:    [[W:%.*]] = add i16 [[ADD_0]], [[TMP3]]
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       end:
+-; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[V]], [[BB1]] ], [ [[W]], [[BB2]] ]
+-; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+-; CHECK-NEXT:    ret void
+-;
+-entry:
+-  switch i64 %i, label %bb0 [
+-  i64 1, label %bb1
+-  i64 2, label %bb2
+-  ]
+-
+-bb0:
+-  %0 = load i16, ptr %b, align 2
+-  call void @side_effects0()
+-  %add.0 = add nsw i16 %0, 1
+-  %1 = load i16, ptr %m, align 2
+-  %u = add i16 %add.0, %1
+-  br label %end
+-
+-bb1:
+-  %2 = load i16, ptr %b, align 2
+-  call void @no_side_effects0()
+-  %add.1 = add nsw i16 %2, 1
+-  %3 = load i16, ptr %m, align 2
+-  %v = add i16 %add.1, %3
+-  br label %end
+-
+-bb2:
+-  %4 = load i16, ptr %b, align 2
+-  call void @no_side_effects0()
+-  %add.2 = add nsw i16 %4, 1
+-  %5 = load i16, ptr %m, align 2
+-  %w = add i16 %add.2, %5
+-  br label %end
+-
+-end:
+-  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+-  store i16 %uv, ptr %d, align 2
+-  ret void
+-}
+ 
+ ;; Check indeed it was the side effects that prevented hoisting the load
+ ;; in the previous test.
+@@ -269,67 +143,6 @@
+   ret void
+ }
+ 
+-define void @f3_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+-; CHECK-LABEL: @f3_switch(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+-; CHECK-NEXT:    [[ADD_0:%.*]] = add nsw i16 [[TMP0]], 1
+-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[M:%.*]], align 2
+-; CHECK-NEXT:    [[U:%.*]] = add i16 [[ADD_0]], [[TMP1]]
+-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+-; CHECK-NEXT:    ]
+-; CHECK:       bb0:
+-; CHECK-NEXT:    call void @no_side_effects0()
+-; CHECK-NEXT:    br label [[END:%.*]]
+-; CHECK:       bb1:
+-; CHECK-NEXT:    call void @no_side_effects1()
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       bb2:
+-; CHECK-NEXT:    call void @no_side_effects1()
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       end:
+-; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[U]], [[BB1]] ], [ [[U]], [[BB2]] ]
+-; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+-; CHECK-NEXT:    ret void
+-;
+-entry:
+-  switch i64 %i, label %bb0 [
+-  i64 1, label %bb1
+-  i64 2, label %bb2
+-  ]
+-
+-bb0:
+-  %0 = load i16, ptr %b, align 2
+-  call void @no_side_effects0()
+-  %add.0 = add nsw i16 %0, 1
+-  %1 = load i16, ptr %m, align 2
+-  %u = add i16 %add.0, %1
+-  br label %end
+-
+-bb1:
+-  %2 = load i16, ptr %b, align 2
+-  call void @no_side_effects1()
+-  %add.1 = add nsw i16 %2, 1
+-  %3 = load i16, ptr %m, align 2
+-  %v = add i16 %add.1, %3
+-  br label %end
+-
+-bb2:
+-  %4 = load i16, ptr %b, align 2
+-  call void @no_side_effects1()
+-  %add.2 = add nsw i16 %4, 1
+-  %5 = load i16, ptr %m, align 2
+-  %w = add i16 %add.2, %5
+-  br label %end
+-
+-end:
+-  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+-  store i16 %uv, ptr %d, align 2
+-  ret void
+-}
+-
+ ;; Check some instructions (e.g. sdiv) are not speculatively executed.
+ 
+ ;; Division by non-zero constant OK to speculate ...
+@@ -373,63 +186,6 @@
+   ret void
+ }
+ 
+-define void @f4_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+-; CHECK-LABEL: @f4_switch(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+-; CHECK-NEXT:    [[DIV_0:%.*]] = sdiv i16 [[TMP0]], 2
+-; CHECK-NEXT:    [[U:%.*]] = add i16 [[DIV_0]], [[TMP0]]
+-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+-; CHECK-NEXT:    ]
+-; CHECK:       bb0:
+-; CHECK-NEXT:    call void @side_effects0()
+-; CHECK-NEXT:    br label [[IF_END:%.*]]
+-; CHECK:       bb1:
+-; CHECK-NEXT:    call void @side_effects1()
+-; CHECK-NEXT:    br label [[IF_END]]
+-; CHECK:       bb2:
+-; CHECK-NEXT:    call void @side_effects1()
+-; CHECK-NEXT:    br label [[IF_END]]
+-; CHECK:       if.end:
+-; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[U]], [[BB1]] ], [ [[U]], [[BB2]] ]
+-; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+-; CHECK-NEXT:    ret void
+-;
+-entry:
+-  switch i64 %i, label %bb0 [
+-  i64 1, label %bb1
+-  i64 2, label %bb2
+-  ]
+-
+-bb0:
+-  %0 = load i16, ptr %b, align 2
+-  call void @side_effects0()
+-  %div.0 = sdiv i16 %0, 2
+-  %u = add i16 %div.0, %0
+-  br label %if.end
+-
+-bb1:
+-  %1 = load i16, ptr %b, align 2
+-  call void @side_effects1()
+-  %div.1 = sdiv i16 %1, 2
+-  %v = add i16 %div.1, %1
+-  br label %if.end
+-
+-bb2:
+-  %2 = load i16, ptr %b, align 2
+-  call void @side_effects1()
+-  %div.2 = sdiv i16 %2, 2
+-  %w = add i16 %div.2, %2
+-  br label %if.end
+-
+-if.end:
+-  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+-  store i16 %uv, ptr %d, align 2
+-  ret void
+-}
+-
+ ;; ... but not a general division ...
+ define void @f5(i1 %c, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+ ; CHECK-LABEL: @f5(
+@@ -474,67 +230,6 @@
+   ret void
+ }
+ 
+-define void @f5_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+-; CHECK-LABEL: @f5_switch(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+-; CHECK-NEXT:    ]
+-; CHECK:       bb0:
+-; CHECK-NEXT:    call void @side_effects0()
+-; CHECK-NEXT:    [[DIV_0:%.*]] = sdiv i16 211, [[TMP0]]
+-; CHECK-NEXT:    [[U:%.*]] = add i16 [[DIV_0]], [[TMP0]]
+-; CHECK-NEXT:    br label [[END:%.*]]
+-; CHECK:       bb1:
+-; CHECK-NEXT:    call void @side_effects1()
+-; CHECK-NEXT:    [[DIV_1:%.*]] = sdiv i16 211, [[TMP0]]
+-; CHECK-NEXT:    [[V:%.*]] = add i16 [[DIV_1]], [[TMP0]]
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       bb2:
+-; CHECK-NEXT:    call void @side_effects1()
+-; CHECK-NEXT:    [[DIV_2:%.*]] = sdiv i16 211, [[TMP0]]
+-; CHECK-NEXT:    [[W:%.*]] = add i16 [[DIV_2]], [[TMP0]]
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       end:
+-; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[V]], [[BB1]] ], [ [[W]], [[BB2]] ]
+-; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+-; CHECK-NEXT:    ret void
+-;
+-entry:
+-  switch i64 %i, label %bb0 [
+-  i64 1, label %bb1
+-  i64 2, label %bb2
+-  ]
+-
+-bb0:
+-  %0 = load i16, ptr %b, align 2
+-  call void @side_effects0()
+-  %div.0 = sdiv i16 211, %0
+-  %u = add i16 %div.0, %0
+-  br label %end
+-
+-bb1:
+-  %1 = load i16, ptr %b, align 2
+-  call void @side_effects1()
+-  %div.1 = sdiv i16 211, %1
+-  %v = add i16 %div.1, %1
+-  br label %end
+-
+-bb2:
+-  %2 = load i16, ptr %b, align 2
+-  call void @side_effects1()
+-  %div.2 = sdiv i16 211, %2
+-  %w = add i16 %div.2, %2
+-  br label %end
+-
+-end:
+-  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+-  store i16 %uv, ptr %d, align 2
+-  ret void
+-}
+-
+ ;; ... and it's also OK to hoist the division when there's no speculation happening.
+ define void @f6(i1 %c, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+ ; CHECK-LABEL: @f6(
+@@ -576,63 +271,6 @@
+   ret void
+ }
+ 
+-define void @f6_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+-; CHECK-LABEL: @f6_switch(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
+-; CHECK-NEXT:    [[DIV_0:%.*]] = sdiv i16 211, [[TMP0]]
+-; CHECK-NEXT:    [[U:%.*]] = add i16 [[DIV_0]], [[TMP0]]
+-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+-; CHECK-NEXT:    ]
+-; CHECK:       bb0:
+-; CHECK-NEXT:    call void @no_side_effects0()
+-; CHECK-NEXT:    br label [[END:%.*]]
+-; CHECK:       bb1:
+-; CHECK-NEXT:    call void @no_side_effects1()
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       bb2:
+-; CHECK-NEXT:    call void @no_side_effects1()
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       end:
+-; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[U]], [[BB1]] ], [ [[U]], [[BB2]] ]
+-; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
+-; CHECK-NEXT:    ret void
+-;
+-entry:
+-  switch i64 %i, label %bb0 [
+-  i64 1, label %bb1
+-  i64 2, label %bb2
+-  ]
+-
+-bb0:
+-  %0 = load i16, ptr %b, align 2
+-  call void @no_side_effects0()
+-  %div.0 = sdiv i16 211, %0
+-  %u = add i16 %div.0, %0
+-  br label %end
+-
+-bb1:
+-  %1 = load i16, ptr %b, align 2
+-  call void @no_side_effects1()
+-  %div.1 = sdiv i16 211, %1
+-  %v = add i16 %div.1, %1
+-  br label %end
+-
+-bb2:
+-  %2 = load i16, ptr %b, align 2
+-  call void @no_side_effects1()
+-  %div.2 = sdiv i16 211, %2
+-  %w = add i16 %div.2, %2
+-  br label %end
+-
+-end:
+-  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
+-  store i16 %uv, ptr %d, align 2
+-  ret void
+-}
+-
+ ;; No reorder of store over a load.
+ define i16 @f7(i1 %c, ptr %a, ptr %b) {
+ ; CHECK-LABEL: @f7(
+@@ -668,55 +306,6 @@
+   ret i16 %v
+ }
+ 
+-define i16 @f7_switch(i64 %i, ptr %a, ptr %b) {
+-; CHECK-LABEL: @f7_switch(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+-; CHECK-NEXT:    ]
+-; CHECK:       bb0:
+-; CHECK-NEXT:    [[VA:%.*]] = load i16, ptr [[A:%.*]], align 2
+-; CHECK-NEXT:    store i16 0, ptr [[B:%.*]], align 2
+-; CHECK-NEXT:    br label [[END:%.*]]
+-; CHECK:       bb1:
+-; CHECK-NEXT:    [[VB:%.*]] = load i16, ptr [[B]], align 2
+-; CHECK-NEXT:    store i16 0, ptr [[B]], align 2
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       bb2:
+-; CHECK-NEXT:    [[VC:%.*]] = load i16, ptr [[B]], align 2
+-; CHECK-NEXT:    store i16 0, ptr [[B]], align 2
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       end:
+-; CHECK-NEXT:    [[V:%.*]] = phi i16 [ [[VA]], [[BB0]] ], [ [[VB]], [[BB1]] ], [ [[VC]], [[BB2]] ]
+-; CHECK-NEXT:    ret i16 [[V]]
+-;
+-entry:
+-  switch i64 %i, label %bb0 [
+-  i64 1, label %bb1
+-  i64 2, label %bb2
+-  ]
+-
+-bb0:
+-  %va = load i16, ptr %a, align 2
+-  store i16 0, ptr %b, align 2
+-  br label %end
+-
+-bb1:
+-  %vb = load i16, ptr %b, align 2
+-  store i16 0, ptr %b, align 2
+-  br label %end
+-
+-bb2:
+-  %vc = load i16, ptr %b, align 2
+-  store i16 0, ptr %b, align 2
+-  br label %end
+-
+-end:
+-  %v = phi i16 [ %va, %bb0 ], [ %vb, %bb1 ], [ %vc, %bb2 ]
+-  ret i16 %v
+-}
+-
+ ;; Can reorder load over another load
+ define i16 @f8(i1 %cond, ptr %a, ptr %b, ptr %c) {
+ ; CHECK-LABEL: @f8(
+@@ -757,59 +346,6 @@
+   ret i16 %w
+ }
+ 
+-define i16 @f8_switch(i64 %i, ptr %a, ptr %b, ptr %c) {
+-; CHECK-LABEL: @f8_switch(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[C_0:%.*]] = load i16, ptr [[C:%.*]], align 2
+-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+-; CHECK-NEXT:    ]
+-; CHECK:       bb0:
+-; CHECK-NEXT:    [[VA:%.*]] = load i16, ptr [[A:%.*]], align 2
+-; CHECK-NEXT:    br label [[END:%.*]]
+-; CHECK:       bb1:
+-; CHECK-NEXT:    [[VB:%.*]] = load i16, ptr [[B:%.*]], align 2
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       bb2:
+-; CHECK-NEXT:    [[VC:%.*]] = load i16, ptr [[B]], align 2
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       end:
+-; CHECK-NEXT:    [[V:%.*]] = phi i16 [ [[VA]], [[BB0]] ], [ [[VB]], [[BB1]] ], [ [[VC]], [[BB2]] ]
+-; CHECK-NEXT:    [[U:%.*]] = phi i16 [ [[C_0]], [[BB0]] ], [ [[C_0]], [[BB1]] ], [ [[C_0]], [[BB2]] ]
+-; CHECK-NEXT:    [[W:%.*]] = add i16 [[V]], [[U]]
+-; CHECK-NEXT:    ret i16 [[W]]
+-;
+-entry:
+-  switch i64 %i, label %bb0 [
+-  i64 1, label %bb1
+-  i64 2, label %bb2
+-  ]
+-
+-bb0:
+-  %va = load i16, ptr %a, align 2
+-  %c.0 = load i16, ptr %c
+-  br label %end
+-
+-bb1:
+-  %vb = load i16, ptr %b, align 2
+-  %c.1 = load i16, ptr %c
+-  br label %end
+-
+-bb2:
+-  %vc = load i16, ptr %b, align 2
+-  %c.2 = load i16, ptr %c
+-  br label %end
+-
+-end:
+-  %v = phi i16 [ %va, %bb0 ], [ %vb, %bb1 ], [ %vc, %bb2 ]
+-  %u = phi i16 [ %c.0, %bb0 ], [ %c.1, %bb1 ], [ %c.2, %bb2 ]
+-
+-  %w = add i16 %v, %u
+-
+-  ret i16 %w
+-}
+-
+ ;; Currently won't reorder volatile and non-volatile loads.
+ define i16 @f9(i1 %cond, ptr %a, ptr %b, ptr %c) {
+ ; CHECK-LABEL: @f9(
+@@ -851,61 +387,6 @@
+   ret i16 %w
+ }
+ 
+-define i16 @f9_switch(i64 %i, ptr %a, ptr %b, ptr %c) {
+-; CHECK-LABEL: @f9_switch(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+-; CHECK-NEXT:    ]
+-; CHECK:       bb0:
+-; CHECK-NEXT:    [[VA:%.*]] = load volatile i16, ptr [[A:%.*]], align 2
+-; CHECK-NEXT:    [[C_0:%.*]] = load i16, ptr [[C:%.*]], align 2
+-; CHECK-NEXT:    br label [[END:%.*]]
+-; CHECK:       bb1:
+-; CHECK-NEXT:    [[VB:%.*]] = load i16, ptr [[B:%.*]], align 2
+-; CHECK-NEXT:    [[C_1:%.*]] = load i16, ptr [[C]], align 2
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       bb2:
+-; CHECK-NEXT:    [[VC:%.*]] = load i16, ptr [[B]], align 2
+-; CHECK-NEXT:    [[C_2:%.*]] = load i16, ptr [[C]], align 2
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       end:
+-; CHECK-NEXT:    [[V:%.*]] = phi i16 [ [[VA]], [[BB0]] ], [ [[VB]], [[BB1]] ], [ [[VC]], [[BB2]] ]
+-; CHECK-NEXT:    [[U:%.*]] = phi i16 [ [[C_0]], [[BB0]] ], [ [[C_1]], [[BB1]] ], [ [[C_2]], [[BB2]] ]
+-; CHECK-NEXT:    [[W:%.*]] = add i16 [[V]], [[U]]
+-; CHECK-NEXT:    ret i16 [[W]]
+-;
+-entry:
+-  switch i64 %i, label %bb0 [
+-  i64 1, label %bb1
+-  i64 2, label %bb2
+-  ]
+-
+-bb0:
+-  %va = load volatile i16, ptr %a, align 2
+-  %c.0 = load i16, ptr %c
+-  br label %end
+-
+-bb1:
+-  %vb = load i16, ptr %b, align 2
+-  %c.1 = load i16, ptr %c
+-  br label %end
+-
+-bb2:
+-  %vc = load i16, ptr %b, align 2
+-  %c.2 = load i16, ptr %c
+-  br label %end
+-
+-end:
+-  %v = phi i16 [ %va, %bb0 ], [ %vb, %bb1 ], [ %vc, %bb2 ]
+-  %u = phi i16 [ %c.0, %bb0 ], [ %c.1, %bb1 ], [ %c.2, %bb2 ]
+-
+-  %w = add i16 %v, %u
+-
+-  ret i16 %w
+-}
+-
+ ;; Don't hoist stacksaves across inalloca allocas
+ define void @f10(i1 %cond) {
+ ; CHECK-LABEL: @f10(
+@@ -953,79 +434,6 @@
+   br label %end
+ 
+ end:
+-  call void @llvm.stackrestore(ptr %ss)
+-  ret void
+-}
+-
+-define void @f10_switch(i64 %i) {
+-; CHECK-LABEL: @f10_switch(
+-; CHECK-NEXT:    [[SS:%.*]] = call ptr @llvm.stacksave.p0()
+-; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
+-; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
+-; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
+-; CHECK-NEXT:    ]
+-; CHECK:       bb0:
+-; CHECK-NEXT:    [[I1:%.*]] = alloca inalloca i32, align 4
+-; CHECK-NEXT:    [[SS2:%.*]] = call ptr @llvm.stacksave.p0()
+-; CHECK-NEXT:    [[I2:%.*]] = alloca inalloca i64, align 8
+-; CHECK-NEXT:    call void @inalloca_i64(ptr inalloca(i64) [[I2]])
+-; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS2]])
+-; CHECK-NEXT:    call void @inalloca_i32(ptr inalloca(i32) [[I1]])
+-; CHECK-NEXT:    br label [[END:%.*]]
+-; CHECK:       bb1:
+-; CHECK-NEXT:    [[I3:%.*]] = alloca inalloca i64, align 8
+-; CHECK-NEXT:    [[SS3:%.*]] = call ptr @llvm.stacksave.p0()
+-; CHECK-NEXT:    [[I4:%.*]] = alloca inalloca i64, align 8
+-; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I4]])
+-; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS3]])
+-; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I3]])
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       bb2:
+-; CHECK-NEXT:    [[I5:%.*]] = alloca inalloca i64, align 8
+-; CHECK-NEXT:    [[SS4:%.*]] = call ptr @llvm.stacksave.p0()
+-; CHECK-NEXT:    [[I6:%.*]] = alloca inalloca i64, align 8
+-; CHECK-NEXT:    [[TMP3:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I6]])
+-; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS4]])
+-; CHECK-NEXT:    [[TMP4:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I5]])
+-; CHECK-NEXT:    br label [[END]]
+-; CHECK:       end:
+-; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS]])
+-; CHECK-NEXT:    ret void
+-;
+-  %ss = call ptr @llvm.stacksave()
+-  switch i64 %i, label %bb0 [
+-  i64 1, label %bb1
+-  i64 2, label %bb2
+-  ]
+-
+-bb0:
+-  %i1 = alloca inalloca i32
+-  %ss2 = call ptr @llvm.stacksave()
+-  %i2 = alloca inalloca i64
+-  call void @inalloca_i64(ptr inalloca(i64) %i2)
+-  call void @llvm.stackrestore(ptr %ss2)
+-  call void @inalloca_i32(ptr inalloca(i32) %i1)
+-  br label %end
+-
+-bb1:
+-  %i3 = alloca inalloca i64
+-  %ss3 = call ptr @llvm.stacksave()
+-  %i4 = alloca inalloca i64
+-  call ptr @inalloca_i64(ptr inalloca(i64) %i4)
+-  call void @llvm.stackrestore(ptr %ss3)
+-  call ptr @inalloca_i64(ptr inalloca(i64) %i3)
+-  br label %end
+-
+-bb2:
+-  %i5 = alloca inalloca i64
+-  %ss4 = call ptr @llvm.stacksave()
+-  %i6 = alloca inalloca i64
+-  call ptr @inalloca_i64(ptr inalloca(i64) %i6)
+-  call void @llvm.stackrestore(ptr %ss4)
+-  call ptr @inalloca_i64(ptr inalloca(i64) %i5)
+-  br label %end
+-
+-end:
+   call void @llvm.stackrestore(ptr %ss)
+   ret void
+ }
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
+--- a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
++++ b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
+@@ -21,8 +21,20 @@
+ 
+ define void @hoist_range_switch(i64 %i, ptr %p) {
+ ; CHECK-LABEL: @hoist_range_switch(
+-; CHECK-NEXT:  out:
++; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
++; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
++; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
++; CHECK-NEXT:    ]
++; CHECK:       bb0:
+ ; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !range [[RNG1:![0-9]+]]
++; CHECK-NEXT:    br label [[OUT:%.*]]
++; CHECK:       bb1:
++; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1, !range [[RNG2:![0-9]+]]
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       bb2:
++; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !range [[RNG3:![0-9]+]]
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       out:
+ ; CHECK-NEXT:    ret void
+ ;
+   switch i64 %i, label %bb0 [
+@@ -45,7 +57,7 @@
+ define void @hoist_both_noundef(i1 %c, ptr %p) {
+ ; CHECK-LABEL: @hoist_both_noundef(
+ ; CHECK-NEXT:  if:
+-; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !2
++; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
+ ; CHECK-NEXT:    ret void
+ ;
+ if:
+@@ -66,8 +78,20 @@
+ 
+ define void @hoist_both_noundef_switch(i64 %i, ptr %p) {
+ ; CHECK-LABEL: @hoist_both_noundef_switch(
+-; CHECK-NEXT:  out:
+-; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !2
++; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
++; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
++; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
++; CHECK-NEXT:    ]
++; CHECK:       bb0:
++; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
++; CHECK-NEXT:    br label [[OUT:%.*]]
++; CHECK:       bb1:
++; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       bb2:
++; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       out:
+ ; CHECK-NEXT:    ret void
+ ;
+   switch i64 %i, label %bb0 [
+@@ -110,8 +134,20 @@
+ 
+ define void @hoist_one_noundef_switch(i64 %i, ptr %p) {
+ ; CHECK-LABEL: @hoist_one_noundef_switch(
+-; CHECK-NEXT:  out:
+-; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1
++; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
++; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
++; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
++; CHECK-NEXT:    ]
++; CHECK:       bb0:
++; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
++; CHECK-NEXT:    br label [[OUT:%.*]]
++; CHECK:       bb1:
++; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       bb2:
++; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       out:
+ ; CHECK-NEXT:    ret void
+ ;
+   switch i64 %i, label %bb0 [
+@@ -134,7 +170,7 @@
+ define void @hoist_dereferenceable(i1 %c, ptr %p) {
+ ; CHECK-LABEL: @hoist_dereferenceable(
+ ; CHECK-NEXT:  if:
+-; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !3
++; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !5
+ ; CHECK-NEXT:    ret void
+ ;
+ if:
+@@ -151,8 +187,20 @@
+ 
+ define void @hoist_dereferenceable_switch(i64 %i, ptr %p) {
+ ; CHECK-LABEL: @hoist_dereferenceable_switch(
+-; CHECK-NEXT:  out:
+-; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !3
++; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
++; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
++; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
++; CHECK-NEXT:    ]
++; CHECK:       bb0:
++; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !5
++; CHECK-NEXT:    br label [[OUT:%.*]]
++; CHECK:       bb1:
++; CHECK-NEXT:    [[E:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable !6
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       bb2:
++; CHECK-NEXT:    [[F:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable !7
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       out:
+ ; CHECK-NEXT:    ret void
+ ;
+   switch i64 %i, label %bb0 [
+@@ -175,7 +223,7 @@
+ define void @hoist_dereferenceable_or_null(i1 %c, ptr %p) {
+ ; CHECK-LABEL: @hoist_dereferenceable_or_null(
+ ; CHECK-NEXT:  if:
+-; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !3
++; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !5
+ ; CHECK-NEXT:    ret void
+ ;
+ if:
+@@ -192,8 +240,20 @@
+ 
+ define void @hoist_dereferenceable_or_null_switch(i64 %i, ptr %p) {
+ ; CHECK-LABEL: @hoist_dereferenceable_or_null_switch(
+-; CHECK-NEXT:  out:
+-; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !3
++; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
++; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
++; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
++; CHECK-NEXT:    ]
++; CHECK:       bb0:
++; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !6
++; CHECK-NEXT:    br label [[OUT:%.*]]
++; CHECK:       bb1:
++; CHECK-NEXT:    [[E:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable_or_null !5
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       bb2:
++; CHECK-NEXT:    [[F:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable_or_null !7
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       out:
+ ; CHECK-NEXT:    ret void
+ ;
+   switch i64 %i, label %bb0 [
+@@ -217,7 +277,7 @@
+ define i32 @speculate_range(i1 %c, ptr dereferenceable(8) align 8 %p) {
+ ; CHECK-LABEL: @speculate_range(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG4:![0-9]+]]
++; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG8:![0-9]+]]
+ ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], i32 [[V]], i32 0
+ ; CHECK-NEXT:    ret i32 [[SPEC_SELECT]]
+ ;
+@@ -238,7 +298,7 @@
+ define ptr @speculate_nonnull(i1 %c, ptr dereferenceable(8) align 8 %p) {
+ ; CHECK-LABEL: @speculate_nonnull(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !2
++; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !4
+ ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], ptr [[V]], ptr null
+ ; CHECK-NEXT:    ret ptr [[SPEC_SELECT]]
+ ;
+@@ -259,7 +319,7 @@
+ define ptr @speculate_align(i1 %c, ptr dereferenceable(8) align 8 %p) {
+ ; CHECK-LABEL: @speculate_align(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !align !5
++; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !align !9
+ ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], ptr [[V]], ptr null
+ ; CHECK-NEXT:    ret ptr [[SPEC_SELECT]]
+ ;
+@@ -278,7 +338,7 @@
+ define void @hoist_fpmath(i1 %c, double %x) {
+ ; CHECK-LABEL: @hoist_fpmath(
+ ; CHECK-NEXT:  if:
+-; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !6
++; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !10
+ ; CHECK-NEXT:    ret void
+ ;
+ if:
+@@ -295,8 +355,20 @@
+ 
+ define void @hoist_fpmath_switch(i64 %i, double %x) {
+ ; CHECK-LABEL: @hoist_fpmath_switch(
+-; CHECK-NEXT:  out:
+-; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !6
++; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
++; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
++; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
++; CHECK-NEXT:    ]
++; CHECK:       bb0:
++; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !10
++; CHECK-NEXT:    br label [[OUT:%.*]]
++; CHECK:       bb1:
++; CHECK-NEXT:    [[E:%.*]] = fadd double [[X]], 1.000000e+00, !fpmath !11
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       bb2:
++; CHECK-NEXT:    [[F:%.*]] = fadd double [[X]], 1.000000e+00, !fpmath !12
++; CHECK-NEXT:    br label [[OUT]]
++; CHECK:       out:
+ ; CHECK-NEXT:    ret void
+ ;
+   switch i64 %i, label %bb0 [
+@@ -322,10 +394,16 @@
+ !3 = !{ i8 7, i8 9 }
+ ;.
+ ; CHECK: [[RNG0]] = !{i8 0, i8 1, i8 3, i8 5}
+-; CHECK: [[RNG1]] = !{i8 0, i8 1, i8 3, i8 5, i8 7, i8 9}
+-; CHECK: [[META2:![0-9]+]] = !{}
+-; CHECK: [[META3:![0-9]+]] = !{i64 10}
+-; CHECK: [[RNG4]] = !{i32 0, i32 10}
+-; CHECK: [[META5:![0-9]+]] = !{i64 4}
+-; CHECK: [[META6:![0-9]+]] = !{float 2.500000e+00}
++; CHECK: [[RNG1]] = !{i8 0, i8 1}
++; CHECK: [[RNG2]] = !{i8 3, i8 5}
++; CHECK: [[RNG3]] = !{i8 7, i8 9}
++; CHECK: [[META4:![0-9]+]] = !{}
++; CHECK: [[META5:![0-9]+]] = !{i64 10}
++; CHECK: [[META6:![0-9]+]] = !{i64 20}
++; CHECK: [[META7:![0-9]+]] = !{i64 30}
++; CHECK: [[RNG8]] = !{i32 0, i32 10}
++; CHECK: [[META9:![0-9]+]] = !{i64 4}
++; CHECK: [[META10:![0-9]+]] = !{float 2.500000e+00}
++; CHECK: [[META11:![0-9]+]] = !{float 5.000000e+00}
++; CHECK: [[META12:![0-9]+]] = !{float 7.500000e+00}
+ ;.
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index a1abdc50e811c6..6b039a5a6e9efe 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "2baf4a06ef06c51c2ef09f981f204983b0f8082c"
-    LLVM_SHA256 = "0ce881f09d65b27810160d02842d42259209506fac98f3f9389059c8b8429d69"
+    LLVM_COMMIT = "afd7db48c55cb87566758e961f1ebac8af16b8bc"
+    LLVM_SHA256 = "64f1436eb824ee7f6125ae06c7c337c8edfa8763767f38d7fd218ca02b0311c3"
 
     tf_http_archive(
         name = name,

From 463c8cd5de83752f9785f5eced74d1adaac7ddc1 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 20 Sep 2023 19:07:39 -0700
Subject: [PATCH 069/567] [stream_executor] NFC: Remove platform/logging.h from
 StreamExecutor

https://github.com/openxla/xla/issues/5761

PiperOrigin-RevId: 567148608
---
 third_party/xla/xla/stream_executor/BUILD     |  5 ++++
 .../xla/xla/stream_executor/cuda/cuda_blas.cc |  2 +-
 .../stream_executor/cuda/cuda_diagnostics.cc  |  6 ++--
 .../xla/xla/stream_executor/cuda/cuda_dnn.cc  |  2 +-
 .../xla/stream_executor/cuda/cuda_driver.cc   |  2 +-
 .../xla/xla/stream_executor/cuda/cuda_fft.cc  |  2 +-
 .../xla/xla/stream_executor/device_options.h  |  2 +-
 third_party/xla/xla/stream_executor/dnn.h     |  2 +-
 third_party/xla/xla/stream_executor/gpu/BUILD |  1 +
 .../xla/xla/stream_executor/gpu/gpu_kernel.h  |  2 +-
 third_party/xla/xla/stream_executor/kernel.cc |  2 +-
 .../xla/xla/stream_executor/kernel_spec.h     |  2 +-
 .../xla/xla/stream_executor/module_spec.h     |  2 +-
 .../xla/xla/stream_executor/platform.cc       |  2 +-
 .../xla/xla/stream_executor/platform/BUILD    |  1 -
 .../xla/stream_executor/platform/logging.h    | 30 -------------------
 .../xla/xla/stream_executor/rocm/rocm_blas.cc |  2 +-
 .../stream_executor/rocm/rocm_diagnostics.cc  |  2 +-
 .../xla/xla/stream_executor/rocm/rocm_dnn.cc  |  2 +-
 .../xla/stream_executor/rocm/rocm_driver.cc   |  2 +-
 .../xla/xla/stream_executor/rocm/rocm_fft.cc  |  5 ++--
 .../stream_executor/rocm/rocm_gpu_executor.cc |  4 +--
 third_party/xla/xla/stream_executor/stream.cc |  8 ++---
 .../temporary_memory_manager.cc               |  2 +-
 24 files changed, 32 insertions(+), 60 deletions(-)
 delete mode 100644 third_party/xla/xla/stream_executor/platform/logging.h

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index f4b13f734c8301..5d40427a31f392 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -177,6 +177,7 @@ cc_library(
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -193,6 +194,7 @@ cc_library(
     deps = [
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -249,6 +251,7 @@ cc_library(
     deps = [
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -281,6 +284,7 @@ cc_library(
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -699,6 +703,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index f9a8ceef02bbf2..d9d5451eb912a7 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -37,11 +37,11 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
index 4a5395f7a8c8e1..ad1aa615f4cce5 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
@@ -45,8 +45,8 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "tsl/platform/host_info.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 
 namespace stream_executor {
@@ -245,8 +245,8 @@ tsl::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
 #if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
   // Callback used when iterating through DSOs. Looks for the driver-interfacing
   // DSO and yields its version number into the callback data, when found.
-  auto iterate_phdr =
-      [](struct dl_phdr_info *info, size_t size, void *data) -> int {
+  auto iterate_phdr = [](struct dl_phdr_info *info, size_t size,
+                         void *data) -> int {
     if (strstr(info->dlpi_name, "libcuda.so.1")) {
       VLOG(1) << "found DLL info with name: " << info->dlpi_name;
       char resolved_path[PATH_MAX] = {0};
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 3a174e92377600..c6496ca684ab85 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
@@ -54,6 +53,7 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tsl/cuda/cudnn_version.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 #include "tsl/util/env_var.h"
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
index b1e217caab9bfd..bf180cb86486d1 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
@@ -40,10 +40,10 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/driver_types.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/stacktrace.h"
 #include "tsl/platform/static_threadlocal.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
index 8e27874af32f04..f7eec72b14e3a5 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
@@ -27,12 +27,12 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_stream.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform/initialize.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_internal.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/device_options.h b/third_party/xla/xla/stream_executor/device_options.h
index d89c2e96b3e86b..5fc86b25e1c97a 100644
--- a/third_party/xla/xla/stream_executor/device_options.h
+++ b/third_party/xla/xla/stream_executor/device_options.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include <map>
 
 #include "absl/strings/str_join.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index b07a6a1819fef6..1cf598e7f37141 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -43,8 +43,8 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.pb.h"
 #include "xla/stream_executor/numeric_options.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 0f19b4852f5235..a790045a3e42f0 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -199,6 +199,7 @@ cc_library(
         "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor:stream_executor_pimpl_header",
         "//xla/stream_executor/platform",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h b/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
index e8eb1f1c9eee31..b7fa33c87e5ace 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
@@ -24,9 +24,9 @@ limitations under the License.
 
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/kernel_cache_config.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/stream_executor_internal.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 namespace gpu {
diff --git a/third_party/xla/xla/stream_executor/kernel.cc b/third_party/xla/xla/stream_executor/kernel.cc
index b4a9f9aa7e1618..d09f25873c8555 100644
--- a/third_party/xla/xla/stream_executor/kernel.cc
+++ b/third_party/xla/xla/stream_executor/kernel.cc
@@ -22,10 +22,10 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/demangle.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.h b/third_party/xla/xla/stream_executor/kernel_spec.h
index 8b33a5fe8dd7f7..70ee8b607faa00 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.h
+++ b/third_party/xla/xla/stream_executor/kernel_spec.h
@@ -53,8 +53,8 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/module_spec.h b/third_party/xla/xla/stream_executor/module_spec.h
index c797df1559e18c..fe34a49626ecc9 100644
--- a/third_party/xla/xla/stream_executor/module_spec.h
+++ b/third_party/xla/xla/stream_executor/module_spec.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/types/span.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/platform.cc b/third_party/xla/xla/stream_executor/platform.cc
index 04ddc16a602bc8..9f2c5c0978f128 100644
--- a/third_party/xla/xla/stream_executor/platform.cc
+++ b/third_party/xla/xla/stream_executor/platform.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 
 #include "absl/strings/str_cat.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/platform/BUILD b/third_party/xla/xla/stream_executor/platform/BUILD
index 85a9641f51ce7d..084de6c15f4ebb 100644
--- a/third_party/xla/xla/stream_executor/platform/BUILD
+++ b/third_party/xla/xla/stream_executor/platform/BUILD
@@ -17,7 +17,6 @@ cc_library(
     name = "platform",
     textual_hdrs = [
         "initialize.h",
-        "logging.h",
         "platform.h",
         "port.h",
     ],
diff --git a/third_party/xla/xla/stream_executor/platform/logging.h b/third_party/xla/xla/stream_executor/platform/logging.h
deleted file mode 100644
index 53121fc4165fe7..00000000000000
--- a/third_party/xla/xla/stream_executor/platform/logging.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_PLATFORM_LOGGING_H_
-#define XLA_STREAM_EXECUTOR_PLATFORM_LOGGING_H_
-
-#include "xla/stream_executor/platform/port.h"
-#include "tsl/platform/logging.h"
-
-#if !(defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID) || \
-      defined(PLATFORM_GOOGLE_IOS) || defined(GOOGLE_LOGGING) ||      \
-      defined(__EMSCRIPTEN__) || defined(PLATFORM_CHROMIUMOS))
-
-#define PCHECK(invocation) CHECK(invocation)
-
-#endif
-
-#endif  // XLA_STREAM_EXECUTOR_PLATFORM_LOGGING_H_
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
index aa9929d77ceaa5..b426cd810ba407 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
@@ -34,12 +34,12 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/platform/dso_loader.h"
 #include "xla/stream_executor/platform/initialize.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/logging.h"
 #include "tsl/util/determinism.h"
 using tsl::OpDeterminismRequired;
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.cc b/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.cc
index 555ce1cb7a293b..91ef8cf92c0113 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.cc
@@ -35,9 +35,9 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/host_info.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 namespace rocm {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index dfdb95b690e2e1..b987cdc6b55988 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/platform/dso_loader.h"
 #include "xla/stream_executor/platform/initialize.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/rocm/rocm_diagnostics.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
@@ -46,6 +45,7 @@ limitations under the License.
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/hash.h"
+#include "tsl/platform/logging.h"
 #include "tsl/util/determinism.h"
 #include "tsl/util/env_var.h"
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
index 752fc93fbd93b2..bde91ca30582c7 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
@@ -31,11 +31,11 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "xla/stream_executor/gpu/gpu_diagnostics.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/rocm/rocm_driver_wrapper.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/stacktrace.h"
 #include "tsl/platform/static_threadlocal.h"
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc b/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
index 3131b74858248f..5dc336e996e30c 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/platform/dso_loader.h"
 #include "xla/stream_executor/platform/initialize.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream_executor_internal.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -533,8 +533,7 @@ bool ROCMFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT hipfftExec,
     if (allocator) {
       auto allocated = allocator->AllocateBytes(input.size());
       if (allocated.ok()) {
-        if (stream->ThenMemcpy(&allocated.value(), input, input.size())
-                .ok()) {
+        if (stream->ThenMemcpy(&allocated.value(), input, input.size()).ok()) {
           input_maybe_copy = DeviceMemory<InputT>(allocated.value());
         } else {
           LOG(ERROR) << "failed to copy input buffer for rocFFT.";
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
index 5ad9feb70e5652..9c0dd98791a6a5 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/dso_loader.h"
 #include "xla/stream_executor/platform/initialize.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/rocm/rocm_diagnostics.h"
@@ -45,6 +44,7 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 
 #ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
 #error \
@@ -182,7 +182,7 @@ tsl::Status GpuExecutor::Init(int device_ordinal,
 //                 would return /usr/bin.
 static string GetBinaryDir(bool strip_exe) {
   char exe_path[PATH_MAX] = {0};
-  PCHECK(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1) != -1);
+  CHECK_NE(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1), -1);
   // Make sure it's null-terminated:
   exe_path[sizeof(exe_path) - 1] = 0;
 
diff --git a/third_party/xla/xla/stream_executor/stream.cc b/third_party/xla/xla/stream_executor/stream.cc
index fe1b4158ec6e5a..d8cfa7bb793cd4 100644
--- a/third_party/xla/xla/stream_executor/stream.cc
+++ b/third_party/xla/xla/stream_executor/stream.cc
@@ -28,10 +28,10 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/stream_executor_internal.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/stacktrace.h"
 
 namespace stream_executor {
@@ -238,10 +238,8 @@ std::string CallStr(const char *function_name, Stream *stream,
 
 // Use this macro to avoid having to type every parameter twice to log
 // it with VLOG and CallStr.
-#define PARAM(parameter)                \
-  {                                     \
-#parameter, ToVlogString(parameter) \
-  }
+#define PARAM(parameter) \
+  { #parameter, ToVlogString(parameter) }
 
 // Use this macro to avoid having to type out the name of each
 // function and to save some boilerplate. Intended to be used like this:
diff --git a/third_party/xla/xla/stream_executor/temporary_memory_manager.cc b/third_party/xla/xla/stream_executor/temporary_memory_manager.cc
index c0b6bd2f53d1d8..690f64225611a9 100644
--- a/third_party/xla/xla/stream_executor/temporary_memory_manager.cc
+++ b/third_party/xla/xla/stream_executor/temporary_memory_manager.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "xla/stream_executor/platform/logging.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 namespace internal {

From 67b2bb20ea256517cc514b83cae36aaa79f93fc4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 19:22:42 -0700
Subject: [PATCH 070/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/6a3e3ece9c01f3e5742a297c73357d463a2fe151.

PiperOrigin-RevId: 567151719
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index ab7501924e78b2..a788df8889a6a7 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "752d6d83d403986227dffe42beb5014843cf2ddb"
-    TFRT_SHA256 = "818d9b3951c1da81a937a24c3875bfae38664ceb37909e5438bbebf708279a24"
+    TFRT_COMMIT = "6a3e3ece9c01f3e5742a297c73357d463a2fe151"
+    TFRT_SHA256 = "24477cd3e9ac93a1010542de308341f1c112df974f9510c18e4d4efb2de78e59"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index ab7501924e78b2..a788df8889a6a7 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "752d6d83d403986227dffe42beb5014843cf2ddb"
-    TFRT_SHA256 = "818d9b3951c1da81a937a24c3875bfae38664ceb37909e5438bbebf708279a24"
+    TFRT_COMMIT = "6a3e3ece9c01f3e5742a297c73357d463a2fe151"
+    TFRT_SHA256 = "24477cd3e9ac93a1010542de308341f1c112df974f9510c18e4d4efb2de78e59"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index ab7501924e78b2..a788df8889a6a7 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "752d6d83d403986227dffe42beb5014843cf2ddb"
-    TFRT_SHA256 = "818d9b3951c1da81a937a24c3875bfae38664ceb37909e5438bbebf708279a24"
+    TFRT_COMMIT = "6a3e3ece9c01f3e5742a297c73357d463a2fe151"
+    TFRT_SHA256 = "24477cd3e9ac93a1010542de308341f1c112df974f9510c18e4d4efb2de78e59"
 
     tf_http_archive(
         name = "tf_runtime",

From 413c90e89c1e6d29eef4e4dc0e3b3fbb647addb1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 19:38:28 -0700
Subject: [PATCH 071/567] Update the logic of `PjRtArray::Reshard` after
 `PjRtBuffer::CopyToMemorySpace` was introduced. Users should use
 `PjRtBuffer::CopyToMemorySpace` instead of `PjRtBuffer::CopyToDevice` when
 memories are supported, since the semantics of the latter one is to always
 copy to the default memory space of the device.

PiperOrigin-RevId: 567154400
---
 .../xla/xla/python/pjrt_ifrt/pjrt_array.cc    | 54 +++++++++++++------
 third_party/xla/xla/python/xla_client.py      |  2 +-
 2 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index 8678ff06b79f5a..205c376246ad38 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_join.h"
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -340,16 +342,29 @@ StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
   // permits device changes and nothing else.
   PjRtBuffers buffers;
   buffers.reserve(pjrt_buffers_.size());
+  // TODO(yueshengys): Add a on-demand canonicalization when all users
+  // (e.g., ifrt proxy) support memories.
+  bool new_sharding_has_memory_kind =
+      new_sharding->memory_kind().memory_kind().has_value();
+  // TODO(yueshengys): Remove the check on PjRt C API after `CopyToMemorySpace`
+  // is supported.
+  CHECK_GT(new_sharding->devices().size(), 0);
+  bool using_c_api = absl::StrContains(
+      new_sharding->devices().front()->client()->platform_version(),
+      "PJRT C API");
   for (int i = 0; i < pjrt_buffers_.size(); ++i) {
+    bool devices_equal =
+        pjrt_buffers_[i]->device() == new_sharding->devices()[i];
+    bool memories_supported =
+        !using_c_api && pjrt_buffers_[i]->memory_space() != nullptr;
     bool memory_kind_equal =
-        !new_sharding->memory_kind().memory_kind().has_value() ||
-        pjrt_buffers_[i]->memory_space() == nullptr ||
+        new_sharding_has_memory_kind && memories_supported &&
         pjrt_buffers_[i]->memory_space()->memory_space_kind() ==
             new_sharding->memory_kind().memory_kind();
-    bool devices_equal =
-        pjrt_buffers_[i]->device() == new_sharding->devices()[i];
 
-    if (devices_equal && memory_kind_equal) {
+    // No need for data transfer.
+    if (devices_equal && (!new_sharding_has_memory_kind ||
+                          !memories_supported || memory_kind_equal)) {
       switch (semantics) {
         case ArrayCopySemantics::kAlwaysCopy:
           // TODO(hyeontaek): kAlwaysCopy should clone the buffer, but the PjRt
@@ -373,25 +388,32 @@ StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
             "first fetched to the host and then sent to the destination "
             "device.");
       }
-      if (!devices_equal && memory_kind_equal) {
+      // Use `PjRtBuffer::CopyToMemorySpace` instead of
+      // `PjRtBuffer::CopyToDevice` when memories are supported. Because the
+      // semantics of the latter one is to copy to the default memory space of
+      // the device.
+      if (new_sharding_has_memory_kind && memories_supported) {
         TF_ASSIGN_OR_RETURN(
-            std::unique_ptr<xla::PjRtBuffer> copied_buffer,
-            pjrt_buffers_[i]->CopyToDevice(new_sharding->devices()[i]));
+            auto memory_space,
+            GetMemorySpaceFromMemoryKind(new_sharding->devices()[i],
+                                         new_sharding->memory_kind()));
+        TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> copied_buffer,
+                            pjrt_buffers_[i]->CopyToMemorySpace(memory_space));
         if (semantics == ArrayCopySemantics::kDonateInput) {
+          if (!memory_kind_equal) {
+            return Unimplemented(
+                "Donation across different memory kinds is not implemented.");
+          }
           pjrt_buffers_[i] = nullptr;
         }
         buffers.push_back(std::shared_ptr<PjRtBuffer>(copied_buffer.release()));
       } else {
-        // memory_kind is not equal and devices can be equal or not equal.
+        // Use `PjRtBuffer::CopyToDevice` when memories are not supported.
         TF_ASSIGN_OR_RETURN(
-            auto memory_space,
-            GetMemorySpaceFromMemoryKind(new_sharding->devices()[i],
-                                         new_sharding->memory_kind()));
-        TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> copied_buffer,
-                            pjrt_buffers_[i]->CopyToMemorySpace(memory_space));
+            std::unique_ptr<xla::PjRtBuffer> copied_buffer,
+            pjrt_buffers_[i]->CopyToDevice(new_sharding->devices()[i]));
         if (semantics == ArrayCopySemantics::kDonateInput) {
-          return Unimplemented(
-              "Donation across different memory kinds is not implemented.");
+          pjrt_buffers_[i] = nullptr;
         }
         buffers.push_back(std::shared_ptr<PjRtBuffer>(copied_buffer.release()));
       }
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 3de4938fdb1204..0c19224f787919 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -44,7 +44,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 195
+_version = 196
 
 # Version number for MLIR:Python components.
 mlir_api_version = 54

From 84c4dd54b92e2af00b4366f87b0bad7ad78a6c9d Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 20 Sep 2023 19:57:58 -0700
Subject: [PATCH 072/567] [stream_executor] NFC: Document exported device
 allocator headers

Issue: #5761
PiperOrigin-RevId: 567157790
---
 third_party/xla/xla/stream_executor/BUILD | 25 +++++++++++++++--------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 5d40427a31f392..f2c8cffb029550 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -55,6 +55,22 @@ cc_library(
     ],
 )
 
+#===--------------------------------------------------------------------------------------------===#
+# Exporting headers for Tensorflow
+#===--------------------------------------------------------------------------------------------===#
+
+# Tensorflow device memory allocators are aliases for StreamExecutor allocators, we export
+# headers for Tensorflow to build shared libraries in OSS.
+
+filegroup(
+    name = "device_mem_allocator_headers",
+    srcs = [
+        "device_host_allocator.h",
+        "device_mem_allocator.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 #===--------------------------------------------------------------------------------------------===#
 
 # The stream_executor_headers target does not prescribe an implementation.
@@ -318,15 +334,6 @@ cc_library(
     ],
 )
 
-filegroup(
-    name = "device_mem_allocator_headers",
-    srcs = [
-        "device_host_allocator.h",
-        "device_mem_allocator.h",
-    ],
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "executor_cache",
     srcs = [

From 9f8e4fa28ad556e1de4b3222c4e2f58bb331f774 Mon Sep 17 00:00:00 2001
From: Jieying Luo <jieying@google.com>
Date: Wed, 20 Sep 2023 21:10:11 -0700
Subject: [PATCH 073/567] Fix parsing visible_devices option in GPU plugin.

PiperOrigin-RevId: 567173464
---
 third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc | 2 +-
 third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index 1cbc0dffe49148..75a14f42118b4d 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -86,7 +86,7 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
   if (auto it = create_options.find("visible_devices");
       it != create_options.end()) {
     const auto& vec = std::get<std::vector<int64_t>>(it->second);
-    visible_devices->insert(vec.begin(), vec.end());
+    visible_devices.emplace(vec.begin(), vec.end());
   }
   int node_id = 0;
   if (auto it = create_options.find("node_id"); it != create_options.end()) {
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 0f3c089acb2c66..63102691defd0c 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -166,13 +166,14 @@ TEST(PjrtCApiGpuKVStoreTest, CreateClientWithKVCallback) {
   }
 }
 
-TEST(PjrtCApiGpuAllocatorTest, ValidAllocatorOptionsParsing) {
+TEST(PjrtCApiGpuAllocatorTest, ValidOptionsParsing) {
   auto api = GetPjrtApi();
   std::vector<std::string> allocator_options = {"default", "platform", "bfc",
                                                 "cuda_async"};
   for (const std::string& allocator_option : allocator_options) {
     absl::flat_hash_map<std::string, xla::PjRtValueType> options = {
         {"allocator", allocator_option},
+        {"visible_devices", xla::PjRtValueType(std::vector<int64_t>{0, 1})},
     };
     if (allocator_option == "bfc" || allocator_option == "cuda_async") {
       options["memory_fraction"] = 0.5f;

From b1995d4f73516a8330a0355b8b06193f44bc8e1f Mon Sep 17 00:00:00 2001
From: Jieying Luo <jieying@google.com>
Date: Wed, 20 Sep 2023 21:10:48 -0700
Subject: [PATCH 074/567] [PJRT C API] Fix
 PjRtCApiClient::ExecutableFingerprint.

When the C API returns an empty fingerprint, PjRtCApiClient::ExecutableFingerprint should return a std::nullopt.

PiperOrigin-RevId: 567173579
---
 third_party/xla/xla/pjrt/pjrt_c_api_client.cc |  4 ++++
 .../xla/xla/pjrt/pjrt_c_api_client_test.cc    | 20 +++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index a1a13020341e96..e6940affd26482 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -318,6 +318,10 @@ StatusOr<std::optional<std::string>> PjRtCApiClient::ExecutableFingerprint(
     xla::Status s = ::pjrt::PjrtErrorToStatus(error.get(), c_api_);
     return s;
   }
+  if (args.executable_fingerprint == nullptr ||
+      args.executable_fingerprint_size == 0) {
+    return {std::nullopt};
+  }
   std::string fingerprint = std::string(args.executable_fingerprint,
                                         args.executable_fingerprint_size);
   return {fingerprint};
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
index 7ce67c622f0640..d4477696f18d7a 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
@@ -99,5 +99,25 @@ TEST(PjRtCApiClientTest, PlatformId) {
   EXPECT_EQ(client->platform_id(), xla::CpuId());
 }
 
+TEST(PjRtCApiClientTest, EmptyExecutableFingerprint) {
+  SetUpCpuPjRtApi();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetCApiClient("cpu"));
+  Shape shape = ShapeUtil::MakeShapeWithType<float>({4});
+  XlaBuilder builder("sum");
+  auto inp_0 = Parameter(&builder, 0, shape, "input0");
+  auto inp_1 = Parameter(&builder, 1, shape, "input1");
+  auto sum = Add(inp_0, inp_1);
+  builder.SetUpAlias({}, 0, {});
+  auto computation = builder.Build(sum).value();
+  std::unique_ptr<PjRtLoadedExecutable> executable =
+      client->Compile(computation, CompileOptions()).value();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::optional<std::string> fingerprint,
+                          client->ExecutableFingerprint(*executable));
+
+  EXPECT_FALSE(fingerprint.has_value());
+}
+
 }  // namespace
 }  // namespace xla

From 88c8f44c2860956460a4a1722a3d2e020533a6d9 Mon Sep 17 00:00:00 2001
From: "Jiyoun (Jen) Ha" <jiyounha@google.com>
Date: Wed, 20 Sep 2023 22:30:49 -0700
Subject: [PATCH 075/567] (2/N) Refactor stablehlo bridge namespaces to
 mlir::quant::stablehlo.

PiperOrigin-RevId: 567187656
---
 .../mlir/lite/stablehlo/transforms/transforms.cc  |  2 +-
 tensorflow/compiler/mlir/python/mlir.cc           |  2 +-
 .../passes/bridge/convert_mhlo_quant_to_int.cc    |  8 +++-----
 .../passes/bridge/convert_tf_quant_ops_to_mhlo.cc | 15 ++++++---------
 .../bridge/convert_tf_quant_to_mhlo_int_test.cc   |  6 ++----
 .../passes/bridge/convert_tf_quant_types.cc       |  8 +++-----
 .../passes/bridge/convert_tf_quant_types_test.cc  |  8 +++-----
 .../stablehlo/passes/bridge/passes.cc             |  4 ++--
 .../quantization/stablehlo/passes/bridge/passes.h |  4 ++--
 .../stablehlo/passes/bridge/passes.td             |  8 ++++----
 .../passes/bridge/verify_quant_legalization.cc    |  6 ++----
 .../stablehlo/tools/stablehlo_quant_opt.cc        |  2 +-
 .../quantization/tensorflow/quantize_passes.cc    |  3 ++-
 .../tensorflow/quantize_preprocess.cc             |  2 +-
 .../mlir/tf2xla/api/v1/compile_mlir_util.cc       |  4 ++--
 tensorflow/compiler/mlir/tf_mlir_opt_main.cc      |  2 +-
 16 files changed, 36 insertions(+), 48 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
index 3b4e3a58bec738..562afd53f6f76c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
@@ -67,7 +67,7 @@ void AddTFToStablehloPasses(OpPassManager& pm, bool skip_resize,
 
   // Legalizes TF UniformQuantized types into MHLO.
   pm.addNestedPass<func::FuncOp>(
-      mlir::stablehlo::CreateConvertTFQuantOpsToMHLOPass());
+      mlir::quant::stablehlo::CreateConvertTFQuantOpsToMHLOPass());
   pm.addPass(mlir::createCanonicalizerPass());
 
   // TF -> StableHLO legalization.
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index 2fc6dcdd84cf24..4841c0ad85714f 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -97,7 +97,7 @@ static void RegisterPasses() {
     // passes.
     mlir::mhlo::registerTfXlaPasses();
     mlir::mhlo::registerLegalizeTFPass();
-    mlir::stablehlo::registerBridgePasses();
+    mlir::quant::stablehlo::registerBridgePasses();
     mlir::tosa::registerLegalizeTosaPasses();
     mlir::tosa::registerTFtoTOSALegalizationPipeline();
     mlir::tosa::registerTFLtoTOSALegalizationPipeline();
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
index 4a26d6d70398b5..255c69573f08eb 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
@@ -46,8 +46,7 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/rewriters.h"
 
-namespace mlir {
-namespace stablehlo {
+namespace mlir::quant::stablehlo {
 namespace {
 
 #define GEN_PASS_DEF_CONVERTMHLOQUANTTOINT
@@ -735,12 +734,11 @@ void ConvertMHLOQuantToInt::runOnOperation() {
   }
 }
 
-}  // end namespace
+}  // namespace
 
 std::unique_ptr<OperationPass<func::FuncOp>> createConvertMHLOQuantToIntPass(
     bool legalize_chlo) {
   return std::make_unique<ConvertMHLOQuantToInt>(legalize_chlo);
 }
 
-}  // end namespace stablehlo
-}  // end namespace mlir
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc
index 28b72634516b59..a24db896d79f1d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc
@@ -58,9 +58,7 @@ limitations under the License.
 #include "tensorflow/core/util/quantization/uniform_quant_ops_attr.pb.h"
 #include "tensorflow/core/util/quantization/uniform_quant_ops_params.h"
 
-// TODO: b/289560952 - Move to mlir::quant::stablehlo namespace.
-namespace mlir {
-namespace stablehlo {
+namespace mlir::quant::stablehlo {
 namespace {
 
 using quant::tensorflow::GetDenseAttrFromTensorProtoAttr;
@@ -138,7 +136,7 @@ FailureOr<Value> CreateConstantOrConvertOp(Operation *op, Value operand,
 }
 
 xla::ConvolutionDimensionNumbers ConvertConvolutionDimensionNumbers(
-    const tensorflow::UniformQuantizedConvolutionDimensionNumbersAttr
+    const ::tensorflow::UniformQuantizedConvolutionDimensionNumbersAttr
         &dnums_input) {
   xla::ConvolutionDimensionNumbers dnums;
   dnums.set_input_batch_dimension(dnums_input.input_batch_dimension());
@@ -207,11 +205,11 @@ FailureOr<ElementsAttr> ConvertPaddingAttr(
       const int64_t stride =
           op.getWindowStridesAttr()[i].template cast<IntegerAttr>().getInt();
       const int64_t lhs_size_dilated =
-          tensorflow::UniformQuantizedConvolutionParams::DilatedSize(
+          ::tensorflow::UniformQuantizedConvolutionParams::DilatedSize(
               lhs_shape.getDimSize(dnums.input_spatial_dimensions(i)),
               op.getLhsDilationAttr()[i].template cast<IntegerAttr>().getInt());
       const int64_t rhs_size_dilated =
-          tensorflow::UniformQuantizedConvolutionParams::DilatedSize(
+          ::tensorflow::UniformQuantizedConvolutionParams::DilatedSize(
               rhs_shape.getDimSize(dnums.kernel_spatial_dimensions(i)),
               op.getRhsDilationAttr()[i].template cast<IntegerAttr>().getInt());
 
@@ -238,7 +236,7 @@ FailureOr<SmallVector<NamedAttribute>> ConvertToMhloConvolutionOpAttrs(
     UniformQuantizedConvolutionOp op, PatternRewriter &rewriter) {
   // TODO(b/261005147): Update the lowering logic after migration to mhlo
   // ConvolutionDimensionNumbers.
-  tensorflow::UniformQuantizedConvolutionDimensionNumbersAttr dnums_input;
+  ::tensorflow::UniformQuantizedConvolutionDimensionNumbersAttr dnums_input;
   if (!dnums_input.ParseFromString(std::string(op.getDimensionNumbers()))) {
     return op->emitError("Parse dimension_numbers failed.");
   }
@@ -766,5 +764,4 @@ CreateConvertTFQuantOpsToMHLOPass() {
   return std::make_unique<ConvertTFQuantOpsToMHLO>();
 }
 
-}  // namespace stablehlo
-}  // namespace mlir
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
index e78ebbfb0ef961..5e80a9ce8d894e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
@@ -35,8 +35,7 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/tests/literal_test_util.h"
 
-namespace mlir {
-namespace stablehlo {
+namespace mlir::quant::stablehlo {
 namespace {
 
 class ConvertTfQuantToMhloIntTest : public ::testing::Test {
@@ -194,5 +193,4 @@ func.func @main(%input: tensor<1x2xf32>, %filter: tensor<2x3xf32>) -> tensor<1x3
 }
 
 }  // namespace
-}  // namespace stablehlo
-}  // end namespace mlir
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
index a60df89d58ed59..65192fc1117673 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
@@ -43,8 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 
-namespace mlir {
-namespace stablehlo {
+namespace mlir::quant::stablehlo {
 namespace {
 
 using quant::tensorflow::GetDenseAttrFromTensorProtoAttr;
@@ -56,7 +55,7 @@ using quant::tensorflow::IsTFUniformQuantizedOp;
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h.inc"
 
 // TODO: b/290366702 - Temporarily added metrics for debugging.
-auto *mlir_tf_quant_op_count = tensorflow::monitoring::Counter<1>::New(
+auto *mlir_tf_quant_op_count = ::tensorflow::monitoring::Counter<1>::New(
     "/tensorflow/core/tf2xla/tf_quant_op_count" /*metric_name*/,
     "Counts the number of ops that has qint types" /*metric description*/,
     "op_name" /*metric label*/);
@@ -326,5 +325,4 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateConvertTFQuantTypesPass() {
   return std::make_unique<ConvertTFQuantTypes>();
 }
 
-}  // namespace stablehlo
-}  // namespace mlir
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types_test.cc
index cc46304650b267..856bbd49930341 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types_test.cc
@@ -30,8 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tsl/platform/statusor.h"
 
-namespace mlir {
-namespace stablehlo {
+namespace mlir::quant::stablehlo {
 namespace {
 
 using ::mlir::DialectRegistry;
@@ -54,7 +53,7 @@ class LegalizeTfTypesTest : public ::testing::Test {
 
     pm_ = std::make_unique<mlir::PassManager>(&context_);
     pm_->addNestedPass<mlir::func::FuncOp>(
-        mlir::stablehlo::CreateConvertTFQuantTypesPass());
+        quant::stablehlo::CreateConvertTFQuantTypesPass());
   }
   mlir::LogicalResult Run() { return pm_->run(module_.get()); }
 
@@ -105,5 +104,4 @@ TEST_F(LegalizeTfTypesTest, RecordsStreamzNoQuantOps) {
 }
 
 }  // namespace
-}  // namespace stablehlo
-}  // namespace mlir
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc
index 1aae78d6340645..aab45eae763e33 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 
-namespace mlir::stablehlo {
+namespace mlir::quant::stablehlo {
 
 void AddQuantizationLoweringPasses(mlir::OpPassManager& pm) {
   // These passes are grouped together and must run in this specific order.
@@ -35,4 +35,4 @@ void AddQuantizationLoweringPasses(mlir::OpPassManager& pm) {
   pm.addNestedPass<mlir::func::FuncOp>(CreateVerifyQuantLegalizationPass());
 }
 
-}  // namespace mlir::stablehlo
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h
index d03ef5837c670c..983339d813c7a0 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 
-namespace mlir::stablehlo {
+namespace mlir::quant::stablehlo {
 
 // Legalizes from MHLO quantized ops with MHLO quant types to MHLO primitive ops
 // like int ops.
@@ -58,6 +58,6 @@ void AddQuantizationLoweringPasses(mlir::OpPassManager &pm);
 #define GEN_PASS_DECL_CONVERTTFQUANTTYPES
 #define GEN_PASS_DECL_VERIFYQUANTLEGALIZATION
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h.inc"
-}  // namespace mlir::stablehlo
+}  // namespace mlir::quant::stablehlo
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_BRIDGE_PASSES_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.td
index cf0cfddd982979..af616129ccf785 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.td
@@ -29,7 +29,7 @@ def ConvertMHLOQuantToInt : Pass<"convert-mhlo-quant-to-int", "mlir::func::FuncO
         "Legalizes intermediate chlo ops to hlo">
     ];
 
-  let constructor = "mlir::stablehlo::createConvertMHLOQuantToIntPass()";
+  let constructor = "mlir::quant::stablehlo::createConvertMHLOQuantToIntPass()";
   let dependentDialects = ["chlo::ChloDialect", "mhlo::MhloDialect",
                            "quant::QuantizationDialect",
                            "shape::ShapeDialect",
@@ -43,7 +43,7 @@ def ConvertTFQuantOpsToMHLO : Pass<"quant-convert-tf-quant-ops-to-mhlo", "mlir::
     Convert TF Quant ops to MHLO quant ops.
   }];
 
-  let constructor = "mlir::stablehlo::CreateConvertTFQuantOpsToMHLOPass()";
+  let constructor = "mlir::quant::stablehlo::CreateConvertTFQuantOpsToMHLOPass()";
   let dependentDialects = ["TF::TensorFlowDialect", "chlo::ChloDialect",
                            "mhlo::MhloDialect", "tf_type::TFTypeDialect",
                            "quant::QuantizationDialect"];
@@ -58,7 +58,7 @@ def ConvertTFQuantTypes : Pass<"convert-tf-quant-types", "mlir::func::FuncOp"> {
     tf.Cast around the ops so that they are still valid.
   }];
 
-  let constructor = "mlir::stablehlo::CreateConvertTFQuantTypesPass()";
+  let constructor = "mlir::quant::stablehlo::CreateConvertTFQuantTypesPass()";
   let dependentDialects = ["TF::TensorFlowDialect", "tf_type::TFTypeDialect"];
 }
 
@@ -70,7 +70,7 @@ def VerifyQuantLegalization : Pass<"verify-quant-legalization", "mlir::func::Fun
     and reports an error about which op failed to legalize. This pass
     does not transform any ops and is checking.}];
 
-  let constructor = "mlir::stablehlo::CreateVerifyQuantLegalizationPass()";
+  let constructor = "mlir::quant::stablehlo::CreateVerifyQuantLegalizationPass()";
   let dependentDialects =  ["tf_type::TFTypeDialect",
                             "quant::QuantizationDialect"];
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc
index c8833ebd8d37d9..361d98c7775abe 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc
@@ -36,8 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
-namespace mlir {
-namespace stablehlo {
+namespace mlir::quant::stablehlo {
 namespace {
 
 using quant::tensorflow::IsTFQintType;
@@ -90,5 +89,4 @@ CreateVerifyQuantLegalizationPass() {
   return std::make_unique<VerifyQuantLegalization>();
 }
 
-}  // namespace stablehlo
-}  // namespace mlir
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc b/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
index c72a75d1acbc22..d05bdb48643458 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
@@ -34,7 +34,7 @@ int main(int argc, char **argv) {
 
   mlir::registerAllPasses();
   mlir::registerTensorFlowPasses();
-  mlir::stablehlo::registerBridgePasses();
+  mlir::quant::stablehlo::registerBridgePasses();
 
   mlir::DialectRegistry registry;
   registry.insert<mlir::scf::SCFDialect, mlir::TF::TensorFlowDialect,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
index 5a6adf4359ba3d..7d988f497474b3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
@@ -40,7 +40,8 @@ void AddStablehloQuantToIntPasses(mlir::PassManager &pm) {
   // StableHLO -> MHLO legalization.
   pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::stablehlo::createConvertMHLOQuantToIntPass(/*legalize_chlo=*/true));
+      mlir::quant::stablehlo::createConvertMHLOQuantToIntPass(
+          /*legalize_chlo=*/true));
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createSymbolDCEPass());
   // MHLO -> StableHLO legalization.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
index 3954d90b48820f..c766a3c87ca75e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
@@ -92,7 +92,7 @@ void AddTFToStablehloPasses(mlir::PassManager& pm) {
   // Legalizes TF UniformQuantized types into MHLO. Part of the official
   // TF/XLA bridge component.
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::stablehlo::CreateConvertTFQuantOpsToMHLOPass());
+      mlir::quant::stablehlo::CreateConvertTFQuantOpsToMHLOPass());
   pm.addPass(mlir::createCanonicalizerPass());
 
   // TF -> StableHLO legalization.
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
index 24dbf9c9d23a48..0c1c3b95b5f8d3 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
@@ -340,7 +340,7 @@ void AddLegalizationPasses(mlir::OpPassManager& pm, bool legalize_chlo,
                            bool lower_to_xla_hlo) {
   if (lower_to_xla_hlo) {
     // Lower TF quant ops and types to MHLO int.
-    mlir::stablehlo::AddQuantizationLoweringPasses(pm);
+    mlir::quant::stablehlo::AddQuantizationLoweringPasses(pm);
 
     pm.addPass(mlir::mhlo::createLegalizeTFPass(
         legalize_chlo,
@@ -449,7 +449,7 @@ void CreateConvertMlirToXlaHloPipeline(
 
   pm.addNestedPass<mlir::func::FuncOp>(mlir::TF::CreateLowerQuantizedPass());
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::stablehlo::CreateConvertTFQuantTypesPass());
+      mlir::quant::stablehlo::CreateConvertTFQuantTypesPass());
 
   if (lower_to_xla_hlo) {
     for (auto& target_pass : custom_legalization_passes) {
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index ed8106175e8ae0..433a05f396bf2f 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -53,7 +53,7 @@ int main(int argc, char **argv) {
   // These are in compiler/mlir/tf2xla and not part of the above MHLO passes.
   mlir::mhlo::registerLegalizeTfPasses();
   mlir::mhlo::registerTfXlaPasses();
-  mlir::stablehlo::registerBridgePasses();
+  mlir::quant::stablehlo::registerBridgePasses();
   mlir::tosa::registerLegalizeTosaPasses();
   mlir::tosa::registerTFtoTOSALegalizationPipeline();
   mlir::tosa::registerTFLtoTOSALegalizationPipeline();

From dd689216576d6af7fb6e49453909458e5dde519f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Sep 2023 22:55:09 -0700
Subject: [PATCH 076/567] Internal Code Change

PiperOrigin-RevId: 567191671
---
 tensorflow/core/common_runtime/eager/BUILD | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index ef3e32c49a6529..b3afde2fda2770 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -17,10 +17,7 @@ load(
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//tensorflow:internal",
-        "//tensorflow_models:__subpackages__",
-    ],
+    default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
 

From 6da396b975ccf5385564cf13d13795072d0411c4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 02:02:01 -0700
Subject: [PATCH 077/567] compat: Update forward compatibility horizon to
 2023-09-21

PiperOrigin-RevId: 567232445
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 0be4fb3a98e8b1..6e494480e54ba7 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 20)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 21)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 7f1050a6976d11bfb0bb37bdfc82350c0a238faa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 02:02:19 -0700
Subject: [PATCH 078/567] Update GraphDef version to 1626.

PiperOrigin-RevId: 567232525
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 05991cc0e70419..7c76122b9da32f 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1625  // Updated: 2023/9/20
+#define TF_GRAPH_DEF_VERSION 1626  // Updated: 2023/9/21
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 0c7098d3552a12037d3fede9784952de732675c2 Mon Sep 17 00:00:00 2001
From: George Necula <necula@google.com>
Date: Thu, 21 Sep 2023 05:02:22 -0700
Subject: [PATCH 079/567] Improve shape refinement to not require inlining.

PiperOrigin-RevId: 567273141
---
 .../compiler/tests/xla_call_module_test.py    |   82 ++
 third_party/stablehlo/temporary.patch         | 1180 ++++++++++++++++-
 .../xla/third_party/stablehlo/temporary.patch | 1180 ++++++++++++++++-
 .../xla/python/refine_polymorphic_shapes.cc   |    4 -
 4 files changed, 2410 insertions(+), 36 deletions(-)

diff --git a/tensorflow/compiler/tests/xla_call_module_test.py b/tensorflow/compiler/tests/xla_call_module_test.py
index a24930a7b8c846..ee651b107c5b8e 100644
--- a/tensorflow/compiler/tests/xla_call_module_test.py
+++ b/tensorflow/compiler/tests/xla_call_module_test.py
@@ -259,6 +259,88 @@ def f(x):  # x: f32[2, b]
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(x),))
 
+  def test_poly_with_inner_token(self):
+    # The inner functions pass tokens through
+    x = np.arange(12, dtype=np.float32).reshape((3, 4))
+
+    def f(x):  # x : f32[b0, b1]
+      # 1 + sin(x)
+      module, version = serialize("""
+module @jit_f.0 attributes {jax.uses_shape_polymorphism = true} {
+  func.func public @main(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
+    %0 = stablehlo.get_dimension_size %arg0, dim = 0 : (tensor<?x?xf32>) -> tensor<i32>
+    %1 = stablehlo.get_dimension_size %arg0, dim = 1 : (tensor<?x?xf32>) -> tensor<i32>
+    %2 = stablehlo.constant dense<> : tensor<0xi1>
+    %3:2 = call @_wrapped_main(%0, %1, %2, %arg0) : (tensor<i32>, tensor<i32>, tensor<0xi1>, tensor<?x?xf32>) -> (tensor<0xi1>, tensor<?x?xf32>)
+    return %3#1 : tensor<?x?xf32>
+  }
+
+  func.func private @_wrapped_main(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<0xi1> {jax.token = true}, %arg3: tensor<?x?xf32>) -> (tensor<0xi1> {jax.token = true}, tensor<?x?xf32>) {
+    %0 = stablehlo.create_token : !stablehlo.token
+    %1 = stablehlo.sine %arg3 : tensor<?x?xf32>
+    %2 = stablehlo.constant dense<1.000000e+00> : tensor<f32>
+    %3 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
+    %4 = stablehlo.reshape %arg1 : (tensor<i32>) -> tensor<1xi32>
+    %5 = stablehlo.concatenate %3, %4, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+    %6 = stablehlo.dynamic_broadcast_in_dim %2, %5, dims = [] : (tensor<f32>, tensor<2xi32>) -> tensor<?x?xf32>
+    %7 = stablehlo.add %1, %6 : tensor<?x?xf32>
+    %8 = stablehlo.constant dense<> : tensor<0xi1>
+    return %8, %7 : tensor<0xi1>, tensor<?x?xf32>
+  }
+}
+""")
+      return xla.call_module(
+          [x],
+          version=version,
+          module=module,
+          Tout=[x.dtype],
+          Sout=[x.shape],
+          has_token_input_output=False,
+          platforms=[self.testing_platform()],
+      )
+
+    self._assertOpOutputMatchesExpected(f, (x,), (1. + np.sin(x),))
+
+  def test_poly_with_inner_prefix_token(self):
+    # Sometimes inner functions take a token as first argument
+    x = np.arange(12, dtype=np.float32).reshape((3, 4))
+
+    def f(x):  # x : f32[b0, b1]
+      # 1 + sin(x)
+      module, version = serialize("""
+module @jit_f.0 attributes {jax.uses_shape_polymorphism = true} {
+  func.func public @main(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
+    %0 = stablehlo.get_dimension_size %arg0, dim = 0 : (tensor<?x?xf32>) -> tensor<i32>
+    %1 = stablehlo.get_dimension_size %arg0, dim = 1 : (tensor<?x?xf32>) -> tensor<i32>
+    %2 = stablehlo.create_token : !stablehlo.token
+    %3:2 = call @_wrapped_main(%2, %0, %1, %arg0) : (!stablehlo.token, tensor<i32>, tensor<i32>, tensor<?x?xf32>) -> (!stablehlo.token, tensor<?x?xf32>)
+    return %3#1 : tensor<?x?xf32>
+  }
+
+  func.func private @_wrapped_main(%arg_token: !stablehlo.token, %arg0: tensor<i32>, %arg1: tensor<i32>, %arg3: tensor<?x?xf32>) -> (!stablehlo.token, tensor<?x?xf32>) {
+    %1 = stablehlo.sine %arg3 : tensor<?x?xf32>
+    %2 = stablehlo.constant dense<1.000000e+00> : tensor<f32>
+    %3 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
+    %4 = stablehlo.reshape %arg1 : (tensor<i32>) -> tensor<1xi32>
+    %5 = stablehlo.concatenate %3, %4, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+    %6 = stablehlo.dynamic_broadcast_in_dim %2, %5, dims = [] : (tensor<f32>, tensor<2xi32>) -> tensor<?x?xf32>
+    %7 = stablehlo.add %1, %6 : tensor<?x?xf32>
+    return %arg_token, %7 : !stablehlo.token, tensor<?x?xf32>
+  }
+}
+""")
+      return xla.call_module(
+          [x],
+          version=version,
+          module=module,
+          Tout=[x.dtype],
+          Sout=[x.shape],
+          has_token_input_output=False,
+          platforms=[self.testing_platform()],
+      )
+
+    self._assertOpOutputMatchesExpected(f, (x,), (1. + np.sin(x),))
+
   def test_wrong_actual_args_errors(self):
     x = np.arange(6, dtype=np.float32).reshape((3, 2))
     y = np.arange(6, dtype=np.int32).reshape((2, 3))
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 4c4228163a6f04..0cb078c3b89795 100644
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1426,7 +1426,353 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/st
 diff --ruN a/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
 --- stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
 +++ stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
-@@ -607,12 +607,55 @@
+@@ -31,6 +31,7 @@
+ 
+ // -----
+ 
++// CHECK-LABEL: module @has_main
+ module @has_main {
+   // CHECK: main
+   func.func @main(%arg0: tensor<4xf32>) -> tensor<*xi32> {
+@@ -38,17 +39,11 @@
+     %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
+     func.return %0 : tensor<*xi32>
+   }
+-
+-  // CHECK: helper
+-  func.func @helper(%arg0: tensor<4xf32>) -> tensor<*xi32> {
+-    // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<*xi32>
+-    %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
+-    func.return %0 : tensor<*xi32>
+-  }
+-}
+-
+-// -----
+-
++}
++
++// -----
++
++// CHECK-LABEL: func @error_unsupported_operation
+ func.func @error_unsupported_operation(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> index {
+   // CHECK: stablehlo.add{{.*}} -> tensor<?xf32>
+   %0 = stablehlo.add %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<?xf32>
+@@ -472,11 +467,312 @@
+ 
+ // -----
+ 
+-// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth
+-func.func @refine_bitcast_convert_same_bitwidth(%arg0 : tensor<4xf32>) -> tensor<*xi32> {
++// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth_unranked_result
++func.func @refine_bitcast_convert_same_bitwidth_unranked_result(%arg0 : tensor<4xf32>) -> tensor<*xi32> {
+   // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<4xi32>
+   %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
+   func.return %0 : tensor<*xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth
++func.func @refine_bitcast_convert_same_bitwidth() -> tensor<?x?x0xf32> {
++  %0 = stablehlo.constant dense<[3, 5, 0]> : tensor<3xi32>
++  %21 = stablehlo.dynamic_iota %0, dim = 0 : (tensor<3xi32>) -> tensor<?x?x0xui32>
++  // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<3x5x0xf32>
++  %48 = stablehlo.bitcast_convert %21 : (tensor<?x?x0xui32>) -> tensor<?x?x0xf32>
++  return %48 : tensor<?x?x0xf32>
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call
++module @refine_call {
++  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
++    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
++    %1 = stablehlo.constant dense<4> : tensor<i32>
++    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
++    %2 = call @refine_call_callee(%1, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    return %2 : tensor<?xf32>
++  }
++  // CHECK: refine_call_callee(%arg0: tensor<4xf32>) -> tensor<4xf32>
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
++    // CHECK: stablehlo.constant dense<4>
++    %0 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
++    %1 = stablehlo.dynamic_iota %0, dim = 0 : (tensor<1xi32>) -> tensor<?xf32>
++    return %1 : tensor<?xf32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call_dimension_arguments
++module @refine_call_dimension_arguments {
++  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT:%.*]] = call @callee
++    // CHECK: return [[RESULT]]
++    %0 = stablehlo.constant dense<3> : tensor<i32>
++    %1 = call @callee(%0, %0, %arg0) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
++    return %1 : tensor<i32>
++  }
++  // %arg0 and %arg1 are dimension arguments
++  // CHECK: @callee([[ARG0:%.*]]: tensor<i32>) -> tensor<i32>
++  func.func private @callee(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
++    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
++    // CHECK: return [[RESULT1]]
++    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
++    %1 = stablehlo.add %0, %arg2: tensor<i32>
++    return %1 : tensor<i32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call_prefix_token_and_dimension_arguments
++module @refine_call_prefix_token_and_dimension_arguments {
++  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT:%.*]] = call @callee
++    // CHECK: return [[RESULT]]
++    %0 = stablehlo.constant dense<3> : tensor<i32>
++    %token = stablehlo.create_token : !stablehlo.token
++    %1 = call @callee(%token, %0, %0, %arg0) : (!stablehlo.token, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
++    return %1 : tensor<i32>
++  }
++  // %arg0 and %arg1 are dimension arguments
++  // CHECK: @callee([[ARG_TOKEN:%.*]]: !stablehlo.token, [[ARG0:%.*]]: tensor<i32>
++  func.func private @callee(%arg_token: !stablehlo.token, %arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
++    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
++    // CHECK: return [[RESULT1]]
++    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
++    %1 = stablehlo.add %0, %arg2: tensor<i32>
++    return %1 : tensor<i32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call_dimension_arguments_followed_by_token
++module @refine_call_dimension_arguments_followed_by_token {
++  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT:%.*]] = call @callee
++    // CHECK: return [[RESULT]]
++    %0 = stablehlo.constant dense<3> : tensor<i32>
++    %token = stablehlo.create_token : !stablehlo.token
++    %1 = call @callee(%0, %0, %token, %arg0) : (tensor<i32>, tensor<i32>, !stablehlo.token, tensor<i32>) -> tensor<i32>
++    return %1 : tensor<i32>
++  }
++  // %arg0 and %arg1 are dimension arguments
++  // CHECK: @callee([[ARG_TOKEN:%.*]]: !stablehlo.token, [[ARG0:%.*]]: tensor<i32>
++  func.func private @callee(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg_token: !stablehlo.token, %arg2: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
++    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
++    // CHECK: return [[RESULT1]]
++    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
++    %1 = stablehlo.add %0, %arg2: tensor<i32>
++    return %1 : tensor<i32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_multiple_call_with_same_context
++module @refine_multiple_call_with_same_context {
++  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
++    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
++    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
++    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
++    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
++    %2 = call @refine_call_callee(%arg0_new, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    return %2 : tensor<?xf32>
++  }
++  // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
++    return %arg1 : tensor<?xf32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_multiple_call_constant_function
++module @refine_multiple_call_constant_function {
++  func.func @main(%arg0: tensor<5xf32>) -> tensor<i32> {
++    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<16>
++    // CHECK: return [[RESULT0]]
++    %0 = stablehlo.constant dense<4> : tensor<i32>
++    %1 = call @refine_call_callee(%0, %arg0) : (tensor<i32>, tensor<5xf32>) -> tensor<i32>
++    %2 = call @refine_call_callee(%0, %arg0) : (tensor<i32>, tensor<5xf32>) -> tensor<i32>
++    %3 = stablehlo.add %1, %2: tensor<i32>
++    return %3 : tensor<i32>
++  }
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<5xf32>) -> tensor<i32> {
++    // CHECK: [[RESULT1:%.*]] = stablehlo.constant dense<8>
++    // CHECK: return [[RESULT1]]
++    %0 = stablehlo.add %arg0, %arg0: tensor<i32>
++    return %0 : tensor<i32>
++  }
++}
++
++// -----
++
++module @refine_call_multiple_with_different_number_dimension_arguments {
++  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
++    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
++    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
++    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    // Ensure that the first argument is not a constant at the second call site
++    %arg0_different_f32 = stablehlo.bitcast_convert %arg0_new : (tensor<i32>) -> tensor<f32>
++    %arg0_different_i32 = stablehlo.bitcast_convert %arg0_different_f32 : (tensor<f32>) -> tensor<i32>
++    // expected-error@+1{{incorrect number of operands for callee}}
++    %2 = call @refine_call_callee(%arg0_different_i32, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    return %2 : tensor<?xf32>
++  }
++  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context. Previous context had 1 and now we have 2 non-dimension arguments}}
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
++    return %arg1 : tensor<?xf32>
++  }
++}
++
++// -----
++
++module @refine_call_multiple_different_dimension_arguments {
++  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
++    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
++    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
++    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    %arg0_different = stablehlo.add %arg0_new, %arg0_new : tensor<i32>
++    // expected-error@+1{{incorrect number of operands for callee}}
++    %2 = call @refine_call_callee(%arg0_different, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    return %2 : tensor<?xf32>
++  }
++  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context.}}
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
++    return %arg1 : tensor<?xf32>
++  }
++}
++
++// -----
++
++module @refine_call_multiple_different_non_dimension_arguments {
++  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
++    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
++    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
++    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    %2 = stablehlo.constant dense<[1., 2.]> : tensor<2xf32>
++    %3 = stablehlo.concatenate %1, %2, dim = 0 : (tensor<?xf32>, tensor<2xf32>) -> tensor<?xf32>
++    // expected-error@+1{{incorrect number of operands for callee}}
++    %4 = call @refine_call_callee(%arg0_new, %3) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    return %4 : tensor<?xf32>
++  }
++  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context.}}
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
++    return %arg1 : tensor<?xf32>
++  }
++}
++
++// -----
++
++module @refine_call_recursive {
++  func.func @main() -> tensor<i32> {
++    %0 = stablehlo.constant dense<3> : tensor<i32>
++    %1 = call @refine_call_callee(%0) : (tensor<i32>) -> tensor<i32>
++    return %1 : tensor<i32>
++  }
++  // expected-error@+1{{Function refine_call_callee is being refined recursively}}
++  func.func @refine_call_callee(%arg0: tensor<i32>) -> tensor<i32> {
++    // expected-error@+1{{incorrect number of operands}}
++    %0 = call @refine_call_callee(%arg0) : (tensor<i32>) -> tensor<i32>
++    return %0 : tensor<i32>
++  }
++}
++
++// -----
++
++module @refine_call_main_argument_unranked {
++  // expected-error@+1{{main must be refined with static shape arguments}}
++  func.func public @main(%arg0: tensor<*xi32>) -> tensor<*xi32> {
++    %2 = call @callee(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
++    return %2 : tensor<*xi32>
++  }
++  func.func private @callee(%arg0: tensor<*xi32>) -> tensor<*xi32> {
++    return %arg0 : tensor<*xi32>
++  }
++}
++
++// -----
++
++module @refine_call_main_argument_dynamic_shape {
++  // expected-error@+1{{main must be refined with static shape arguments}}
++  func.func public @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
++    %2 = call @callee(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
++    return %2 : tensor<?xi32>
++  }
++  func.func private @callee(%arg0: tensor<?xi32>) -> tensor<?xi32> {
++    return %arg0 : tensor<?xi32>
++  }
++}
++
++// -----
++
++module @refine_call_callee_argument_unranked {
++  func.func public @main(%arg0: tensor<1xi64>) -> tensor<*xi32> {
++    %1 = stablehlo.dynamic_iota %arg0, dim = 0 : (tensor<1xi64>) -> tensor<*xi32>
++    %2 = call @callee(%1) : (tensor<*xi32>) -> tensor<*xi32>
++    return %2 : tensor<*xi32>
++  }
++  // expected-error@+1{{callee must be refined with static shape arguments}}
++  func.func private @callee(%arg0: tensor<*xi32>) -> tensor<*xi32> {
++    return %arg0 : tensor<*xi32>
++  }
++}
++
++// -----
++
++module @refine_call_callee_argument_dynamic_shape {
++  func.func public @main(%arg0: tensor<1xi64>) -> tensor<?xi32> {
++    %1 = stablehlo.dynamic_iota %arg0, dim = 0 : (tensor<1xi64>) -> tensor<?xi32>
++    %2 = call @callee(%1) : (tensor<?xi32>) -> tensor<?xi32>
++    return %2 : tensor<?xi32>
++  }
++  // expected-error@+1{{callee must be refined with static shape arguments}}
++  func.func private @callee(%arg0: tensor<?xi32>) -> tensor<?xi32> {
++    return %arg0 : tensor<?xi32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call_dimension_argument_non_scalar
++// The non-scalar constant is not folded into the callee
++module @refine_call_dimension_argument_non_scalar {
++  func.func public @main() -> tensor<4xi32> {
++    // CHECK: dense<[1, 2, 3, 4]> : tensor<4xi32>
++    %0 = stablehlo.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
++    %1 = call @callee(%0) : (tensor<4xi32>) -> tensor<4xi32>
++    return %1 : tensor<4xi32>
++  }
++  func.func private @callee(%arg0: tensor<4xi32>) -> tensor<4xi32> {
++    // CHECK: return %arg0 : tensor<4xi32>
++    return %arg0 : tensor<4xi32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call_dimension_argument_not_integer
++module @refine_call_dimension_argument_not_integer {
++  func.func public @main() -> tensor<f32> {
++    %0 = stablehlo.constant dense<3.> : tensor<f32>
++    // CHECK: call @callee({{.*}}) : (tensor<f32>) -> tensor<f32>
++    %2 = call @callee(%0) : (tensor<f32>) -> tensor<f32>
++    return %2 : tensor<f32>
++  }
++  func.func private @callee(%arg0: tensor<f32>) -> tensor<f32> {
++    return %arg0 : tensor<f32>
++  }
+ }
+ 
+ // -----
+@@ -607,12 +903,55 @@
  
  // -----
  
@@ -1631,7 +1977,74 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/
 diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
 --- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
 +++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-@@ -43,6 +43,7 @@
+@@ -11,9 +11,48 @@
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+-
++/*
++This shape refinement pass was designed to resolve the dynamic shapes in
++a StableHLO module produced by JAX serialization with shape polymorphism.
++Such a module has the following properties:
++
++  * it contains a "main" function with statically-shaped arguments;
++    the result types may be dynamically shaped.
++  * all the dynamic shapes depend only on the input shapes (no shape
++    dependency on the input array contents). We refer to the operations that
++    depend transitively only on the input shapes (e.g., as given by
++    `stablehlo.get_dimension_size`) as `dimension` operations.
++    All dimension values can be resolved to constants through inter-procedural
++    constant folding.
++  * intermediate functions may take a number of token arguments (of type
++    !stablehlo.token) at the start of the argument list, followed by some
++    dimension arguments (integer scalars).
++  * some intermediate functions may return dimension values.
++    E.g., the `floordiv` operation on dimension values may be implemented
++    using intermediate functions. These constant functions need to be
++    constant-folded.
++  * All the dynamic shapes can be resolved through shape inference from the
++    dimension values. The dimension values themselves do not depend on the
++    result of shape inference.
++
++
++For each intermediate function we compute a refinement context, including
++the values of the dimension arguments and the static shapes of the other
++arguments. We compute the refinement context when we encounter a function call,
++and then we refine the callee recursively. We abort in the presence of
++recursive calls.
++We also abort if a function is called with multiple distinct refinement
++contexts.
++
++After refinement, all operations should have static shapes, all calls to
++constant functions are replaced with constants, and all dimension arguments
++for intermediate functions are dropped and are replaced with constants.
++*/
++#include <algorithm>
+ #include <cstdint>
+ #include <memory>
++#include <optional>
++#include <set>
+ #include <string>
+ #include <utility>
+ 
+@@ -24,8 +63,10 @@
+ #include "llvm/ADT/SmallSet.h"
+ #include "llvm/ADT/SmallVector.h"
+ #include "llvm/ADT/StringRef.h"
++#include "llvm/Support/Debug.h"
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Support/FormatVariadic.h"
++#include "llvm/Support/ScopedPrinter.h"
+ #include "mlir/Dialect/Func/IR/FuncOps.h"
+ #include "mlir/IR/BuiltinAttributes.h"
+ #include "mlir/IR/BuiltinOps.h"
+@@ -39,10 +80,13 @@
+ #include "mlir/IR/Types.h"
+ #include "mlir/IR/Value.h"
+ #include "mlir/Interfaces/InferTypeOpInterface.h"
++#include "mlir/Support/DebugStringHelper.h"
+ #include "mlir/Support/LogicalResult.h"
++#include "mlir/Support/LLVM.h"
  #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
  #include "stablehlo/dialect/Base.h"
  #include "stablehlo/dialect/ChloOps.h"
@@ -1639,7 +2052,407 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
  #include "stablehlo/dialect/StablehloOps.h"
  #include "stablehlo/dialect/TypeInference.h"
  #include "stablehlo/transforms/Passes.h"
-@@ -844,12 +845,97 @@
+@@ -50,10 +94,144 @@
+ namespace mlir {
+ namespace stablehlo {
+ 
++#define DEBUG_TYPE "stablehlo-refine-shapes"
++
+ #define GEN_PASS_DEF_STABLEHLOREFINESHAPESPASS
+ #include "stablehlo/transforms/Passes.h.inc"
+ 
+ namespace {
++
++// Per-module state for shape refinement.
++class RefineShapeState {
++ public:
++  // Validates that we are not attempting to refine a function with a different
++  // context than previously, and are not attempting recursive refinement.
++  // Returns failure() if validation fails. On success, returns a boolean
++  // that specifies whether the function has already been refined.
++  FailureOr<bool> validateFunctionRefinement(
++      func::FuncOp func, SmallVector<APSInt> dimensionArguments,
++      SmallVector<Type> nonDimensionArgumentTypes) {
++    StringRef funcName = func.getName();
++    auto found = refinementContexts.find(func);
++    if (found == refinementContexts.end()) {
++      return false;  // not already refined.
++    }
++    auto prevDimensionArguments = std::get<0>(found->second);
++    auto prevNonDimensionArgumentTypes = std::get<1>(found->second);
++    // Since we refine until fixed point, we will refine a call to a function
++    // both for the original function and for the refined one. In the latter
++    // case, we should have empty dimensionArguments but the same
++    // nonDimensionArgumentTypes.
++    if (prevNonDimensionArgumentTypes != nonDimensionArgumentTypes ||
++        (!dimensionArguments.empty() &&
++         prevDimensionArguments != dimensionArguments)) {
++      emitDifferentRefinementContextError(
++          func, /*dimensionArguments=*/dimensionArguments,
++          /*nonDimensionArgumentTypes=*/nonDimensionArgumentTypes,
++          /*prevDimensionArguments=*/prevDimensionArguments,
++          /*prevNonDimensionArgumentShapes=*/prevNonDimensionArgumentTypes);
++      return failure();
++    }
++    for (auto funcOnStack : functionsBeingRefined) {
++      if (funcOnStack == funcName) {
++        func.emitOpError() << "Function " << funcName
++                           << " is being refined recursively\n";
++        return failure();
++      }
++    }
++    return true;  // already refined.
++  }
++
++  // Updates the state to signal the starting of a function refinement.
++  // Callers must call `finishFunctionRefinement` when done.
++  void startFunctionRefinement(func::FuncOp func,
++                               SmallVector<APSInt> dimensionArguments,
++                               SmallVector<Type> nonDimensionArgumentTypes) {
++    StringRef funcName = func.getName();
++    functionsBeingRefined.push_back(funcName);
++    refinementContexts[func] =
++        std::make_tuple(dimensionArguments, nonDimensionArgumentTypes);
++  }
++
++  // Updates the state to signal the starting of a function refinement.
++  LogicalResult finishFunctionRefinement(func::FuncOp func) {
++    if (func.getName() !=
++        functionsBeingRefined[functionsBeingRefined.size() - 1]) {
++      func.emitOpError() << "Expected to find " << func.getName()
++                         << " at the top of the stack";
++      return failure();
++    }
++    functionsBeingRefined.pop_back();
++    return success();
++  }
++
++ private:
++  // Maps refined functions to the refinement context: the values of dimension
++  // arguments and the types of non-dimension arguments. A function is added
++  // here when we start refining it.
++  DenseMap<func::FuncOp, std::tuple<SmallVector<APSInt>, SmallVector<Type>>>
++      refinementContexts;
++
++  // A stack of functions that are in the process of being refined, the current
++  // one is last.
++  SmallVector<llvm::StringRef> functionsBeingRefined;
++
++  void emitDifferentRefinementContextError(
++      func::FuncOp func, SmallVector<APSInt> dimensionArguments,
++      SmallVector<Type> nonDimensionArgumentTypes,
++      SmallVector<APSInt> prevDimensionArguments,
++      SmallVector<Type> prevNonDimensionArgumentShapes) {
++    InFlightDiagnostic msg = func.emitOpError();
++    msg << "Function " << func.getName()
++        << " has already been refined with a different "
++           "refinement context. ";
++    int countShowNonDimensionArguments =
++        std::min(prevNonDimensionArgumentShapes.size(),
++                 nonDimensionArgumentTypes.size());
++    if (prevNonDimensionArgumentShapes.size() !=
++        nonDimensionArgumentTypes.size()) {
++      msg << "Previous context had " << prevNonDimensionArgumentShapes.size()
++          << " and now we have " << nonDimensionArgumentTypes.size()
++          << " non-dimension arguments. ";
++    }
++    msg << "The differences among the first " << countShowNonDimensionArguments
++        << " non-dimension argument types are: ";
++    for (auto i = 0; i < countShowNonDimensionArguments; ++i) {
++      if (prevNonDimensionArgumentShapes[i] != nonDimensionArgumentTypes[i]) {
++        msg << "Non-dimension argument[" << i << "] previously had type "
++            << debugString(prevNonDimensionArgumentShapes[i])
++            << " and now has type " << debugString(nonDimensionArgumentTypes[i])
++            << ". ";
++      }
++    }
++    int countShowDimensionArguments =
++        std::min(prevDimensionArguments.size(), dimensionArguments.size());
++    if (prevDimensionArguments.size() != dimensionArguments.size()) {
++      msg << "Previous context had " << prevDimensionArguments.size()
++          << " and now we have " << dimensionArguments.size()
++          << " dimension arguments. ";
++    }
++    msg << "The differences among the first " << countShowDimensionArguments
++        << " dimension arguments are: ";
++    for (auto i = 0; i < countShowDimensionArguments; ++i) {
++      if (prevDimensionArguments[i] != dimensionArguments[i]) {
++        msg << "Dimension argument[" << i << "] previously was "
++            << prevDimensionArguments[i].getSExtValue() << " and now is "
++            << dimensionArguments[i].getSExtValue() << ". ";
++      }
++    }
++  }
++};
++
++// Refines a function.
++// Returns `true` if the function had already been processed with the same
++// refinement context and `false` if this is the first time we refined the
++// function. Returns failure() if we encounter an error.
++LogicalResult refineFunction(func::FuncOp func, MLIRContext* context,
++                             RefineShapeState* state,
++                             size_t nrPrefixTokenArguments,
++                             SmallVector<APSInt> dimensionArguments,
++                             SmallVector<Type> nonDimensionArgumentTypes);
+ 
+ // DenseElementsAttr can be constructed from ArrayRef<APInt> but not from
+ // ArrayRef<APSInt>. This helper bridges the gap.
+@@ -424,11 +602,10 @@
+       diag << "refineValues failed for " << types << ": expected "
+            << values.size() << " types, got " << types.size();
+     });
+-
+-  // Check whether `types` contain any new information with respect to existing
+-  // return types. Even if just a single dimension size out of an entire tensor
+-  // type got updated, using `inferMostSpecificType` ensures that we don't
+-  // miss that.
++  // Check whether `types` contain any new information with respect to
++  // existing return types. Even if just a single dimension size out of an
++  // entire tensor type got updated, using `inferMostSpecificType` ensures
++  // that we don't miss that.
+   bool needsRefinement = false;
+   SmallVector<Type> refinedTypes;
+   for (auto it : llvm::zip(values.getTypes(), types)) {
+@@ -468,11 +645,13 @@
+ 
+       // Simply changing operand type of `func.return` won't work because
+       // that won't update the FunctionType of the enclosing `func.func`.
+-      // Nonetheless, we still want to support these ops because they are widely
+-      // used in StableHLO programs (although the plan of record is to replace
+-      // `func.return` ops in StableHLO programs with `stablehlo.return`:
+-      // https://github.com/openxla/stablehlo/issues/425).
++      // Nonetheless, we still want to support these ops because they are
++      // widely used in StableHLO programs (although the plan of record is to
++      // replace `func.return` ops in StableHLO programs with
++      // `stablehlo.return`: https://github.com/openxla/stablehlo/issues/425).
+       if (isa<func::ReturnOp>(user)) continue;
++
++      if (isa<func::CallOp>(user)) continue;
+ 
+       // Unlike in TensorFlow's type inference pass, here we work only with
+       // allowlisted ops to focus our support on well-defined semantics of
+@@ -489,7 +668,8 @@
+     value.setType(refinedType);
+ 
+     // Special case: for `func.return`, guard the refinement with a cast
+-    // and leave propagation of the refined return type to a dedicated pattern.
++    // and leave propagation of the refined return type to a dedicated
++    // pattern.
+     auto isFuncReturn = [](OpOperand& use) -> bool {
+       return isa<func::ReturnOp>(use.getOwner());
+     };
+@@ -505,8 +685,8 @@
+ 
+ // Refines the return types of the given operation using the given types.
+ // This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
++// users of this op if any updates to its results have happened during
++// execution of the function.
+ LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
+                                 ArrayRef<Type> types) {
+   if (failed(refineValues(rewriter, op, op->getResults(), types)))
+@@ -528,12 +708,12 @@
+ //      traversal, and only then we apply the refinements. If there are other
+ //      types, then the corresponding refinements must be completely empty.
+ //   2) Encodings are not supported. In principle, TypeExtensions should be
+-//      supportable, but this needs careful thinking through. Given that no one
+-//      asked for support for bounded dynamism in this pass yet, this is left
+-//      for future work.
++//      supportable, but this needs careful thinking through. Given that no
++//      one asked for support for bounded dynamism in this pass yet, this is
++//      left for future work.
+ // This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
++// users of this op if any updates to its results have happened during
++// execution of the function.
+ LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
+                                 ArrayRef<ShapedTypeComponents> refinements) {
+   SmallVector<Type> flattenedTypes;
+@@ -623,8 +803,8 @@
+ 
+ // Refines the return type of the given operation using the given shape.
+ // This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
++// users of this op if any updates to its results have happened during
++// execution of the function.
+ template <typename OpType>
+ LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
+                                 ArrayRef<int64_t> shape) {
+@@ -633,8 +813,8 @@
+ 
+ // Refines the return type of the given operation using the given shape.
+ // This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
++// users of this op if any updates to its results have happened during
++// execution of the function.
+ template <typename OpType>
+ LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
+                                 Value shapeValue) {
+@@ -647,6 +827,52 @@
+   return refineReturnShape(rewriter, op, shape);
+ }
+ 
++// Dimension arguments are leading scalar constant arguments, optionally
++// preceeded by some stablehlo.token arguments.
++SmallVector<APSInt> getDimensionArguments(func::CallOp callOp,
++                                          size_t* nrPrefixTokenArguments) {
++  *nrPrefixTokenArguments = 0;
++  SmallVector<Value> operands = callOp.getOperands();
++  SmallVector<APSInt> dimensionArguments;
++  for (size_t i = 0; i < operands.size(); ++i) {
++    if (i == *nrPrefixTokenArguments && isa<TokenType>(operands[i].getType())) {
++      (*nrPrefixTokenArguments)++;
++      continue;
++    }
++    RankedTensorType operandType =
++        dyn_cast<RankedTensorType>(operands[i].getType());
++    if (!operandType || operandType.getRank() != 0 ||
++        !operandType.getElementType().template isa<IntegerType>())
++      break;
++    SmallVector<APSInt> operand_int;
++    if (failed(hlo::matchInts(operands[i], operand_int))) {
++      break;
++    }
++    dimensionArguments.push_back(operand_int[0]);
++  }
++  return dimensionArguments;
++}
++
++std::optional<SmallVector<DenseIntElementsAttr>> isConstantFunction(
++    func::FuncOp func) {
++  LLVM_DEBUG(llvm::dbgs() << "check if " << func.getName()
++                          << " is a constant function\n");
++  SmallVector<DenseIntElementsAttr> returnedConstants;
++  func::ReturnOp ret = *func.getOps<func::ReturnOp>().begin();
++  bool isConstant = llvm::all_of(ret->getOperands(), [&](auto returnVal) {
++    DenseIntElementsAttr attr;
++    Operation* return_operand_def = returnVal.getDefiningOp();
++    if (return_operand_def &&
++        matchPattern(return_operand_def, m_Constant(&attr))) {
++      returnedConstants.push_back(attr);
++      return true;
++    }
++    return false;
++  });
++  if (isConstant) return returnedConstants;
++  return std::nullopt;
++}
++
+ struct RefineAllGatherOpPattern : public OpRewritePattern<AllGatherOp> {
+   using OpRewritePattern::OpRewritePattern;
+   LogicalResult matchAndRewrite(AllGatherOp op,
+@@ -655,9 +881,9 @@
+     if (!operandType.hasRank())
+       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
+ 
+-    // This represents the cross_replica_and_partition process grouping strategy
+-    // that requires num_partitions to compute shardCount. Since we don't know
+-    // num_partitions at this point, we error out.
++    // This represents the cross_replica_and_partition process grouping
++    // strategy that requires num_partitions to compute shardCount. Since we
++    // don't know num_partitions at this point, we error out.
+     if (op.getChannelHandle() && !op.getUseGlobalDeviceIds())
+       return rewriter.notifyMatchFailure(op, "unsupported strategy");
+     DenseIntElementsAttr replicaGroups = op.getReplicaGroups();
+@@ -678,12 +904,11 @@
+     auto operandType = op.getOperand().getType();
+     if (!operandType.hasRank())
+       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
+-
++    auto resultType = op.getType();
+     // If bit widths of the operand and the result are different, then
+     // operand and result shapes have different ranks.
+     // This complicates the logic quite a bit and is not needed to pass the
+     // current tests, so we leave this for future work.
+-    auto resultType = op.getType();
+     auto getBitWidthFn = [](ShapedType type) {
+       auto elementType = type.getElementType();
+       if (auto complexType = elementType.dyn_cast<ComplexType>())
+@@ -694,8 +919,77 @@
+     if (getBitWidthFn(operandType) != getBitWidthFn(resultType))
+       return rewriter.notifyMatchFailure(op, "unsupported bit width");
+ 
+-    return refineReturnShape(rewriter, op, operandType.getShape());
+-  }
++    auto res = refineReturnShape(rewriter, op, operandType.getShape());
++    if (failed(res)) return failure();
++    if (op.getOperand().getType() == op.getResult().getType()) {
++      LLVM_DEBUG({ llvm::dbgs() << "    ** remove no-op bitcast convert\n"; });
++      rewriter.replaceOp(op, op.getOperand());
++    }
++    return success();
++  }
++};
++
++struct RefineCallOpPattern : public OpRewritePattern<func::CallOp> {
++  using OpRewritePattern::OpRewritePattern;
++
++  RefineCallOpPattern(MLIRContext* context, RefineShapeState* state)
++      : OpRewritePattern<func::CallOp>(context), _state(state) {}
++
++  LogicalResult matchAndRewrite(func::CallOp op,
++                                PatternRewriter& rewriter) const override {
++    LLVM_DEBUG({ llvm::dbgs() << "refineCallOp " << debugString(op) << "\n"; });
++
++    // We have a number of prefix token arguments, then the dimension arguments
++    size_t nrPrefixTokenArguments = 0;
++    SmallVector<APSInt> dimensionArguments =
++        getDimensionArguments(op, &nrPrefixTokenArguments);
++    SmallVector<Type> nonDimensionArgumentTypes;
++    SmallVector<Value> nonDimensionArguments;
++    SmallVector<Value> operands = op.getOperands();
++    for (size_t i = 0; i < operands.size(); ++i) {
++      // Skip the dimension arguments.
++      if (i >= nrPrefixTokenArguments &&
++          i < nrPrefixTokenArguments + dimensionArguments.size()) {
++        continue;
++      }
++      nonDimensionArgumentTypes.push_back(operands[i].getType());
++      nonDimensionArguments.push_back(operands[i]);
++    }
++    FlatSymbolRefAttr calleeName = op.getCalleeAttr();
++    const SymbolTable symbolTable(op->getParentOfType<ModuleOp>());
++    func::FuncOp callee = dyn_cast<func::FuncOp>(
++        symbolTable.lookupNearestSymbolFrom(op, calleeName.getAttr()));
++    if (!callee)
++      return rewriter.notifyMatchFailure(
++          op, "cannot find callee in the current scope");
++    if (failed(refineFunction(callee, rewriter.getContext(), _state,
++                              nrPrefixTokenArguments, dimensionArguments,
++                              nonDimensionArgumentTypes)))
++      return failure();
++
++    // Is the callee a constant function in this refinement context?
++    std::optional<SmallVector<DenseIntElementsAttr>> constantAttrs =
++        isConstantFunction(callee);
++    if (constantAttrs.has_value()) {
++      SmallVector<Value> constants;
++      for (auto constAttr : constantAttrs.value()) {
++        constants.push_back(
++            rewriter.create<ConstantOp>(op.getLoc(), constAttr));
++      }
++      rewriter.replaceOp(op, constants);
++      return success();
++    }
++    if (!dimensionArguments.empty()) {
++      // Drop the dimension arguments, but only if necessary, or else we
++      // will end up trying to refine the new CallOp forever.
++      op = rewriter.replaceOpWithNewOp<func::CallOp>(
++          op, op.getResultTypes(), callee.getSymName(), nonDimensionArguments);
++    }
++    return refineReturnTypes(rewriter, op, callee.getResultTypes());
++  }
++
++ private:
++  RefineShapeState* _state;
+ };
+ 
+ struct RefineConvertOpPattern : public OpRewritePattern<ConvertOp> {
+@@ -844,12 +1138,98 @@
    }
  };
  
@@ -1712,8 +2525,9 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
 +      return rewriter.notifyMatchFailure(op, "expected constant output_shape");
 +
 +    // We only need to refine the shape of `output` (the second result).
-+    // The shape of `output_state` (the first result) is determined by the shape
-+    // of `initial_state`, so we ignore it and provide an empty refinement.
++    // The shape of `output_state` (the first result) is determined by the
++    // shape of `initial_state`, so we ignore it and provide an empty
++    // refinement.
 +    return refineReturnTypes(rewriter, op, {{initialStateType}, {outputShape}});
 +  }
 +};
@@ -1737,15 +2551,349 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
    }
  };
  
-@@ -1181,7 +1267,10 @@
-     patterns.add<RefineDynamicConvOpPattern>(&getContext());
-     patterns.add<RefineDynamicIotaOpPattern>(&getContext());
-     patterns.add<RefineDynamicPadOpPattern>(&getContext());
-+    patterns.add<RefineDynamicReduceWindowOpPattern>(&getContext());
-     patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
-+    patterns.add<RefineDynamicRngBitGeneratorOpPattern>(&getContext());
-+    patterns.add<RefineDynamicTopKOpPattern>(&getContext());
-     patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
-     patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
-     patterns.add<RefineReduceScatterOpPattern>(&getContext());
+@@ -865,11 +1245,11 @@
+     if (!isa<chlo::ChloDialect, StablehloDialect>(op->getDialect()))
+       return rewriter.notifyMatchFailure(op, "unsupported dialect");
+ 
+-    // For the ops that implement InferTypeOpInterface, we reinfer their return
+-    // types and see what happens.
+-    // Operands of these ops might have been refined elsewhere (e.g. someone
+-    // might have updated argument types of a function) or earlier during this
+-    // pass, and this might enable refinement opportunities downstream.
++    // For the ops that implement InferTypeOpInterface, we reinfer their
++    // return types and see what happens. Operands of these ops might have
++    // been refined elsewhere (e.g. someone might have updated argument types
++    // of a function) or earlier during this pass, and this might enable
++    // refinement opportunities downstream.
+     SmallVector<Type> inferredReturnTypes;
+     if (failed(op.inferReturnTypes(getContext(), /*location=*/{},
+                                    op->getOperands(), op->getAttrDictionary(),
+@@ -925,8 +1305,8 @@
+           sliceSizesAttr.size(),
+           RankedTensorType::get({}, startIndicesElementType));
+ 
+-      // RealDynamicSliceOp can take tensors of integer or index element types.
+-      // DynamicSliceOp::slice_sizes only supports i64 element type.
++      // RealDynamicSliceOp can take tensors of integer or index element
++      // types. DynamicSliceOp::slice_sizes only supports i64 element type.
+       // Adapt accordingly in order to be compatible with inferDynamicSliceOp.
+       SmallVector<int64_t> sliceSizes;
+       for (auto element : sliceSizesAttr.getValues<APInt>()) {
+@@ -956,9 +1336,9 @@
+     if (!operandType.hasRank())
+       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
+ 
+-    // This represents the cross_replica_and_partition process grouping strategy
+-    // that requires num_partitions to compute shardCount. Since we don't know
+-    // num_partitions at this point, we error out.
++    // This represents the cross_replica_and_partition process grouping
++    // strategy that requires num_partitions to compute shardCount. Since we
++    // don't know num_partitions at this point, we error out.
+     if (op.getChannelHandle() && !op.getUseGlobalDeviceIds())
+       return rewriter.notifyMatchFailure(op, "unsupported strategy");
+     DenseIntElementsAttr replicaGroups = op.getReplicaGroups();
+@@ -998,9 +1378,9 @@
+                                 PatternRewriter& rewriter) const override {
+     // Push the potentially refined operand types into the nested regions.
+     // This can lead to refinements of the return types of the body (but not
+-    // of the cond since it always returns tensor<i1>), but the key insight here
+-    // is that the enclosing while op doesn't care about these refinements
+-    // (because its return types are equal to its operand types).
++    // of the cond since it always returns tensor<i1>), but the key insight
++    // here is that the enclosing while op doesn't care about these
++    // refinements (because its return types are equal to its operand types).
+     // If we end up with incompatibilities between while's return types and
+     // body's return types, the verifier will tell us about that. This means
+     // that the original program wasn't well-formed. TODO(burmako): Implement
+@@ -1050,8 +1430,8 @@
+       if (failed(mostSpecificType) || destType == *mostSpecificType) continue;
+ 
+       // If the source type of the cast is more specific than the target type,
+-      // then we conclude that the cast is redundant (i.e. needs to be removed)
+-      // and that the return type of the function needs an update.
++      // then we conclude that the cast is redundant (i.e. needs to be
++      // removed) and that the return type of the function needs an update.
+       needsUpdate = true;
+       updatedResultTypes[i] = sourceType;
+ 
+@@ -1066,9 +1446,6 @@
+     for (auto cast : castsToReplace)
+       rewriter.replaceOp(cast, cast->getOperands());
+ 
+-    // If the type of the enclosing `func.func` needs an update, we simply
+-    // call setType. We can afford this simplicity because our algorithm
+-    // currently supports only one function per module.
+     auto func = cast<func::FuncOp>(op->getParentOp());
+     func.setType(
+         rewriter.getFunctionType(func.getArgumentTypes(), updatedResultTypes));
+@@ -1100,22 +1477,186 @@
+   }
+ };
+ 
++LogicalResult applyRewritePatterns(func::FuncOp func, MLIRContext* context,
++                                   RefineShapeState* state) {
++  // TODO(#1048): Find out why .maxIterations = 1 no longer works.
++  // There have been recent refactors to applyPatternsAndFoldGreedily
++  // upstream, and that might be the reason.
++  GreedyRewriteConfig config;
++  config.useTopDownTraversal = true;
++  config.enableRegionSimplification = true;
++  config.maxIterations = 2;
++  config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++  config.strictMode = GreedyRewriteStrictness::AnyOp;
++
++  RewritePatternSet patterns(context);
++  patterns.add<EvalAddOpPattern>(context);
++  patterns.add<EvalAndOpPattern>(context);
++  patterns.add<EvalBroadcastInDimOpPattern>(context);
++  patterns.add<EvalClampOpPattern>(context);
++  patterns.add<EvalCompareOpPattern>(context);
++  patterns.add<EvalConcatenateOpPattern>(context);
++  patterns.add<EvalConvertOpPattern>(context);
++  patterns.add<EvalDivOpPattern>(context);
++  patterns.add<EvalGetDimensionSizeOpPattern>(context);
++  patterns.add<EvalMaxOpPattern>(context);
++  patterns.add<EvalMinOpPattern>(context);
++  patterns.add<EvalMulOpPattern>(context);
++  patterns.add<EvalRemOpPattern>(context);
++  patterns.add<EvalReshapeOpPattern>(context);
++  patterns.add<EvalSelectOpPattern>(context);
++  patterns.add<EvalSignOpPattern>(context);
++  patterns.add<EvalSliceOpPattern>(context);
++  patterns.add<EvalSubtractOpPattern>(context);
++  patterns.add<RefineAllGatherOpPattern>(context);
++  patterns.add<RefineBitcastConvertOpPattern>(context);
++  patterns.add<RefineCallOpPattern>(context, state);
++  patterns.add<RefineConvertOpPattern>(context);
++  patterns.add<RefineConvolutionOpPattern>(context);
++  patterns.add<RefineCustomCallOpPattern>(context);
++  patterns.add<RefineDotGeneralOpPattern>(context);
++  patterns.add<RefineDynamicBroadcastInDimOpPattern>(context);
++  patterns.add<RefineDynamicConvOpPattern>(context);
++  patterns.add<RefineDynamicIotaOpPattern>(context);
++  patterns.add<RefineDynamicPadOpPattern>(context);
++  patterns.add<RefineDynamicReduceWindowOpPattern>(context);
++  patterns.add<RefineDynamicReshapeOpPattern>(context);
++  patterns.add<RefineDynamicRngBitGeneratorOpPattern>(context);
++  patterns.add<RefineDynamicTopKOpPattern>(context);
++  patterns.add<RefineInferTypeOpInterfacePattern>(context);
++  patterns.add<RefineRealDynamicSliceOpPattern>(context);
++  patterns.add<RefineReduceScatterOpPattern>(context);
++  patterns.add<RefineRngOpPattern>(context);
++  patterns.add<RefineUniformQuantizeOpPattern>(context);
++  patterns.add<RefineWhileOpPattern>(context);
++  patterns.add<UpdateFunctionTypePattern>(context);
++  patterns.add<UpdateRegionTypePattern>(context);
++  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
++    func.emitOpError() << "applyPatternsAndFoldGreedily failed";
++    return failure();
++  }
++  return success();
++}
++
++LogicalResult refineFunction(func::FuncOp func, MLIRContext* context,
++                             RefineShapeState* state,
++                             size_t nrPrefixTokenArguments,
++                             SmallVector<APSInt> dimensionArguments,
++                             SmallVector<Type> nonDimensionArgumentTypes) {
++  // The nonDimensionArgumentTypes include the prefix token arguments.
++  LLVM_DEBUG({
++    llvm::dbgs() << "refineFunction " << func.getName() << ": initial type "
++                 << debugString(func.getFunctionType()) << "\n";
++    llvm::dbgs() << "   has " << nrPrefixTokenArguments << " prefix tokens\n";
++    for (size_t i = 0; i < dimensionArguments.size(); ++i) {
++      llvm::dbgs() << "   with dimension arg[" << i
++                   << "] = " << dimensionArguments[i] << "\n";
++    }
++  });
++  // Check that the argument types have static shapes.
++  for (size_t i = 0; i < nonDimensionArgumentTypes.size(); ++i) {
++    if (i < nrPrefixTokenArguments) continue;
++    auto argType = nonDimensionArgumentTypes[i];
++    if (isa<TokenType>(argType)) continue;
++    auto argRankedTensorType = dyn_cast<RankedTensorType>(argType);
++    if (!argRankedTensorType || !argRankedTensorType.hasStaticShape()) {
++      func.emitOpError() << func.getName()
++                         << " must be refined with static shape arguments. "
++                         << "Found argument of type " << debugString(argType);
++      return failure();
++    }
++  }
++  auto alreadyRefined = state->validateFunctionRefinement(
++      func, dimensionArguments, nonDimensionArgumentTypes);
++  if (failed(alreadyRefined)) {
++    return failure();
++  }
++  if (*alreadyRefined) {
++    LLVM_DEBUG({
++      llvm::dbgs() << "refineFunction " << func.getName()
++                   << ": skipping, already refined\n";
++    });
++    return success();
++  }
++  state->startFunctionRefinement(func, dimensionArguments,
++                                 nonDimensionArgumentTypes);
++  // Only one block per function is supported at the moment.
++  // At the StableHLO level, functions are expected to only have one block,
++  // so supporting more is out of scope for this pass.
++  if (!func.getRegion().hasOneBlock()) {
++    func.emitOpError() << "must have exactly one block";
++    return failure();
++  }
++
++  // Replace all dimension arguments with constants and remove those arguments.
++  // Wrap non-dimension arguments with bitcast_convert.
++  OpBuilder op_builder(func.getRegion());
++  op_builder.setInsertionPointToStart(&func.getRegion().front());
++  size_t firstNonDimensionArg =
++      nrPrefixTokenArguments + dimensionArguments.size();
++  for (size_t i = 0; i < func.getNumArguments(); ++i) {
++    BlockArgument arg = func.getArgument(i);
++    Type argType = arg.getType();
++    if (i < nrPrefixTokenArguments) {
++      continue;
++    }
++    if (i < firstNonDimensionArg) {
++      ShapedType argShapedType = dyn_cast<ShapedType>(argType);
++      if (!argShapedType) {
++        func.emitOpError() << "dimension arguments must have shaped types";
++        return failure();
++      }
++      // We will drop the dimension arguments, replace them with constants.
++      auto replacement_op = op_builder.create<stablehlo::ConstantOp>(
++          arg.getLoc(), argType,
++          getTensorAttr(argShapedType,
++                        dimensionArguments[i - nrPrefixTokenArguments]));
++      arg.replaceAllUsesWith(replacement_op);
++    } else {
++      int nonDimensionArgumentIndex =
++          nrPrefixTokenArguments + i - firstNonDimensionArg;
++      Type refinedType = nonDimensionArgumentTypes[nonDimensionArgumentIndex];
++      if (refinedType != argType) {
++        // We add BitcastConvertOp as the only uses of the non-dimension
++        // arguments to ensure the module stays valid after we set the argument
++        // type.
++        auto replacement_op = op_builder.create<stablehlo::BitcastConvertOp>(
++            arg.getLoc(), argType, arg);
++        arg.replaceAllUsesExcept(replacement_op->getResult(0), replacement_op);
++        arg.setType(refinedType);
++      }
++    }
++  }
++  BitVector argIndices(func.getNumArguments());
++  argIndices.set(nrPrefixTokenArguments, firstNonDimensionArg);
++  func.eraseArguments(argIndices);
++  func.setType(op_builder.getFunctionType(nonDimensionArgumentTypes,
++                                          func.getResultTypes()));
++  LLVM_DEBUG({
++    llvm::dbgs() << "refineFunction " << func.getName() << ": set type to "
++                 << func.getFunctionType() << "\n";
++  });
++  if (failed(applyRewritePatterns(func, context, state))) return failure();
++  LLVM_DEBUG({
++    llvm::dbgs() << "refineFunction " << func.getName() << ": end with type "
++                 << debugString(func.getFunctionType()) << "\n";
++  });
++  if (failed(state->finishFunctionRefinement(func))) return failure();
++  return success();
++}
++
+ struct StablehloRefineShapesPass
+     : public impl::StablehloRefineShapesPassBase<StablehloRefineShapesPass> {
+   using StablehloRefineShapesPassBase::StablehloRefineShapesPassBase;
+ 
+   void runOnOperation() override {
+-    // Only one function per module is supported at the moment to avoid the need
+-    // to think about iterative type inference algorithms.
+-    // Current use cases are served well by inlining multiple functions into
+-    // a single function, so we leave native support for multiple functions to
+-    // future work.
+     // To enable modules that contain CustomCallOp::called_computations,
+     // we allow multiple functions, in which case we only refine the main
+     // function called "main", assuming that the called computations will have
+     // static shapes. Lifting this assumption and expanding refinement to
+     // multiple functions is left for future work.
+     ModuleOp module = getOperation();
++    RefineShapeState state;
+     auto funcs = llvm::to_vector(module.getOps<func::FuncOp>());
+     if (funcs.empty()) return;
+     func::FuncOp func;
+@@ -1130,70 +1671,14 @@
+           << " function to clearly identify which function will be refined";
+       return signalPassFailure();
+     }
+-
+-    // Similarly, only one block per function is supported at the moment.
+-    // At the StableHLO level, functions are expected to only have one block,
+-    // so supporting more is out of scope for this pass.
+-    if (!func.getRegion().hasOneBlock()) {
+-      func.emitOpError() << "must have exactly one block";
++    SmallVector<APSInt> emptyDimensionArguments;
++    SmallVector<Type> nonDimensionArgumentTypes;
++    for (auto arg : func.getArguments())
++      nonDimensionArgumentTypes.push_back(arg.getType());
++    if (failed(refineFunction(func, &getContext(), &state, 0,
++                              emptyDimensionArguments,
++                              nonDimensionArgumentTypes)))
+       return signalPassFailure();
+-    }
+-
+-    // The algorithm behind this pass consists of a single traversal of the
+-    // function. This is sufficient because we only support one function per
+-    // program at the moment.
+-    // TODO(#1048): Find out why .maxIterations = 1 no longer works.
+-    // There have been recent refactors to applyPatternsAndFoldGreedily
+-    // upstream, and that might be the reason.
+-    GreedyRewriteConfig config;
+-    config.useTopDownTraversal = true;
+-    config.enableRegionSimplification = true;
+-    config.maxIterations = 2;
+-    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
+-    config.strictMode = GreedyRewriteStrictness::AnyOp;
+-
+-    RewritePatternSet patterns(&getContext());
+-    patterns.add<EvalAddOpPattern>(&getContext());
+-    patterns.add<EvalAndOpPattern>(&getContext());
+-    patterns.add<EvalBroadcastInDimOpPattern>(&getContext());
+-    patterns.add<EvalClampOpPattern>(&getContext());
+-    patterns.add<EvalCompareOpPattern>(&getContext());
+-    patterns.add<EvalConcatenateOpPattern>(&getContext());
+-    patterns.add<EvalConvertOpPattern>(&getContext());
+-    patterns.add<EvalDivOpPattern>(&getContext());
+-    patterns.add<EvalGetDimensionSizeOpPattern>(&getContext());
+-    patterns.add<EvalMaxOpPattern>(&getContext());
+-    patterns.add<EvalMinOpPattern>(&getContext());
+-    patterns.add<EvalMulOpPattern>(&getContext());
+-    patterns.add<EvalRemOpPattern>(&getContext());
+-    patterns.add<EvalReshapeOpPattern>(&getContext());
+-    patterns.add<EvalSelectOpPattern>(&getContext());
+-    patterns.add<EvalSignOpPattern>(&getContext());
+-    patterns.add<EvalSliceOpPattern>(&getContext());
+-    patterns.add<EvalSubtractOpPattern>(&getContext());
+-    patterns.add<RefineAllGatherOpPattern>(&getContext());
+-    patterns.add<RefineBitcastConvertOpPattern>(&getContext());
+-    patterns.add<RefineConvertOpPattern>(&getContext());
+-    patterns.add<RefineConvolutionOpPattern>(&getContext());
+-    patterns.add<RefineCustomCallOpPattern>(&getContext());
+-    patterns.add<RefineDotGeneralOpPattern>(&getContext());
+-    patterns.add<RefineDynamicBroadcastInDimOpPattern>(&getContext());
+-    patterns.add<RefineDynamicConvOpPattern>(&getContext());
+-    patterns.add<RefineDynamicIotaOpPattern>(&getContext());
+-    patterns.add<RefineDynamicPadOpPattern>(&getContext());
+-    patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
+-    patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
+-    patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
+-    patterns.add<RefineReduceScatterOpPattern>(&getContext());
+-    patterns.add<RefineRngOpPattern>(&getContext());
+-    patterns.add<RefineUniformQuantizeOpPattern>(&getContext());
+-    patterns.add<RefineWhileOpPattern>(&getContext());
+-    patterns.add<UpdateFunctionTypePattern>(&getContext());
+-    patterns.add<UpdateRegionTypePattern>(&getContext());
+-    if (failed(
+-            applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
+-      return signalPassFailure();
+-    }
+   }
+ };
+ 
 
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 4c4228163a6f04..0cb078c3b89795 100644
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1426,7 +1426,353 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/st
 diff --ruN a/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
 --- stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
 +++ stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
-@@ -607,12 +607,55 @@
+@@ -31,6 +31,7 @@
+ 
+ // -----
+ 
++// CHECK-LABEL: module @has_main
+ module @has_main {
+   // CHECK: main
+   func.func @main(%arg0: tensor<4xf32>) -> tensor<*xi32> {
+@@ -38,17 +39,11 @@
+     %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
+     func.return %0 : tensor<*xi32>
+   }
+-
+-  // CHECK: helper
+-  func.func @helper(%arg0: tensor<4xf32>) -> tensor<*xi32> {
+-    // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<*xi32>
+-    %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
+-    func.return %0 : tensor<*xi32>
+-  }
+-}
+-
+-// -----
+-
++}
++
++// -----
++
++// CHECK-LABEL: func @error_unsupported_operation
+ func.func @error_unsupported_operation(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> index {
+   // CHECK: stablehlo.add{{.*}} -> tensor<?xf32>
+   %0 = stablehlo.add %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<?xf32>
+@@ -472,11 +467,312 @@
+ 
+ // -----
+ 
+-// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth
+-func.func @refine_bitcast_convert_same_bitwidth(%arg0 : tensor<4xf32>) -> tensor<*xi32> {
++// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth_unranked_result
++func.func @refine_bitcast_convert_same_bitwidth_unranked_result(%arg0 : tensor<4xf32>) -> tensor<*xi32> {
+   // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<4xi32>
+   %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
+   func.return %0 : tensor<*xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth
++func.func @refine_bitcast_convert_same_bitwidth() -> tensor<?x?x0xf32> {
++  %0 = stablehlo.constant dense<[3, 5, 0]> : tensor<3xi32>
++  %21 = stablehlo.dynamic_iota %0, dim = 0 : (tensor<3xi32>) -> tensor<?x?x0xui32>
++  // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<3x5x0xf32>
++  %48 = stablehlo.bitcast_convert %21 : (tensor<?x?x0xui32>) -> tensor<?x?x0xf32>
++  return %48 : tensor<?x?x0xf32>
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call
++module @refine_call {
++  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
++    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
++    %1 = stablehlo.constant dense<4> : tensor<i32>
++    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
++    %2 = call @refine_call_callee(%1, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    return %2 : tensor<?xf32>
++  }
++  // CHECK: refine_call_callee(%arg0: tensor<4xf32>) -> tensor<4xf32>
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
++    // CHECK: stablehlo.constant dense<4>
++    %0 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
++    %1 = stablehlo.dynamic_iota %0, dim = 0 : (tensor<1xi32>) -> tensor<?xf32>
++    return %1 : tensor<?xf32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call_dimension_arguments
++module @refine_call_dimension_arguments {
++  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT:%.*]] = call @callee
++    // CHECK: return [[RESULT]]
++    %0 = stablehlo.constant dense<3> : tensor<i32>
++    %1 = call @callee(%0, %0, %arg0) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
++    return %1 : tensor<i32>
++  }
++  // %arg0 and %arg1 are dimension arguments
++  // CHECK: @callee([[ARG0:%.*]]: tensor<i32>) -> tensor<i32>
++  func.func private @callee(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
++    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
++    // CHECK: return [[RESULT1]]
++    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
++    %1 = stablehlo.add %0, %arg2: tensor<i32>
++    return %1 : tensor<i32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call_prefix_token_and_dimension_arguments
++module @refine_call_prefix_token_and_dimension_arguments {
++  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT:%.*]] = call @callee
++    // CHECK: return [[RESULT]]
++    %0 = stablehlo.constant dense<3> : tensor<i32>
++    %token = stablehlo.create_token : !stablehlo.token
++    %1 = call @callee(%token, %0, %0, %arg0) : (!stablehlo.token, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
++    return %1 : tensor<i32>
++  }
++  // %arg0 and %arg1 are dimension arguments
++  // CHECK: @callee([[ARG_TOKEN:%.*]]: !stablehlo.token, [[ARG0:%.*]]: tensor<i32>
++  func.func private @callee(%arg_token: !stablehlo.token, %arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
++    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
++    // CHECK: return [[RESULT1]]
++    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
++    %1 = stablehlo.add %0, %arg2: tensor<i32>
++    return %1 : tensor<i32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call_dimension_arguments_followed_by_token
++module @refine_call_dimension_arguments_followed_by_token {
++  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT:%.*]] = call @callee
++    // CHECK: return [[RESULT]]
++    %0 = stablehlo.constant dense<3> : tensor<i32>
++    %token = stablehlo.create_token : !stablehlo.token
++    %1 = call @callee(%0, %0, %token, %arg0) : (tensor<i32>, tensor<i32>, !stablehlo.token, tensor<i32>) -> tensor<i32>
++    return %1 : tensor<i32>
++  }
++  // %arg0 and %arg1 are dimension arguments
++  // CHECK: @callee([[ARG_TOKEN:%.*]]: !stablehlo.token, [[ARG0:%.*]]: tensor<i32>
++  func.func private @callee(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg_token: !stablehlo.token, %arg2: tensor<i32>) -> tensor<i32> {
++    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
++    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
++    // CHECK: return [[RESULT1]]
++    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
++    %1 = stablehlo.add %0, %arg2: tensor<i32>
++    return %1 : tensor<i32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_multiple_call_with_same_context
++module @refine_multiple_call_with_same_context {
++  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
++    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
++    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
++    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
++    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
++    %2 = call @refine_call_callee(%arg0_new, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    return %2 : tensor<?xf32>
++  }
++  // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
++    return %arg1 : tensor<?xf32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_multiple_call_constant_function
++module @refine_multiple_call_constant_function {
++  func.func @main(%arg0: tensor<5xf32>) -> tensor<i32> {
++    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<16>
++    // CHECK: return [[RESULT0]]
++    %0 = stablehlo.constant dense<4> : tensor<i32>
++    %1 = call @refine_call_callee(%0, %arg0) : (tensor<i32>, tensor<5xf32>) -> tensor<i32>
++    %2 = call @refine_call_callee(%0, %arg0) : (tensor<i32>, tensor<5xf32>) -> tensor<i32>
++    %3 = stablehlo.add %1, %2: tensor<i32>
++    return %3 : tensor<i32>
++  }
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<5xf32>) -> tensor<i32> {
++    // CHECK: [[RESULT1:%.*]] = stablehlo.constant dense<8>
++    // CHECK: return [[RESULT1]]
++    %0 = stablehlo.add %arg0, %arg0: tensor<i32>
++    return %0 : tensor<i32>
++  }
++}
++
++// -----
++
++module @refine_call_multiple_with_different_number_dimension_arguments {
++  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
++    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
++    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
++    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    // Ensure that the first argument is not a constant at the second call site
++    %arg0_different_f32 = stablehlo.bitcast_convert %arg0_new : (tensor<i32>) -> tensor<f32>
++    %arg0_different_i32 = stablehlo.bitcast_convert %arg0_different_f32 : (tensor<f32>) -> tensor<i32>
++    // expected-error@+1{{incorrect number of operands for callee}}
++    %2 = call @refine_call_callee(%arg0_different_i32, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    return %2 : tensor<?xf32>
++  }
++  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context. Previous context had 1 and now we have 2 non-dimension arguments}}
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
++    return %arg1 : tensor<?xf32>
++  }
++}
++
++// -----
++
++module @refine_call_multiple_different_dimension_arguments {
++  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
++    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
++    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
++    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    %arg0_different = stablehlo.add %arg0_new, %arg0_new : tensor<i32>
++    // expected-error@+1{{incorrect number of operands for callee}}
++    %2 = call @refine_call_callee(%arg0_different, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    return %2 : tensor<?xf32>
++  }
++  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context.}}
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
++    return %arg1 : tensor<?xf32>
++  }
++}
++
++// -----
++
++module @refine_call_multiple_different_non_dimension_arguments {
++  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
++    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
++    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
++    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    %2 = stablehlo.constant dense<[1., 2.]> : tensor<2xf32>
++    %3 = stablehlo.concatenate %1, %2, dim = 0 : (tensor<?xf32>, tensor<2xf32>) -> tensor<?xf32>
++    // expected-error@+1{{incorrect number of operands for callee}}
++    %4 = call @refine_call_callee(%arg0_new, %3) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
++    return %4 : tensor<?xf32>
++  }
++  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context.}}
++  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
++    return %arg1 : tensor<?xf32>
++  }
++}
++
++// -----
++
++module @refine_call_recursive {
++  func.func @main() -> tensor<i32> {
++    %0 = stablehlo.constant dense<3> : tensor<i32>
++    %1 = call @refine_call_callee(%0) : (tensor<i32>) -> tensor<i32>
++    return %1 : tensor<i32>
++  }
++  // expected-error@+1{{Function refine_call_callee is being refined recursively}}
++  func.func @refine_call_callee(%arg0: tensor<i32>) -> tensor<i32> {
++    // expected-error@+1{{incorrect number of operands}}
++    %0 = call @refine_call_callee(%arg0) : (tensor<i32>) -> tensor<i32>
++    return %0 : tensor<i32>
++  }
++}
++
++// -----
++
++module @refine_call_main_argument_unranked {
++  // expected-error@+1{{main must be refined with static shape arguments}}
++  func.func public @main(%arg0: tensor<*xi32>) -> tensor<*xi32> {
++    %2 = call @callee(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
++    return %2 : tensor<*xi32>
++  }
++  func.func private @callee(%arg0: tensor<*xi32>) -> tensor<*xi32> {
++    return %arg0 : tensor<*xi32>
++  }
++}
++
++// -----
++
++module @refine_call_main_argument_dynamic_shape {
++  // expected-error@+1{{main must be refined with static shape arguments}}
++  func.func public @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
++    %2 = call @callee(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
++    return %2 : tensor<?xi32>
++  }
++  func.func private @callee(%arg0: tensor<?xi32>) -> tensor<?xi32> {
++    return %arg0 : tensor<?xi32>
++  }
++}
++
++// -----
++
++module @refine_call_callee_argument_unranked {
++  func.func public @main(%arg0: tensor<1xi64>) -> tensor<*xi32> {
++    %1 = stablehlo.dynamic_iota %arg0, dim = 0 : (tensor<1xi64>) -> tensor<*xi32>
++    %2 = call @callee(%1) : (tensor<*xi32>) -> tensor<*xi32>
++    return %2 : tensor<*xi32>
++  }
++  // expected-error@+1{{callee must be refined with static shape arguments}}
++  func.func private @callee(%arg0: tensor<*xi32>) -> tensor<*xi32> {
++    return %arg0 : tensor<*xi32>
++  }
++}
++
++// -----
++
++module @refine_call_callee_argument_dynamic_shape {
++  func.func public @main(%arg0: tensor<1xi64>) -> tensor<?xi32> {
++    %1 = stablehlo.dynamic_iota %arg0, dim = 0 : (tensor<1xi64>) -> tensor<?xi32>
++    %2 = call @callee(%1) : (tensor<?xi32>) -> tensor<?xi32>
++    return %2 : tensor<?xi32>
++  }
++  // expected-error@+1{{callee must be refined with static shape arguments}}
++  func.func private @callee(%arg0: tensor<?xi32>) -> tensor<?xi32> {
++    return %arg0 : tensor<?xi32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call_dimension_argument_non_scalar
++// The non-scalar constant is not folded into the callee
++module @refine_call_dimension_argument_non_scalar {
++  func.func public @main() -> tensor<4xi32> {
++    // CHECK: dense<[1, 2, 3, 4]> : tensor<4xi32>
++    %0 = stablehlo.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
++    %1 = call @callee(%0) : (tensor<4xi32>) -> tensor<4xi32>
++    return %1 : tensor<4xi32>
++  }
++  func.func private @callee(%arg0: tensor<4xi32>) -> tensor<4xi32> {
++    // CHECK: return %arg0 : tensor<4xi32>
++    return %arg0 : tensor<4xi32>
++  }
++}
++
++// -----
++
++// CHECK-LABEL: module @refine_call_dimension_argument_not_integer
++module @refine_call_dimension_argument_not_integer {
++  func.func public @main() -> tensor<f32> {
++    %0 = stablehlo.constant dense<3.> : tensor<f32>
++    // CHECK: call @callee({{.*}}) : (tensor<f32>) -> tensor<f32>
++    %2 = call @callee(%0) : (tensor<f32>) -> tensor<f32>
++    return %2 : tensor<f32>
++  }
++  func.func private @callee(%arg0: tensor<f32>) -> tensor<f32> {
++    return %arg0 : tensor<f32>
++  }
+ }
+ 
+ // -----
+@@ -607,12 +903,55 @@
  
  // -----
  
@@ -1631,7 +1977,74 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/
 diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
 --- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
 +++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-@@ -43,6 +43,7 @@
+@@ -11,9 +11,48 @@
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+-
++/*
++This shape refinement pass was designed to resolve the dynamic shapes in
++a StableHLO module produced by JAX serialization with shape polymorphism.
++Such a module has the following properties:
++
++  * it contains a "main" function with statically-shaped arguments;
++    the result types may be dynamically shaped.
++  * all the dynamic shapes depend only on the input shapes (no shape
++    dependency on the input array contents). We refer to the operations that
++    depend transitively only on the input shapes (e.g., as given by
++    `stablehlo.get_dimension_size`) as `dimension` operations.
++    All dimension values can be resolved to constants through inter-procedural
++    constant folding.
++  * intermediate functions may take a number of token arguments (of type
++    !stablehlo.token) at the start of the argument list, followed by some
++    dimension arguments (integer scalars).
++  * some intermediate functions may return dimension values.
++    E.g., the `floordiv` operation on dimension values may be implemented
++    using intermediate functions. These constant functions need to be
++    constant-folded.
++  * All the dynamic shapes can be resolved through shape inference from the
++    dimension values. The dimension values themselves do not depend on the
++    result of shape inference.
++
++
++For each intermediate function we compute a refinement context, including
++the values of the dimension arguments and the static shapes of the other
++arguments. We compute the refinement context when we encounter a function call,
++and then we refine the callee recursively. We abort in the presence of
++recursive calls.
++We also abort if a function is called with multiple distinct refinement
++contexts.
++
++After refinement, all operations should have static shapes, all calls to
++constant functions are replaced with constants, and all dimension arguments
++for intermediate functions are dropped and are replaced with constants.
++*/
++#include <algorithm>
+ #include <cstdint>
+ #include <memory>
++#include <optional>
++#include <set>
+ #include <string>
+ #include <utility>
+ 
+@@ -24,8 +63,10 @@
+ #include "llvm/ADT/SmallSet.h"
+ #include "llvm/ADT/SmallVector.h"
+ #include "llvm/ADT/StringRef.h"
++#include "llvm/Support/Debug.h"
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Support/FormatVariadic.h"
++#include "llvm/Support/ScopedPrinter.h"
+ #include "mlir/Dialect/Func/IR/FuncOps.h"
+ #include "mlir/IR/BuiltinAttributes.h"
+ #include "mlir/IR/BuiltinOps.h"
+@@ -39,10 +80,13 @@
+ #include "mlir/IR/Types.h"
+ #include "mlir/IR/Value.h"
+ #include "mlir/Interfaces/InferTypeOpInterface.h"
++#include "mlir/Support/DebugStringHelper.h"
+ #include "mlir/Support/LogicalResult.h"
++#include "mlir/Support/LLVM.h"
  #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
  #include "stablehlo/dialect/Base.h"
  #include "stablehlo/dialect/ChloOps.h"
@@ -1639,7 +2052,407 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
  #include "stablehlo/dialect/StablehloOps.h"
  #include "stablehlo/dialect/TypeInference.h"
  #include "stablehlo/transforms/Passes.h"
-@@ -844,12 +845,97 @@
+@@ -50,10 +94,144 @@
+ namespace mlir {
+ namespace stablehlo {
+ 
++#define DEBUG_TYPE "stablehlo-refine-shapes"
++
+ #define GEN_PASS_DEF_STABLEHLOREFINESHAPESPASS
+ #include "stablehlo/transforms/Passes.h.inc"
+ 
+ namespace {
++
++// Per-module state for shape refinement.
++class RefineShapeState {
++ public:
++  // Validates that we are not attempting to refine a function with a different
++  // context than previously, and are not attempting recursive refinement.
++  // Returns failure() if validation fails. On success, returns a boolean
++  // that specifies whether the function has already been refined.
++  FailureOr<bool> validateFunctionRefinement(
++      func::FuncOp func, SmallVector<APSInt> dimensionArguments,
++      SmallVector<Type> nonDimensionArgumentTypes) {
++    StringRef funcName = func.getName();
++    auto found = refinementContexts.find(func);
++    if (found == refinementContexts.end()) {
++      return false;  // not already refined.
++    }
++    auto prevDimensionArguments = std::get<0>(found->second);
++    auto prevNonDimensionArgumentTypes = std::get<1>(found->second);
++    // Since we refine until fixed point, we will refine a call to a function
++    // both for the original function and for the refined one. In the latter
++    // case, we should have empty dimensionArguments but the same
++    // nonDimensionArgumentTypes.
++    if (prevNonDimensionArgumentTypes != nonDimensionArgumentTypes ||
++        (!dimensionArguments.empty() &&
++         prevDimensionArguments != dimensionArguments)) {
++      emitDifferentRefinementContextError(
++          func, /*dimensionArguments=*/dimensionArguments,
++          /*nonDimensionArgumentTypes=*/nonDimensionArgumentTypes,
++          /*prevDimensionArguments=*/prevDimensionArguments,
++          /*prevNonDimensionArgumentShapes=*/prevNonDimensionArgumentTypes);
++      return failure();
++    }
++    for (auto funcOnStack : functionsBeingRefined) {
++      if (funcOnStack == funcName) {
++        func.emitOpError() << "Function " << funcName
++                           << " is being refined recursively\n";
++        return failure();
++      }
++    }
++    return true;  // already refined.
++  }
++
++  // Updates the state to signal the starting of a function refinement.
++  // Callers must call `finishFunctionRefinement` when done.
++  void startFunctionRefinement(func::FuncOp func,
++                               SmallVector<APSInt> dimensionArguments,
++                               SmallVector<Type> nonDimensionArgumentTypes) {
++    StringRef funcName = func.getName();
++    functionsBeingRefined.push_back(funcName);
++    refinementContexts[func] =
++        std::make_tuple(dimensionArguments, nonDimensionArgumentTypes);
++  }
++
++  // Updates the state to signal the starting of a function refinement.
++  LogicalResult finishFunctionRefinement(func::FuncOp func) {
++    if (func.getName() !=
++        functionsBeingRefined[functionsBeingRefined.size() - 1]) {
++      func.emitOpError() << "Expected to find " << func.getName()
++                         << " at the top of the stack";
++      return failure();
++    }
++    functionsBeingRefined.pop_back();
++    return success();
++  }
++
++ private:
++  // Maps refined functions to the refinement context: the values of dimension
++  // arguments and the types of non-dimension arguments. A function is added
++  // here when we start refining it.
++  DenseMap<func::FuncOp, std::tuple<SmallVector<APSInt>, SmallVector<Type>>>
++      refinementContexts;
++
++  // A stack of functions that are in the process of being refined, the current
++  // one is last.
++  SmallVector<llvm::StringRef> functionsBeingRefined;
++
++  void emitDifferentRefinementContextError(
++      func::FuncOp func, SmallVector<APSInt> dimensionArguments,
++      SmallVector<Type> nonDimensionArgumentTypes,
++      SmallVector<APSInt> prevDimensionArguments,
++      SmallVector<Type> prevNonDimensionArgumentShapes) {
++    InFlightDiagnostic msg = func.emitOpError();
++    msg << "Function " << func.getName()
++        << " has already been refined with a different "
++           "refinement context. ";
++    int countShowNonDimensionArguments =
++        std::min(prevNonDimensionArgumentShapes.size(),
++                 nonDimensionArgumentTypes.size());
++    if (prevNonDimensionArgumentShapes.size() !=
++        nonDimensionArgumentTypes.size()) {
++      msg << "Previous context had " << prevNonDimensionArgumentShapes.size()
++          << " and now we have " << nonDimensionArgumentTypes.size()
++          << " non-dimension arguments. ";
++    }
++    msg << "The differences among the first " << countShowNonDimensionArguments
++        << " non-dimension argument types are: ";
++    for (auto i = 0; i < countShowNonDimensionArguments; ++i) {
++      if (prevNonDimensionArgumentShapes[i] != nonDimensionArgumentTypes[i]) {
++        msg << "Non-dimension argument[" << i << "] previously had type "
++            << debugString(prevNonDimensionArgumentShapes[i])
++            << " and now has type " << debugString(nonDimensionArgumentTypes[i])
++            << ". ";
++      }
++    }
++    int countShowDimensionArguments =
++        std::min(prevDimensionArguments.size(), dimensionArguments.size());
++    if (prevDimensionArguments.size() != dimensionArguments.size()) {
++      msg << "Previous context had " << prevDimensionArguments.size()
++          << " and now we have " << dimensionArguments.size()
++          << " dimension arguments. ";
++    }
++    msg << "The differences among the first " << countShowDimensionArguments
++        << " dimension arguments are: ";
++    for (auto i = 0; i < countShowDimensionArguments; ++i) {
++      if (prevDimensionArguments[i] != dimensionArguments[i]) {
++        msg << "Dimension argument[" << i << "] previously was "
++            << prevDimensionArguments[i].getSExtValue() << " and now is "
++            << dimensionArguments[i].getSExtValue() << ". ";
++      }
++    }
++  }
++};
++
++// Refines a function.
++// Returns `true` if the function had already been processed with the same
++// refinement context and `false` if this is the first time we refined the
++// function. Returns failure() if we encounter an error.
++LogicalResult refineFunction(func::FuncOp func, MLIRContext* context,
++                             RefineShapeState* state,
++                             size_t nrPrefixTokenArguments,
++                             SmallVector<APSInt> dimensionArguments,
++                             SmallVector<Type> nonDimensionArgumentTypes);
+ 
+ // DenseElementsAttr can be constructed from ArrayRef<APInt> but not from
+ // ArrayRef<APSInt>. This helper bridges the gap.
+@@ -424,11 +602,10 @@
+       diag << "refineValues failed for " << types << ": expected "
+            << values.size() << " types, got " << types.size();
+     });
+-
+-  // Check whether `types` contain any new information with respect to existing
+-  // return types. Even if just a single dimension size out of an entire tensor
+-  // type got updated, using `inferMostSpecificType` ensures that we don't
+-  // miss that.
++  // Check whether `types` contain any new information with respect to
++  // existing return types. Even if just a single dimension size out of an
++  // entire tensor type got updated, using `inferMostSpecificType` ensures
++  // that we don't miss that.
+   bool needsRefinement = false;
+   SmallVector<Type> refinedTypes;
+   for (auto it : llvm::zip(values.getTypes(), types)) {
+@@ -468,11 +645,13 @@
+ 
+       // Simply changing operand type of `func.return` won't work because
+       // that won't update the FunctionType of the enclosing `func.func`.
+-      // Nonetheless, we still want to support these ops because they are widely
+-      // used in StableHLO programs (although the plan of record is to replace
+-      // `func.return` ops in StableHLO programs with `stablehlo.return`:
+-      // https://github.com/openxla/stablehlo/issues/425).
++      // Nonetheless, we still want to support these ops because they are
++      // widely used in StableHLO programs (although the plan of record is to
++      // replace `func.return` ops in StableHLO programs with
++      // `stablehlo.return`: https://github.com/openxla/stablehlo/issues/425).
+       if (isa<func::ReturnOp>(user)) continue;
++
++      if (isa<func::CallOp>(user)) continue;
+ 
+       // Unlike in TensorFlow's type inference pass, here we work only with
+       // allowlisted ops to focus our support on well-defined semantics of
+@@ -489,7 +668,8 @@
+     value.setType(refinedType);
+ 
+     // Special case: for `func.return`, guard the refinement with a cast
+-    // and leave propagation of the refined return type to a dedicated pattern.
++    // and leave propagation of the refined return type to a dedicated
++    // pattern.
+     auto isFuncReturn = [](OpOperand& use) -> bool {
+       return isa<func::ReturnOp>(use.getOwner());
+     };
+@@ -505,8 +685,8 @@
+ 
+ // Refines the return types of the given operation using the given types.
+ // This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
++// users of this op if any updates to its results have happened during
++// execution of the function.
+ LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
+                                 ArrayRef<Type> types) {
+   if (failed(refineValues(rewriter, op, op->getResults(), types)))
+@@ -528,12 +708,12 @@
+ //      traversal, and only then we apply the refinements. If there are other
+ //      types, then the corresponding refinements must be completely empty.
+ //   2) Encodings are not supported. In principle, TypeExtensions should be
+-//      supportable, but this needs careful thinking through. Given that no one
+-//      asked for support for bounded dynamism in this pass yet, this is left
+-//      for future work.
++//      supportable, but this needs careful thinking through. Given that no
++//      one asked for support for bounded dynamism in this pass yet, this is
++//      left for future work.
+ // This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
++// users of this op if any updates to its results have happened during
++// execution of the function.
+ LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
+                                 ArrayRef<ShapedTypeComponents> refinements) {
+   SmallVector<Type> flattenedTypes;
+@@ -623,8 +803,8 @@
+ 
+ // Refines the return type of the given operation using the given shape.
+ // This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
++// users of this op if any updates to its results have happened during
++// execution of the function.
+ template <typename OpType>
+ LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
+                                 ArrayRef<int64_t> shape) {
+@@ -633,8 +813,8 @@
+ 
+ // Refines the return type of the given operation using the given shape.
+ // This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
++// users of this op if any updates to its results have happened during
++// execution of the function.
+ template <typename OpType>
+ LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
+                                 Value shapeValue) {
+@@ -647,6 +827,52 @@
+   return refineReturnShape(rewriter, op, shape);
+ }
+ 
++// Dimension arguments are leading scalar constant arguments, optionally
++// preceeded by some stablehlo.token arguments.
++SmallVector<APSInt> getDimensionArguments(func::CallOp callOp,
++                                          size_t* nrPrefixTokenArguments) {
++  *nrPrefixTokenArguments = 0;
++  SmallVector<Value> operands = callOp.getOperands();
++  SmallVector<APSInt> dimensionArguments;
++  for (size_t i = 0; i < operands.size(); ++i) {
++    if (i == *nrPrefixTokenArguments && isa<TokenType>(operands[i].getType())) {
++      (*nrPrefixTokenArguments)++;
++      continue;
++    }
++    RankedTensorType operandType =
++        dyn_cast<RankedTensorType>(operands[i].getType());
++    if (!operandType || operandType.getRank() != 0 ||
++        !operandType.getElementType().template isa<IntegerType>())
++      break;
++    SmallVector<APSInt> operand_int;
++    if (failed(hlo::matchInts(operands[i], operand_int))) {
++      break;
++    }
++    dimensionArguments.push_back(operand_int[0]);
++  }
++  return dimensionArguments;
++}
++
++std::optional<SmallVector<DenseIntElementsAttr>> isConstantFunction(
++    func::FuncOp func) {
++  LLVM_DEBUG(llvm::dbgs() << "check if " << func.getName()
++                          << " is a constant function\n");
++  SmallVector<DenseIntElementsAttr> returnedConstants;
++  func::ReturnOp ret = *func.getOps<func::ReturnOp>().begin();
++  bool isConstant = llvm::all_of(ret->getOperands(), [&](auto returnVal) {
++    DenseIntElementsAttr attr;
++    Operation* return_operand_def = returnVal.getDefiningOp();
++    if (return_operand_def &&
++        matchPattern(return_operand_def, m_Constant(&attr))) {
++      returnedConstants.push_back(attr);
++      return true;
++    }
++    return false;
++  });
++  if (isConstant) return returnedConstants;
++  return std::nullopt;
++}
++
+ struct RefineAllGatherOpPattern : public OpRewritePattern<AllGatherOp> {
+   using OpRewritePattern::OpRewritePattern;
+   LogicalResult matchAndRewrite(AllGatherOp op,
+@@ -655,9 +881,9 @@
+     if (!operandType.hasRank())
+       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
+ 
+-    // This represents the cross_replica_and_partition process grouping strategy
+-    // that requires num_partitions to compute shardCount. Since we don't know
+-    // num_partitions at this point, we error out.
++    // This represents the cross_replica_and_partition process grouping
++    // strategy that requires num_partitions to compute shardCount. Since we
++    // don't know num_partitions at this point, we error out.
+     if (op.getChannelHandle() && !op.getUseGlobalDeviceIds())
+       return rewriter.notifyMatchFailure(op, "unsupported strategy");
+     DenseIntElementsAttr replicaGroups = op.getReplicaGroups();
+@@ -678,12 +904,11 @@
+     auto operandType = op.getOperand().getType();
+     if (!operandType.hasRank())
+       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
+-
++    auto resultType = op.getType();
+     // If bit widths of the operand and the result are different, then
+     // operand and result shapes have different ranks.
+     // This complicates the logic quite a bit and is not needed to pass the
+     // current tests, so we leave this for future work.
+-    auto resultType = op.getType();
+     auto getBitWidthFn = [](ShapedType type) {
+       auto elementType = type.getElementType();
+       if (auto complexType = elementType.dyn_cast<ComplexType>())
+@@ -694,8 +919,77 @@
+     if (getBitWidthFn(operandType) != getBitWidthFn(resultType))
+       return rewriter.notifyMatchFailure(op, "unsupported bit width");
+ 
+-    return refineReturnShape(rewriter, op, operandType.getShape());
+-  }
++    auto res = refineReturnShape(rewriter, op, operandType.getShape());
++    if (failed(res)) return failure();
++    if (op.getOperand().getType() == op.getResult().getType()) {
++      LLVM_DEBUG({ llvm::dbgs() << "    ** remove no-op bitcast convert\n"; });
++      rewriter.replaceOp(op, op.getOperand());
++    }
++    return success();
++  }
++};
++
++struct RefineCallOpPattern : public OpRewritePattern<func::CallOp> {
++  using OpRewritePattern::OpRewritePattern;
++
++  RefineCallOpPattern(MLIRContext* context, RefineShapeState* state)
++      : OpRewritePattern<func::CallOp>(context), _state(state) {}
++
++  LogicalResult matchAndRewrite(func::CallOp op,
++                                PatternRewriter& rewriter) const override {
++    LLVM_DEBUG({ llvm::dbgs() << "refineCallOp " << debugString(op) << "\n"; });
++
++    // We have a number of prefix token arguments, then the dimension arguments
++    size_t nrPrefixTokenArguments = 0;
++    SmallVector<APSInt> dimensionArguments =
++        getDimensionArguments(op, &nrPrefixTokenArguments);
++    SmallVector<Type> nonDimensionArgumentTypes;
++    SmallVector<Value> nonDimensionArguments;
++    SmallVector<Value> operands = op.getOperands();
++    for (size_t i = 0; i < operands.size(); ++i) {
++      // Skip the dimension arguments.
++      if (i >= nrPrefixTokenArguments &&
++          i < nrPrefixTokenArguments + dimensionArguments.size()) {
++        continue;
++      }
++      nonDimensionArgumentTypes.push_back(operands[i].getType());
++      nonDimensionArguments.push_back(operands[i]);
++    }
++    FlatSymbolRefAttr calleeName = op.getCalleeAttr();
++    const SymbolTable symbolTable(op->getParentOfType<ModuleOp>());
++    func::FuncOp callee = dyn_cast<func::FuncOp>(
++        symbolTable.lookupNearestSymbolFrom(op, calleeName.getAttr()));
++    if (!callee)
++      return rewriter.notifyMatchFailure(
++          op, "cannot find callee in the current scope");
++    if (failed(refineFunction(callee, rewriter.getContext(), _state,
++                              nrPrefixTokenArguments, dimensionArguments,
++                              nonDimensionArgumentTypes)))
++      return failure();
++
++    // Is the callee a constant function in this refinement context?
++    std::optional<SmallVector<DenseIntElementsAttr>> constantAttrs =
++        isConstantFunction(callee);
++    if (constantAttrs.has_value()) {
++      SmallVector<Value> constants;
++      for (auto constAttr : constantAttrs.value()) {
++        constants.push_back(
++            rewriter.create<ConstantOp>(op.getLoc(), constAttr));
++      }
++      rewriter.replaceOp(op, constants);
++      return success();
++    }
++    if (!dimensionArguments.empty()) {
++      // Drop the dimension arguments, but only if necessary, or else we
++      // will end up trying to refine the new CallOp forever.
++      op = rewriter.replaceOpWithNewOp<func::CallOp>(
++          op, op.getResultTypes(), callee.getSymName(), nonDimensionArguments);
++    }
++    return refineReturnTypes(rewriter, op, callee.getResultTypes());
++  }
++
++ private:
++  RefineShapeState* _state;
+ };
+ 
+ struct RefineConvertOpPattern : public OpRewritePattern<ConvertOp> {
+@@ -844,12 +1138,98 @@
    }
  };
  
@@ -1712,8 +2525,9 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
 +      return rewriter.notifyMatchFailure(op, "expected constant output_shape");
 +
 +    // We only need to refine the shape of `output` (the second result).
-+    // The shape of `output_state` (the first result) is determined by the shape
-+    // of `initial_state`, so we ignore it and provide an empty refinement.
++    // The shape of `output_state` (the first result) is determined by the
++    // shape of `initial_state`, so we ignore it and provide an empty
++    // refinement.
 +    return refineReturnTypes(rewriter, op, {{initialStateType}, {outputShape}});
 +  }
 +};
@@ -1737,15 +2551,349 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
    }
  };
  
-@@ -1181,7 +1267,10 @@
-     patterns.add<RefineDynamicConvOpPattern>(&getContext());
-     patterns.add<RefineDynamicIotaOpPattern>(&getContext());
-     patterns.add<RefineDynamicPadOpPattern>(&getContext());
-+    patterns.add<RefineDynamicReduceWindowOpPattern>(&getContext());
-     patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
-+    patterns.add<RefineDynamicRngBitGeneratorOpPattern>(&getContext());
-+    patterns.add<RefineDynamicTopKOpPattern>(&getContext());
-     patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
-     patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
-     patterns.add<RefineReduceScatterOpPattern>(&getContext());
+@@ -865,11 +1245,11 @@
+     if (!isa<chlo::ChloDialect, StablehloDialect>(op->getDialect()))
+       return rewriter.notifyMatchFailure(op, "unsupported dialect");
+ 
+-    // For the ops that implement InferTypeOpInterface, we reinfer their return
+-    // types and see what happens.
+-    // Operands of these ops might have been refined elsewhere (e.g. someone
+-    // might have updated argument types of a function) or earlier during this
+-    // pass, and this might enable refinement opportunities downstream.
++    // For the ops that implement InferTypeOpInterface, we reinfer their
++    // return types and see what happens. Operands of these ops might have
++    // been refined elsewhere (e.g. someone might have updated argument types
++    // of a function) or earlier during this pass, and this might enable
++    // refinement opportunities downstream.
+     SmallVector<Type> inferredReturnTypes;
+     if (failed(op.inferReturnTypes(getContext(), /*location=*/{},
+                                    op->getOperands(), op->getAttrDictionary(),
+@@ -925,8 +1305,8 @@
+           sliceSizesAttr.size(),
+           RankedTensorType::get({}, startIndicesElementType));
+ 
+-      // RealDynamicSliceOp can take tensors of integer or index element types.
+-      // DynamicSliceOp::slice_sizes only supports i64 element type.
++      // RealDynamicSliceOp can take tensors of integer or index element
++      // types. DynamicSliceOp::slice_sizes only supports i64 element type.
+       // Adapt accordingly in order to be compatible with inferDynamicSliceOp.
+       SmallVector<int64_t> sliceSizes;
+       for (auto element : sliceSizesAttr.getValues<APInt>()) {
+@@ -956,9 +1336,9 @@
+     if (!operandType.hasRank())
+       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
+ 
+-    // This represents the cross_replica_and_partition process grouping strategy
+-    // that requires num_partitions to compute shardCount. Since we don't know
+-    // num_partitions at this point, we error out.
++    // This represents the cross_replica_and_partition process grouping
++    // strategy that requires num_partitions to compute shardCount. Since we
++    // don't know num_partitions at this point, we error out.
+     if (op.getChannelHandle() && !op.getUseGlobalDeviceIds())
+       return rewriter.notifyMatchFailure(op, "unsupported strategy");
+     DenseIntElementsAttr replicaGroups = op.getReplicaGroups();
+@@ -998,9 +1378,9 @@
+                                 PatternRewriter& rewriter) const override {
+     // Push the potentially refined operand types into the nested regions.
+     // This can lead to refinements of the return types of the body (but not
+-    // of the cond since it always returns tensor<i1>), but the key insight here
+-    // is that the enclosing while op doesn't care about these refinements
+-    // (because its return types are equal to its operand types).
++    // of the cond since it always returns tensor<i1>), but the key insight
++    // here is that the enclosing while op doesn't care about these
++    // refinements (because its return types are equal to its operand types).
+     // If we end up with incompatibilities between while's return types and
+     // body's return types, the verifier will tell us about that. This means
+     // that the original program wasn't well-formed. TODO(burmako): Implement
+@@ -1050,8 +1430,8 @@
+       if (failed(mostSpecificType) || destType == *mostSpecificType) continue;
+ 
+       // If the source type of the cast is more specific than the target type,
+-      // then we conclude that the cast is redundant (i.e. needs to be removed)
+-      // and that the return type of the function needs an update.
++      // then we conclude that the cast is redundant (i.e. needs to be
++      // removed) and that the return type of the function needs an update.
+       needsUpdate = true;
+       updatedResultTypes[i] = sourceType;
+ 
+@@ -1066,9 +1446,6 @@
+     for (auto cast : castsToReplace)
+       rewriter.replaceOp(cast, cast->getOperands());
+ 
+-    // If the type of the enclosing `func.func` needs an update, we simply
+-    // call setType. We can afford this simplicity because our algorithm
+-    // currently supports only one function per module.
+     auto func = cast<func::FuncOp>(op->getParentOp());
+     func.setType(
+         rewriter.getFunctionType(func.getArgumentTypes(), updatedResultTypes));
+@@ -1100,22 +1477,186 @@
+   }
+ };
+ 
++LogicalResult applyRewritePatterns(func::FuncOp func, MLIRContext* context,
++                                   RefineShapeState* state) {
++  // TODO(#1048): Find out why .maxIterations = 1 no longer works.
++  // There have been recent refactors to applyPatternsAndFoldGreedily
++  // upstream, and that might be the reason.
++  GreedyRewriteConfig config;
++  config.useTopDownTraversal = true;
++  config.enableRegionSimplification = true;
++  config.maxIterations = 2;
++  config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++  config.strictMode = GreedyRewriteStrictness::AnyOp;
++
++  RewritePatternSet patterns(context);
++  patterns.add<EvalAddOpPattern>(context);
++  patterns.add<EvalAndOpPattern>(context);
++  patterns.add<EvalBroadcastInDimOpPattern>(context);
++  patterns.add<EvalClampOpPattern>(context);
++  patterns.add<EvalCompareOpPattern>(context);
++  patterns.add<EvalConcatenateOpPattern>(context);
++  patterns.add<EvalConvertOpPattern>(context);
++  patterns.add<EvalDivOpPattern>(context);
++  patterns.add<EvalGetDimensionSizeOpPattern>(context);
++  patterns.add<EvalMaxOpPattern>(context);
++  patterns.add<EvalMinOpPattern>(context);
++  patterns.add<EvalMulOpPattern>(context);
++  patterns.add<EvalRemOpPattern>(context);
++  patterns.add<EvalReshapeOpPattern>(context);
++  patterns.add<EvalSelectOpPattern>(context);
++  patterns.add<EvalSignOpPattern>(context);
++  patterns.add<EvalSliceOpPattern>(context);
++  patterns.add<EvalSubtractOpPattern>(context);
++  patterns.add<RefineAllGatherOpPattern>(context);
++  patterns.add<RefineBitcastConvertOpPattern>(context);
++  patterns.add<RefineCallOpPattern>(context, state);
++  patterns.add<RefineConvertOpPattern>(context);
++  patterns.add<RefineConvolutionOpPattern>(context);
++  patterns.add<RefineCustomCallOpPattern>(context);
++  patterns.add<RefineDotGeneralOpPattern>(context);
++  patterns.add<RefineDynamicBroadcastInDimOpPattern>(context);
++  patterns.add<RefineDynamicConvOpPattern>(context);
++  patterns.add<RefineDynamicIotaOpPattern>(context);
++  patterns.add<RefineDynamicPadOpPattern>(context);
++  patterns.add<RefineDynamicReduceWindowOpPattern>(context);
++  patterns.add<RefineDynamicReshapeOpPattern>(context);
++  patterns.add<RefineDynamicRngBitGeneratorOpPattern>(context);
++  patterns.add<RefineDynamicTopKOpPattern>(context);
++  patterns.add<RefineInferTypeOpInterfacePattern>(context);
++  patterns.add<RefineRealDynamicSliceOpPattern>(context);
++  patterns.add<RefineReduceScatterOpPattern>(context);
++  patterns.add<RefineRngOpPattern>(context);
++  patterns.add<RefineUniformQuantizeOpPattern>(context);
++  patterns.add<RefineWhileOpPattern>(context);
++  patterns.add<UpdateFunctionTypePattern>(context);
++  patterns.add<UpdateRegionTypePattern>(context);
++  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
++    func.emitOpError() << "applyPatternsAndFoldGreedily failed";
++    return failure();
++  }
++  return success();
++}
++
++LogicalResult refineFunction(func::FuncOp func, MLIRContext* context,
++                             RefineShapeState* state,
++                             size_t nrPrefixTokenArguments,
++                             SmallVector<APSInt> dimensionArguments,
++                             SmallVector<Type> nonDimensionArgumentTypes) {
++  // The nonDimensionArgumentTypes include the prefix token arguments.
++  LLVM_DEBUG({
++    llvm::dbgs() << "refineFunction " << func.getName() << ": initial type "
++                 << debugString(func.getFunctionType()) << "\n";
++    llvm::dbgs() << "   has " << nrPrefixTokenArguments << " prefix tokens\n";
++    for (size_t i = 0; i < dimensionArguments.size(); ++i) {
++      llvm::dbgs() << "   with dimension arg[" << i
++                   << "] = " << dimensionArguments[i] << "\n";
++    }
++  });
++  // Check that the argument types have static shapes.
++  for (size_t i = 0; i < nonDimensionArgumentTypes.size(); ++i) {
++    if (i < nrPrefixTokenArguments) continue;
++    auto argType = nonDimensionArgumentTypes[i];
++    if (isa<TokenType>(argType)) continue;
++    auto argRankedTensorType = dyn_cast<RankedTensorType>(argType);
++    if (!argRankedTensorType || !argRankedTensorType.hasStaticShape()) {
++      func.emitOpError() << func.getName()
++                         << " must be refined with static shape arguments. "
++                         << "Found argument of type " << debugString(argType);
++      return failure();
++    }
++  }
++  auto alreadyRefined = state->validateFunctionRefinement(
++      func, dimensionArguments, nonDimensionArgumentTypes);
++  if (failed(alreadyRefined)) {
++    return failure();
++  }
++  if (*alreadyRefined) {
++    LLVM_DEBUG({
++      llvm::dbgs() << "refineFunction " << func.getName()
++                   << ": skipping, already refined\n";
++    });
++    return success();
++  }
++  state->startFunctionRefinement(func, dimensionArguments,
++                                 nonDimensionArgumentTypes);
++  // Only one block per function is supported at the moment.
++  // At the StableHLO level, functions are expected to only have one block,
++  // so supporting more is out of scope for this pass.
++  if (!func.getRegion().hasOneBlock()) {
++    func.emitOpError() << "must have exactly one block";
++    return failure();
++  }
++
++  // Replace all dimension arguments with constants and remove those arguments.
++  // Wrap non-dimension arguments with bitcast_convert.
++  OpBuilder op_builder(func.getRegion());
++  op_builder.setInsertionPointToStart(&func.getRegion().front());
++  size_t firstNonDimensionArg =
++      nrPrefixTokenArguments + dimensionArguments.size();
++  for (size_t i = 0; i < func.getNumArguments(); ++i) {
++    BlockArgument arg = func.getArgument(i);
++    Type argType = arg.getType();
++    if (i < nrPrefixTokenArguments) {
++      continue;
++    }
++    if (i < firstNonDimensionArg) {
++      ShapedType argShapedType = dyn_cast<ShapedType>(argType);
++      if (!argShapedType) {
++        func.emitOpError() << "dimension arguments must have shaped types";
++        return failure();
++      }
++      // We will drop the dimension arguments, replace them with constants.
++      auto replacement_op = op_builder.create<stablehlo::ConstantOp>(
++          arg.getLoc(), argType,
++          getTensorAttr(argShapedType,
++                        dimensionArguments[i - nrPrefixTokenArguments]));
++      arg.replaceAllUsesWith(replacement_op);
++    } else {
++      int nonDimensionArgumentIndex =
++          nrPrefixTokenArguments + i - firstNonDimensionArg;
++      Type refinedType = nonDimensionArgumentTypes[nonDimensionArgumentIndex];
++      if (refinedType != argType) {
++        // We add BitcastConvertOp as the only uses of the non-dimension
++        // arguments to ensure the module stays valid after we set the argument
++        // type.
++        auto replacement_op = op_builder.create<stablehlo::BitcastConvertOp>(
++            arg.getLoc(), argType, arg);
++        arg.replaceAllUsesExcept(replacement_op->getResult(0), replacement_op);
++        arg.setType(refinedType);
++      }
++    }
++  }
++  BitVector argIndices(func.getNumArguments());
++  argIndices.set(nrPrefixTokenArguments, firstNonDimensionArg);
++  func.eraseArguments(argIndices);
++  func.setType(op_builder.getFunctionType(nonDimensionArgumentTypes,
++                                          func.getResultTypes()));
++  LLVM_DEBUG({
++    llvm::dbgs() << "refineFunction " << func.getName() << ": set type to "
++                 << func.getFunctionType() << "\n";
++  });
++  if (failed(applyRewritePatterns(func, context, state))) return failure();
++  LLVM_DEBUG({
++    llvm::dbgs() << "refineFunction " << func.getName() << ": end with type "
++                 << debugString(func.getFunctionType()) << "\n";
++  });
++  if (failed(state->finishFunctionRefinement(func))) return failure();
++  return success();
++}
++
+ struct StablehloRefineShapesPass
+     : public impl::StablehloRefineShapesPassBase<StablehloRefineShapesPass> {
+   using StablehloRefineShapesPassBase::StablehloRefineShapesPassBase;
+ 
+   void runOnOperation() override {
+-    // Only one function per module is supported at the moment to avoid the need
+-    // to think about iterative type inference algorithms.
+-    // Current use cases are served well by inlining multiple functions into
+-    // a single function, so we leave native support for multiple functions to
+-    // future work.
+     // To enable modules that contain CustomCallOp::called_computations,
+     // we allow multiple functions, in which case we only refine the main
+     // function called "main", assuming that the called computations will have
+     // static shapes. Lifting this assumption and expanding refinement to
+     // multiple functions is left for future work.
+     ModuleOp module = getOperation();
++    RefineShapeState state;
+     auto funcs = llvm::to_vector(module.getOps<func::FuncOp>());
+     if (funcs.empty()) return;
+     func::FuncOp func;
+@@ -1130,70 +1671,14 @@
+           << " function to clearly identify which function will be refined";
+       return signalPassFailure();
+     }
+-
+-    // Similarly, only one block per function is supported at the moment.
+-    // At the StableHLO level, functions are expected to only have one block,
+-    // so supporting more is out of scope for this pass.
+-    if (!func.getRegion().hasOneBlock()) {
+-      func.emitOpError() << "must have exactly one block";
++    SmallVector<APSInt> emptyDimensionArguments;
++    SmallVector<Type> nonDimensionArgumentTypes;
++    for (auto arg : func.getArguments())
++      nonDimensionArgumentTypes.push_back(arg.getType());
++    if (failed(refineFunction(func, &getContext(), &state, 0,
++                              emptyDimensionArguments,
++                              nonDimensionArgumentTypes)))
+       return signalPassFailure();
+-    }
+-
+-    // The algorithm behind this pass consists of a single traversal of the
+-    // function. This is sufficient because we only support one function per
+-    // program at the moment.
+-    // TODO(#1048): Find out why .maxIterations = 1 no longer works.
+-    // There have been recent refactors to applyPatternsAndFoldGreedily
+-    // upstream, and that might be the reason.
+-    GreedyRewriteConfig config;
+-    config.useTopDownTraversal = true;
+-    config.enableRegionSimplification = true;
+-    config.maxIterations = 2;
+-    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
+-    config.strictMode = GreedyRewriteStrictness::AnyOp;
+-
+-    RewritePatternSet patterns(&getContext());
+-    patterns.add<EvalAddOpPattern>(&getContext());
+-    patterns.add<EvalAndOpPattern>(&getContext());
+-    patterns.add<EvalBroadcastInDimOpPattern>(&getContext());
+-    patterns.add<EvalClampOpPattern>(&getContext());
+-    patterns.add<EvalCompareOpPattern>(&getContext());
+-    patterns.add<EvalConcatenateOpPattern>(&getContext());
+-    patterns.add<EvalConvertOpPattern>(&getContext());
+-    patterns.add<EvalDivOpPattern>(&getContext());
+-    patterns.add<EvalGetDimensionSizeOpPattern>(&getContext());
+-    patterns.add<EvalMaxOpPattern>(&getContext());
+-    patterns.add<EvalMinOpPattern>(&getContext());
+-    patterns.add<EvalMulOpPattern>(&getContext());
+-    patterns.add<EvalRemOpPattern>(&getContext());
+-    patterns.add<EvalReshapeOpPattern>(&getContext());
+-    patterns.add<EvalSelectOpPattern>(&getContext());
+-    patterns.add<EvalSignOpPattern>(&getContext());
+-    patterns.add<EvalSliceOpPattern>(&getContext());
+-    patterns.add<EvalSubtractOpPattern>(&getContext());
+-    patterns.add<RefineAllGatherOpPattern>(&getContext());
+-    patterns.add<RefineBitcastConvertOpPattern>(&getContext());
+-    patterns.add<RefineConvertOpPattern>(&getContext());
+-    patterns.add<RefineConvolutionOpPattern>(&getContext());
+-    patterns.add<RefineCustomCallOpPattern>(&getContext());
+-    patterns.add<RefineDotGeneralOpPattern>(&getContext());
+-    patterns.add<RefineDynamicBroadcastInDimOpPattern>(&getContext());
+-    patterns.add<RefineDynamicConvOpPattern>(&getContext());
+-    patterns.add<RefineDynamicIotaOpPattern>(&getContext());
+-    patterns.add<RefineDynamicPadOpPattern>(&getContext());
+-    patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
+-    patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
+-    patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
+-    patterns.add<RefineReduceScatterOpPattern>(&getContext());
+-    patterns.add<RefineRngOpPattern>(&getContext());
+-    patterns.add<RefineUniformQuantizeOpPattern>(&getContext());
+-    patterns.add<RefineWhileOpPattern>(&getContext());
+-    patterns.add<UpdateFunctionTypePattern>(&getContext());
+-    patterns.add<UpdateRegionTypePattern>(&getContext());
+-    if (failed(
+-            applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
+-      return signalPassFailure();
+-    }
+   }
+ };
+ 
 
diff --git a/third_party/xla/xla/python/refine_polymorphic_shapes.cc b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
index 063b15aba58062..44ce5b20e39d9e 100644
--- a/third_party/xla/xla/python/refine_polymorphic_shapes.cc
+++ b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
@@ -251,10 +251,6 @@ absl::Status RefinePolymorphicShapes(mlir::ModuleOp module,
     pm.enableIRPrinting(print_before, print_after, /*printModuleScope=*/true,
                         /*printAfterOnlyOnChange=*/true);
   }
-
-  // TODO(necula): we should not need the inliner.
-  pm.addPass(mlir::createInlinerPass());
-  pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::stablehlo::createStablehloRefineShapesPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::stablehlo::createStablehloCanonicalizeDynamismPass());

From de0cbb9bb70515f75bde5ff87b7d470096e7aa49 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 21 Sep 2023 05:11:23 -0700
Subject: [PATCH 080/567] [XLA:GPU] Clean up Target util.

We have some differences between Triton codegen and other fusion codegen,
namely for Remainder/Fmod and Cbrt. Unify that.

- Remove two unused math functions.
- Add mapping from kRemainder to kFmod.
- Use kCbrt device function in elemental_ir_emitter.

PiperOrigin-RevId: 567274915
---
 third_party/xla/xla/service/gpu/elemental_ir_emitter.cc | 6 ++++++
 third_party/xla/xla/service/gpu/elemental_ir_emitter.h  | 3 +++
 third_party/xla/xla/service/gpu/target_util.cc          | 8 ++------
 third_party/xla/xla/service/gpu/target_util.h           | 4 +---
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc b/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
index 352d1746459ce8..6955d3050e869a 100644
--- a/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
@@ -326,6 +326,12 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexAbs(
                             {prim_type, prim_type}, prim_type);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitCbrt(PrimitiveType prim_type,
+                                                       llvm::Value* value) {
+  return EmitDeviceMathCall(TargetDeviceFunctionID::kCbrt, {value}, {prim_type},
+                            prim_type);
+}
+
 llvm::Value* GpuElementalIrEmitter::EmitThreadId() {
   llvm::Value* block_id = IntCast(
       EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, b()),
diff --git a/third_party/xla/xla/service/gpu/elemental_ir_emitter.h b/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
index 2e3e3da57575dd..f97861ba2e7afc 100644
--- a/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
@@ -87,6 +87,9 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitComplexAbs(PrimitiveType prim_type,
                                         llvm::Value* value) override;
 
+  StatusOr<llvm::Value*> EmitCbrt(PrimitiveType prim_type,
+                                  llvm::Value* value) override;
+
   StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
       absl::string_view, bool /*is_reducer*/) override;
diff --git a/third_party/xla/xla/service/gpu/target_util.cc b/third_party/xla/xla/service/gpu/target_util.cc
index 97ca6388804521..12b12b3f19faec 100644
--- a/third_party/xla/xla/service/gpu/target_util.cc
+++ b/third_party/xla/xla/service/gpu/target_util.cc
@@ -127,9 +127,6 @@ struct TargetDeviceFunction GetDeviceFunctionRoot(
     case TargetDeviceFunctionID::kCos: {
       return {"__nv_cos", "__ocml_cos"};
     }
-    case TargetDeviceFunctionID::kErfcinv: {
-      return {"__nv_erfcinv", "__ocml_erfcinv"};
-    }
     case TargetDeviceFunctionID::kExp: {
       return {"__nv_exp", "__ocml_exp"};
     }
@@ -151,9 +148,6 @@ struct TargetDeviceFunction GetDeviceFunctionRoot(
     case TargetDeviceFunctionID::kPow: {
       return {"__nv_pow", "__ocml_pow"};
     }
-    case TargetDeviceFunctionID::kRound: {
-      return {"__nv_round", "__ocml_round"};
-    }
     case TargetDeviceFunctionID::kRsqrt: {
       return {"__nv_rsqrt", "__ocml_rsqrt"};
     }
@@ -192,6 +186,8 @@ StatusOr<TargetDeviceFunctionID> GetTargetDeviceFunctionID(HloOpcode op) {
       return TargetDeviceFunctionID::kLog1p;
     case HloOpcode::kPower:
       return TargetDeviceFunctionID::kPow;
+    case HloOpcode::kRemainder:
+      return TargetDeviceFunctionID::kFmod;
     case HloOpcode::kRsqrt:
       return TargetDeviceFunctionID::kRsqrt;
     case HloOpcode::kSin:
diff --git a/third_party/xla/xla/service/gpu/target_util.h b/third_party/xla/xla/service/gpu/target_util.h
index 981e316ae1b68a..19b00307b3fea1 100644
--- a/third_party/xla/xla/service/gpu/target_util.h
+++ b/third_party/xla/xla/service/gpu/target_util.h
@@ -48,8 +48,8 @@ enum class TargetIntrinsicID {
 // Enumeration to get target specific device math function.
 enum class TargetDeviceFunctionID {
   kAtan2 = 0,
+  kCbrt,
   kCos,
-  kErfcinv,
   kExp,
   kExpm1,
   kFmod,
@@ -57,13 +57,11 @@ enum class TargetDeviceFunctionID {
   kLog,
   kLog1p,
   kPow,
-  kRound,
   kRsqrt,
   kSin,
   kSqrt,
   kTan,
   kTanh,
-  kCbrt,
 };
 
 // HLO opcode -> TargetDeviceFunctionID mapping.

From e20ace297d609e5f182f51a5daf8fe1d116bfbe5 Mon Sep 17 00:00:00 2001
From: Andrew Goodbody <andrew.goodbody@linaro.org>
Date: Thu, 21 Sep 2023 13:21:29 +0100
Subject: [PATCH 081/567] Fix permission denied on cp of headers

Remove the duplication of copies that can result in permission
denied
---
 tensorflow/tools/pip_package/build_pip_package.sh | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index deff63b8d15557..a77aa82a21c5aa 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -254,13 +254,8 @@ function prepare_src() {
     fi
   fi
 
-  # Move headers from TSL/XLA into tensorflow so that InstallHeaders can move
-  # them back into tensorflow/include
-  cp -rL bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_tsl/tsl/ ${TMPDIR}/tensorflow
-  cp -rL bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
   # Move vendored files into proper locations
   # This is required because TSL/XLA don't publish their own wheels
-  # TODO(jakeharmon): These two copy statements may no longer be necessary
   cp -rL bazel-bin/external/local_tsl/tsl/ ${TMPDIR}/tensorflow
   cp -rL bazel-bin/external/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
   # Fix the proto stubs

From 4511d6f62cc70359482145d0feea2ae583b18b2f Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Thu, 21 Sep 2023 06:51:56 -0700
Subject: [PATCH 082/567] [XLA:GPU] Avoid unnecessary autotuning of tiny Triton
 GEMM fusions.

PiperOrigin-RevId: 567293724
---
 third_party/xla/xla/service/gpu/BUILD         |  8 +++
 .../xla/xla/service/gpu/triton_autotuner.cc   | 17 +++--
 .../xla/xla/service/gpu/triton_autotuner.h    |  6 +-
 .../xla/service/gpu/triton_autotuner_test.cc  | 71 ++++++++++++++++---
 4 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 374ba47563aa10..8646507b0eef2f 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -673,6 +673,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
         "//xla/service:dump",
         "//xla/service:executable",
         "//xla/service:float_normalization",
@@ -710,7 +711,9 @@ xla_test(
         ":gemm_rewriter_triton",
         ":triton_autotuner",
         "//xla:autotuning_proto_cc",
+        "//xla:error_spec",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass_pipeline",
@@ -721,10 +724,15 @@ xla_test(
         "//xla/tests:test_utils",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner.cc b/third_party/xla/xla/service/gpu/triton_autotuner.cc
index 7e56199473a547..96ed7527326acb 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
 #include "xla/service/float_normalization.h"
@@ -212,14 +213,12 @@ class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
 
  private:
   GemmConfigSet GetGemmConfigSet(const HloFusionInstruction* fusion) {
-    const HloComputation& fusion_computation =
-        *fusion->called_computations().at(0);
-    const HloInstruction& fusion_root = *fusion_computation.root_instruction();
     const DebugOptions& debug_options =
         fusion->GetModule()->config().debug_options();
     se::StreamExecutor* stream_exec = config_.GetExecutor();
     return {GetPossibleMatmulAutotuneConfigs(
-        fusion_root,
+        *Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
+            *fusion->called_computations().at(0), HloOpcode::kDot)),
         stream_exec->GetDeviceDescription().cuda_compute_capability(),
         debug_options, config_.ExhaustiveTilingSearch())};
   }
@@ -739,9 +738,15 @@ Status Autotune(const AutotuneConfig& config, AutotunerCompileUtil& util,
 }  // anonymous namespace
 
 std::vector<AutotuneResult::TritonGemmKey> GetPossibleMatmulAutotuneConfigs(
-    const HloInstruction& instr,
+    const HloDotInstruction& dot,
     const se::CudaComputeCapability compute_capability,
     const DebugOptions& debug_options, bool exhaustive_tiling_search) {
+  // Avoid autotuning tiny fusions.
+  constexpr int kMinGemmElements = 32 * 32;
+  if (ShapeUtil::ElementsIn(dot.operand(0)->shape()) <= kMinGemmElements &&
+      ShapeUtil::ElementsIn(dot.operand(1)->shape()) <= kMinGemmElements) {
+    return {GemmKey(32, 32, 32, 1, 1, 4)};
+  }
   // Split-K optimization enables more even utilization of a GPU in cases
   // where tiling just the non-contracting dimensions of a GEMM does not create
   // a sufficient number of thread block programs to occupy all available cores.
@@ -756,7 +761,7 @@ std::vector<AutotuneResult::TritonGemmKey> GetPossibleMatmulAutotuneConfigs(
       debug_options.xla_gpu_enable_split_k_autotuning()
           ? std::max<int64_t>(1L, kSufficientNumberOfTiles * kMaxTileSize *
                                       kMaxTileSize /
-                                      ShapeUtil::ElementsIn(instr.shape()))
+                                      ShapeUtil::ElementsIn(dot.shape()))
           : 1;
   return exhaustive_tiling_search
              ? GetExhaustiveMatmulAutotuneConfigs(compute_capability,
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner.h b/third_party/xla/xla/service/gpu/triton_autotuner.h
index 912c8149aea5fd..bff2f170e6e7b2 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner.h
+++ b/third_party/xla/xla/service/gpu/triton_autotuner.h
@@ -21,9 +21,13 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
 #include "tsl/platform/threadpool.h"
 
 namespace xla {
@@ -51,7 +55,7 @@ class TritonAutotuner : public HloModulePass {
 // TODO(b/266210099): have a way to generate/load these dynamically.
 // Returns a list of possible tilings for a GEMM performed in Triton.
 std::vector<AutotuneResult::TritonGemmKey> GetPossibleMatmulAutotuneConfigs(
-    const HloInstruction& instr, se::CudaComputeCapability compute_capability,
+    const HloDotInstruction& dot, se::CudaComputeCapability compute_capability,
     const DebugOptions& debug_options, bool exhaustive_tiling_search = false);
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
index 7a22da6e421b26..217f030519719d 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
@@ -21,10 +21,16 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "xla/autotuning.pb.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/autotuner_util.h"
@@ -39,8 +45,13 @@ limitations under the License.
 #include "xla/tests/test_utils.h"
 #include "xla/tests/verified_hlo_module.h"
 #include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/cpu_info.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace gpu {
@@ -222,12 +233,20 @@ class TritonAutotunerTestWithMorePreciseReduction : public TritonAutotunerTest {
 };
 
 TEST_F(TritonAutotunerTest, VoltaUsesNoMoreThanTwoStages) {
+  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = f32[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  ROOT r = f32[1024,1024] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})")
+                                                  .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::VOLTA, /*minor=*/0};
   const std::vector<AutotuneResult::TritonGemmKey> configs =
       GetPossibleMatmulAutotuneConfigs(
-          *HloInstruction::CreateParameter(
-              0, ShapeUtil::MakeShape(F32, {1024, 1024}), ""),
+          *Cast<HloDotInstruction>(
+              module->entry_computation()->root_instruction()),
           compute_capability, GetDebugOptionsForTest());
   EXPECT_FALSE(std::any_of(configs.begin(), configs.end(),
                            [](const AutotuneResult::TritonGemmKey& key) {
@@ -236,12 +255,20 @@ TEST_F(TritonAutotunerTest, VoltaUsesNoMoreThanTwoStages) {
 }
 
 TEST_F(TritonAutotunerTest, AmpereUsesMoreThanTwoStages) {
+  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = f32[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  ROOT r = f32[1024,1024] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})")
+                                                  .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
   const std::vector<AutotuneResult::TritonGemmKey> configs =
       GetPossibleMatmulAutotuneConfigs(
-          *HloInstruction::CreateParameter(
-              0, ShapeUtil::MakeShape(F32, {1024, 1024}), ""),
+          *Cast<HloDotInstruction>(
+              module->entry_computation()->root_instruction()),
           compute_capability, GetDebugOptionsForTest());
   EXPECT_TRUE(std::any_of(configs.begin(), configs.end(),
                           [](const AutotuneResult::TritonGemmKey& key) {
@@ -250,12 +277,20 @@ TEST_F(TritonAutotunerTest, AmpereUsesMoreThanTwoStages) {
 }
 
 TEST_F(TritonAutotunerTest, SmallOutputCanUseLargeSplitK) {
+  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = f32[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  ROOT r = f32[1024,1024] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})")
+                                                  .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
   const std::vector<AutotuneResult::TritonGemmKey> configs =
       GetPossibleMatmulAutotuneConfigs(
-          *HloInstruction::CreateParameter(
-              0, ShapeUtil::MakeShape(F32, {1024, 1024}), ""),
+          *Cast<HloDotInstruction>(
+              module->entry_computation()->root_instruction()),
           compute_capability, GetDebugOptionsForTest());
   EXPECT_TRUE(std::any_of(configs.begin(), configs.end(),
                           [](const AutotuneResult::TritonGemmKey& key) {
@@ -264,12 +299,20 @@ TEST_F(TritonAutotunerTest, SmallOutputCanUseLargeSplitK) {
 }
 
 TEST_F(TritonAutotunerTest, LargeOutputDoesNotUseLargeSplitK) {
+  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = f32[20480,20480] parameter(0)
+  p1 = f32[20480,20480] parameter(1)
+  ROOT r = f32[20480,20480] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})")
+                                                  .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
   const std::vector<AutotuneResult::TritonGemmKey> configs =
       GetPossibleMatmulAutotuneConfigs(
-          *HloInstruction::CreateParameter(
-              0, ShapeUtil::MakeShape(F32, {20480, 20480}), ""),
+          *Cast<HloDotInstruction>(
+              module->entry_computation()->root_instruction()),
           compute_capability, GetDebugOptionsForTest());
   EXPECT_FALSE(std::any_of(configs.begin(), configs.end(),
                            [](const AutotuneResult::TritonGemmKey& key) {
@@ -507,12 +550,20 @@ class TritonAutotunerDisableSplitK : public TritonAutotunerTest {
 };
 
 TEST_F(TritonAutotunerDisableSplitK, SplitKIsDisabled) {
+  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = f32[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  ROOT r = f32[1024,1024] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})")
+                                                  .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
   const std::vector<AutotuneResult::TritonGemmKey> configs =
       GetPossibleMatmulAutotuneConfigs(
-          *HloInstruction::CreateParameter(
-              0, ShapeUtil::MakeShape(F32, {1024, 1024}), ""),
+          *Cast<HloDotInstruction>(
+              module->entry_computation()->root_instruction()),
           compute_capability, GetDebugOptionsForTest());
   EXPECT_TRUE(std::all_of(configs.begin(), configs.end(),
                           [](const AutotuneResult::TritonGemmKey& key) {

From d95348224dc97baddfad977ef834cc2331f17787 Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Thu, 21 Sep 2023 07:32:12 -0700
Subject: [PATCH 083/567] Add python 3.12 to JAX docker containers

PiperOrigin-RevId: 567302141
---
 ...11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython |  2 ++
 ....0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython |  2 ++
 .../tools/ci_build/install/build_and_install_python.sh |  4 +++-
 .../install/install_pip_packages_by_version.sh         | 10 +++++-----
 tensorflow/tools/toolchains/remote_config/configs.bzl  |  8 ++++----
 .../tools/toolchains/remote_config/containers.bzl      |  4 ++--
 .../tsl/tools/toolchains/remote_config/configs.bzl     |  8 ++++----
 .../tsl/tools/toolchains/remote_config/containers.bzl  |  4 ++--
 .../xla/tools/toolchains/remote_config/configs.bzl     |  8 ++++----
 .../xla/tools/toolchains/remote_config/containers.bzl  |  4 ++--
 10 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
index 6f1888f23741ee..83223fde6e478c 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
@@ -38,9 +38,11 @@ COPY install/build_and_install_python.sh /install/
 RUN /install/build_and_install_python.sh "3.9.4"
 RUN /install/build_and_install_python.sh "3.10.0"
 RUN /install/build_and_install_python.sh "3.11.0"
+RUN /install/build_and_install_python.sh "3.12.0rc3"
 
 COPY install/install_pip_packages_by_version.sh /install/
 # https://github.com/numpy/numpy/issues/22623 for `SETUPTOOLS_USE_DISTUTILS`.
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.12" "jax"
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython
index 2dc4b0611c9c05..ee1a272ffedf42 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython
@@ -75,9 +75,11 @@ COPY install/build_and_install_python.sh /install/
 RUN /install/build_and_install_python.sh "3.9.4"
 RUN /install/build_and_install_python.sh "3.10.0"
 RUN /install/build_and_install_python.sh "3.11.0"
+RUN /install/build_and_install_python.sh "3.12.0rc3"
 
 COPY install/install_pip_packages_by_version.sh /install/
 # https://github.com/numpy/numpy/issues/22623 for `SETUPTOOLS_USE_DISTUTILS`.
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.12" "jax"
diff --git a/tensorflow/tools/ci_build/install/build_and_install_python.sh b/tensorflow/tools/ci_build/install/build_and_install_python.sh
index fb8b6298542733..6055ea82536ada 100755
--- a/tensorflow/tools/ci_build/install/build_and_install_python.sh
+++ b/tensorflow/tools/ci_build/install/build_and_install_python.sh
@@ -15,11 +15,13 @@
 # ==============================================================================
 
 VERSION="$1"
+NO_RC_VERSION="${VERSION%rc*}"
+
 shift
 
 mkdir /build
 cd /build
-wget "https://www.python.org/ftp/python/${VERSION}/Python-${VERSION}.tgz"
+wget "https://www.python.org/ftp/python/${NO_RC_VERSION}/Python-${VERSION}.tgz"
 tar xvzf "Python-${VERSION}.tgz"
 cd "Python-${VERSION}"
 ./configure --enable-optimizations "$@"
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
index dfc2a7272c0234..e743ad8f63fadc 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
@@ -27,8 +27,7 @@ rm "get-pip.py"
 PYTHON_VERSION=$(echo ${PIP##*.})  # only the last number, eg. 10
 
 JAX_PACKAGES=(
-  # https://github.com/numpy/numpy/issues/22623
-  "setuptools<=65.5.1"
+  "setuptools"
   "wheel"
   "cloudpickle"
   "colorama>=0.4.4"
@@ -40,9 +39,8 @@ JAX_PACKAGES=(
   "six"
   "opt-einsum"
   "auditwheel"
-  "msgpack"
   "typing_extensions"
-  "ml_dtypes>=0.2.0"
+  "ml_dtypes>=0.3.0"
   "importlib_metadata>=4.6"
 )
 
@@ -94,7 +92,9 @@ fi
 if [[ "$2" == "jax" ]]; then
   # Special casing by version of Python
   # E.g., numpy supports py3.11 only from 1.23.4
-  if [[ ${PYTHON_VERSION} -eq 11 ]]; then
+  if [[ ${PYTHON_VERSION} -eq 12 ]]; then
+    "${PIP_INSTALL[@]}" "numpy==1.26.0" "scipy==1.11.2"
+  elif [[ ${PYTHON_VERSION} -eq 11 ]]; then
     "${PIP_INSTALL[@]}" "numpy==1.23.4" "scipy==1.9.2"
   else
     "${PIP_INSTALL[@]}" "numpy==1.22.4" "scipy==1.7.3"
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index 0ef444f89f17d5..a1fd875f4bd5e5 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -210,7 +210,7 @@ def initialize_rbe_configs():
         cuda_version = "11.8",
         cudnn_version = "8.6",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "8.4",
         sysroot = "/dt9",
@@ -224,7 +224,7 @@ def initialize_rbe_configs():
         cuda_version = "11.8",
         cudnn_version = "8.6",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "8.4",
         python_install_path = "/usr/local",
@@ -236,7 +236,7 @@ def initialize_rbe_configs():
         cuda_version = "12.0.1",
         cudnn_version = "8.8",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         sysroot = "/dt9",
         python_install_path = "/usr/local",
     )
@@ -248,7 +248,7 @@ def initialize_rbe_configs():
         cuda_version = "12.0.1",
         cudnn_version = "8.8",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         python_install_path = "/usr/local",
     )
 
diff --git a/tensorflow/tools/toolchains/remote_config/containers.bzl b/tensorflow/tools/toolchains/remote_config/containers.bzl
index b219c01f77d177..830b05b0c444b6 100644
--- a/tensorflow/tools/toolchains/remote_config/containers.bzl
+++ b/tensorflow/tools/toolchains/remote_config/containers.bzl
@@ -7,8 +7,8 @@ container_digests = {
     # JAX manylinux2014 configs.
     "cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython": "sha256:011034978c5f1e5dcecc816b3b964faafc42b243001d9cd09ff7cfe4a6a0f4b9",
     "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:d17894a1349a12baea1732cb133f65f08754ed97d0a6647efe23c916a9ab8f1c",
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:74a055cf4d996cf0ad280a9d929f0740eeb10de7696e2c42991ec719544ac656",
-    "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:5ed0933b22fce5073091deaeae98183461737b87a2e44c579e67ea4ee04b61d5",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:c973a5dd1b335b83f5cc65ab2d1f12e12c0cc5d310a2d9bf676fcdb52cf08285",
+    "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:2551b1587bdd0b63a4dd329eba6416cd07acb25496dde411c376609ce4f076f0",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index d1e467c45c91b6..2453dc746feefb 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -210,7 +210,7 @@ def initialize_rbe_configs():
         cuda_version = "11.8",
         cudnn_version = "8.6",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "8.4",
         sysroot = "/dt9",
@@ -224,7 +224,7 @@ def initialize_rbe_configs():
         cuda_version = "11.8",
         cudnn_version = "8.6",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "8.4",
         python_install_path = "/usr/local",
@@ -236,7 +236,7 @@ def initialize_rbe_configs():
         cuda_version = "12.0.1",
         cudnn_version = "8.8",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         sysroot = "/dt9",
         python_install_path = "/usr/local",
     )
@@ -248,7 +248,7 @@ def initialize_rbe_configs():
         cuda_version = "12.0.1",
         cudnn_version = "8.8",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         python_install_path = "/usr/local",
     )
 
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
index b219c01f77d177..830b05b0c444b6 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
@@ -7,8 +7,8 @@ container_digests = {
     # JAX manylinux2014 configs.
     "cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython": "sha256:011034978c5f1e5dcecc816b3b964faafc42b243001d9cd09ff7cfe4a6a0f4b9",
     "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:d17894a1349a12baea1732cb133f65f08754ed97d0a6647efe23c916a9ab8f1c",
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:74a055cf4d996cf0ad280a9d929f0740eeb10de7696e2c42991ec719544ac656",
-    "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:5ed0933b22fce5073091deaeae98183461737b87a2e44c579e67ea4ee04b61d5",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:c973a5dd1b335b83f5cc65ab2d1f12e12c0cc5d310a2d9bf676fcdb52cf08285",
+    "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:2551b1587bdd0b63a4dd329eba6416cd07acb25496dde411c376609ce4f076f0",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index d1e467c45c91b6..2453dc746feefb 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -210,7 +210,7 @@ def initialize_rbe_configs():
         cuda_version = "11.8",
         cudnn_version = "8.6",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "8.4",
         sysroot = "/dt9",
@@ -224,7 +224,7 @@ def initialize_rbe_configs():
         cuda_version = "11.8",
         cudnn_version = "8.6",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "8.4",
         python_install_path = "/usr/local",
@@ -236,7 +236,7 @@ def initialize_rbe_configs():
         cuda_version = "12.0.1",
         cudnn_version = "8.8",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         sysroot = "/dt9",
         python_install_path = "/usr/local",
     )
@@ -248,7 +248,7 @@ def initialize_rbe_configs():
         cuda_version = "12.0.1",
         cudnn_version = "8.8",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.8", "3.9", "3.10", "3.11"],
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
         python_install_path = "/usr/local",
     )
 
diff --git a/third_party/xla/tools/toolchains/remote_config/containers.bzl b/third_party/xla/tools/toolchains/remote_config/containers.bzl
index b219c01f77d177..830b05b0c444b6 100644
--- a/third_party/xla/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/containers.bzl
@@ -7,8 +7,8 @@ container_digests = {
     # JAX manylinux2014 configs.
     "cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython": "sha256:011034978c5f1e5dcecc816b3b964faafc42b243001d9cd09ff7cfe4a6a0f4b9",
     "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:d17894a1349a12baea1732cb133f65f08754ed97d0a6647efe23c916a9ab8f1c",
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:74a055cf4d996cf0ad280a9d929f0740eeb10de7696e2c42991ec719544ac656",
-    "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:5ed0933b22fce5073091deaeae98183461737b87a2e44c579e67ea4ee04b61d5",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:c973a5dd1b335b83f5cc65ab2d1f12e12c0cc5d310a2d9bf676fcdb52cf08285",
+    "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:2551b1587bdd0b63a4dd329eba6416cd07acb25496dde411c376609ce4f076f0",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",

From c2655123109b494679367639a4ba890cc9017c78 Mon Sep 17 00:00:00 2001
From: Alan Kelly <alankelly@google.com>
Date: Thu, 21 Sep 2023 07:43:51 -0700
Subject: [PATCH 084/567] Merge same size types for PACK to reduce binary size

PiperOrigin-RevId: 567304689
---
 tensorflow/lite/kernels/pack.cc | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index e4bc806a577805..5bb1a618d446c5 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -116,34 +116,21 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     GetOutputSafe(context, node, kOutputTensor, &output));
   switch (output->type) {
-    case kTfLiteFloat32: {
-      return PackImpl<float>(context, node, output, data->values_count,
-                             data->axis);
-    }
-    case kTfLiteUInt8: {
-      return PackImpl<uint8_t>(context, node, output, data->values_count,
-                               data->axis);
-    }
-    case kTfLiteUInt32: {
-      return PackImpl<uint32_t>(context, node, output, data->values_count,
-                                data->axis);
-    }
-    case kTfLiteInt8: {
+    case kTfLiteInt8:
+    case kTfLiteUInt8:
       return PackImpl<int8_t>(context, node, output, data->values_count,
                               data->axis);
-    }
-    case kTfLiteInt16: {
+    case kTfLiteInt16:
       return PackImpl<int16_t>(context, node, output, data->values_count,
                                data->axis);
-    }
-    case kTfLiteInt32: {
+    case kTfLiteFloat32:
+    case kTfLiteInt32:
+    case kTfLiteUInt32:
       return PackImpl<int32_t>(context, node, output, data->values_count,
                                data->axis);
-    }
-    case kTfLiteInt64: {
+    case kTfLiteInt64:
       return PackImpl<int64_t>(context, node, output, data->values_count,
                                data->axis);
-    }
     default: {
       TF_LITE_KERNEL_LOG(context, "Type '%s' is not supported by pack.",
                          TfLiteTypeGetName(output->type));

From 4f4958cc106a8f0db94e0d2750984a641c0197a7 Mon Sep 17 00:00:00 2001
From: Neel Kovelamudi <nkovela@google.com>
Date: Thu, 21 Sep 2023 08:36:09 -0700
Subject: [PATCH 085/567] Pass along the shape param from
 tf.compat.v1.get_variable to its underlying variable creator. At this point,
 we have already checked that either shape is compatible w/ initial_value, or
 that it is not specified (None). Therefore it should always be safe to pass
 it. Having it can help subsequent nested variable creators save some cycles
 tracing the initializer function when all they need to know is the shape.

PiperOrigin-RevId: 567317207
---
 .../feature_column/feature_column_test.py     | 28 ++++++-------------
 .../feature_column/feature_column_v2_test.py  | 12 +++-----
 tensorflow/python/ops/variable_scope.py       | 17 ++++-------
 3 files changed, 17 insertions(+), 40 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index f8fbd8db7e3b15..3cde6c5657edc1 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -4924,19 +4924,16 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
-        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
-          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
-          return array_ops.slice(
-              embedding_values, partition_info.var_offset, shape
-          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-          return embedding_values
+
+        self.assertEqual(dtypes.float32, dtype)
+        return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups = (
@@ -4979,12 +4976,7 @@ def _initializer(shape, dtype, partition_info=None):
       for v in global_vars:
         self.assertIsInstance(v, variables_lib.Variable)
       with _initialized_session():
-        if partition_variables:
-          self.assertAllEqual(
-              embedding_values, array_ops.concat(global_vars, axis=0)
-          )
-        else:
-          self.assertAllEqual(embedding_values, global_vars[0])
+        self.assertAllEqual(embedding_values, global_vars[0])
         self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
       if use_safe_embedding_lookup:
@@ -5791,19 +5783,16 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
-        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
-          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
-          return array_ops.slice(
-              embedding_values, partition_info.var_offset, shape
-          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-          return embedding_values
+
+        self.assertEqual(dtypes.float32, dtype)
+        return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups_a = (
@@ -5853,11 +5842,10 @@ def _initializer(shape, dtype, partition_info=None):
         self.assertCountEqual(('vars/embedding_weights/part_0:0',
                                'vars/embedding_weights/part_1:0'),
                               tuple([v.name for v in global_vars]))
-        embedding_var = array_ops.concat(global_vars, axis=0)
       else:
         self.assertCountEqual(('vars/embedding_weights:0',),
                               tuple([v.name for v in global_vars]))
-        embedding_var = global_vars[0]
+      embedding_var = global_vars[0]
 
       self.evaluate(variables_lib.global_variables_initializer())
       self.evaluate(lookup_ops.tables_initializer())
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index cb98bf60c03184..21dcbb4452d6c7 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -5762,19 +5762,16 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
-        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
-          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
-          return array_ops.slice(
-              embedding_values, partition_info.var_offset, shape
-          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-          return embedding_values
+
+        self.assertEqual(dtypes.float32, dtype)
+        return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups_a = (
@@ -5824,11 +5821,10 @@ def _initializer(shape, dtype, partition_info=None):
         self.assertCountEqual(('vars/aaa_bbb_shared_embedding/part_0:0',
                                'vars/aaa_bbb_shared_embedding/part_1:0'),
                               tuple([v.name for v in global_vars]))
-        embedding_var = array_ops.concat(global_vars, axis=0)
       else:
         self.assertCountEqual(('vars/aaa_bbb_shared_embedding:0',),
                               tuple([v.name for v in global_vars]))
-        embedding_var = global_vars[0]
+      embedding_var = global_vars[0]
 
       self.evaluate(variables_lib.global_variables_initializer())
       self.evaluate(lookup_ops.tables_initializer())
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 622f70732adaba..33dd0438fa2f2f 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -807,8 +807,7 @@ def _get_partitioned_variable(self,
             use_resource=use_resource,
             constraint=constraint,
             synchronization=synchronization,
-            aggregation=aggregation,
-        )
+            aggregation=aggregation)
 
       # pylint: disable=protected-access
       var._set_save_slice_info(
@@ -881,8 +880,7 @@ def _get_single_variable(self,
       raise ValueError("If initializer is a constant, do not specify shape.")
 
     dtype = dtypes.as_dtype(dtype)
-    if shape is not None:
-      shape = tensor_shape.as_shape(shape)
+    shape = tensor_shape.as_shape(shape)
 
     if name in self._vars:
       # Here we handle the case when returning an existing variable.
@@ -903,9 +901,7 @@ def _get_single_variable(self,
         raise ValueError("%s Originally defined at:\n\n%s" %
                          (err_msg, "".join(traceback.format_list(tb))))
       found_var = self._vars[name]
-      if shape is not None and not shape.is_compatible_with(
-          found_var.get_shape()
-      ):
+      if not shape.is_compatible_with(found_var.get_shape()):
         raise ValueError("Trying to share variable %s, but specified shape %s"
                          " and found shape %s." %
                          (name, shape, found_var.get_shape()))
@@ -925,7 +921,6 @@ def _get_single_variable(self,
 
     # Create the tensor to initialize the variable with default value.
     if initializer is None:
-      assert shape is not None
       initializer, initializing_from_value = self._get_default_initializer(
           name=name, shape=shape, dtype=dtype)
     # Enter an init scope when creating the initializer.
@@ -937,7 +932,7 @@ def _get_single_variable(self,
         # Instantiate initializer if provided initializer is a type object.
         if tf_inspect.isclass(initializer):
           initializer = initializer()
-        if shape is not None and shape.is_fully_defined():
+        if shape.is_fully_defined():
           if "partition_info" in tf_inspect.getargspec(initializer).args:
             init_val = functools.partial(initializer,
                                          shape.as_list(),
@@ -972,9 +967,7 @@ def _get_single_variable(self,
         constraint=constraint,
         use_resource=use_resource,
         synchronization=synchronization,
-        aggregation=aggregation,
-        shape=shape,
-    )
+        aggregation=aggregation)
     if context.executing_eagerly() and self._store_eager_variables:
       if collections:
         ops.add_to_collections(collections, v)

From 060f5118478e4c0cc3fb5482470ff9c0101c752d Mon Sep 17 00:00:00 2001
From: George Necula <necula@google.com>
Date: Thu, 21 Sep 2023 09:13:10 -0700
Subject: [PATCH 086/567] Improve shape refinement to not require inlining.

PiperOrigin-RevId: 567326536
---
 .../compiler/tests/xla_call_module_test.py    |   82 --
 third_party/stablehlo/temporary.patch         | 1180 +----------------
 .../xla/third_party/stablehlo/temporary.patch | 1180 +----------------
 .../xla/python/refine_polymorphic_shapes.cc   |    4 +
 4 files changed, 36 insertions(+), 2410 deletions(-)

diff --git a/tensorflow/compiler/tests/xla_call_module_test.py b/tensorflow/compiler/tests/xla_call_module_test.py
index ee651b107c5b8e..a24930a7b8c846 100644
--- a/tensorflow/compiler/tests/xla_call_module_test.py
+++ b/tensorflow/compiler/tests/xla_call_module_test.py
@@ -259,88 +259,6 @@ def f(x):  # x: f32[2, b]
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(x),))
 
-  def test_poly_with_inner_token(self):
-    # The inner functions pass tokens through
-    x = np.arange(12, dtype=np.float32).reshape((3, 4))
-
-    def f(x):  # x : f32[b0, b1]
-      # 1 + sin(x)
-      module, version = serialize("""
-module @jit_f.0 attributes {jax.uses_shape_polymorphism = true} {
-  func.func public @main(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
-    %0 = stablehlo.get_dimension_size %arg0, dim = 0 : (tensor<?x?xf32>) -> tensor<i32>
-    %1 = stablehlo.get_dimension_size %arg0, dim = 1 : (tensor<?x?xf32>) -> tensor<i32>
-    %2 = stablehlo.constant dense<> : tensor<0xi1>
-    %3:2 = call @_wrapped_main(%0, %1, %2, %arg0) : (tensor<i32>, tensor<i32>, tensor<0xi1>, tensor<?x?xf32>) -> (tensor<0xi1>, tensor<?x?xf32>)
-    return %3#1 : tensor<?x?xf32>
-  }
-
-  func.func private @_wrapped_main(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<0xi1> {jax.token = true}, %arg3: tensor<?x?xf32>) -> (tensor<0xi1> {jax.token = true}, tensor<?x?xf32>) {
-    %0 = stablehlo.create_token : !stablehlo.token
-    %1 = stablehlo.sine %arg3 : tensor<?x?xf32>
-    %2 = stablehlo.constant dense<1.000000e+00> : tensor<f32>
-    %3 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
-    %4 = stablehlo.reshape %arg1 : (tensor<i32>) -> tensor<1xi32>
-    %5 = stablehlo.concatenate %3, %4, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %6 = stablehlo.dynamic_broadcast_in_dim %2, %5, dims = [] : (tensor<f32>, tensor<2xi32>) -> tensor<?x?xf32>
-    %7 = stablehlo.add %1, %6 : tensor<?x?xf32>
-    %8 = stablehlo.constant dense<> : tensor<0xi1>
-    return %8, %7 : tensor<0xi1>, tensor<?x?xf32>
-  }
-}
-""")
-      return xla.call_module(
-          [x],
-          version=version,
-          module=module,
-          Tout=[x.dtype],
-          Sout=[x.shape],
-          has_token_input_output=False,
-          platforms=[self.testing_platform()],
-      )
-
-    self._assertOpOutputMatchesExpected(f, (x,), (1. + np.sin(x),))
-
-  def test_poly_with_inner_prefix_token(self):
-    # Sometimes inner functions take a token as first argument
-    x = np.arange(12, dtype=np.float32).reshape((3, 4))
-
-    def f(x):  # x : f32[b0, b1]
-      # 1 + sin(x)
-      module, version = serialize("""
-module @jit_f.0 attributes {jax.uses_shape_polymorphism = true} {
-  func.func public @main(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
-    %0 = stablehlo.get_dimension_size %arg0, dim = 0 : (tensor<?x?xf32>) -> tensor<i32>
-    %1 = stablehlo.get_dimension_size %arg0, dim = 1 : (tensor<?x?xf32>) -> tensor<i32>
-    %2 = stablehlo.create_token : !stablehlo.token
-    %3:2 = call @_wrapped_main(%2, %0, %1, %arg0) : (!stablehlo.token, tensor<i32>, tensor<i32>, tensor<?x?xf32>) -> (!stablehlo.token, tensor<?x?xf32>)
-    return %3#1 : tensor<?x?xf32>
-  }
-
-  func.func private @_wrapped_main(%arg_token: !stablehlo.token, %arg0: tensor<i32>, %arg1: tensor<i32>, %arg3: tensor<?x?xf32>) -> (!stablehlo.token, tensor<?x?xf32>) {
-    %1 = stablehlo.sine %arg3 : tensor<?x?xf32>
-    %2 = stablehlo.constant dense<1.000000e+00> : tensor<f32>
-    %3 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
-    %4 = stablehlo.reshape %arg1 : (tensor<i32>) -> tensor<1xi32>
-    %5 = stablehlo.concatenate %3, %4, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-    %6 = stablehlo.dynamic_broadcast_in_dim %2, %5, dims = [] : (tensor<f32>, tensor<2xi32>) -> tensor<?x?xf32>
-    %7 = stablehlo.add %1, %6 : tensor<?x?xf32>
-    return %arg_token, %7 : !stablehlo.token, tensor<?x?xf32>
-  }
-}
-""")
-      return xla.call_module(
-          [x],
-          version=version,
-          module=module,
-          Tout=[x.dtype],
-          Sout=[x.shape],
-          has_token_input_output=False,
-          platforms=[self.testing_platform()],
-      )
-
-    self._assertOpOutputMatchesExpected(f, (x,), (1. + np.sin(x),))
-
   def test_wrong_actual_args_errors(self):
     x = np.arange(6, dtype=np.float32).reshape((3, 2))
     y = np.arange(6, dtype=np.int32).reshape((2, 3))
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 0cb078c3b89795..4c4228163a6f04 100644
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1426,353 +1426,7 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/st
 diff --ruN a/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
 --- stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
 +++ stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
-@@ -31,6 +31,7 @@
- 
- // -----
- 
-+// CHECK-LABEL: module @has_main
- module @has_main {
-   // CHECK: main
-   func.func @main(%arg0: tensor<4xf32>) -> tensor<*xi32> {
-@@ -38,17 +39,11 @@
-     %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
-     func.return %0 : tensor<*xi32>
-   }
--
--  // CHECK: helper
--  func.func @helper(%arg0: tensor<4xf32>) -> tensor<*xi32> {
--    // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<*xi32>
--    %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
--    func.return %0 : tensor<*xi32>
--  }
--}
--
--// -----
--
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @error_unsupported_operation
- func.func @error_unsupported_operation(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> index {
-   // CHECK: stablehlo.add{{.*}} -> tensor<?xf32>
-   %0 = stablehlo.add %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<?xf32>
-@@ -472,11 +467,312 @@
- 
- // -----
- 
--// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth
--func.func @refine_bitcast_convert_same_bitwidth(%arg0 : tensor<4xf32>) -> tensor<*xi32> {
-+// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth_unranked_result
-+func.func @refine_bitcast_convert_same_bitwidth_unranked_result(%arg0 : tensor<4xf32>) -> tensor<*xi32> {
-   // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<4xi32>
-   %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
-   func.return %0 : tensor<*xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth
-+func.func @refine_bitcast_convert_same_bitwidth() -> tensor<?x?x0xf32> {
-+  %0 = stablehlo.constant dense<[3, 5, 0]> : tensor<3xi32>
-+  %21 = stablehlo.dynamic_iota %0, dim = 0 : (tensor<3xi32>) -> tensor<?x?x0xui32>
-+  // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<3x5x0xf32>
-+  %48 = stablehlo.bitcast_convert %21 : (tensor<?x?x0xui32>) -> tensor<?x?x0xf32>
-+  return %48 : tensor<?x?x0xf32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call
-+module @refine_call {
-+  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
-+    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
-+    %1 = stablehlo.constant dense<4> : tensor<i32>
-+    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
-+    %2 = call @refine_call_callee(%1, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    return %2 : tensor<?xf32>
-+  }
-+  // CHECK: refine_call_callee(%arg0: tensor<4xf32>) -> tensor<4xf32>
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-+    // CHECK: stablehlo.constant dense<4>
-+    %0 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
-+    %1 = stablehlo.dynamic_iota %0, dim = 0 : (tensor<1xi32>) -> tensor<?xf32>
-+    return %1 : tensor<?xf32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call_dimension_arguments
-+module @refine_call_dimension_arguments {
-+  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT:%.*]] = call @callee
-+    // CHECK: return [[RESULT]]
-+    %0 = stablehlo.constant dense<3> : tensor<i32>
-+    %1 = call @callee(%0, %0, %arg0) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+  // %arg0 and %arg1 are dimension arguments
-+  // CHECK: @callee([[ARG0:%.*]]: tensor<i32>) -> tensor<i32>
-+  func.func private @callee(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
-+    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
-+    // CHECK: return [[RESULT1]]
-+    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
-+    %1 = stablehlo.add %0, %arg2: tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call_prefix_token_and_dimension_arguments
-+module @refine_call_prefix_token_and_dimension_arguments {
-+  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT:%.*]] = call @callee
-+    // CHECK: return [[RESULT]]
-+    %0 = stablehlo.constant dense<3> : tensor<i32>
-+    %token = stablehlo.create_token : !stablehlo.token
-+    %1 = call @callee(%token, %0, %0, %arg0) : (!stablehlo.token, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+  // %arg0 and %arg1 are dimension arguments
-+  // CHECK: @callee([[ARG_TOKEN:%.*]]: !stablehlo.token, [[ARG0:%.*]]: tensor<i32>
-+  func.func private @callee(%arg_token: !stablehlo.token, %arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
-+    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
-+    // CHECK: return [[RESULT1]]
-+    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
-+    %1 = stablehlo.add %0, %arg2: tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call_dimension_arguments_followed_by_token
-+module @refine_call_dimension_arguments_followed_by_token {
-+  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT:%.*]] = call @callee
-+    // CHECK: return [[RESULT]]
-+    %0 = stablehlo.constant dense<3> : tensor<i32>
-+    %token = stablehlo.create_token : !stablehlo.token
-+    %1 = call @callee(%0, %0, %token, %arg0) : (tensor<i32>, tensor<i32>, !stablehlo.token, tensor<i32>) -> tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+  // %arg0 and %arg1 are dimension arguments
-+  // CHECK: @callee([[ARG_TOKEN:%.*]]: !stablehlo.token, [[ARG0:%.*]]: tensor<i32>
-+  func.func private @callee(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg_token: !stablehlo.token, %arg2: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
-+    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
-+    // CHECK: return [[RESULT1]]
-+    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
-+    %1 = stablehlo.add %0, %arg2: tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_multiple_call_with_same_context
-+module @refine_multiple_call_with_same_context {
-+  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
-+    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
-+    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
-+    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
-+    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
-+    %2 = call @refine_call_callee(%arg0_new, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    return %2 : tensor<?xf32>
-+  }
-+  // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-+    return %arg1 : tensor<?xf32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_multiple_call_constant_function
-+module @refine_multiple_call_constant_function {
-+  func.func @main(%arg0: tensor<5xf32>) -> tensor<i32> {
-+    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<16>
-+    // CHECK: return [[RESULT0]]
-+    %0 = stablehlo.constant dense<4> : tensor<i32>
-+    %1 = call @refine_call_callee(%0, %arg0) : (tensor<i32>, tensor<5xf32>) -> tensor<i32>
-+    %2 = call @refine_call_callee(%0, %arg0) : (tensor<i32>, tensor<5xf32>) -> tensor<i32>
-+    %3 = stablehlo.add %1, %2: tensor<i32>
-+    return %3 : tensor<i32>
-+  }
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<5xf32>) -> tensor<i32> {
-+    // CHECK: [[RESULT1:%.*]] = stablehlo.constant dense<8>
-+    // CHECK: return [[RESULT1]]
-+    %0 = stablehlo.add %arg0, %arg0: tensor<i32>
-+    return %0 : tensor<i32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_multiple_with_different_number_dimension_arguments {
-+  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
-+    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
-+    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
-+    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    // Ensure that the first argument is not a constant at the second call site
-+    %arg0_different_f32 = stablehlo.bitcast_convert %arg0_new : (tensor<i32>) -> tensor<f32>
-+    %arg0_different_i32 = stablehlo.bitcast_convert %arg0_different_f32 : (tensor<f32>) -> tensor<i32>
-+    // expected-error@+1{{incorrect number of operands for callee}}
-+    %2 = call @refine_call_callee(%arg0_different_i32, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    return %2 : tensor<?xf32>
-+  }
-+  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context. Previous context had 1 and now we have 2 non-dimension arguments}}
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-+    return %arg1 : tensor<?xf32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_multiple_different_dimension_arguments {
-+  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
-+    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
-+    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
-+    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    %arg0_different = stablehlo.add %arg0_new, %arg0_new : tensor<i32>
-+    // expected-error@+1{{incorrect number of operands for callee}}
-+    %2 = call @refine_call_callee(%arg0_different, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    return %2 : tensor<?xf32>
-+  }
-+  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context.}}
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-+    return %arg1 : tensor<?xf32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_multiple_different_non_dimension_arguments {
-+  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
-+    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
-+    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
-+    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    %2 = stablehlo.constant dense<[1., 2.]> : tensor<2xf32>
-+    %3 = stablehlo.concatenate %1, %2, dim = 0 : (tensor<?xf32>, tensor<2xf32>) -> tensor<?xf32>
-+    // expected-error@+1{{incorrect number of operands for callee}}
-+    %4 = call @refine_call_callee(%arg0_new, %3) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    return %4 : tensor<?xf32>
-+  }
-+  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context.}}
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-+    return %arg1 : tensor<?xf32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_recursive {
-+  func.func @main() -> tensor<i32> {
-+    %0 = stablehlo.constant dense<3> : tensor<i32>
-+    %1 = call @refine_call_callee(%0) : (tensor<i32>) -> tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+  // expected-error@+1{{Function refine_call_callee is being refined recursively}}
-+  func.func @refine_call_callee(%arg0: tensor<i32>) -> tensor<i32> {
-+    // expected-error@+1{{incorrect number of operands}}
-+    %0 = call @refine_call_callee(%arg0) : (tensor<i32>) -> tensor<i32>
-+    return %0 : tensor<i32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_main_argument_unranked {
-+  // expected-error@+1{{main must be refined with static shape arguments}}
-+  func.func public @main(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-+    %2 = call @callee(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
-+    return %2 : tensor<*xi32>
-+  }
-+  func.func private @callee(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-+    return %arg0 : tensor<*xi32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_main_argument_dynamic_shape {
-+  // expected-error@+1{{main must be refined with static shape arguments}}
-+  func.func public @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-+    %2 = call @callee(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-+    return %2 : tensor<?xi32>
-+  }
-+  func.func private @callee(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-+    return %arg0 : tensor<?xi32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_callee_argument_unranked {
-+  func.func public @main(%arg0: tensor<1xi64>) -> tensor<*xi32> {
-+    %1 = stablehlo.dynamic_iota %arg0, dim = 0 : (tensor<1xi64>) -> tensor<*xi32>
-+    %2 = call @callee(%1) : (tensor<*xi32>) -> tensor<*xi32>
-+    return %2 : tensor<*xi32>
-+  }
-+  // expected-error@+1{{callee must be refined with static shape arguments}}
-+  func.func private @callee(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-+    return %arg0 : tensor<*xi32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_callee_argument_dynamic_shape {
-+  func.func public @main(%arg0: tensor<1xi64>) -> tensor<?xi32> {
-+    %1 = stablehlo.dynamic_iota %arg0, dim = 0 : (tensor<1xi64>) -> tensor<?xi32>
-+    %2 = call @callee(%1) : (tensor<?xi32>) -> tensor<?xi32>
-+    return %2 : tensor<?xi32>
-+  }
-+  // expected-error@+1{{callee must be refined with static shape arguments}}
-+  func.func private @callee(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-+    return %arg0 : tensor<?xi32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call_dimension_argument_non_scalar
-+// The non-scalar constant is not folded into the callee
-+module @refine_call_dimension_argument_non_scalar {
-+  func.func public @main() -> tensor<4xi32> {
-+    // CHECK: dense<[1, 2, 3, 4]> : tensor<4xi32>
-+    %0 = stablehlo.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-+    %1 = call @callee(%0) : (tensor<4xi32>) -> tensor<4xi32>
-+    return %1 : tensor<4xi32>
-+  }
-+  func.func private @callee(%arg0: tensor<4xi32>) -> tensor<4xi32> {
-+    // CHECK: return %arg0 : tensor<4xi32>
-+    return %arg0 : tensor<4xi32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call_dimension_argument_not_integer
-+module @refine_call_dimension_argument_not_integer {
-+  func.func public @main() -> tensor<f32> {
-+    %0 = stablehlo.constant dense<3.> : tensor<f32>
-+    // CHECK: call @callee({{.*}}) : (tensor<f32>) -> tensor<f32>
-+    %2 = call @callee(%0) : (tensor<f32>) -> tensor<f32>
-+    return %2 : tensor<f32>
-+  }
-+  func.func private @callee(%arg0: tensor<f32>) -> tensor<f32> {
-+    return %arg0 : tensor<f32>
-+  }
- }
- 
- // -----
-@@ -607,12 +903,55 @@
+@@ -607,12 +607,55 @@
  
  // -----
  
@@ -1977,74 +1631,7 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/
 diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
 --- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
 +++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-@@ -11,9 +11,48 @@
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
--
-+/*
-+This shape refinement pass was designed to resolve the dynamic shapes in
-+a StableHLO module produced by JAX serialization with shape polymorphism.
-+Such a module has the following properties:
-+
-+  * it contains a "main" function with statically-shaped arguments;
-+    the result types may be dynamically shaped.
-+  * all the dynamic shapes depend only on the input shapes (no shape
-+    dependency on the input array contents). We refer to the operations that
-+    depend transitively only on the input shapes (e.g., as given by
-+    `stablehlo.get_dimension_size`) as `dimension` operations.
-+    All dimension values can be resolved to constants through inter-procedural
-+    constant folding.
-+  * intermediate functions may take a number of token arguments (of type
-+    !stablehlo.token) at the start of the argument list, followed by some
-+    dimension arguments (integer scalars).
-+  * some intermediate functions may return dimension values.
-+    E.g., the `floordiv` operation on dimension values may be implemented
-+    using intermediate functions. These constant functions need to be
-+    constant-folded.
-+  * All the dynamic shapes can be resolved through shape inference from the
-+    dimension values. The dimension values themselves do not depend on the
-+    result of shape inference.
-+
-+
-+For each intermediate function we compute a refinement context, including
-+the values of the dimension arguments and the static shapes of the other
-+arguments. We compute the refinement context when we encounter a function call,
-+and then we refine the callee recursively. We abort in the presence of
-+recursive calls.
-+We also abort if a function is called with multiple distinct refinement
-+contexts.
-+
-+After refinement, all operations should have static shapes, all calls to
-+constant functions are replaced with constants, and all dimension arguments
-+for intermediate functions are dropped and are replaced with constants.
-+*/
-+#include <algorithm>
- #include <cstdint>
- #include <memory>
-+#include <optional>
-+#include <set>
- #include <string>
- #include <utility>
- 
-@@ -24,8 +63,10 @@
- #include "llvm/ADT/SmallSet.h"
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/ADT/StringRef.h"
-+#include "llvm/Support/Debug.h"
- #include "llvm/Support/ErrorHandling.h"
- #include "llvm/Support/FormatVariadic.h"
-+#include "llvm/Support/ScopedPrinter.h"
- #include "mlir/Dialect/Func/IR/FuncOps.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/BuiltinOps.h"
-@@ -39,10 +80,13 @@
- #include "mlir/IR/Types.h"
- #include "mlir/IR/Value.h"
- #include "mlir/Interfaces/InferTypeOpInterface.h"
-+#include "mlir/Support/DebugStringHelper.h"
- #include "mlir/Support/LogicalResult.h"
-+#include "mlir/Support/LLVM.h"
+@@ -43,6 +43,7 @@
  #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
  #include "stablehlo/dialect/Base.h"
  #include "stablehlo/dialect/ChloOps.h"
@@ -2052,407 +1639,7 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
  #include "stablehlo/dialect/StablehloOps.h"
  #include "stablehlo/dialect/TypeInference.h"
  #include "stablehlo/transforms/Passes.h"
-@@ -50,10 +94,144 @@
- namespace mlir {
- namespace stablehlo {
- 
-+#define DEBUG_TYPE "stablehlo-refine-shapes"
-+
- #define GEN_PASS_DEF_STABLEHLOREFINESHAPESPASS
- #include "stablehlo/transforms/Passes.h.inc"
- 
- namespace {
-+
-+// Per-module state for shape refinement.
-+class RefineShapeState {
-+ public:
-+  // Validates that we are not attempting to refine a function with a different
-+  // context than previously, and are not attempting recursive refinement.
-+  // Returns failure() if validation fails. On success, returns a boolean
-+  // that specifies whether the function has already been refined.
-+  FailureOr<bool> validateFunctionRefinement(
-+      func::FuncOp func, SmallVector<APSInt> dimensionArguments,
-+      SmallVector<Type> nonDimensionArgumentTypes) {
-+    StringRef funcName = func.getName();
-+    auto found = refinementContexts.find(func);
-+    if (found == refinementContexts.end()) {
-+      return false;  // not already refined.
-+    }
-+    auto prevDimensionArguments = std::get<0>(found->second);
-+    auto prevNonDimensionArgumentTypes = std::get<1>(found->second);
-+    // Since we refine until fixed point, we will refine a call to a function
-+    // both for the original function and for the refined one. In the latter
-+    // case, we should have empty dimensionArguments but the same
-+    // nonDimensionArgumentTypes.
-+    if (prevNonDimensionArgumentTypes != nonDimensionArgumentTypes ||
-+        (!dimensionArguments.empty() &&
-+         prevDimensionArguments != dimensionArguments)) {
-+      emitDifferentRefinementContextError(
-+          func, /*dimensionArguments=*/dimensionArguments,
-+          /*nonDimensionArgumentTypes=*/nonDimensionArgumentTypes,
-+          /*prevDimensionArguments=*/prevDimensionArguments,
-+          /*prevNonDimensionArgumentShapes=*/prevNonDimensionArgumentTypes);
-+      return failure();
-+    }
-+    for (auto funcOnStack : functionsBeingRefined) {
-+      if (funcOnStack == funcName) {
-+        func.emitOpError() << "Function " << funcName
-+                           << " is being refined recursively\n";
-+        return failure();
-+      }
-+    }
-+    return true;  // already refined.
-+  }
-+
-+  // Updates the state to signal the starting of a function refinement.
-+  // Callers must call `finishFunctionRefinement` when done.
-+  void startFunctionRefinement(func::FuncOp func,
-+                               SmallVector<APSInt> dimensionArguments,
-+                               SmallVector<Type> nonDimensionArgumentTypes) {
-+    StringRef funcName = func.getName();
-+    functionsBeingRefined.push_back(funcName);
-+    refinementContexts[func] =
-+        std::make_tuple(dimensionArguments, nonDimensionArgumentTypes);
-+  }
-+
-+  // Updates the state to signal the starting of a function refinement.
-+  LogicalResult finishFunctionRefinement(func::FuncOp func) {
-+    if (func.getName() !=
-+        functionsBeingRefined[functionsBeingRefined.size() - 1]) {
-+      func.emitOpError() << "Expected to find " << func.getName()
-+                         << " at the top of the stack";
-+      return failure();
-+    }
-+    functionsBeingRefined.pop_back();
-+    return success();
-+  }
-+
-+ private:
-+  // Maps refined functions to the refinement context: the values of dimension
-+  // arguments and the types of non-dimension arguments. A function is added
-+  // here when we start refining it.
-+  DenseMap<func::FuncOp, std::tuple<SmallVector<APSInt>, SmallVector<Type>>>
-+      refinementContexts;
-+
-+  // A stack of functions that are in the process of being refined, the current
-+  // one is last.
-+  SmallVector<llvm::StringRef> functionsBeingRefined;
-+
-+  void emitDifferentRefinementContextError(
-+      func::FuncOp func, SmallVector<APSInt> dimensionArguments,
-+      SmallVector<Type> nonDimensionArgumentTypes,
-+      SmallVector<APSInt> prevDimensionArguments,
-+      SmallVector<Type> prevNonDimensionArgumentShapes) {
-+    InFlightDiagnostic msg = func.emitOpError();
-+    msg << "Function " << func.getName()
-+        << " has already been refined with a different "
-+           "refinement context. ";
-+    int countShowNonDimensionArguments =
-+        std::min(prevNonDimensionArgumentShapes.size(),
-+                 nonDimensionArgumentTypes.size());
-+    if (prevNonDimensionArgumentShapes.size() !=
-+        nonDimensionArgumentTypes.size()) {
-+      msg << "Previous context had " << prevNonDimensionArgumentShapes.size()
-+          << " and now we have " << nonDimensionArgumentTypes.size()
-+          << " non-dimension arguments. ";
-+    }
-+    msg << "The differences among the first " << countShowNonDimensionArguments
-+        << " non-dimension argument types are: ";
-+    for (auto i = 0; i < countShowNonDimensionArguments; ++i) {
-+      if (prevNonDimensionArgumentShapes[i] != nonDimensionArgumentTypes[i]) {
-+        msg << "Non-dimension argument[" << i << "] previously had type "
-+            << debugString(prevNonDimensionArgumentShapes[i])
-+            << " and now has type " << debugString(nonDimensionArgumentTypes[i])
-+            << ". ";
-+      }
-+    }
-+    int countShowDimensionArguments =
-+        std::min(prevDimensionArguments.size(), dimensionArguments.size());
-+    if (prevDimensionArguments.size() != dimensionArguments.size()) {
-+      msg << "Previous context had " << prevDimensionArguments.size()
-+          << " and now we have " << dimensionArguments.size()
-+          << " dimension arguments. ";
-+    }
-+    msg << "The differences among the first " << countShowDimensionArguments
-+        << " dimension arguments are: ";
-+    for (auto i = 0; i < countShowDimensionArguments; ++i) {
-+      if (prevDimensionArguments[i] != dimensionArguments[i]) {
-+        msg << "Dimension argument[" << i << "] previously was "
-+            << prevDimensionArguments[i].getSExtValue() << " and now is "
-+            << dimensionArguments[i].getSExtValue() << ". ";
-+      }
-+    }
-+  }
-+};
-+
-+// Refines a function.
-+// Returns `true` if the function had already been processed with the same
-+// refinement context and `false` if this is the first time we refined the
-+// function. Returns failure() if we encounter an error.
-+LogicalResult refineFunction(func::FuncOp func, MLIRContext* context,
-+                             RefineShapeState* state,
-+                             size_t nrPrefixTokenArguments,
-+                             SmallVector<APSInt> dimensionArguments,
-+                             SmallVector<Type> nonDimensionArgumentTypes);
- 
- // DenseElementsAttr can be constructed from ArrayRef<APInt> but not from
- // ArrayRef<APSInt>. This helper bridges the gap.
-@@ -424,11 +602,10 @@
-       diag << "refineValues failed for " << types << ": expected "
-            << values.size() << " types, got " << types.size();
-     });
--
--  // Check whether `types` contain any new information with respect to existing
--  // return types. Even if just a single dimension size out of an entire tensor
--  // type got updated, using `inferMostSpecificType` ensures that we don't
--  // miss that.
-+  // Check whether `types` contain any new information with respect to
-+  // existing return types. Even if just a single dimension size out of an
-+  // entire tensor type got updated, using `inferMostSpecificType` ensures
-+  // that we don't miss that.
-   bool needsRefinement = false;
-   SmallVector<Type> refinedTypes;
-   for (auto it : llvm::zip(values.getTypes(), types)) {
-@@ -468,11 +645,13 @@
- 
-       // Simply changing operand type of `func.return` won't work because
-       // that won't update the FunctionType of the enclosing `func.func`.
--      // Nonetheless, we still want to support these ops because they are widely
--      // used in StableHLO programs (although the plan of record is to replace
--      // `func.return` ops in StableHLO programs with `stablehlo.return`:
--      // https://github.com/openxla/stablehlo/issues/425).
-+      // Nonetheless, we still want to support these ops because they are
-+      // widely used in StableHLO programs (although the plan of record is to
-+      // replace `func.return` ops in StableHLO programs with
-+      // `stablehlo.return`: https://github.com/openxla/stablehlo/issues/425).
-       if (isa<func::ReturnOp>(user)) continue;
-+
-+      if (isa<func::CallOp>(user)) continue;
- 
-       // Unlike in TensorFlow's type inference pass, here we work only with
-       // allowlisted ops to focus our support on well-defined semantics of
-@@ -489,7 +668,8 @@
-     value.setType(refinedType);
- 
-     // Special case: for `func.return`, guard the refinement with a cast
--    // and leave propagation of the refined return type to a dedicated pattern.
-+    // and leave propagation of the refined return type to a dedicated
-+    // pattern.
-     auto isFuncReturn = [](OpOperand& use) -> bool {
-       return isa<func::ReturnOp>(use.getOwner());
-     };
-@@ -505,8 +685,8 @@
- 
- // Refines the return types of the given operation using the given types.
- // This function also signals PatternRewriter that it needs to visit all the
--// users of this op if any updates to its results have happened during execution
--// of the function.
-+// users of this op if any updates to its results have happened during
-+// execution of the function.
- LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
-                                 ArrayRef<Type> types) {
-   if (failed(refineValues(rewriter, op, op->getResults(), types)))
-@@ -528,12 +708,12 @@
- //      traversal, and only then we apply the refinements. If there are other
- //      types, then the corresponding refinements must be completely empty.
- //   2) Encodings are not supported. In principle, TypeExtensions should be
--//      supportable, but this needs careful thinking through. Given that no one
--//      asked for support for bounded dynamism in this pass yet, this is left
--//      for future work.
-+//      supportable, but this needs careful thinking through. Given that no
-+//      one asked for support for bounded dynamism in this pass yet, this is
-+//      left for future work.
- // This function also signals PatternRewriter that it needs to visit all the
--// users of this op if any updates to its results have happened during execution
--// of the function.
-+// users of this op if any updates to its results have happened during
-+// execution of the function.
- LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
-                                 ArrayRef<ShapedTypeComponents> refinements) {
-   SmallVector<Type> flattenedTypes;
-@@ -623,8 +803,8 @@
- 
- // Refines the return type of the given operation using the given shape.
- // This function also signals PatternRewriter that it needs to visit all the
--// users of this op if any updates to its results have happened during execution
--// of the function.
-+// users of this op if any updates to its results have happened during
-+// execution of the function.
- template <typename OpType>
- LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
-                                 ArrayRef<int64_t> shape) {
-@@ -633,8 +813,8 @@
- 
- // Refines the return type of the given operation using the given shape.
- // This function also signals PatternRewriter that it needs to visit all the
--// users of this op if any updates to its results have happened during execution
--// of the function.
-+// users of this op if any updates to its results have happened during
-+// execution of the function.
- template <typename OpType>
- LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
-                                 Value shapeValue) {
-@@ -647,6 +827,52 @@
-   return refineReturnShape(rewriter, op, shape);
- }
- 
-+// Dimension arguments are leading scalar constant arguments, optionally
-+// preceeded by some stablehlo.token arguments.
-+SmallVector<APSInt> getDimensionArguments(func::CallOp callOp,
-+                                          size_t* nrPrefixTokenArguments) {
-+  *nrPrefixTokenArguments = 0;
-+  SmallVector<Value> operands = callOp.getOperands();
-+  SmallVector<APSInt> dimensionArguments;
-+  for (size_t i = 0; i < operands.size(); ++i) {
-+    if (i == *nrPrefixTokenArguments && isa<TokenType>(operands[i].getType())) {
-+      (*nrPrefixTokenArguments)++;
-+      continue;
-+    }
-+    RankedTensorType operandType =
-+        dyn_cast<RankedTensorType>(operands[i].getType());
-+    if (!operandType || operandType.getRank() != 0 ||
-+        !operandType.getElementType().template isa<IntegerType>())
-+      break;
-+    SmallVector<APSInt> operand_int;
-+    if (failed(hlo::matchInts(operands[i], operand_int))) {
-+      break;
-+    }
-+    dimensionArguments.push_back(operand_int[0]);
-+  }
-+  return dimensionArguments;
-+}
-+
-+std::optional<SmallVector<DenseIntElementsAttr>> isConstantFunction(
-+    func::FuncOp func) {
-+  LLVM_DEBUG(llvm::dbgs() << "check if " << func.getName()
-+                          << " is a constant function\n");
-+  SmallVector<DenseIntElementsAttr> returnedConstants;
-+  func::ReturnOp ret = *func.getOps<func::ReturnOp>().begin();
-+  bool isConstant = llvm::all_of(ret->getOperands(), [&](auto returnVal) {
-+    DenseIntElementsAttr attr;
-+    Operation* return_operand_def = returnVal.getDefiningOp();
-+    if (return_operand_def &&
-+        matchPattern(return_operand_def, m_Constant(&attr))) {
-+      returnedConstants.push_back(attr);
-+      return true;
-+    }
-+    return false;
-+  });
-+  if (isConstant) return returnedConstants;
-+  return std::nullopt;
-+}
-+
- struct RefineAllGatherOpPattern : public OpRewritePattern<AllGatherOp> {
-   using OpRewritePattern::OpRewritePattern;
-   LogicalResult matchAndRewrite(AllGatherOp op,
-@@ -655,9 +881,9 @@
-     if (!operandType.hasRank())
-       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
- 
--    // This represents the cross_replica_and_partition process grouping strategy
--    // that requires num_partitions to compute shardCount. Since we don't know
--    // num_partitions at this point, we error out.
-+    // This represents the cross_replica_and_partition process grouping
-+    // strategy that requires num_partitions to compute shardCount. Since we
-+    // don't know num_partitions at this point, we error out.
-     if (op.getChannelHandle() && !op.getUseGlobalDeviceIds())
-       return rewriter.notifyMatchFailure(op, "unsupported strategy");
-     DenseIntElementsAttr replicaGroups = op.getReplicaGroups();
-@@ -678,12 +904,11 @@
-     auto operandType = op.getOperand().getType();
-     if (!operandType.hasRank())
-       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
--
-+    auto resultType = op.getType();
-     // If bit widths of the operand and the result are different, then
-     // operand and result shapes have different ranks.
-     // This complicates the logic quite a bit and is not needed to pass the
-     // current tests, so we leave this for future work.
--    auto resultType = op.getType();
-     auto getBitWidthFn = [](ShapedType type) {
-       auto elementType = type.getElementType();
-       if (auto complexType = elementType.dyn_cast<ComplexType>())
-@@ -694,8 +919,77 @@
-     if (getBitWidthFn(operandType) != getBitWidthFn(resultType))
-       return rewriter.notifyMatchFailure(op, "unsupported bit width");
- 
--    return refineReturnShape(rewriter, op, operandType.getShape());
--  }
-+    auto res = refineReturnShape(rewriter, op, operandType.getShape());
-+    if (failed(res)) return failure();
-+    if (op.getOperand().getType() == op.getResult().getType()) {
-+      LLVM_DEBUG({ llvm::dbgs() << "    ** remove no-op bitcast convert\n"; });
-+      rewriter.replaceOp(op, op.getOperand());
-+    }
-+    return success();
-+  }
-+};
-+
-+struct RefineCallOpPattern : public OpRewritePattern<func::CallOp> {
-+  using OpRewritePattern::OpRewritePattern;
-+
-+  RefineCallOpPattern(MLIRContext* context, RefineShapeState* state)
-+      : OpRewritePattern<func::CallOp>(context), _state(state) {}
-+
-+  LogicalResult matchAndRewrite(func::CallOp op,
-+                                PatternRewriter& rewriter) const override {
-+    LLVM_DEBUG({ llvm::dbgs() << "refineCallOp " << debugString(op) << "\n"; });
-+
-+    // We have a number of prefix token arguments, then the dimension arguments
-+    size_t nrPrefixTokenArguments = 0;
-+    SmallVector<APSInt> dimensionArguments =
-+        getDimensionArguments(op, &nrPrefixTokenArguments);
-+    SmallVector<Type> nonDimensionArgumentTypes;
-+    SmallVector<Value> nonDimensionArguments;
-+    SmallVector<Value> operands = op.getOperands();
-+    for (size_t i = 0; i < operands.size(); ++i) {
-+      // Skip the dimension arguments.
-+      if (i >= nrPrefixTokenArguments &&
-+          i < nrPrefixTokenArguments + dimensionArguments.size()) {
-+        continue;
-+      }
-+      nonDimensionArgumentTypes.push_back(operands[i].getType());
-+      nonDimensionArguments.push_back(operands[i]);
-+    }
-+    FlatSymbolRefAttr calleeName = op.getCalleeAttr();
-+    const SymbolTable symbolTable(op->getParentOfType<ModuleOp>());
-+    func::FuncOp callee = dyn_cast<func::FuncOp>(
-+        symbolTable.lookupNearestSymbolFrom(op, calleeName.getAttr()));
-+    if (!callee)
-+      return rewriter.notifyMatchFailure(
-+          op, "cannot find callee in the current scope");
-+    if (failed(refineFunction(callee, rewriter.getContext(), _state,
-+                              nrPrefixTokenArguments, dimensionArguments,
-+                              nonDimensionArgumentTypes)))
-+      return failure();
-+
-+    // Is the callee a constant function in this refinement context?
-+    std::optional<SmallVector<DenseIntElementsAttr>> constantAttrs =
-+        isConstantFunction(callee);
-+    if (constantAttrs.has_value()) {
-+      SmallVector<Value> constants;
-+      for (auto constAttr : constantAttrs.value()) {
-+        constants.push_back(
-+            rewriter.create<ConstantOp>(op.getLoc(), constAttr));
-+      }
-+      rewriter.replaceOp(op, constants);
-+      return success();
-+    }
-+    if (!dimensionArguments.empty()) {
-+      // Drop the dimension arguments, but only if necessary, or else we
-+      // will end up trying to refine the new CallOp forever.
-+      op = rewriter.replaceOpWithNewOp<func::CallOp>(
-+          op, op.getResultTypes(), callee.getSymName(), nonDimensionArguments);
-+    }
-+    return refineReturnTypes(rewriter, op, callee.getResultTypes());
-+  }
-+
-+ private:
-+  RefineShapeState* _state;
- };
- 
- struct RefineConvertOpPattern : public OpRewritePattern<ConvertOp> {
-@@ -844,12 +1138,98 @@
+@@ -844,12 +845,97 @@
    }
  };
  
@@ -2525,9 +1712,8 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
 +      return rewriter.notifyMatchFailure(op, "expected constant output_shape");
 +
 +    // We only need to refine the shape of `output` (the second result).
-+    // The shape of `output_state` (the first result) is determined by the
-+    // shape of `initial_state`, so we ignore it and provide an empty
-+    // refinement.
++    // The shape of `output_state` (the first result) is determined by the shape
++    // of `initial_state`, so we ignore it and provide an empty refinement.
 +    return refineReturnTypes(rewriter, op, {{initialStateType}, {outputShape}});
 +  }
 +};
@@ -2551,349 +1737,15 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
    }
  };
  
-@@ -865,11 +1245,11 @@
-     if (!isa<chlo::ChloDialect, StablehloDialect>(op->getDialect()))
-       return rewriter.notifyMatchFailure(op, "unsupported dialect");
- 
--    // For the ops that implement InferTypeOpInterface, we reinfer their return
--    // types and see what happens.
--    // Operands of these ops might have been refined elsewhere (e.g. someone
--    // might have updated argument types of a function) or earlier during this
--    // pass, and this might enable refinement opportunities downstream.
-+    // For the ops that implement InferTypeOpInterface, we reinfer their
-+    // return types and see what happens. Operands of these ops might have
-+    // been refined elsewhere (e.g. someone might have updated argument types
-+    // of a function) or earlier during this pass, and this might enable
-+    // refinement opportunities downstream.
-     SmallVector<Type> inferredReturnTypes;
-     if (failed(op.inferReturnTypes(getContext(), /*location=*/{},
-                                    op->getOperands(), op->getAttrDictionary(),
-@@ -925,8 +1305,8 @@
-           sliceSizesAttr.size(),
-           RankedTensorType::get({}, startIndicesElementType));
- 
--      // RealDynamicSliceOp can take tensors of integer or index element types.
--      // DynamicSliceOp::slice_sizes only supports i64 element type.
-+      // RealDynamicSliceOp can take tensors of integer or index element
-+      // types. DynamicSliceOp::slice_sizes only supports i64 element type.
-       // Adapt accordingly in order to be compatible with inferDynamicSliceOp.
-       SmallVector<int64_t> sliceSizes;
-       for (auto element : sliceSizesAttr.getValues<APInt>()) {
-@@ -956,9 +1336,9 @@
-     if (!operandType.hasRank())
-       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
- 
--    // This represents the cross_replica_and_partition process grouping strategy
--    // that requires num_partitions to compute shardCount. Since we don't know
--    // num_partitions at this point, we error out.
-+    // This represents the cross_replica_and_partition process grouping
-+    // strategy that requires num_partitions to compute shardCount. Since we
-+    // don't know num_partitions at this point, we error out.
-     if (op.getChannelHandle() && !op.getUseGlobalDeviceIds())
-       return rewriter.notifyMatchFailure(op, "unsupported strategy");
-     DenseIntElementsAttr replicaGroups = op.getReplicaGroups();
-@@ -998,9 +1378,9 @@
-                                 PatternRewriter& rewriter) const override {
-     // Push the potentially refined operand types into the nested regions.
-     // This can lead to refinements of the return types of the body (but not
--    // of the cond since it always returns tensor<i1>), but the key insight here
--    // is that the enclosing while op doesn't care about these refinements
--    // (because its return types are equal to its operand types).
-+    // of the cond since it always returns tensor<i1>), but the key insight
-+    // here is that the enclosing while op doesn't care about these
-+    // refinements (because its return types are equal to its operand types).
-     // If we end up with incompatibilities between while's return types and
-     // body's return types, the verifier will tell us about that. This means
-     // that the original program wasn't well-formed. TODO(burmako): Implement
-@@ -1050,8 +1430,8 @@
-       if (failed(mostSpecificType) || destType == *mostSpecificType) continue;
- 
-       // If the source type of the cast is more specific than the target type,
--      // then we conclude that the cast is redundant (i.e. needs to be removed)
--      // and that the return type of the function needs an update.
-+      // then we conclude that the cast is redundant (i.e. needs to be
-+      // removed) and that the return type of the function needs an update.
-       needsUpdate = true;
-       updatedResultTypes[i] = sourceType;
- 
-@@ -1066,9 +1446,6 @@
-     for (auto cast : castsToReplace)
-       rewriter.replaceOp(cast, cast->getOperands());
- 
--    // If the type of the enclosing `func.func` needs an update, we simply
--    // call setType. We can afford this simplicity because our algorithm
--    // currently supports only one function per module.
-     auto func = cast<func::FuncOp>(op->getParentOp());
-     func.setType(
-         rewriter.getFunctionType(func.getArgumentTypes(), updatedResultTypes));
-@@ -1100,22 +1477,186 @@
-   }
- };
- 
-+LogicalResult applyRewritePatterns(func::FuncOp func, MLIRContext* context,
-+                                   RefineShapeState* state) {
-+  // TODO(#1048): Find out why .maxIterations = 1 no longer works.
-+  // There have been recent refactors to applyPatternsAndFoldGreedily
-+  // upstream, and that might be the reason.
-+  GreedyRewriteConfig config;
-+  config.useTopDownTraversal = true;
-+  config.enableRegionSimplification = true;
-+  config.maxIterations = 2;
-+  config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
-+  config.strictMode = GreedyRewriteStrictness::AnyOp;
-+
-+  RewritePatternSet patterns(context);
-+  patterns.add<EvalAddOpPattern>(context);
-+  patterns.add<EvalAndOpPattern>(context);
-+  patterns.add<EvalBroadcastInDimOpPattern>(context);
-+  patterns.add<EvalClampOpPattern>(context);
-+  patterns.add<EvalCompareOpPattern>(context);
-+  patterns.add<EvalConcatenateOpPattern>(context);
-+  patterns.add<EvalConvertOpPattern>(context);
-+  patterns.add<EvalDivOpPattern>(context);
-+  patterns.add<EvalGetDimensionSizeOpPattern>(context);
-+  patterns.add<EvalMaxOpPattern>(context);
-+  patterns.add<EvalMinOpPattern>(context);
-+  patterns.add<EvalMulOpPattern>(context);
-+  patterns.add<EvalRemOpPattern>(context);
-+  patterns.add<EvalReshapeOpPattern>(context);
-+  patterns.add<EvalSelectOpPattern>(context);
-+  patterns.add<EvalSignOpPattern>(context);
-+  patterns.add<EvalSliceOpPattern>(context);
-+  patterns.add<EvalSubtractOpPattern>(context);
-+  patterns.add<RefineAllGatherOpPattern>(context);
-+  patterns.add<RefineBitcastConvertOpPattern>(context);
-+  patterns.add<RefineCallOpPattern>(context, state);
-+  patterns.add<RefineConvertOpPattern>(context);
-+  patterns.add<RefineConvolutionOpPattern>(context);
-+  patterns.add<RefineCustomCallOpPattern>(context);
-+  patterns.add<RefineDotGeneralOpPattern>(context);
-+  patterns.add<RefineDynamicBroadcastInDimOpPattern>(context);
-+  patterns.add<RefineDynamicConvOpPattern>(context);
-+  patterns.add<RefineDynamicIotaOpPattern>(context);
-+  patterns.add<RefineDynamicPadOpPattern>(context);
-+  patterns.add<RefineDynamicReduceWindowOpPattern>(context);
-+  patterns.add<RefineDynamicReshapeOpPattern>(context);
-+  patterns.add<RefineDynamicRngBitGeneratorOpPattern>(context);
-+  patterns.add<RefineDynamicTopKOpPattern>(context);
-+  patterns.add<RefineInferTypeOpInterfacePattern>(context);
-+  patterns.add<RefineRealDynamicSliceOpPattern>(context);
-+  patterns.add<RefineReduceScatterOpPattern>(context);
-+  patterns.add<RefineRngOpPattern>(context);
-+  patterns.add<RefineUniformQuantizeOpPattern>(context);
-+  patterns.add<RefineWhileOpPattern>(context);
-+  patterns.add<UpdateFunctionTypePattern>(context);
-+  patterns.add<UpdateRegionTypePattern>(context);
-+  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
-+    func.emitOpError() << "applyPatternsAndFoldGreedily failed";
-+    return failure();
-+  }
-+  return success();
-+}
-+
-+LogicalResult refineFunction(func::FuncOp func, MLIRContext* context,
-+                             RefineShapeState* state,
-+                             size_t nrPrefixTokenArguments,
-+                             SmallVector<APSInt> dimensionArguments,
-+                             SmallVector<Type> nonDimensionArgumentTypes) {
-+  // The nonDimensionArgumentTypes include the prefix token arguments.
-+  LLVM_DEBUG({
-+    llvm::dbgs() << "refineFunction " << func.getName() << ": initial type "
-+                 << debugString(func.getFunctionType()) << "\n";
-+    llvm::dbgs() << "   has " << nrPrefixTokenArguments << " prefix tokens\n";
-+    for (size_t i = 0; i < dimensionArguments.size(); ++i) {
-+      llvm::dbgs() << "   with dimension arg[" << i
-+                   << "] = " << dimensionArguments[i] << "\n";
-+    }
-+  });
-+  // Check that the argument types have static shapes.
-+  for (size_t i = 0; i < nonDimensionArgumentTypes.size(); ++i) {
-+    if (i < nrPrefixTokenArguments) continue;
-+    auto argType = nonDimensionArgumentTypes[i];
-+    if (isa<TokenType>(argType)) continue;
-+    auto argRankedTensorType = dyn_cast<RankedTensorType>(argType);
-+    if (!argRankedTensorType || !argRankedTensorType.hasStaticShape()) {
-+      func.emitOpError() << func.getName()
-+                         << " must be refined with static shape arguments. "
-+                         << "Found argument of type " << debugString(argType);
-+      return failure();
-+    }
-+  }
-+  auto alreadyRefined = state->validateFunctionRefinement(
-+      func, dimensionArguments, nonDimensionArgumentTypes);
-+  if (failed(alreadyRefined)) {
-+    return failure();
-+  }
-+  if (*alreadyRefined) {
-+    LLVM_DEBUG({
-+      llvm::dbgs() << "refineFunction " << func.getName()
-+                   << ": skipping, already refined\n";
-+    });
-+    return success();
-+  }
-+  state->startFunctionRefinement(func, dimensionArguments,
-+                                 nonDimensionArgumentTypes);
-+  // Only one block per function is supported at the moment.
-+  // At the StableHLO level, functions are expected to only have one block,
-+  // so supporting more is out of scope for this pass.
-+  if (!func.getRegion().hasOneBlock()) {
-+    func.emitOpError() << "must have exactly one block";
-+    return failure();
-+  }
-+
-+  // Replace all dimension arguments with constants and remove those arguments.
-+  // Wrap non-dimension arguments with bitcast_convert.
-+  OpBuilder op_builder(func.getRegion());
-+  op_builder.setInsertionPointToStart(&func.getRegion().front());
-+  size_t firstNonDimensionArg =
-+      nrPrefixTokenArguments + dimensionArguments.size();
-+  for (size_t i = 0; i < func.getNumArguments(); ++i) {
-+    BlockArgument arg = func.getArgument(i);
-+    Type argType = arg.getType();
-+    if (i < nrPrefixTokenArguments) {
-+      continue;
-+    }
-+    if (i < firstNonDimensionArg) {
-+      ShapedType argShapedType = dyn_cast<ShapedType>(argType);
-+      if (!argShapedType) {
-+        func.emitOpError() << "dimension arguments must have shaped types";
-+        return failure();
-+      }
-+      // We will drop the dimension arguments, replace them with constants.
-+      auto replacement_op = op_builder.create<stablehlo::ConstantOp>(
-+          arg.getLoc(), argType,
-+          getTensorAttr(argShapedType,
-+                        dimensionArguments[i - nrPrefixTokenArguments]));
-+      arg.replaceAllUsesWith(replacement_op);
-+    } else {
-+      int nonDimensionArgumentIndex =
-+          nrPrefixTokenArguments + i - firstNonDimensionArg;
-+      Type refinedType = nonDimensionArgumentTypes[nonDimensionArgumentIndex];
-+      if (refinedType != argType) {
-+        // We add BitcastConvertOp as the only uses of the non-dimension
-+        // arguments to ensure the module stays valid after we set the argument
-+        // type.
-+        auto replacement_op = op_builder.create<stablehlo::BitcastConvertOp>(
-+            arg.getLoc(), argType, arg);
-+        arg.replaceAllUsesExcept(replacement_op->getResult(0), replacement_op);
-+        arg.setType(refinedType);
-+      }
-+    }
-+  }
-+  BitVector argIndices(func.getNumArguments());
-+  argIndices.set(nrPrefixTokenArguments, firstNonDimensionArg);
-+  func.eraseArguments(argIndices);
-+  func.setType(op_builder.getFunctionType(nonDimensionArgumentTypes,
-+                                          func.getResultTypes()));
-+  LLVM_DEBUG({
-+    llvm::dbgs() << "refineFunction " << func.getName() << ": set type to "
-+                 << func.getFunctionType() << "\n";
-+  });
-+  if (failed(applyRewritePatterns(func, context, state))) return failure();
-+  LLVM_DEBUG({
-+    llvm::dbgs() << "refineFunction " << func.getName() << ": end with type "
-+                 << debugString(func.getFunctionType()) << "\n";
-+  });
-+  if (failed(state->finishFunctionRefinement(func))) return failure();
-+  return success();
-+}
-+
- struct StablehloRefineShapesPass
-     : public impl::StablehloRefineShapesPassBase<StablehloRefineShapesPass> {
-   using StablehloRefineShapesPassBase::StablehloRefineShapesPassBase;
- 
-   void runOnOperation() override {
--    // Only one function per module is supported at the moment to avoid the need
--    // to think about iterative type inference algorithms.
--    // Current use cases are served well by inlining multiple functions into
--    // a single function, so we leave native support for multiple functions to
--    // future work.
-     // To enable modules that contain CustomCallOp::called_computations,
-     // we allow multiple functions, in which case we only refine the main
-     // function called "main", assuming that the called computations will have
-     // static shapes. Lifting this assumption and expanding refinement to
-     // multiple functions is left for future work.
-     ModuleOp module = getOperation();
-+    RefineShapeState state;
-     auto funcs = llvm::to_vector(module.getOps<func::FuncOp>());
-     if (funcs.empty()) return;
-     func::FuncOp func;
-@@ -1130,70 +1671,14 @@
-           << " function to clearly identify which function will be refined";
-       return signalPassFailure();
-     }
--
--    // Similarly, only one block per function is supported at the moment.
--    // At the StableHLO level, functions are expected to only have one block,
--    // so supporting more is out of scope for this pass.
--    if (!func.getRegion().hasOneBlock()) {
--      func.emitOpError() << "must have exactly one block";
-+    SmallVector<APSInt> emptyDimensionArguments;
-+    SmallVector<Type> nonDimensionArgumentTypes;
-+    for (auto arg : func.getArguments())
-+      nonDimensionArgumentTypes.push_back(arg.getType());
-+    if (failed(refineFunction(func, &getContext(), &state, 0,
-+                              emptyDimensionArguments,
-+                              nonDimensionArgumentTypes)))
-       return signalPassFailure();
--    }
--
--    // The algorithm behind this pass consists of a single traversal of the
--    // function. This is sufficient because we only support one function per
--    // program at the moment.
--    // TODO(#1048): Find out why .maxIterations = 1 no longer works.
--    // There have been recent refactors to applyPatternsAndFoldGreedily
--    // upstream, and that might be the reason.
--    GreedyRewriteConfig config;
--    config.useTopDownTraversal = true;
--    config.enableRegionSimplification = true;
--    config.maxIterations = 2;
--    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
--    config.strictMode = GreedyRewriteStrictness::AnyOp;
--
--    RewritePatternSet patterns(&getContext());
--    patterns.add<EvalAddOpPattern>(&getContext());
--    patterns.add<EvalAndOpPattern>(&getContext());
--    patterns.add<EvalBroadcastInDimOpPattern>(&getContext());
--    patterns.add<EvalClampOpPattern>(&getContext());
--    patterns.add<EvalCompareOpPattern>(&getContext());
--    patterns.add<EvalConcatenateOpPattern>(&getContext());
--    patterns.add<EvalConvertOpPattern>(&getContext());
--    patterns.add<EvalDivOpPattern>(&getContext());
--    patterns.add<EvalGetDimensionSizeOpPattern>(&getContext());
--    patterns.add<EvalMaxOpPattern>(&getContext());
--    patterns.add<EvalMinOpPattern>(&getContext());
--    patterns.add<EvalMulOpPattern>(&getContext());
--    patterns.add<EvalRemOpPattern>(&getContext());
--    patterns.add<EvalReshapeOpPattern>(&getContext());
--    patterns.add<EvalSelectOpPattern>(&getContext());
--    patterns.add<EvalSignOpPattern>(&getContext());
--    patterns.add<EvalSliceOpPattern>(&getContext());
--    patterns.add<EvalSubtractOpPattern>(&getContext());
--    patterns.add<RefineAllGatherOpPattern>(&getContext());
--    patterns.add<RefineBitcastConvertOpPattern>(&getContext());
--    patterns.add<RefineConvertOpPattern>(&getContext());
--    patterns.add<RefineConvolutionOpPattern>(&getContext());
--    patterns.add<RefineCustomCallOpPattern>(&getContext());
--    patterns.add<RefineDotGeneralOpPattern>(&getContext());
--    patterns.add<RefineDynamicBroadcastInDimOpPattern>(&getContext());
--    patterns.add<RefineDynamicConvOpPattern>(&getContext());
--    patterns.add<RefineDynamicIotaOpPattern>(&getContext());
--    patterns.add<RefineDynamicPadOpPattern>(&getContext());
--    patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
--    patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
--    patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
--    patterns.add<RefineReduceScatterOpPattern>(&getContext());
--    patterns.add<RefineRngOpPattern>(&getContext());
--    patterns.add<RefineUniformQuantizeOpPattern>(&getContext());
--    patterns.add<RefineWhileOpPattern>(&getContext());
--    patterns.add<UpdateFunctionTypePattern>(&getContext());
--    patterns.add<UpdateRegionTypePattern>(&getContext());
--    if (failed(
--            applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
--      return signalPassFailure();
--    }
-   }
- };
- 
+@@ -1181,7 +1267,10 @@
+     patterns.add<RefineDynamicConvOpPattern>(&getContext());
+     patterns.add<RefineDynamicIotaOpPattern>(&getContext());
+     patterns.add<RefineDynamicPadOpPattern>(&getContext());
++    patterns.add<RefineDynamicReduceWindowOpPattern>(&getContext());
+     patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
++    patterns.add<RefineDynamicRngBitGeneratorOpPattern>(&getContext());
++    patterns.add<RefineDynamicTopKOpPattern>(&getContext());
+     patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
+     patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
+     patterns.add<RefineReduceScatterOpPattern>(&getContext());
 
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 0cb078c3b89795..4c4228163a6f04 100644
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1426,353 +1426,7 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/st
 diff --ruN a/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
 --- stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
 +++ stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
-@@ -31,6 +31,7 @@
- 
- // -----
- 
-+// CHECK-LABEL: module @has_main
- module @has_main {
-   // CHECK: main
-   func.func @main(%arg0: tensor<4xf32>) -> tensor<*xi32> {
-@@ -38,17 +39,11 @@
-     %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
-     func.return %0 : tensor<*xi32>
-   }
--
--  // CHECK: helper
--  func.func @helper(%arg0: tensor<4xf32>) -> tensor<*xi32> {
--    // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<*xi32>
--    %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
--    func.return %0 : tensor<*xi32>
--  }
--}
--
--// -----
--
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @error_unsupported_operation
- func.func @error_unsupported_operation(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> index {
-   // CHECK: stablehlo.add{{.*}} -> tensor<?xf32>
-   %0 = stablehlo.add %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<?xf32>
-@@ -472,11 +467,312 @@
- 
- // -----
- 
--// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth
--func.func @refine_bitcast_convert_same_bitwidth(%arg0 : tensor<4xf32>) -> tensor<*xi32> {
-+// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth_unranked_result
-+func.func @refine_bitcast_convert_same_bitwidth_unranked_result(%arg0 : tensor<4xf32>) -> tensor<*xi32> {
-   // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<4xi32>
-   %0 = stablehlo.bitcast_convert %arg0 : (tensor<4xf32>) -> tensor<*xi32>
-   func.return %0 : tensor<*xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_bitcast_convert_same_bitwidth
-+func.func @refine_bitcast_convert_same_bitwidth() -> tensor<?x?x0xf32> {
-+  %0 = stablehlo.constant dense<[3, 5, 0]> : tensor<3xi32>
-+  %21 = stablehlo.dynamic_iota %0, dim = 0 : (tensor<3xi32>) -> tensor<?x?x0xui32>
-+  // CHECK: stablehlo.bitcast_convert{{.*}} -> tensor<3x5x0xf32>
-+  %48 = stablehlo.bitcast_convert %21 : (tensor<?x?x0xui32>) -> tensor<?x?x0xf32>
-+  return %48 : tensor<?x?x0xf32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call
-+module @refine_call {
-+  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
-+    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
-+    %1 = stablehlo.constant dense<4> : tensor<i32>
-+    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
-+    %2 = call @refine_call_callee(%1, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    return %2 : tensor<?xf32>
-+  }
-+  // CHECK: refine_call_callee(%arg0: tensor<4xf32>) -> tensor<4xf32>
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-+    // CHECK: stablehlo.constant dense<4>
-+    %0 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
-+    %1 = stablehlo.dynamic_iota %0, dim = 0 : (tensor<1xi32>) -> tensor<?xf32>
-+    return %1 : tensor<?xf32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call_dimension_arguments
-+module @refine_call_dimension_arguments {
-+  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT:%.*]] = call @callee
-+    // CHECK: return [[RESULT]]
-+    %0 = stablehlo.constant dense<3> : tensor<i32>
-+    %1 = call @callee(%0, %0, %arg0) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+  // %arg0 and %arg1 are dimension arguments
-+  // CHECK: @callee([[ARG0:%.*]]: tensor<i32>) -> tensor<i32>
-+  func.func private @callee(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
-+    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
-+    // CHECK: return [[RESULT1]]
-+    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
-+    %1 = stablehlo.add %0, %arg2: tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call_prefix_token_and_dimension_arguments
-+module @refine_call_prefix_token_and_dimension_arguments {
-+  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT:%.*]] = call @callee
-+    // CHECK: return [[RESULT]]
-+    %0 = stablehlo.constant dense<3> : tensor<i32>
-+    %token = stablehlo.create_token : !stablehlo.token
-+    %1 = call @callee(%token, %0, %0, %arg0) : (!stablehlo.token, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+  // %arg0 and %arg1 are dimension arguments
-+  // CHECK: @callee([[ARG_TOKEN:%.*]]: !stablehlo.token, [[ARG0:%.*]]: tensor<i32>
-+  func.func private @callee(%arg_token: !stablehlo.token, %arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
-+    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
-+    // CHECK: return [[RESULT1]]
-+    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
-+    %1 = stablehlo.add %0, %arg2: tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call_dimension_arguments_followed_by_token
-+module @refine_call_dimension_arguments_followed_by_token {
-+  func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT:%.*]] = call @callee
-+    // CHECK: return [[RESULT]]
-+    %0 = stablehlo.constant dense<3> : tensor<i32>
-+    %token = stablehlo.create_token : !stablehlo.token
-+    %1 = call @callee(%0, %0, %token, %arg0) : (tensor<i32>, tensor<i32>, !stablehlo.token, tensor<i32>) -> tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+  // %arg0 and %arg1 are dimension arguments
-+  // CHECK: @callee([[ARG_TOKEN:%.*]]: !stablehlo.token, [[ARG0:%.*]]: tensor<i32>
-+  func.func private @callee(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg_token: !stablehlo.token, %arg2: tensor<i32>) -> tensor<i32> {
-+    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<6>
-+    // CHECK: [[RESULT1:%.*]] = stablehlo.add [[RESULT0]], [[ARG0]]
-+    // CHECK: return [[RESULT1]]
-+    %0 = stablehlo.add %arg0, %arg1: tensor<i32>
-+    %1 = stablehlo.add %0, %arg2: tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_multiple_call_with_same_context
-+module @refine_multiple_call_with_same_context {
-+  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
-+    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
-+    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
-+    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
-+    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
-+    %2 = call @refine_call_callee(%arg0_new, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    return %2 : tensor<?xf32>
-+  }
-+  // CHECK: refine_call_callee{{.*}}-> tensor<4xf32>
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-+    return %arg1 : tensor<?xf32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_multiple_call_constant_function
-+module @refine_multiple_call_constant_function {
-+  func.func @main(%arg0: tensor<5xf32>) -> tensor<i32> {
-+    // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<16>
-+    // CHECK: return [[RESULT0]]
-+    %0 = stablehlo.constant dense<4> : tensor<i32>
-+    %1 = call @refine_call_callee(%0, %arg0) : (tensor<i32>, tensor<5xf32>) -> tensor<i32>
-+    %2 = call @refine_call_callee(%0, %arg0) : (tensor<i32>, tensor<5xf32>) -> tensor<i32>
-+    %3 = stablehlo.add %1, %2: tensor<i32>
-+    return %3 : tensor<i32>
-+  }
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<5xf32>) -> tensor<i32> {
-+    // CHECK: [[RESULT1:%.*]] = stablehlo.constant dense<8>
-+    // CHECK: return [[RESULT1]]
-+    %0 = stablehlo.add %arg0, %arg0: tensor<i32>
-+    return %0 : tensor<i32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_multiple_with_different_number_dimension_arguments {
-+  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
-+    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
-+    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
-+    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    // Ensure that the first argument is not a constant at the second call site
-+    %arg0_different_f32 = stablehlo.bitcast_convert %arg0_new : (tensor<i32>) -> tensor<f32>
-+    %arg0_different_i32 = stablehlo.bitcast_convert %arg0_different_f32 : (tensor<f32>) -> tensor<i32>
-+    // expected-error@+1{{incorrect number of operands for callee}}
-+    %2 = call @refine_call_callee(%arg0_different_i32, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    return %2 : tensor<?xf32>
-+  }
-+  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context. Previous context had 1 and now we have 2 non-dimension arguments}}
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-+    return %arg1 : tensor<?xf32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_multiple_different_dimension_arguments {
-+  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
-+    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
-+    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
-+    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    %arg0_different = stablehlo.add %arg0_new, %arg0_new : tensor<i32>
-+    // expected-error@+1{{incorrect number of operands for callee}}
-+    %2 = call @refine_call_callee(%arg0_different, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    return %2 : tensor<?xf32>
-+  }
-+  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context.}}
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-+    return %arg1 : tensor<?xf32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_multiple_different_non_dimension_arguments {
-+  func.func @main(%arg1: tensor<4xf32>) -> tensor<?xf32> {
-+    %0 = stablehlo.bitcast_convert %arg1 : (tensor<4xf32>) -> tensor<?xf32>
-+    %arg0_new = "stablehlo.get_dimension_size"(%0) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
-+    %1 = call @refine_call_callee(%arg0_new, %0) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    %2 = stablehlo.constant dense<[1., 2.]> : tensor<2xf32>
-+    %3 = stablehlo.concatenate %1, %2, dim = 0 : (tensor<?xf32>, tensor<2xf32>) -> tensor<?xf32>
-+    // expected-error@+1{{incorrect number of operands for callee}}
-+    %4 = call @refine_call_callee(%arg0_new, %3) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-+    return %4 : tensor<?xf32>
-+  }
-+  // expected-error@+1{{Function refine_call_callee has already been refined with a different refinement context.}}
-+  func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-+    return %arg1 : tensor<?xf32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_recursive {
-+  func.func @main() -> tensor<i32> {
-+    %0 = stablehlo.constant dense<3> : tensor<i32>
-+    %1 = call @refine_call_callee(%0) : (tensor<i32>) -> tensor<i32>
-+    return %1 : tensor<i32>
-+  }
-+  // expected-error@+1{{Function refine_call_callee is being refined recursively}}
-+  func.func @refine_call_callee(%arg0: tensor<i32>) -> tensor<i32> {
-+    // expected-error@+1{{incorrect number of operands}}
-+    %0 = call @refine_call_callee(%arg0) : (tensor<i32>) -> tensor<i32>
-+    return %0 : tensor<i32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_main_argument_unranked {
-+  // expected-error@+1{{main must be refined with static shape arguments}}
-+  func.func public @main(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-+    %2 = call @callee(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
-+    return %2 : tensor<*xi32>
-+  }
-+  func.func private @callee(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-+    return %arg0 : tensor<*xi32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_main_argument_dynamic_shape {
-+  // expected-error@+1{{main must be refined with static shape arguments}}
-+  func.func public @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-+    %2 = call @callee(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-+    return %2 : tensor<?xi32>
-+  }
-+  func.func private @callee(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-+    return %arg0 : tensor<?xi32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_callee_argument_unranked {
-+  func.func public @main(%arg0: tensor<1xi64>) -> tensor<*xi32> {
-+    %1 = stablehlo.dynamic_iota %arg0, dim = 0 : (tensor<1xi64>) -> tensor<*xi32>
-+    %2 = call @callee(%1) : (tensor<*xi32>) -> tensor<*xi32>
-+    return %2 : tensor<*xi32>
-+  }
-+  // expected-error@+1{{callee must be refined with static shape arguments}}
-+  func.func private @callee(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-+    return %arg0 : tensor<*xi32>
-+  }
-+}
-+
-+// -----
-+
-+module @refine_call_callee_argument_dynamic_shape {
-+  func.func public @main(%arg0: tensor<1xi64>) -> tensor<?xi32> {
-+    %1 = stablehlo.dynamic_iota %arg0, dim = 0 : (tensor<1xi64>) -> tensor<?xi32>
-+    %2 = call @callee(%1) : (tensor<?xi32>) -> tensor<?xi32>
-+    return %2 : tensor<?xi32>
-+  }
-+  // expected-error@+1{{callee must be refined with static shape arguments}}
-+  func.func private @callee(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-+    return %arg0 : tensor<?xi32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call_dimension_argument_non_scalar
-+// The non-scalar constant is not folded into the callee
-+module @refine_call_dimension_argument_non_scalar {
-+  func.func public @main() -> tensor<4xi32> {
-+    // CHECK: dense<[1, 2, 3, 4]> : tensor<4xi32>
-+    %0 = stablehlo.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-+    %1 = call @callee(%0) : (tensor<4xi32>) -> tensor<4xi32>
-+    return %1 : tensor<4xi32>
-+  }
-+  func.func private @callee(%arg0: tensor<4xi32>) -> tensor<4xi32> {
-+    // CHECK: return %arg0 : tensor<4xi32>
-+    return %arg0 : tensor<4xi32>
-+  }
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: module @refine_call_dimension_argument_not_integer
-+module @refine_call_dimension_argument_not_integer {
-+  func.func public @main() -> tensor<f32> {
-+    %0 = stablehlo.constant dense<3.> : tensor<f32>
-+    // CHECK: call @callee({{.*}}) : (tensor<f32>) -> tensor<f32>
-+    %2 = call @callee(%0) : (tensor<f32>) -> tensor<f32>
-+    return %2 : tensor<f32>
-+  }
-+  func.func private @callee(%arg0: tensor<f32>) -> tensor<f32> {
-+    return %arg0 : tensor<f32>
-+  }
- }
- 
- // -----
-@@ -607,12 +903,55 @@
+@@ -607,12 +607,55 @@
  
  // -----
  
@@ -1977,74 +1631,7 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/
 diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
 --- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
 +++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-@@ -11,9 +11,48 @@
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
--
-+/*
-+This shape refinement pass was designed to resolve the dynamic shapes in
-+a StableHLO module produced by JAX serialization with shape polymorphism.
-+Such a module has the following properties:
-+
-+  * it contains a "main" function with statically-shaped arguments;
-+    the result types may be dynamically shaped.
-+  * all the dynamic shapes depend only on the input shapes (no shape
-+    dependency on the input array contents). We refer to the operations that
-+    depend transitively only on the input shapes (e.g., as given by
-+    `stablehlo.get_dimension_size`) as `dimension` operations.
-+    All dimension values can be resolved to constants through inter-procedural
-+    constant folding.
-+  * intermediate functions may take a number of token arguments (of type
-+    !stablehlo.token) at the start of the argument list, followed by some
-+    dimension arguments (integer scalars).
-+  * some intermediate functions may return dimension values.
-+    E.g., the `floordiv` operation on dimension values may be implemented
-+    using intermediate functions. These constant functions need to be
-+    constant-folded.
-+  * All the dynamic shapes can be resolved through shape inference from the
-+    dimension values. The dimension values themselves do not depend on the
-+    result of shape inference.
-+
-+
-+For each intermediate function we compute a refinement context, including
-+the values of the dimension arguments and the static shapes of the other
-+arguments. We compute the refinement context when we encounter a function call,
-+and then we refine the callee recursively. We abort in the presence of
-+recursive calls.
-+We also abort if a function is called with multiple distinct refinement
-+contexts.
-+
-+After refinement, all operations should have static shapes, all calls to
-+constant functions are replaced with constants, and all dimension arguments
-+for intermediate functions are dropped and are replaced with constants.
-+*/
-+#include <algorithm>
- #include <cstdint>
- #include <memory>
-+#include <optional>
-+#include <set>
- #include <string>
- #include <utility>
- 
-@@ -24,8 +63,10 @@
- #include "llvm/ADT/SmallSet.h"
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/ADT/StringRef.h"
-+#include "llvm/Support/Debug.h"
- #include "llvm/Support/ErrorHandling.h"
- #include "llvm/Support/FormatVariadic.h"
-+#include "llvm/Support/ScopedPrinter.h"
- #include "mlir/Dialect/Func/IR/FuncOps.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/BuiltinOps.h"
-@@ -39,10 +80,13 @@
- #include "mlir/IR/Types.h"
- #include "mlir/IR/Value.h"
- #include "mlir/Interfaces/InferTypeOpInterface.h"
-+#include "mlir/Support/DebugStringHelper.h"
- #include "mlir/Support/LogicalResult.h"
-+#include "mlir/Support/LLVM.h"
+@@ -43,6 +43,7 @@
  #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
  #include "stablehlo/dialect/Base.h"
  #include "stablehlo/dialect/ChloOps.h"
@@ -2052,407 +1639,7 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
  #include "stablehlo/dialect/StablehloOps.h"
  #include "stablehlo/dialect/TypeInference.h"
  #include "stablehlo/transforms/Passes.h"
-@@ -50,10 +94,144 @@
- namespace mlir {
- namespace stablehlo {
- 
-+#define DEBUG_TYPE "stablehlo-refine-shapes"
-+
- #define GEN_PASS_DEF_STABLEHLOREFINESHAPESPASS
- #include "stablehlo/transforms/Passes.h.inc"
- 
- namespace {
-+
-+// Per-module state for shape refinement.
-+class RefineShapeState {
-+ public:
-+  // Validates that we are not attempting to refine a function with a different
-+  // context than previously, and are not attempting recursive refinement.
-+  // Returns failure() if validation fails. On success, returns a boolean
-+  // that specifies whether the function has already been refined.
-+  FailureOr<bool> validateFunctionRefinement(
-+      func::FuncOp func, SmallVector<APSInt> dimensionArguments,
-+      SmallVector<Type> nonDimensionArgumentTypes) {
-+    StringRef funcName = func.getName();
-+    auto found = refinementContexts.find(func);
-+    if (found == refinementContexts.end()) {
-+      return false;  // not already refined.
-+    }
-+    auto prevDimensionArguments = std::get<0>(found->second);
-+    auto prevNonDimensionArgumentTypes = std::get<1>(found->second);
-+    // Since we refine until fixed point, we will refine a call to a function
-+    // both for the original function and for the refined one. In the latter
-+    // case, we should have empty dimensionArguments but the same
-+    // nonDimensionArgumentTypes.
-+    if (prevNonDimensionArgumentTypes != nonDimensionArgumentTypes ||
-+        (!dimensionArguments.empty() &&
-+         prevDimensionArguments != dimensionArguments)) {
-+      emitDifferentRefinementContextError(
-+          func, /*dimensionArguments=*/dimensionArguments,
-+          /*nonDimensionArgumentTypes=*/nonDimensionArgumentTypes,
-+          /*prevDimensionArguments=*/prevDimensionArguments,
-+          /*prevNonDimensionArgumentShapes=*/prevNonDimensionArgumentTypes);
-+      return failure();
-+    }
-+    for (auto funcOnStack : functionsBeingRefined) {
-+      if (funcOnStack == funcName) {
-+        func.emitOpError() << "Function " << funcName
-+                           << " is being refined recursively\n";
-+        return failure();
-+      }
-+    }
-+    return true;  // already refined.
-+  }
-+
-+  // Updates the state to signal the starting of a function refinement.
-+  // Callers must call `finishFunctionRefinement` when done.
-+  void startFunctionRefinement(func::FuncOp func,
-+                               SmallVector<APSInt> dimensionArguments,
-+                               SmallVector<Type> nonDimensionArgumentTypes) {
-+    StringRef funcName = func.getName();
-+    functionsBeingRefined.push_back(funcName);
-+    refinementContexts[func] =
-+        std::make_tuple(dimensionArguments, nonDimensionArgumentTypes);
-+  }
-+
-+  // Updates the state to signal the starting of a function refinement.
-+  LogicalResult finishFunctionRefinement(func::FuncOp func) {
-+    if (func.getName() !=
-+        functionsBeingRefined[functionsBeingRefined.size() - 1]) {
-+      func.emitOpError() << "Expected to find " << func.getName()
-+                         << " at the top of the stack";
-+      return failure();
-+    }
-+    functionsBeingRefined.pop_back();
-+    return success();
-+  }
-+
-+ private:
-+  // Maps refined functions to the refinement context: the values of dimension
-+  // arguments and the types of non-dimension arguments. A function is added
-+  // here when we start refining it.
-+  DenseMap<func::FuncOp, std::tuple<SmallVector<APSInt>, SmallVector<Type>>>
-+      refinementContexts;
-+
-+  // A stack of functions that are in the process of being refined, the current
-+  // one is last.
-+  SmallVector<llvm::StringRef> functionsBeingRefined;
-+
-+  void emitDifferentRefinementContextError(
-+      func::FuncOp func, SmallVector<APSInt> dimensionArguments,
-+      SmallVector<Type> nonDimensionArgumentTypes,
-+      SmallVector<APSInt> prevDimensionArguments,
-+      SmallVector<Type> prevNonDimensionArgumentShapes) {
-+    InFlightDiagnostic msg = func.emitOpError();
-+    msg << "Function " << func.getName()
-+        << " has already been refined with a different "
-+           "refinement context. ";
-+    int countShowNonDimensionArguments =
-+        std::min(prevNonDimensionArgumentShapes.size(),
-+                 nonDimensionArgumentTypes.size());
-+    if (prevNonDimensionArgumentShapes.size() !=
-+        nonDimensionArgumentTypes.size()) {
-+      msg << "Previous context had " << prevNonDimensionArgumentShapes.size()
-+          << " and now we have " << nonDimensionArgumentTypes.size()
-+          << " non-dimension arguments. ";
-+    }
-+    msg << "The differences among the first " << countShowNonDimensionArguments
-+        << " non-dimension argument types are: ";
-+    for (auto i = 0; i < countShowNonDimensionArguments; ++i) {
-+      if (prevNonDimensionArgumentShapes[i] != nonDimensionArgumentTypes[i]) {
-+        msg << "Non-dimension argument[" << i << "] previously had type "
-+            << debugString(prevNonDimensionArgumentShapes[i])
-+            << " and now has type " << debugString(nonDimensionArgumentTypes[i])
-+            << ". ";
-+      }
-+    }
-+    int countShowDimensionArguments =
-+        std::min(prevDimensionArguments.size(), dimensionArguments.size());
-+    if (prevDimensionArguments.size() != dimensionArguments.size()) {
-+      msg << "Previous context had " << prevDimensionArguments.size()
-+          << " and now we have " << dimensionArguments.size()
-+          << " dimension arguments. ";
-+    }
-+    msg << "The differences among the first " << countShowDimensionArguments
-+        << " dimension arguments are: ";
-+    for (auto i = 0; i < countShowDimensionArguments; ++i) {
-+      if (prevDimensionArguments[i] != dimensionArguments[i]) {
-+        msg << "Dimension argument[" << i << "] previously was "
-+            << prevDimensionArguments[i].getSExtValue() << " and now is "
-+            << dimensionArguments[i].getSExtValue() << ". ";
-+      }
-+    }
-+  }
-+};
-+
-+// Refines a function.
-+// Returns `true` if the function had already been processed with the same
-+// refinement context and `false` if this is the first time we refined the
-+// function. Returns failure() if we encounter an error.
-+LogicalResult refineFunction(func::FuncOp func, MLIRContext* context,
-+                             RefineShapeState* state,
-+                             size_t nrPrefixTokenArguments,
-+                             SmallVector<APSInt> dimensionArguments,
-+                             SmallVector<Type> nonDimensionArgumentTypes);
- 
- // DenseElementsAttr can be constructed from ArrayRef<APInt> but not from
- // ArrayRef<APSInt>. This helper bridges the gap.
-@@ -424,11 +602,10 @@
-       diag << "refineValues failed for " << types << ": expected "
-            << values.size() << " types, got " << types.size();
-     });
--
--  // Check whether `types` contain any new information with respect to existing
--  // return types. Even if just a single dimension size out of an entire tensor
--  // type got updated, using `inferMostSpecificType` ensures that we don't
--  // miss that.
-+  // Check whether `types` contain any new information with respect to
-+  // existing return types. Even if just a single dimension size out of an
-+  // entire tensor type got updated, using `inferMostSpecificType` ensures
-+  // that we don't miss that.
-   bool needsRefinement = false;
-   SmallVector<Type> refinedTypes;
-   for (auto it : llvm::zip(values.getTypes(), types)) {
-@@ -468,11 +645,13 @@
- 
-       // Simply changing operand type of `func.return` won't work because
-       // that won't update the FunctionType of the enclosing `func.func`.
--      // Nonetheless, we still want to support these ops because they are widely
--      // used in StableHLO programs (although the plan of record is to replace
--      // `func.return` ops in StableHLO programs with `stablehlo.return`:
--      // https://github.com/openxla/stablehlo/issues/425).
-+      // Nonetheless, we still want to support these ops because they are
-+      // widely used in StableHLO programs (although the plan of record is to
-+      // replace `func.return` ops in StableHLO programs with
-+      // `stablehlo.return`: https://github.com/openxla/stablehlo/issues/425).
-       if (isa<func::ReturnOp>(user)) continue;
-+
-+      if (isa<func::CallOp>(user)) continue;
- 
-       // Unlike in TensorFlow's type inference pass, here we work only with
-       // allowlisted ops to focus our support on well-defined semantics of
-@@ -489,7 +668,8 @@
-     value.setType(refinedType);
- 
-     // Special case: for `func.return`, guard the refinement with a cast
--    // and leave propagation of the refined return type to a dedicated pattern.
-+    // and leave propagation of the refined return type to a dedicated
-+    // pattern.
-     auto isFuncReturn = [](OpOperand& use) -> bool {
-       return isa<func::ReturnOp>(use.getOwner());
-     };
-@@ -505,8 +685,8 @@
- 
- // Refines the return types of the given operation using the given types.
- // This function also signals PatternRewriter that it needs to visit all the
--// users of this op if any updates to its results have happened during execution
--// of the function.
-+// users of this op if any updates to its results have happened during
-+// execution of the function.
- LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
-                                 ArrayRef<Type> types) {
-   if (failed(refineValues(rewriter, op, op->getResults(), types)))
-@@ -528,12 +708,12 @@
- //      traversal, and only then we apply the refinements. If there are other
- //      types, then the corresponding refinements must be completely empty.
- //   2) Encodings are not supported. In principle, TypeExtensions should be
--//      supportable, but this needs careful thinking through. Given that no one
--//      asked for support for bounded dynamism in this pass yet, this is left
--//      for future work.
-+//      supportable, but this needs careful thinking through. Given that no
-+//      one asked for support for bounded dynamism in this pass yet, this is
-+//      left for future work.
- // This function also signals PatternRewriter that it needs to visit all the
--// users of this op if any updates to its results have happened during execution
--// of the function.
-+// users of this op if any updates to its results have happened during
-+// execution of the function.
- LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
-                                 ArrayRef<ShapedTypeComponents> refinements) {
-   SmallVector<Type> flattenedTypes;
-@@ -623,8 +803,8 @@
- 
- // Refines the return type of the given operation using the given shape.
- // This function also signals PatternRewriter that it needs to visit all the
--// users of this op if any updates to its results have happened during execution
--// of the function.
-+// users of this op if any updates to its results have happened during
-+// execution of the function.
- template <typename OpType>
- LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
-                                 ArrayRef<int64_t> shape) {
-@@ -633,8 +813,8 @@
- 
- // Refines the return type of the given operation using the given shape.
- // This function also signals PatternRewriter that it needs to visit all the
--// users of this op if any updates to its results have happened during execution
--// of the function.
-+// users of this op if any updates to its results have happened during
-+// execution of the function.
- template <typename OpType>
- LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
-                                 Value shapeValue) {
-@@ -647,6 +827,52 @@
-   return refineReturnShape(rewriter, op, shape);
- }
- 
-+// Dimension arguments are leading scalar constant arguments, optionally
-+// preceeded by some stablehlo.token arguments.
-+SmallVector<APSInt> getDimensionArguments(func::CallOp callOp,
-+                                          size_t* nrPrefixTokenArguments) {
-+  *nrPrefixTokenArguments = 0;
-+  SmallVector<Value> operands = callOp.getOperands();
-+  SmallVector<APSInt> dimensionArguments;
-+  for (size_t i = 0; i < operands.size(); ++i) {
-+    if (i == *nrPrefixTokenArguments && isa<TokenType>(operands[i].getType())) {
-+      (*nrPrefixTokenArguments)++;
-+      continue;
-+    }
-+    RankedTensorType operandType =
-+        dyn_cast<RankedTensorType>(operands[i].getType());
-+    if (!operandType || operandType.getRank() != 0 ||
-+        !operandType.getElementType().template isa<IntegerType>())
-+      break;
-+    SmallVector<APSInt> operand_int;
-+    if (failed(hlo::matchInts(operands[i], operand_int))) {
-+      break;
-+    }
-+    dimensionArguments.push_back(operand_int[0]);
-+  }
-+  return dimensionArguments;
-+}
-+
-+std::optional<SmallVector<DenseIntElementsAttr>> isConstantFunction(
-+    func::FuncOp func) {
-+  LLVM_DEBUG(llvm::dbgs() << "check if " << func.getName()
-+                          << " is a constant function\n");
-+  SmallVector<DenseIntElementsAttr> returnedConstants;
-+  func::ReturnOp ret = *func.getOps<func::ReturnOp>().begin();
-+  bool isConstant = llvm::all_of(ret->getOperands(), [&](auto returnVal) {
-+    DenseIntElementsAttr attr;
-+    Operation* return_operand_def = returnVal.getDefiningOp();
-+    if (return_operand_def &&
-+        matchPattern(return_operand_def, m_Constant(&attr))) {
-+      returnedConstants.push_back(attr);
-+      return true;
-+    }
-+    return false;
-+  });
-+  if (isConstant) return returnedConstants;
-+  return std::nullopt;
-+}
-+
- struct RefineAllGatherOpPattern : public OpRewritePattern<AllGatherOp> {
-   using OpRewritePattern::OpRewritePattern;
-   LogicalResult matchAndRewrite(AllGatherOp op,
-@@ -655,9 +881,9 @@
-     if (!operandType.hasRank())
-       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
- 
--    // This represents the cross_replica_and_partition process grouping strategy
--    // that requires num_partitions to compute shardCount. Since we don't know
--    // num_partitions at this point, we error out.
-+    // This represents the cross_replica_and_partition process grouping
-+    // strategy that requires num_partitions to compute shardCount. Since we
-+    // don't know num_partitions at this point, we error out.
-     if (op.getChannelHandle() && !op.getUseGlobalDeviceIds())
-       return rewriter.notifyMatchFailure(op, "unsupported strategy");
-     DenseIntElementsAttr replicaGroups = op.getReplicaGroups();
-@@ -678,12 +904,11 @@
-     auto operandType = op.getOperand().getType();
-     if (!operandType.hasRank())
-       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
--
-+    auto resultType = op.getType();
-     // If bit widths of the operand and the result are different, then
-     // operand and result shapes have different ranks.
-     // This complicates the logic quite a bit and is not needed to pass the
-     // current tests, so we leave this for future work.
--    auto resultType = op.getType();
-     auto getBitWidthFn = [](ShapedType type) {
-       auto elementType = type.getElementType();
-       if (auto complexType = elementType.dyn_cast<ComplexType>())
-@@ -694,8 +919,77 @@
-     if (getBitWidthFn(operandType) != getBitWidthFn(resultType))
-       return rewriter.notifyMatchFailure(op, "unsupported bit width");
- 
--    return refineReturnShape(rewriter, op, operandType.getShape());
--  }
-+    auto res = refineReturnShape(rewriter, op, operandType.getShape());
-+    if (failed(res)) return failure();
-+    if (op.getOperand().getType() == op.getResult().getType()) {
-+      LLVM_DEBUG({ llvm::dbgs() << "    ** remove no-op bitcast convert\n"; });
-+      rewriter.replaceOp(op, op.getOperand());
-+    }
-+    return success();
-+  }
-+};
-+
-+struct RefineCallOpPattern : public OpRewritePattern<func::CallOp> {
-+  using OpRewritePattern::OpRewritePattern;
-+
-+  RefineCallOpPattern(MLIRContext* context, RefineShapeState* state)
-+      : OpRewritePattern<func::CallOp>(context), _state(state) {}
-+
-+  LogicalResult matchAndRewrite(func::CallOp op,
-+                                PatternRewriter& rewriter) const override {
-+    LLVM_DEBUG({ llvm::dbgs() << "refineCallOp " << debugString(op) << "\n"; });
-+
-+    // We have a number of prefix token arguments, then the dimension arguments
-+    size_t nrPrefixTokenArguments = 0;
-+    SmallVector<APSInt> dimensionArguments =
-+        getDimensionArguments(op, &nrPrefixTokenArguments);
-+    SmallVector<Type> nonDimensionArgumentTypes;
-+    SmallVector<Value> nonDimensionArguments;
-+    SmallVector<Value> operands = op.getOperands();
-+    for (size_t i = 0; i < operands.size(); ++i) {
-+      // Skip the dimension arguments.
-+      if (i >= nrPrefixTokenArguments &&
-+          i < nrPrefixTokenArguments + dimensionArguments.size()) {
-+        continue;
-+      }
-+      nonDimensionArgumentTypes.push_back(operands[i].getType());
-+      nonDimensionArguments.push_back(operands[i]);
-+    }
-+    FlatSymbolRefAttr calleeName = op.getCalleeAttr();
-+    const SymbolTable symbolTable(op->getParentOfType<ModuleOp>());
-+    func::FuncOp callee = dyn_cast<func::FuncOp>(
-+        symbolTable.lookupNearestSymbolFrom(op, calleeName.getAttr()));
-+    if (!callee)
-+      return rewriter.notifyMatchFailure(
-+          op, "cannot find callee in the current scope");
-+    if (failed(refineFunction(callee, rewriter.getContext(), _state,
-+                              nrPrefixTokenArguments, dimensionArguments,
-+                              nonDimensionArgumentTypes)))
-+      return failure();
-+
-+    // Is the callee a constant function in this refinement context?
-+    std::optional<SmallVector<DenseIntElementsAttr>> constantAttrs =
-+        isConstantFunction(callee);
-+    if (constantAttrs.has_value()) {
-+      SmallVector<Value> constants;
-+      for (auto constAttr : constantAttrs.value()) {
-+        constants.push_back(
-+            rewriter.create<ConstantOp>(op.getLoc(), constAttr));
-+      }
-+      rewriter.replaceOp(op, constants);
-+      return success();
-+    }
-+    if (!dimensionArguments.empty()) {
-+      // Drop the dimension arguments, but only if necessary, or else we
-+      // will end up trying to refine the new CallOp forever.
-+      op = rewriter.replaceOpWithNewOp<func::CallOp>(
-+          op, op.getResultTypes(), callee.getSymName(), nonDimensionArguments);
-+    }
-+    return refineReturnTypes(rewriter, op, callee.getResultTypes());
-+  }
-+
-+ private:
-+  RefineShapeState* _state;
- };
- 
- struct RefineConvertOpPattern : public OpRewritePattern<ConvertOp> {
-@@ -844,12 +1138,98 @@
+@@ -844,12 +845,97 @@
    }
  };
  
@@ -2525,9 +1712,8 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
 +      return rewriter.notifyMatchFailure(op, "expected constant output_shape");
 +
 +    // We only need to refine the shape of `output` (the second result).
-+    // The shape of `output_state` (the first result) is determined by the
-+    // shape of `initial_state`, so we ignore it and provide an empty
-+    // refinement.
++    // The shape of `output_state` (the first result) is determined by the shape
++    // of `initial_state`, so we ignore it and provide an empty refinement.
 +    return refineReturnTypes(rewriter, op, {{initialStateType}, {outputShape}});
 +  }
 +};
@@ -2551,349 +1737,15 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
    }
  };
  
-@@ -865,11 +1245,11 @@
-     if (!isa<chlo::ChloDialect, StablehloDialect>(op->getDialect()))
-       return rewriter.notifyMatchFailure(op, "unsupported dialect");
- 
--    // For the ops that implement InferTypeOpInterface, we reinfer their return
--    // types and see what happens.
--    // Operands of these ops might have been refined elsewhere (e.g. someone
--    // might have updated argument types of a function) or earlier during this
--    // pass, and this might enable refinement opportunities downstream.
-+    // For the ops that implement InferTypeOpInterface, we reinfer their
-+    // return types and see what happens. Operands of these ops might have
-+    // been refined elsewhere (e.g. someone might have updated argument types
-+    // of a function) or earlier during this pass, and this might enable
-+    // refinement opportunities downstream.
-     SmallVector<Type> inferredReturnTypes;
-     if (failed(op.inferReturnTypes(getContext(), /*location=*/{},
-                                    op->getOperands(), op->getAttrDictionary(),
-@@ -925,8 +1305,8 @@
-           sliceSizesAttr.size(),
-           RankedTensorType::get({}, startIndicesElementType));
- 
--      // RealDynamicSliceOp can take tensors of integer or index element types.
--      // DynamicSliceOp::slice_sizes only supports i64 element type.
-+      // RealDynamicSliceOp can take tensors of integer or index element
-+      // types. DynamicSliceOp::slice_sizes only supports i64 element type.
-       // Adapt accordingly in order to be compatible with inferDynamicSliceOp.
-       SmallVector<int64_t> sliceSizes;
-       for (auto element : sliceSizesAttr.getValues<APInt>()) {
-@@ -956,9 +1336,9 @@
-     if (!operandType.hasRank())
-       return rewriter.notifyMatchFailure(op, "expected ranked operand type");
- 
--    // This represents the cross_replica_and_partition process grouping strategy
--    // that requires num_partitions to compute shardCount. Since we don't know
--    // num_partitions at this point, we error out.
-+    // This represents the cross_replica_and_partition process grouping
-+    // strategy that requires num_partitions to compute shardCount. Since we
-+    // don't know num_partitions at this point, we error out.
-     if (op.getChannelHandle() && !op.getUseGlobalDeviceIds())
-       return rewriter.notifyMatchFailure(op, "unsupported strategy");
-     DenseIntElementsAttr replicaGroups = op.getReplicaGroups();
-@@ -998,9 +1378,9 @@
-                                 PatternRewriter& rewriter) const override {
-     // Push the potentially refined operand types into the nested regions.
-     // This can lead to refinements of the return types of the body (but not
--    // of the cond since it always returns tensor<i1>), but the key insight here
--    // is that the enclosing while op doesn't care about these refinements
--    // (because its return types are equal to its operand types).
-+    // of the cond since it always returns tensor<i1>), but the key insight
-+    // here is that the enclosing while op doesn't care about these
-+    // refinements (because its return types are equal to its operand types).
-     // If we end up with incompatibilities between while's return types and
-     // body's return types, the verifier will tell us about that. This means
-     // that the original program wasn't well-formed. TODO(burmako): Implement
-@@ -1050,8 +1430,8 @@
-       if (failed(mostSpecificType) || destType == *mostSpecificType) continue;
- 
-       // If the source type of the cast is more specific than the target type,
--      // then we conclude that the cast is redundant (i.e. needs to be removed)
--      // and that the return type of the function needs an update.
-+      // then we conclude that the cast is redundant (i.e. needs to be
-+      // removed) and that the return type of the function needs an update.
-       needsUpdate = true;
-       updatedResultTypes[i] = sourceType;
- 
-@@ -1066,9 +1446,6 @@
-     for (auto cast : castsToReplace)
-       rewriter.replaceOp(cast, cast->getOperands());
- 
--    // If the type of the enclosing `func.func` needs an update, we simply
--    // call setType. We can afford this simplicity because our algorithm
--    // currently supports only one function per module.
-     auto func = cast<func::FuncOp>(op->getParentOp());
-     func.setType(
-         rewriter.getFunctionType(func.getArgumentTypes(), updatedResultTypes));
-@@ -1100,22 +1477,186 @@
-   }
- };
- 
-+LogicalResult applyRewritePatterns(func::FuncOp func, MLIRContext* context,
-+                                   RefineShapeState* state) {
-+  // TODO(#1048): Find out why .maxIterations = 1 no longer works.
-+  // There have been recent refactors to applyPatternsAndFoldGreedily
-+  // upstream, and that might be the reason.
-+  GreedyRewriteConfig config;
-+  config.useTopDownTraversal = true;
-+  config.enableRegionSimplification = true;
-+  config.maxIterations = 2;
-+  config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
-+  config.strictMode = GreedyRewriteStrictness::AnyOp;
-+
-+  RewritePatternSet patterns(context);
-+  patterns.add<EvalAddOpPattern>(context);
-+  patterns.add<EvalAndOpPattern>(context);
-+  patterns.add<EvalBroadcastInDimOpPattern>(context);
-+  patterns.add<EvalClampOpPattern>(context);
-+  patterns.add<EvalCompareOpPattern>(context);
-+  patterns.add<EvalConcatenateOpPattern>(context);
-+  patterns.add<EvalConvertOpPattern>(context);
-+  patterns.add<EvalDivOpPattern>(context);
-+  patterns.add<EvalGetDimensionSizeOpPattern>(context);
-+  patterns.add<EvalMaxOpPattern>(context);
-+  patterns.add<EvalMinOpPattern>(context);
-+  patterns.add<EvalMulOpPattern>(context);
-+  patterns.add<EvalRemOpPattern>(context);
-+  patterns.add<EvalReshapeOpPattern>(context);
-+  patterns.add<EvalSelectOpPattern>(context);
-+  patterns.add<EvalSignOpPattern>(context);
-+  patterns.add<EvalSliceOpPattern>(context);
-+  patterns.add<EvalSubtractOpPattern>(context);
-+  patterns.add<RefineAllGatherOpPattern>(context);
-+  patterns.add<RefineBitcastConvertOpPattern>(context);
-+  patterns.add<RefineCallOpPattern>(context, state);
-+  patterns.add<RefineConvertOpPattern>(context);
-+  patterns.add<RefineConvolutionOpPattern>(context);
-+  patterns.add<RefineCustomCallOpPattern>(context);
-+  patterns.add<RefineDotGeneralOpPattern>(context);
-+  patterns.add<RefineDynamicBroadcastInDimOpPattern>(context);
-+  patterns.add<RefineDynamicConvOpPattern>(context);
-+  patterns.add<RefineDynamicIotaOpPattern>(context);
-+  patterns.add<RefineDynamicPadOpPattern>(context);
-+  patterns.add<RefineDynamicReduceWindowOpPattern>(context);
-+  patterns.add<RefineDynamicReshapeOpPattern>(context);
-+  patterns.add<RefineDynamicRngBitGeneratorOpPattern>(context);
-+  patterns.add<RefineDynamicTopKOpPattern>(context);
-+  patterns.add<RefineInferTypeOpInterfacePattern>(context);
-+  patterns.add<RefineRealDynamicSliceOpPattern>(context);
-+  patterns.add<RefineReduceScatterOpPattern>(context);
-+  patterns.add<RefineRngOpPattern>(context);
-+  patterns.add<RefineUniformQuantizeOpPattern>(context);
-+  patterns.add<RefineWhileOpPattern>(context);
-+  patterns.add<UpdateFunctionTypePattern>(context);
-+  patterns.add<UpdateRegionTypePattern>(context);
-+  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
-+    func.emitOpError() << "applyPatternsAndFoldGreedily failed";
-+    return failure();
-+  }
-+  return success();
-+}
-+
-+LogicalResult refineFunction(func::FuncOp func, MLIRContext* context,
-+                             RefineShapeState* state,
-+                             size_t nrPrefixTokenArguments,
-+                             SmallVector<APSInt> dimensionArguments,
-+                             SmallVector<Type> nonDimensionArgumentTypes) {
-+  // The nonDimensionArgumentTypes include the prefix token arguments.
-+  LLVM_DEBUG({
-+    llvm::dbgs() << "refineFunction " << func.getName() << ": initial type "
-+                 << debugString(func.getFunctionType()) << "\n";
-+    llvm::dbgs() << "   has " << nrPrefixTokenArguments << " prefix tokens\n";
-+    for (size_t i = 0; i < dimensionArguments.size(); ++i) {
-+      llvm::dbgs() << "   with dimension arg[" << i
-+                   << "] = " << dimensionArguments[i] << "\n";
-+    }
-+  });
-+  // Check that the argument types have static shapes.
-+  for (size_t i = 0; i < nonDimensionArgumentTypes.size(); ++i) {
-+    if (i < nrPrefixTokenArguments) continue;
-+    auto argType = nonDimensionArgumentTypes[i];
-+    if (isa<TokenType>(argType)) continue;
-+    auto argRankedTensorType = dyn_cast<RankedTensorType>(argType);
-+    if (!argRankedTensorType || !argRankedTensorType.hasStaticShape()) {
-+      func.emitOpError() << func.getName()
-+                         << " must be refined with static shape arguments. "
-+                         << "Found argument of type " << debugString(argType);
-+      return failure();
-+    }
-+  }
-+  auto alreadyRefined = state->validateFunctionRefinement(
-+      func, dimensionArguments, nonDimensionArgumentTypes);
-+  if (failed(alreadyRefined)) {
-+    return failure();
-+  }
-+  if (*alreadyRefined) {
-+    LLVM_DEBUG({
-+      llvm::dbgs() << "refineFunction " << func.getName()
-+                   << ": skipping, already refined\n";
-+    });
-+    return success();
-+  }
-+  state->startFunctionRefinement(func, dimensionArguments,
-+                                 nonDimensionArgumentTypes);
-+  // Only one block per function is supported at the moment.
-+  // At the StableHLO level, functions are expected to only have one block,
-+  // so supporting more is out of scope for this pass.
-+  if (!func.getRegion().hasOneBlock()) {
-+    func.emitOpError() << "must have exactly one block";
-+    return failure();
-+  }
-+
-+  // Replace all dimension arguments with constants and remove those arguments.
-+  // Wrap non-dimension arguments with bitcast_convert.
-+  OpBuilder op_builder(func.getRegion());
-+  op_builder.setInsertionPointToStart(&func.getRegion().front());
-+  size_t firstNonDimensionArg =
-+      nrPrefixTokenArguments + dimensionArguments.size();
-+  for (size_t i = 0; i < func.getNumArguments(); ++i) {
-+    BlockArgument arg = func.getArgument(i);
-+    Type argType = arg.getType();
-+    if (i < nrPrefixTokenArguments) {
-+      continue;
-+    }
-+    if (i < firstNonDimensionArg) {
-+      ShapedType argShapedType = dyn_cast<ShapedType>(argType);
-+      if (!argShapedType) {
-+        func.emitOpError() << "dimension arguments must have shaped types";
-+        return failure();
-+      }
-+      // We will drop the dimension arguments, replace them with constants.
-+      auto replacement_op = op_builder.create<stablehlo::ConstantOp>(
-+          arg.getLoc(), argType,
-+          getTensorAttr(argShapedType,
-+                        dimensionArguments[i - nrPrefixTokenArguments]));
-+      arg.replaceAllUsesWith(replacement_op);
-+    } else {
-+      int nonDimensionArgumentIndex =
-+          nrPrefixTokenArguments + i - firstNonDimensionArg;
-+      Type refinedType = nonDimensionArgumentTypes[nonDimensionArgumentIndex];
-+      if (refinedType != argType) {
-+        // We add BitcastConvertOp as the only uses of the non-dimension
-+        // arguments to ensure the module stays valid after we set the argument
-+        // type.
-+        auto replacement_op = op_builder.create<stablehlo::BitcastConvertOp>(
-+            arg.getLoc(), argType, arg);
-+        arg.replaceAllUsesExcept(replacement_op->getResult(0), replacement_op);
-+        arg.setType(refinedType);
-+      }
-+    }
-+  }
-+  BitVector argIndices(func.getNumArguments());
-+  argIndices.set(nrPrefixTokenArguments, firstNonDimensionArg);
-+  func.eraseArguments(argIndices);
-+  func.setType(op_builder.getFunctionType(nonDimensionArgumentTypes,
-+                                          func.getResultTypes()));
-+  LLVM_DEBUG({
-+    llvm::dbgs() << "refineFunction " << func.getName() << ": set type to "
-+                 << func.getFunctionType() << "\n";
-+  });
-+  if (failed(applyRewritePatterns(func, context, state))) return failure();
-+  LLVM_DEBUG({
-+    llvm::dbgs() << "refineFunction " << func.getName() << ": end with type "
-+                 << debugString(func.getFunctionType()) << "\n";
-+  });
-+  if (failed(state->finishFunctionRefinement(func))) return failure();
-+  return success();
-+}
-+
- struct StablehloRefineShapesPass
-     : public impl::StablehloRefineShapesPassBase<StablehloRefineShapesPass> {
-   using StablehloRefineShapesPassBase::StablehloRefineShapesPassBase;
- 
-   void runOnOperation() override {
--    // Only one function per module is supported at the moment to avoid the need
--    // to think about iterative type inference algorithms.
--    // Current use cases are served well by inlining multiple functions into
--    // a single function, so we leave native support for multiple functions to
--    // future work.
-     // To enable modules that contain CustomCallOp::called_computations,
-     // we allow multiple functions, in which case we only refine the main
-     // function called "main", assuming that the called computations will have
-     // static shapes. Lifting this assumption and expanding refinement to
-     // multiple functions is left for future work.
-     ModuleOp module = getOperation();
-+    RefineShapeState state;
-     auto funcs = llvm::to_vector(module.getOps<func::FuncOp>());
-     if (funcs.empty()) return;
-     func::FuncOp func;
-@@ -1130,70 +1671,14 @@
-           << " function to clearly identify which function will be refined";
-       return signalPassFailure();
-     }
--
--    // Similarly, only one block per function is supported at the moment.
--    // At the StableHLO level, functions are expected to only have one block,
--    // so supporting more is out of scope for this pass.
--    if (!func.getRegion().hasOneBlock()) {
--      func.emitOpError() << "must have exactly one block";
-+    SmallVector<APSInt> emptyDimensionArguments;
-+    SmallVector<Type> nonDimensionArgumentTypes;
-+    for (auto arg : func.getArguments())
-+      nonDimensionArgumentTypes.push_back(arg.getType());
-+    if (failed(refineFunction(func, &getContext(), &state, 0,
-+                              emptyDimensionArguments,
-+                              nonDimensionArgumentTypes)))
-       return signalPassFailure();
--    }
--
--    // The algorithm behind this pass consists of a single traversal of the
--    // function. This is sufficient because we only support one function per
--    // program at the moment.
--    // TODO(#1048): Find out why .maxIterations = 1 no longer works.
--    // There have been recent refactors to applyPatternsAndFoldGreedily
--    // upstream, and that might be the reason.
--    GreedyRewriteConfig config;
--    config.useTopDownTraversal = true;
--    config.enableRegionSimplification = true;
--    config.maxIterations = 2;
--    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
--    config.strictMode = GreedyRewriteStrictness::AnyOp;
--
--    RewritePatternSet patterns(&getContext());
--    patterns.add<EvalAddOpPattern>(&getContext());
--    patterns.add<EvalAndOpPattern>(&getContext());
--    patterns.add<EvalBroadcastInDimOpPattern>(&getContext());
--    patterns.add<EvalClampOpPattern>(&getContext());
--    patterns.add<EvalCompareOpPattern>(&getContext());
--    patterns.add<EvalConcatenateOpPattern>(&getContext());
--    patterns.add<EvalConvertOpPattern>(&getContext());
--    patterns.add<EvalDivOpPattern>(&getContext());
--    patterns.add<EvalGetDimensionSizeOpPattern>(&getContext());
--    patterns.add<EvalMaxOpPattern>(&getContext());
--    patterns.add<EvalMinOpPattern>(&getContext());
--    patterns.add<EvalMulOpPattern>(&getContext());
--    patterns.add<EvalRemOpPattern>(&getContext());
--    patterns.add<EvalReshapeOpPattern>(&getContext());
--    patterns.add<EvalSelectOpPattern>(&getContext());
--    patterns.add<EvalSignOpPattern>(&getContext());
--    patterns.add<EvalSliceOpPattern>(&getContext());
--    patterns.add<EvalSubtractOpPattern>(&getContext());
--    patterns.add<RefineAllGatherOpPattern>(&getContext());
--    patterns.add<RefineBitcastConvertOpPattern>(&getContext());
--    patterns.add<RefineConvertOpPattern>(&getContext());
--    patterns.add<RefineConvolutionOpPattern>(&getContext());
--    patterns.add<RefineCustomCallOpPattern>(&getContext());
--    patterns.add<RefineDotGeneralOpPattern>(&getContext());
--    patterns.add<RefineDynamicBroadcastInDimOpPattern>(&getContext());
--    patterns.add<RefineDynamicConvOpPattern>(&getContext());
--    patterns.add<RefineDynamicIotaOpPattern>(&getContext());
--    patterns.add<RefineDynamicPadOpPattern>(&getContext());
--    patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
--    patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
--    patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
--    patterns.add<RefineReduceScatterOpPattern>(&getContext());
--    patterns.add<RefineRngOpPattern>(&getContext());
--    patterns.add<RefineUniformQuantizeOpPattern>(&getContext());
--    patterns.add<RefineWhileOpPattern>(&getContext());
--    patterns.add<UpdateFunctionTypePattern>(&getContext());
--    patterns.add<UpdateRegionTypePattern>(&getContext());
--    if (failed(
--            applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
--      return signalPassFailure();
--    }
-   }
- };
- 
+@@ -1181,7 +1267,10 @@
+     patterns.add<RefineDynamicConvOpPattern>(&getContext());
+     patterns.add<RefineDynamicIotaOpPattern>(&getContext());
+     patterns.add<RefineDynamicPadOpPattern>(&getContext());
++    patterns.add<RefineDynamicReduceWindowOpPattern>(&getContext());
+     patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
++    patterns.add<RefineDynamicRngBitGeneratorOpPattern>(&getContext());
++    patterns.add<RefineDynamicTopKOpPattern>(&getContext());
+     patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
+     patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
+     patterns.add<RefineReduceScatterOpPattern>(&getContext());
 
diff --git a/third_party/xla/xla/python/refine_polymorphic_shapes.cc b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
index 44ce5b20e39d9e..063b15aba58062 100644
--- a/third_party/xla/xla/python/refine_polymorphic_shapes.cc
+++ b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
@@ -251,6 +251,10 @@ absl::Status RefinePolymorphicShapes(mlir::ModuleOp module,
     pm.enableIRPrinting(print_before, print_after, /*printModuleScope=*/true,
                         /*printAfterOnlyOnChange=*/true);
   }
+
+  // TODO(necula): we should not need the inliner.
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::stablehlo::createStablehloRefineShapesPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::stablehlo::createStablehloCanonicalizeDynamismPass());

From 9e03bbf6d0b9073b0d4ee66eba95bf73c3c36d73 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Thu, 21 Sep 2023 09:23:34 -0700
Subject: [PATCH 087/567] [XLA:GPU] Trigger Triton GEMM fusions also on kCopy
 input operations.

PiperOrigin-RevId: 567329486
---
 .../xla/service/gpu/gemm_rewriter_triton.cc   | 30 ++++++++++++-------
 .../xla/service/gpu/ir_emitter_triton_test.cc | 22 ++++++++++++++
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
index 8f1e0399ddb13a..8f237801bfa4cf 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
@@ -73,6 +73,16 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+int GetFusionLevel(const HloInstruction& hlo, const GpuVersion gpu_version) {
+  int level =
+      hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
+  if (!std::get<se::CudaComputeCapability>(gpu_version)
+           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+    level = std::min(level, 1);
+  }
+  return level;
+}
+
 bool HasDivisibleSuffixAllowingSplit(const absl::Span<int64_t const> span,
                                      const int64_t divisor) {
   CHECK_GE(divisor, 1);
@@ -1082,12 +1092,6 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
     absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
         old_to_new_mapping,
     const GpuVersion gpu_version) const {
-  int fusion_level =
-      hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
-  if (!std::get<se::CudaComputeCapability>(gpu_version)
-           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    fusion_level = std::min(fusion_level, 1);
-  }
   if (hlo.opcode() == HloOpcode::kTuple ||
       hlo.opcode() == HloOpcode::kGetTupleElement) {
     return "Unsupported instruction.";
@@ -1108,7 +1112,7 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
     return "Unsupported output data type.";
   }
   if (as_input) {
-    if (fusion_level < 2) {
+    if (GetFusionLevel(hlo, gpu_version) < 2) {
       if (hlo.opcode() == HloOpcode::kConvert) {
         if (FusionDecision decision =
                 RequireTritonFusibleConvert(&hlo, gpu_version);
@@ -1124,7 +1128,7 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
       }
     }
   } else {
-    if (fusion_level < 2) {
+    if (GetFusionLevel(hlo, gpu_version) < 2) {
       return "Skipping fusing outputs at low fusion levels.";
     }
     for (const HloInstruction* operand : hlo.operands()) {
@@ -1374,10 +1378,14 @@ StatusOr<FusionDecision> FuseDot(HloInstruction& dot,
   if (dot.GetModule()->config().debug_options().xla_gpu_triton_gemm_any()) {
     return FusionDecision{};
   }
+
+  absl::flat_hash_set<HloOpcode> triggers{
+      HloOpcode::kConvert, HloOpcode::kSlice, HloOpcode::kTranspose};
+  if (GetFusionLevel(dot, gpu_version) >= 2) {
+    triggers.insert(HloOpcode::kCopy);
+  }
   for (const auto& iter : old_to_new_mapping) {
-    if (iter.second->opcode() == HloOpcode::kConvert ||
-        iter.second->opcode() == HloOpcode::kSlice ||
-        iter.second->opcode() == HloOpcode::kTranspose) {
+    if (triggers.contains(iter.second->opcode())) {
       return FusionDecision{};
     }
   }
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 65f6906475d8e2..af6776ed1c0231 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -1086,6 +1086,28 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-3, /*arel=*/2e-3}));
 }
 
+TEST_F(TritonGemmLevel2Test, FuseTransposeWithoutMixedTypes) {
+  const std::string kHloText = R"(
+ENTRY e {
+  p1 = f16[150,32,60]{2,1,0} parameter(1)
+  p0 = f16[75,2,26,60]{3,2,1,0} parameter(0)
+  t = f16[75,2,60,26]{3,2,1,0} transpose(p0), dimensions={0,1,3,2}
+  r = f16[150,60,26]{2,1,0} reshape(t)
+  ROOT tmp_4 = f16[150,32,26]{2,1,0} dot(p1, r),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 TEST_F(TritonGemmTest, SineOutputIsNotFused) {
   const std::string kHloText = R"(
 HloModule m

From 17d45cfb8980fc6798926b2968f57d4213c6d5c8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 09:26:22 -0700
Subject: [PATCH 088/567] Disable two compiler warnings in the bazelrc

This is adding `-Wno-error=unused-command-line-argument` and `-Wno-gnu-offsetof-extensions` to the `rbe_linux_cpu` config.

We have these flags already in the `release_cpu_linux` config,
but some RBE builds don't use that config. So I'm adding
them to `rbe_linux_cpu` as well.

Ideally there would be a config that applies to all clang builds but not to the
old GCC builds, but that doesn't exist. So for now I don't see a good option
other than having the flag in two places (without major restructurings of the bazelrc file).

PiperOrigin-RevId: 567330212
---
 .bazelrc                                 | 7 +++++++
 third_party/xla/.bazelrc                 | 7 +++++++
 third_party/xla/third_party/tsl/.bazelrc | 7 +++++++
 3 files changed, 21 insertions(+)

diff --git a/.bazelrc b/.bazelrc
index 8fb09a849a8b57..475a1c5378cdfe 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -463,6 +463,13 @@ build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang17_config_cuda//cro
 build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
 build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang17_config_platform//:platform"
 build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
+# This is needed for all Clang17 builds but must not be present in GCC builds.
+build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
+# This was added in clang-16 by https://reviews.llvm.org/D133574.
+# Can be removed once upb is updated, since a type definition is used within
+# offset of in the current version of ubp.
+# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
+build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
 build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 8fb09a849a8b57..475a1c5378cdfe 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -463,6 +463,13 @@ build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang17_config_cuda//cro
 build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
 build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang17_config_platform//:platform"
 build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
+# This is needed for all Clang17 builds but must not be present in GCC builds.
+build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
+# This was added in clang-16 by https://reviews.llvm.org/D133574.
+# Can be removed once upb is updated, since a type definition is used within
+# offset of in the current version of ubp.
+# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
+build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
 build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 8fb09a849a8b57..475a1c5378cdfe 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -463,6 +463,13 @@ build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang17_config_cuda//cro
 build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
 build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang17_config_platform//:platform"
 build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
+# This is needed for all Clang17 builds but must not be present in GCC builds.
+build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
+# This was added in clang-16 by https://reviews.llvm.org/D133574.
+# Can be removed once upb is updated, since a type definition is used within
+# offset of in the current version of ubp.
+# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
+build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
 build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"

From 91f89fd2abb71f41522e47a2910b63da50651973 Mon Sep 17 00:00:00 2001
From: Wilsin Gosti <wilsin@google.com>
Date: Thu, 21 Sep 2023 09:33:37 -0700
Subject: [PATCH 089/567] #tf.data The change of default memory cap for tf.data
 should have been rolled out with an experiment. Revert it back and will
 re-roll it out with an experiment.

PiperOrigin-RevId: 567332040
---
 tensorflow/core/framework/model.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 5dfa6f579abfdc..727af779489b1b 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -65,7 +65,7 @@ constexpr char kMaxBufferedElements[] = "max_buffered_elements";
 constexpr char kModelInputTimeKey[] = "model_input_time";
 
 // Default share of available RAM that can be used by model's internal buffers.
-constexpr double kRamBudgetShare = 0.9;
+constexpr double kRamBudgetShare = 0.5;
 
 // Weight of the latest processing time used in computing the exponential moving
 // average of processing time per element.

From 6c34dcf9aa7970fcfbba4508e9437a6f17517e24 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Thu, 21 Sep 2023 09:51:03 -0700
Subject: [PATCH 090/567] Legalize MHLO Pad operation to TFLite

PiperOrigin-RevId: 567337044
---
 tensorflow/compiler/mlir/lite/stablehlo/BUILD |   1 +
 .../stablehlo/tests/tfl_legalize_hlo_pad.mlir | 175 ++++++++++++++
 .../lite/stablehlo/transforms/legalize_hlo.cc |   3 +
 .../transforms/legalize_hlo_conversions/BUILD |  21 ++
 .../legalize_hlo_conversions/pad.cc           |  82 +++++++
 .../transforms/legalize_hlo_conversions/pad.h |  36 +++
 .../legalize_hlo_conversions/util.cc          | 220 +++++++++++++++++-
 .../legalize_hlo_conversions/util.h           | 139 ++++++++++-
 .../transforms/legalize_hlo_patterns.td       |   1 +
 .../transforms/tflite_legalize_hlo.cc         |   5 +-
 10 files changed, 675 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo_pad.mlir
 create mode 100644 tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.cc
 create mode 100644 tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 4907ef34d37cd8..b65d7480a6ece3 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -500,6 +500,7 @@ cc_library(
     deps = [
         ":passes_inc_gen",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:pad",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo_pad.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo_pad.mlir
new file mode 100644
index 00000000000000..b72b4296c000ff
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo_pad.mlir
@@ -0,0 +1,175 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-legalize-hlo -split-input-file | FileCheck %s --dump-input=fail
+
+func.func @mhlo_pad_test__noop(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<5x7xf32> {
+  %0 = "mhlo.pad"(%input, %padding_value) {
+    edge_padding_low = dense<[0, 0]> : tensor<2xi64>,
+    edge_padding_high = dense<[0, 0]> : tensor<2xi64>,
+    interior_padding = dense<[0, 0]> : tensor<2xi64>
+  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<5x7xf32>
+  func.return %0: tensor<5x7xf32>
+
+// CHECK-LABEL: mhlo_pad_test__noop
+// CHECK: return %arg0 : tensor<5x7xf32>
+}
+
+func.func @mhlo_pad_test__pad_all(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<9x10xf32> {
+  %0 = "mhlo.pad"(%input, %padding_value) {
+    edge_padding_low = dense<[3, 2]> : tensor<2xi64>,
+    edge_padding_high = dense<[1, 1]> : tensor<2xi64>,
+    interior_padding = dense<[0, 0]> : tensor<2xi64>
+  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<9x10xf32>
+  func.return %0: tensor<9x10xf32>
+
+// CHECK-LABEL: mhlo_pad_test__pad_all
+// CHECK: %cst = arith.constant dense<{{\[}}[3, 1], [2, 1]]> : tensor<2x2xi64>
+// CHECK: %0 = "tfl.padv2"(%arg0, %cst, %arg1) : (tensor<5x7xf32>, tensor<2x2xi64>, tensor<f32>) -> tensor<9x10xf32>
+// CHECK: return %0 : tensor<9x10xf32>
+}
+
+func.func @mhlo_pad_test__crop_all(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<3x5xf32> {
+  %0 = "mhlo.pad"(%input, %padding_value) {
+    edge_padding_low = dense<[-1, -1]> : tensor<2xi64>,
+    edge_padding_high = dense<[-1, -1]> : tensor<2xi64>,
+    interior_padding = dense<[0, 0]> : tensor<2xi64>
+  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<3x5xf32>
+  func.return %0: tensor<3x5xf32>
+
+// CHECK-LABEL: mhlo_pad_test__crop_all
+// CHECK: %cst = arith.constant dense<1> : tensor<2xi64>
+// CHECK: %cst_0 = arith.constant dense<-1> : tensor<2xi64>
+// CHECK: %cst_1 = arith.constant dense<1> : tensor<2xi64>
+// CHECK: %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<5x7xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<3x5xf32>
+// CHECK: return %0 : tensor<3x5xf32>
+}
+
+func.func @mhlo_pad_test__interior_pad_all(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<9x13xf32> {
+  %0 = "mhlo.pad"(%input, %padding_value) {
+    edge_padding_low = dense<[0, 0]> : tensor<2xi64>,
+    edge_padding_high = dense<[0, 0]> : tensor<2xi64>,
+    interior_padding = dense<[1, 1]> : tensor<2xi64>
+  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<9x13xf32>
+  func.return %0: tensor<9x13xf32>
+
+// CHECK-LABEL: mhlo_pad_test__interior_pad_all
+// CHECK: %cst = arith.constant dense<2> : tensor<2xi32>
+// CHECK: %0 = "tfl.dilate"(%arg0, %cst, %arg1) : (tensor<5x7xf32>, tensor<2xi32>, tensor<f32>) -> tensor<9x13xf32>
+// CHECK: return %0 : tensor<9x13xf32>
+}
+
+func.func @mhlo_pad_test__pad_and_crop(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<5x7xf32> {
+  %0 = "mhlo.pad"(%input, %padding_value) {
+    edge_padding_low = dense<[-1, 1]> : tensor<2xi64>,
+    edge_padding_high = dense<[1, -1]> : tensor<2xi64>,
+    interior_padding = dense<[0, 0]> : tensor<2xi64>
+  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<5x7xf32>
+  func.return %0: tensor<5x7xf32>
+
+// CHECK-LABEL: mhlo_pad_test__pad_and_crop
+// CHECK: %cst = arith.constant dense<{{\[}}[0, 1], [1, 0]]> : tensor<2x2xi64>
+// CHECK: %0 = "tfl.padv2"(%arg0, %cst, %arg1) : (tensor<5x7xf32>, tensor<2x2xi64>, tensor<f32>) -> tensor<6x8xf32>
+// CHECK: %cst_0 = arith.constant dense<[1, 0]> : tensor<2xi64>
+// CHECK: %cst_1 = arith.constant dense<[0, -1]> : tensor<2xi64>
+// CHECK: %cst_2 = arith.constant dense<1> : tensor<2xi64>
+// CHECK: %1 = "tfl.strided_slice"(%0, %cst_0, %cst_1, %cst_2) {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 1 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<6x8xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<5x7xf32>
+// CHECK: return %1 : tensor<5x7xf32>
+}
+
+func.func @mhlo_pad_test__pad_and_crop_and_interior_pad(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<13x25xf32> {
+  %0 = "mhlo.pad"(%input, %padding_value) {
+    edge_padding_low = dense<[-1, 1]> : tensor<2xi64>,
+    edge_padding_high = dense<[1, -1]> : tensor<2xi64>,
+    interior_padding = dense<[2, 3]> : tensor<2xi64>
+  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<13x25xf32>
+  func.return %0: tensor<13x25xf32>
+
+// CHECK-LABEL: mhlo_pad_test__pad_and_crop_and_interior_pad
+// CHECK: %cst = arith.constant dense<[3, 4]> : tensor<2xi32>
+// CHECK: %0 = "tfl.dilate"(%arg0, %cst, %arg1) : (tensor<5x7xf32>, tensor<2xi32>, tensor<f32>) -> tensor<13x25xf32>
+// CHECK: %cst_0 = arith.constant dense<{{\[}}[0, 1], [1, 0]]> : tensor<2x2xi64>
+// CHECK: %1 = "tfl.padv2"(%0, %cst_0, %arg1) : (tensor<13x25xf32>, tensor<2x2xi64>, tensor<f32>) -> tensor<14x26xf32>
+// CHECK: %cst_1 = arith.constant dense<[1, 0]> : tensor<2xi64>
+// CHECK: %cst_2 = arith.constant dense<[0, -1]> : tensor<2xi64>
+// CHECK: %cst_3 = arith.constant dense<1> : tensor<2xi64>
+// CHECK: %2 = "tfl.strided_slice"(%1, %cst_1, %cst_2, %cst_3) {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 1 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<14x26xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<13x25xf32>
+// CHECK: return %2 : tensor<13x25xf32>
+}
+
+func.func @mhlo_pad_test__pad_all_unknown_shape(%input: tensor<?x?x?x?xf32>, %padding_value: tensor<f32>) -> tensor<?x?x?x?xf32> {
+  %0 = "mhlo.pad"(%input, %padding_value) {
+    edge_padding_low = dense<[1, 1, 1, 1]> : tensor<4xi64>,
+    edge_padding_high = dense<[1, 1, 1, 1]> : tensor<4xi64>,
+    interior_padding = dense<[0, 0, 0, 0]> : tensor<4xi64>
+  } : (tensor<?x?x?x?xf32>, tensor<f32>) -> tensor<?x?x?x?xf32>
+  func.return %0: tensor<?x?x?x?xf32>
+
+// CHECK-LABEL: mhlo_pad_test__pad_all_unknown_shape
+// CHECK: %cst = arith.constant dense<1> : tensor<4x2xi64>
+// CHECK: %0 = "tfl.padv2"(%arg0, %cst, %arg1) : (tensor<?x?x?x?xf32>, tensor<4x2xi64>, tensor<f32>) -> tensor<?x?x?x?xf32>
+// CHECK: return %0 : tensor<?x?x?x?xf32>
+}
+
+func.func @mhlo_pad_test__crop_all_unknown_shape(%input: tensor<?x?x?x?xf32>, %padding_value: tensor<f32>) -> tensor<?x?x?x?xf32> {
+  %0 = "mhlo.pad"(%input, %padding_value) {
+    edge_padding_low = dense<[-1, -1, -1, -1]> : tensor<4xi64>,
+    edge_padding_high = dense<[-1, -1, -1, -1]> : tensor<4xi64>,
+    interior_padding = dense<[0, 0, 0, 0]> : tensor<4xi64>
+  } : (tensor<?x?x?x?xf32>, tensor<f32>) -> tensor<?x?x?x?xf32>
+  func.return %0: tensor<?x?x?x?xf32>
+
+// CHECK-LABEL: mhlo_pad_test__crop_all_unknown_shape
+// CHECK: %cst = arith.constant dense<1> : tensor<4xi64>
+// CHECK: %cst_0 = arith.constant dense<-1> : tensor<4xi64>
+// CHECK: %cst_1 = arith.constant dense<1> : tensor<4xi64>
+// CHECK: %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<?x?x?x?xf32>, tensor<4xi64>, tensor<4xi64>, tensor<4xi64>) -> tensor<?x?x?x?xf32>
+// CHECK: return %0 : tensor<?x?x?x?xf32>
+}
+
+func.func @mhlo_pad_test__pad_all_unknown_dim0(%input: tensor<?x2x3x4xf32>, %padding_value: tensor<f32>) -> tensor<?x4x5x6xf32> {
+  %0 = "mhlo.pad"(%input, %padding_value) {
+    edge_padding_low = dense<[1, 1, 1, 1]> : tensor<4xi64>,
+    edge_padding_high = dense<[1, 1, 1, 1]> : tensor<4xi64>,
+    interior_padding = dense<[0, 0, 0, 0]> : tensor<4xi64>
+  } : (tensor<?x2x3x4xf32>, tensor<f32>) -> tensor<?x4x5x6xf32>
+  func.return %0: tensor<?x4x5x6xf32>
+
+// CHECK-LABEL: mhlo_pad_test__pad_all_unknown_dim0
+// CHECK: %cst = arith.constant dense<1> : tensor<4x2xi64>
+// CHECK: %0 = "tfl.padv2"(%arg0, %cst, %arg1) : (tensor<?x2x3x4xf32>, tensor<4x2xi64>, tensor<f32>) -> tensor<?x4x5x6xf32>
+// CHECK: return %0 : tensor<?x4x5x6xf32>
+}
+
+func.func @mhlo_pad_test__crop_all_unknown_dim0(%input: tensor<?x2x3x4xf32>, %padding_value: tensor<f32>) -> tensor<?x0x1x2xf32> {
+  %0 = "mhlo.pad"(%input, %padding_value) {
+    edge_padding_low = dense<[-1, -1, -1, -1]> : tensor<4xi64>,
+    edge_padding_high = dense<[-1, -1, -1, -1]> : tensor<4xi64>,
+    interior_padding = dense<[0, 0, 0, 0]> : tensor<4xi64>
+  } : (tensor<?x2x3x4xf32>, tensor<f32>) -> tensor<?x0x1x2xf32>
+  func.return %0: tensor<?x0x1x2xf32>
+
+// CHECK-LABEL: mhlo_pad_test__crop_all_unknown_dim0
+// CHECK: %cst = arith.constant dense<1> : tensor<4xi64>
+// CHECK: %cst_0 = arith.constant dense<-1> : tensor<4xi64>
+// CHECK: %cst_1 = arith.constant dense<1> : tensor<4xi64>
+// CHECK: %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<?x2x3x4xf32>, tensor<4xi64>, tensor<4xi64>, tensor<4xi64>) -> tensor<?x0x1x2xf32>
+// CHECK: return %0 : tensor<?x0x1x2xf32>
+}
+
+func.func @mhlo_pad_test__pad_and_crop_and_interior_pad_unknown_dim0(%input: tensor<?x2x3x4xf32>, %padding_value: tensor<f32>) -> tensor<?x3x8x15xf32> {
+  %0 = "mhlo.pad"(%input, %padding_value) {
+    edge_padding_low = dense<[-2, -1, 0, 1]> : tensor<4xi64>,
+    edge_padding_high = dense<[1, 0, -1, -2]> : tensor<4xi64>,
+    interior_padding = dense<[1, 2, 3, 4]> : tensor<4xi64>
+  } : (tensor<?x2x3x4xf32>, tensor<f32>) -> tensor<?x3x8x15xf32>
+  func.return %0: tensor<?x3x8x15xf32>
+
+// CHECK-LABEL: mhlo_pad_test__pad_and_crop_and_interior_pad_unknown_dim0
+// CHECK: %cst = arith.constant dense<[2, 3, 4, 5]> : tensor<4xi32>
+// CHECK: %0 = "tfl.dilate"(%arg0, %cst, %arg1) : (tensor<?x2x3x4xf32>, tensor<4xi32>, tensor<f32>) -> tensor<?x4x9x16xf32>
+// CHECK: %cst_0 = arith.constant dense<{{\[}}[0, 1], [0, 0], [0, 0], [1, 0]]> : tensor<4x2xi64>
+// CHECK: %1 = "tfl.padv2"(%0, %cst_0, %arg1) : (tensor<?x4x9x16xf32>, tensor<4x2xi64>, tensor<f32>) -> tensor<?x4x9x17xf32>
+// CHECK: %cst_1 = arith.constant dense<[2, 1, 0, 0]> : tensor<4xi64>
+// CHECK: %cst_2 = arith.constant dense<[0, 0, -1, -2]> : tensor<4xi64>
+// CHECK: %cst_3 = arith.constant dense<1> : tensor<4xi64>
+// CHECK: %2 = "tfl.strided_slice"(%1, %cst_1, %cst_2, %cst_3) {begin_mask = 12 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<?x4x9x17xf32>, tensor<4xi64>, tensor<4xi64>, tensor<4xi64>) -> tensor<?x3x8x15xf32>
+// CHECK: return %2 : tensor<?x3x8x15xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index ca6db32e17ac42..d424a1e9c3137b 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -3337,6 +3337,9 @@ class ConvertIfOp : public OpConversionPattern<mhlo::IfOp> {
 };
 
 // Converts mhlo.pad to tf.PadV2
+// TODO: b/301438955 - This is redundant with the MHLO -> TFLite
+// legalization and covers less usecases. We need to check with DarwiNN that
+// this can be removed without breaking their workflow.
 Value ConvertPadOp(PatternRewriter& rewriter, Operation* old_op) {
   auto pad_op = cast<mhlo::PadOp>(old_op);
   mlir::Location loc = pad_op.getLoc();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
index c5c1e7cade6a8f..c397605fe682ec 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
@@ -18,8 +18,11 @@ cc_library(
         "util.h",
     ],
     deps = [
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/algorithm:container",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
@@ -45,3 +48,21 @@ cc_library(
         "@local_xla//xla/mlir_hlo",
     ],
 )
+
+cc_library(
+    name = "pad",
+    srcs = [
+        "pad.cc",
+    ],
+    hdrs = [
+        "pad.h",
+    ],
+    deps = [
+        ":util",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@local_xla//xla/mlir_hlo",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.cc
new file mode 100644
index 00000000000000..9fd1fcb8402c51
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.cc
@@ -0,0 +1,82 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h"
+
+#include <cstdint>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+
+ConversionState BuildConversionState(mhlo::PadOp mhlo_pad,
+                                     ConversionPatternRewriter& rewriter) {
+  ConversionState state{
+      /*.shlo_op=*/mhlo_pad.getOperation(),
+      /*.rewriter=*/rewriter,
+      /*.last_tf_op=*/nullptr,
+  };
+  return state;
+}
+
+// Converts the given StableHLO Pad operation to a chain of TFLite operations.
+//
+// StableHLO Pad allows dilating, padding and cropping its input, in that order.
+// This can be implemented in TFLite as a sequence of these operations. Note
+// that all operations do not always need to be called: if there is no dilation
+// (resp. pad, crop) we do not need to add it to the chain.
+//
+// TFLite does not provide a crop operation, the StridedSlice one is used
+// instead.
+LogicalResult ConvertPadOp::matchAndRewrite(
+    mhlo::PadOp mhlo_pad, OpAdaptor adaptor,
+    ConversionPatternRewriter& rewriter) const {
+  // We don't need to match the pad op as we always know how to convert it.
+  ConversionState state = BuildConversionState(mhlo_pad, rewriter);
+
+  // Dilate when interior padding is specified different from 0.
+  AddDilateOpIfRequired(state, mhlo_pad.getInteriorPadding(),
+                        mhlo_pad.getPaddingValue(),
+                        /*is_padding=*/true);
+  // Pad when padding has positive values.
+  AddPadOpIfRequired(state, mhlo_pad.getEdgePaddingLow(),
+                     mhlo_pad.getEdgePaddingHigh(), mhlo_pad.getPaddingValue());
+  // Crop when padding has negative values.
+  //
+  // Note that there is no crop operation in TFLite so we use the StridedSlice
+  // operation instead.
+  const DenseElementsAttr strides_data = CreateDenseElementsAttr(
+      state.rewriter,
+      llvm::SmallVector<int64_t, 6>(state.GetOperandShape().size(), 1));
+  AddStridedSliceOpIfRequired(state, mhlo_pad.getEdgePaddingLow(),
+                              mhlo_pad.getEdgePaddingHigh(), strides_data);
+
+  if (state.last_tf_op) {
+    rewriter.replaceOp(mhlo_pad, state.last_tf_op);
+  } else {
+    rewriter.replaceOp(mhlo_pad, mhlo_pad.getOperand());
+  }
+  return success();
+}
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h
new file mode 100644
index 00000000000000..c0fa5017b69236
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_H_
+
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+
+class ConvertPadOp : public OpConversionPattern<mhlo::PadOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::PadOp mhlo_pad, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final;
+};
+
+}  // namespace odml
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
index 2cd9a689f8e780..3588b68bc1b182 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h"
 
-#include <stdint.h>
-
 #include <cassert>
+#include <cstddef>
+#include <cstdint>
 
+#include "absl/algorithm/container.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -28,9 +29,11 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
@@ -158,5 +161,218 @@ LogicalResult MatchBinaryReduceFunction<void>(mlir::Region& function) {
   return success();
 }
 
+Value ConversionState::GetOperand() const {
+  if (last_tf_op) {
+    return last_tf_op->getResult(0);
+  }
+  return hlo_op->getOperand(0);
+}
+
+TensorType ConversionState::GetOperandTensorType() const {
+  if (last_tf_op) {
+    return last_tf_op->getResult(0).getType().cast<TensorType>();
+  }
+  return hlo_op->getOperand(0).getType().cast<TensorType>();
+}
+
+llvm::ArrayRef<int64_t> ConversionState::GetOperandShape() const {
+  return GetOperandTensorType().getShape();
+}
+
+namespace {
+
+// Gets the dilation data for TFLite Dilate.
+//
+// Depending on the definition of the op we are trying to legalize, a dilation
+// can be either seen as interior padding or as a scaling factor where:
+//
+//     scaling_factor = interior_padding + 1
+//
+// The is_padding parameter is used to take this difference into account.
+llvm::SmallVector<int32_t, 6> GetDilateData(const DenseElementsAttr& dilation,
+                                            const bool is_padding) {
+  llvm::SmallVector<int32_t, 6> data;
+  for (const auto& v : dilation.getValues<APInt>()) {
+    data.push_back(v.getSExtValue() + static_cast<int32_t>(is_padding));
+  }
+  return data;
+}
+
+}  // namespace
+
+void AddDilateOpIfRequired(ConversionState& state,
+                           const DenseElementsAttr& dilation,
+                           const Value padding_value, const bool is_padding) {
+  const auto dilate_data = GetDilateData(dilation, is_padding);
+  if (absl::c_any_of(dilate_data, IsNot(1))) {
+    const TensorType output_type = state.ComputeResultTensorType(
+        [](int i, const auto& shape, const auto& dilate_data) {
+          if (shape[i] < 0) {
+            return shape[i];
+          }
+          return shape[i] + (shape[i] - 1) * (dilate_data[i] - 1);
+        },
+        dilate_data);
+
+    auto dilate_tensor = AddConstantTensor(state, dilate_data);
+    auto tfl_dilate = state.rewriter.create<TFL::DilateOp>(
+        state.hlo_op->getLoc(), output_type, state.GetOperand(), dilate_tensor,
+        padding_value);
+
+    state.last_tf_op = tfl_dilate;
+  }
+}
+
+namespace {
+
+// Gets the pad data for TFLite PadV2.
+//
+// StableHLO Pad allows negative values for cropping. This functions replaces
+// negative values with 0.
+llvm::SmallVector<int64_t, 12> GetPadData(
+    const DenseElementsAttr& edge_padding_low,
+    const DenseElementsAttr& edge_padding_high) {
+  llvm::SmallVector<int64_t, 12> data;
+  auto low_values = edge_padding_low.getValues<APInt>();
+  auto high_values = edge_padding_high.getValues<APInt>();
+  for (int i = 0; i < edge_padding_low.getNumElements(); ++i) {
+    const int64_t pad_low = low_values[i].getSExtValue();
+    const int64_t pad_high = high_values[i].getSExtValue();
+    data.push_back(pad_low < 0 ? 0 : pad_low);
+    data.push_back(pad_high < 0 ? 0 : pad_high);
+  }
+  return data;
+}
+
+template <class Container>
+void AddPadOpIfRequiredImpl(ConversionState& state, const Container& pad_data,
+                            const Value padding_value) {
+  if (absl::c_any_of(pad_data, IsNot(0))) {
+    const TensorType output_type = state.ComputeResultTensorType(
+        [](int i, const auto& shape, const auto& pad) {
+          if (shape[i] < 0) {
+            return shape[i];
+          }
+          return shape[i] + pad[2 * i] + pad[2 * i + 1];
+        },
+        pad_data);
+
+    auto pad_tensor = AddConstantTensor(
+        state, pad_data,
+        {static_cast<int64_t>(state.GetOperandShape().size()), 2});
+    auto tfl_pad = state.rewriter.create<TFL::PadV2Op>(
+        state.hlo_op->getLoc(), output_type, state.GetOperand(), pad_tensor,
+        padding_value);
+
+    state.last_tf_op = tfl_pad;
+  }
+}
+
+}  // namespace
+
+void AddPadOpIfRequired(ConversionState& state,
+                        const DenseElementsAttr& edge_padding_low,
+                        const DenseElementsAttr& edge_padding_high,
+                        const Value padding_value) {
+  AddPadOpIfRequiredImpl(state, GetPadData(edge_padding_low, edge_padding_high),
+                         padding_value);
+}
+
+namespace {
+
+// Holds the data needed to generate a TFLite StridedSlice operation.
+struct StridedSliceData {
+  llvm::SmallVector<int64_t, 6> low;
+  llvm::SmallVector<int64_t, 6> high;
+  llvm::SmallVector<int64_t, 6> strides;
+  int32_t begin_mask = 0;
+  int32_t end_mask = 0;
+
+  void resize(const size_t size) {
+    low.resize(size);
+    high.resize(size);
+    strides.resize(size);
+  }
+};
+
+// Updates the strided slice data with the given values for the `i`th element.
+//
+// Warning: this expects the data internal buffers to have at least i+1
+// elements.
+void AppendDataDim(StridedSliceData& data, const int i, const APInt& low,
+                   const APInt& high, const APInt& stride) {
+  const int64_t pad_low = low.getSExtValue();
+  const int64_t pad_high = high.getSExtValue();
+  if (pad_low >= 0) {
+    data.begin_mask |= 1 << i;
+    data.low[i] = 0;
+  } else {
+    data.low[i] = -pad_low;
+  }
+  if (pad_high >= 0) {
+    data.end_mask |= 1 << i;
+    data.high[i] = 0;
+  } else {
+    data.high[i] = pad_high;
+  }
+  data.strides[i] = stride.getSExtValue();
+}
+
+// Gets the data needed to generate a TFLite StridedSlice operation.
+StridedSliceData GetStridedSliceData(const DenseElementsAttr& edge_padding_low,
+                                     const DenseElementsAttr& edge_padding_high,
+                                     const DenseElementsAttr& strides) {
+  StridedSliceData data;
+  data.resize(edge_padding_low.getNumElements());
+  const auto low_values = edge_padding_low.getValues<APInt>();
+  const auto high_values = edge_padding_high.getValues<APInt>();
+  const auto stride_values = strides.getValues<APInt>();
+  for (int i = 0; i < edge_padding_low.getNumElements(); ++i) {
+    AppendDataDim(data, i, low_values[i], high_values[i], stride_values[i]);
+  }
+  return data;
+}
+
+void AddStridedSliceOpIfRequiredImpl(
+    ConversionState& state, const StridedSliceData& strided_slice_data) {
+  if (absl::c_any_of(strided_slice_data.low, IsNot(0)) ||
+      absl::c_any_of(strided_slice_data.high, IsNot(0)) ||
+      absl::c_any_of(strided_slice_data.strides, IsNot(1))) {
+    const TensorType output_type = state.ComputeResultTensorType(
+        [](int i, const auto& shape, const auto& high, const auto& low,
+           const auto& strides) {
+          if (shape[i] < 0) {
+            return shape[i];
+          }
+          return (shape[i] + high[i] - low[i]) / strides[i];
+        },
+        strided_slice_data.high, strided_slice_data.low,
+        strided_slice_data.strides);
+
+    auto crop_begin_tensor = AddConstantTensor(state, strided_slice_data.low);
+    auto crop_end_tensor = AddConstantTensor(state, strided_slice_data.high);
+    auto crop_strides_tensor =
+        AddConstantTensor(state, strided_slice_data.strides);
+    auto tfl_crop = state.rewriter.create<TFL::StridedSliceOp>(
+        state.hlo_op->getLoc(), output_type, state.GetOperand(),
+        crop_begin_tensor, crop_end_tensor, crop_strides_tensor,
+        strided_slice_data.begin_mask, strided_slice_data.end_mask, 0, 0, 0,
+        false);
+
+    state.last_tf_op = tfl_crop;
+  }
+}
+
+}  // namespace
+
+void AddStridedSliceOpIfRequired(ConversionState& state,
+                                 const DenseElementsAttr& edge_padding_low,
+                                 const DenseElementsAttr& edge_padding_high,
+                                 const DenseElementsAttr& strides) {
+  StridedSliceData strided_slice_data =
+      GetStridedSliceData(edge_padding_low, edge_padding_high, strides);
+  AddStridedSliceOpIfRequiredImpl(state, strided_slice_data);
+}
+
 }  // namespace odml
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
index 72485597d6c4ff..8d63bfa1435324 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_UTIL_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_UTIL_H_
 
-#include <stdint.h>
-
-#include <cassert>
+#include <cstdint>
 
 #include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -30,7 +30,6 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
@@ -112,6 +111,138 @@ LogicalResult MatchBinaryReduceFunction(mlir::Region& function) {
 // scatter like ops.
 template <>
 LogicalResult MatchBinaryReduceFunction<void>(mlir::Region& function);
+
+// Concentrates the data needed to substitute StableHLO operations with TFLite
+// ones.
+struct ConversionState {
+  Operation* hlo_op;
+  ConversionPatternRewriter& rewriter;
+  Operation* last_tf_op;
+
+  // Returns the main operand of a NEW op to add to the conversion chain.
+  //
+  // This is generally the result of the last op that was added to the chain.
+  Value GetOperand() const;
+
+  // Returns the type of the operand of a NEW op to add to the conversion chain.
+  //
+  // This is generally the type of the result of the last op that was added to
+  // the chain.
+  TensorType GetOperandTensorType() const;
+
+  llvm::ArrayRef<int64_t> GetOperandShape() const;
+
+  // Computes a new shape from the current operand shape.
+  //
+  // - The args are containers that are indexable using operator[].
+  // - The callback must be callable have a signature that is:
+  //      `int64_t (int idx, shape, decltype(args)...)`
+  //
+  // The callback is called for each element of the operand shape with the
+  // index of the current loop iteration, the shape and args.
+  template <class F, class... Containers>
+  llvm::SmallVector<int64_t, 6> ComputeResultShape(F&& callback,
+                                                   Containers&&... args) const {
+    llvm::ArrayRef<int64_t> shape = GetOperandShape();
+    llvm::SmallVector<int64_t, 6> res;
+    for (int i = 0; i < shape.size(); ++i) {
+      if (shape[i] < 0) {
+        res.push_back(shape[i]);
+      } else {
+        res.push_back(callback(i, shape, args...));
+      }
+    }
+    return res;
+  }
+
+  template <class F, class... Containers>
+  TensorType ComputeResultTensorType(F&& callback, Containers&&... args) const {
+    const llvm::SmallVector<int64_t, 6> shape = ComputeResultShape(
+        static_cast<F&&>(callback), static_cast<Containers&&>(args)...);
+    return GetOperandTensorType().cloneWith(
+        shape, GetOperandTensorType().getElementType());
+  }
+};
+
+// Gets the Type associated to type T from the builder.
+template <class T>
+Type GetElementType(OpBuilder& builder);
+
+#define GET_ELEMENT_TYPE_SPECIALISATION(TYPE, NAME)       \
+  template <>                                             \
+  inline Type GetElementType<TYPE>(OpBuilder & builder) { \
+    return builder.get##NAME##Type();                     \
+  }
+
+GET_ELEMENT_TYPE_SPECIALISATION(int32_t, I32);
+GET_ELEMENT_TYPE_SPECIALISATION(int64_t, I64);
+
+// Create a DenseElementsAttr from given shape and data.
+template <class Data, class Shape = llvm::SmallVector<int64_t, 6>>
+DenseElementsAttr CreateDenseElementsAttr(OpBuilder& builder, const Data& data,
+                                          const Shape& shape = Shape()) {
+  llvm::SmallVector<int64_t, 6> attr_shape(shape.begin(), shape.end());
+  if (attr_shape.empty()) {
+    attr_shape.push_back(static_cast<int64_t>(data.size()));
+  }
+  const Type attr_type = GetElementType<typename Data::value_type>(builder);
+  return DenseElementsAttr::get(RankedTensorType::get(attr_shape, attr_type),
+                                ArrayRef<typename Data::value_type>(data));
+}
+
+// Adds a constant tensor to the conversion chain.
+template <class Data, class Shape = llvm::SmallVector<int64_t, 6>>
+auto AddConstantTensor(ConversionState& state, const Data& data,
+                       const Shape& shape = Shape()) {
+  const DenseElementsAttr attr =
+      CreateDenseElementsAttr(state.rewriter, data, shape);
+  return state.rewriter.create<arith::ConstantOp>(state.hlo_op->getLoc(), attr);
+}
+
+// Builds a callable object that checks that its argument is not the given
+// `value`.
+template <class T>
+auto IsNot(T value) {
+  return [value](auto v) { return v != value; };
+}
+
+// Adds a TFLite Dilate operation to the conversion chain.
+//
+// If the given parameters would end with the identity operation, this does not
+// add anything to the chain.
+//
+// Depending on the definition of the op we are trying to legalize, a dilation
+// can be either seen as interior padding or as a scaling factor where:
+//
+//     scaling_factor = interior_padding + 1
+//
+// The is_padding parameter is used to take this difference into account.
+void AddDilateOpIfRequired(ConversionState& state,
+                           const DenseElementsAttr& dilation,
+                           Value padding_value, bool is_padding);
+
+// Adds a TFLite PadV2 operation to the conversion chain.
+//
+// If the given parameters would end with the identity operation, this does not
+// add anything to the chain.
+void AddPadOpIfRequired(ConversionState& state,
+                        const DenseElementsAttr& edge_padding_low,
+                        const DenseElementsAttr& edge_padding_high,
+                        Value padding_value);
+
+// Adds a TFLite StridedSlice operation to the conversion chain.
+//
+// This overload is used to legalize a crop operation in TFLite. As such, the
+// begin and end specifications of the strided slice are computed from the
+// negative values in the padding parameters.
+//
+// If the given parameters would end with the identity operation, this does not
+// add anything to the chain.
+void AddStridedSliceOpIfRequired(ConversionState& state,
+                                 const DenseElementsAttr& edge_padding_low,
+                                 const DenseElementsAttr& edge_padding_high,
+                                 const DenseElementsAttr& strides);
+
 }  // namespace odml
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
index ea0ab47b25ccd8..5030777490c5d8 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
@@ -304,6 +304,7 @@ def : Pat<(MHLO_DotGeneralOp:$old_value
                $dot_dimension_numbers, $precision_config),
           (ConvertDotGeneralOp $old_value)>;
 
+
 def IsZero : Constraint<CPred<
   "$0.isSplat() && $0.getSplatValue<APInt>() == 0">>;
 def ConvertPadOp : NativeCodeCall<
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
index e3c52f9d529532..53a89ac16ee0b6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
@@ -52,13 +53,13 @@ void LegalizeHloToTfLitePass::runOnOperation() {
   MLIRContext& context = getContext();
   RewritePatternSet patterns(&getContext());
   // Add new conversion patterns here.
-  // patterns.add<>(&context);
+  patterns.add<odml::ConvertPadOp>(&context);
 
   ConversionTarget target(context);
   target.addLegalDialect<TFL::TensorFlowLiteDialect, mhlo::MhloDialect>();
   target.addLegalOp<func::CallOp, func::ConstantOp, arith::ConstantOp>();
   // Converted MHLO ops should be marked illegal here.
-  // target.addIllegalOp<>();
+  target.addIllegalOp<mhlo::PadOp>();
   if (failed(applyPartialConversion(getOperation(), target,
                                     std::move(patterns)))) {
     getOperation().emitError("mhlo to TFLite legalization failed.");

From fad9aa2ac91f765c164746263151668b8230535e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 10:04:52 -0700
Subject: [PATCH 091/567] Internal change only.

PiperOrigin-RevId: 567341035
---
 .../xla/third_party/tsl/tsl/platform/default/logging.h        | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/logging.h b/third_party/xla/third_party/tsl/tsl/platform/default/logging.h
index 0b934012c1f3cb..3dba4b3b8653b7 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/logging.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/logging.h
@@ -583,6 +583,10 @@ class TFLogEntry {
   std::string ToString() const { return message_; }
   absl::string_view text_message() const { return message_; }
 
+  // Returning similar result as `text_message` as there is no prefix in this
+  // implementation.
+  absl::string_view text_message_with_prefix() const { return message_; }
+
  private:
   const absl::LogSeverity severity_;
   const std::string fname_;

From 7e711e5754d30c78201885e9ae0ad2eaebb109b3 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 21 Sep 2023 11:04:49 -0700
Subject: [PATCH 092/567] [stream_executor] NFC: Add dnn dependency to stream
 executor impl

https://github.com/openxla/xla/issues/5761

PiperOrigin-RevId: 567359965
---
 third_party/xla/xla/stream_executor/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index f2c8cffb029550..677b085f83a7db 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -781,6 +781,7 @@ cc_library(
     deps = [
         ":device_description",
         ":device_memory",
+        ":dnn",
         ":dnn_proto_cc",
         ":event",
         ":kernel",

From 0c637e6d2eb9223355693e543c28fc4a705ce3e1 Mon Sep 17 00:00:00 2001
From: Raviteja Gorijala <gorijala@google.com>
Date: Thu, 21 Sep 2023 11:16:14 -0700
Subject: [PATCH 093/567] Update release branch string for Apple Silicon MacOS
 builds

PiperOrigin-RevId: 567363555
---
 tensorflow/tools/ci_build/update_version.py | 33 ++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index 513891192d8d2d..ea7fd29e58745f 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -36,7 +36,22 @@
 SETUP_PY = "%s/tools/pip_package/setup.py" % TF_SRC_DIR
 README_MD = "./README.md"
 TENSORFLOW_BZL = "%s/tensorflow.bzl" % TF_SRC_DIR
-RELEVANT_FILES = [TF_SRC_DIR, VERSION_H, SETUP_PY, README_MD]
+TF_MAC_ARM64_CI_BUILD = (
+    "%s/tools/ci_build/osx/arm64/tensorflow_as_build_release.Jenkinsfile"
+    % TF_SRC_DIR
+)
+TF_MAC_ARM64_CI_TEST = (
+    "%s/tools/ci_build/osx/arm64/tensorflow_as_test_release.Jenkinsfile"
+    % TF_SRC_DIR
+)
+RELEVANT_FILES = [
+    TF_SRC_DIR,
+    VERSION_H,
+    SETUP_PY,
+    README_MD,
+    TF_MAC_ARM64_CI_BUILD,
+    TF_MAC_ARM64_CI_TEST
+]
 
 # Version type parameters.
 NIGHTLY_VERSION = 1
@@ -221,6 +236,20 @@ def update_tensorflow_bzl(old_version, new_version):
                          'VERSION = "%s"' % new_mmp, TENSORFLOW_BZL)
 
 
+def update_m1_builds(old_version, new_version):
+  """Update M1 builds."""
+  replace_string_in_line(
+      "RELEASE_BRANCH = 'r%s.%s'" % (old_version.major, old_version.minor),
+      "RELEASE_BRANCH = 'r%s.%s'" % (new_version.major, new_version.minor),
+      TF_MAC_ARM64_CI_BUILD,
+  )
+  replace_string_in_line(
+      "RELEASE_BRANCH = 'r%s.%s'" % (old_version.major, old_version.minor),
+      "RELEASE_BRANCH = 'r%s.%s'" % (new_version.major, new_version.minor),
+      TF_MAC_ARM64_CI_TEST,
+  )
+
+
 def major_minor_change(old_version, new_version):
   """Check if a major or minor change occurred."""
   major_mismatch = old_version.major != new_version.major
@@ -301,6 +330,8 @@ def main():
                             NIGHTLY_VERSION)
   else:
     new_version = Version.parse_from_string(args.version, REGULAR_VERSION)
+    # Update Apple Silicon release CI files for release builds only
+    update_m1_builds(old_version, new_version)
 
   update_version_h(old_version, new_version)
   update_setup_dot_py(old_version, new_version)

From da4b96f223fee680d305d79e0ca35c6c7810ee9c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 11:16:50 -0700
Subject: [PATCH 094/567] Free GPU memory Streamz Metric.

PiperOrigin-RevId: 567363752
---
 tensorflow/core/tfrt/saved_model/BUILD        |  2 ++
 .../core/tfrt/saved_model/saved_model.cc      |  9 +++++++-
 .../core/tfrt/saved_model/saved_model_util.cc | 23 +++++++++++++++++++
 .../core/tfrt/saved_model/saved_model_util.h  |  4 ++++
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index fc06ddd3911993..5f1d8cc73c1aff 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -269,6 +269,8 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:protobuf",
+        "@local_xla//xla/stream_executor",
+        "@local_xla//xla/stream_executor/gpu:gpu_init",
         "@tf_runtime//:bef",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:init_tfrt_dialects",
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.cc b/tensorflow/core/tfrt/saved_model/saved_model.cc
index 81eb443bb3bb0d..e9134850d72cd4 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model.cc
@@ -752,12 +752,19 @@ tensorflow::Status SavedModelImpl::Run(
   DCHECK(runner_table);
   DCHECK(resource_array);
 
-  return GraphExecutionRunOnFunction(
+  auto status = GraphExecutionRunOnFunction(
       options_.graph_execution_options, run_options, name, *symbol_uids, func,
       loaded_executable, inputs, outputs, resource_context,
       client_graph_resource_context, runner_table, resource_array, runtime(),
       *fallback_state_, fallback_state_->process_function_library_runtime(),
       &req_deadline_tracker_, /*stream_callback_id=*/std::nullopt);
+
+  if (options_.graph_execution_options.compile_options.device_target ==
+      TfrtDeviceInfraTarget::kGpu) {
+    RecordFreeGpuMemory();
+  }
+
+  return status;
 }
 
 struct SavedModelImpl::JoinedSignature {
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_util.cc b/tensorflow/core/tfrt/saved_model/saved_model_util.cc
index f7c4463ce06213..dc3671c57cab11 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_util.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_util.cc
@@ -47,6 +47,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "xla/stream_executor/gpu/gpu_init.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -78,6 +81,10 @@ auto* saved_model_grappler_time_seconds =
         "/tensorflow/tfrt/saved_model/grappler_time",
         "Record the grappler time for the savedmodel.", "model_name");
 
+auto* free_gpu_memory = tensorflow::monitoring::Gauge<int64_t, 1>::New(
+    "/tensorflow/tfrt/saved_model/free_gpu_memory",
+    "Record the free GPU memory.", "gpu_id");
+
 std::vector<std::string> FindNamesForValidSignatures(
     const tensorflow::MetaGraphDef& meta_graph_def) {
   std::vector<std::string> valid_signature_names;
@@ -278,5 +285,21 @@ void RegisterTFRTDialectsForAoT(mlir::DialectRegistry& registry) {
   tensorflow::RegisterGpuDialects(&registry);
 }
 
+void RecordFreeGpuMemory() {
+  se::Platform* gpu_manager = se::GPUMachineManager();
+  if (gpu_manager == nullptr || gpu_manager->VisibleDeviceCount() <= 0) return;
+
+  for (int i = 0; i < gpu_manager->VisibleDeviceCount(); ++i) {
+    se::StreamExecutor* se = gpu_manager->ExecutorForDevice(i).value();
+    int64_t free_memory = 0, total_memory = 0;
+    DCHECK(se->DeviceMemoryUsage(&free_memory, &total_memory));
+    free_gpu_memory->GetCell(std::to_string(i))->Set(free_memory);
+  }
+}
+
+int64_t GetFreeGpuMemory(int gpu_id) {
+  return free_gpu_memory->GetCell(std::to_string(gpu_id))->value();
+}
+
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_util.h b/tensorflow/core/tfrt/saved_model/saved_model_util.h
index 4b4608cdef1fbf..00fe1f87df7f9c 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_util.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_util.h
@@ -131,6 +131,10 @@ absl::Status DeserializeAoTMlirModule(
 
 void RegisterTFRTDialectsForAoT(mlir::DialectRegistry& registry);
 
+void RecordFreeGpuMemory();
+
+int64_t GetFreeGpuMemory(int gpu_id);
+
 }  // namespace tfrt_stub
 }  // namespace tensorflow
 

From 4280d60e8bce6bf7e20ab6bdd0170024314314d8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 11:29:02 -0700
Subject: [PATCH 095/567] Add support for the kOptimizationBarrier HLO in
 auto-sharding. We treat the op as an elementwise op.

PiperOrigin-RevId: 567367690
---
 .../xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 8d2fda21ec47b8..e0b07bb3d98348 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -1893,6 +1893,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
       case HloOpcode::kCbrt:
       case HloOpcode::kTan:
       case HloOpcode::kTanh:
+      case HloOpcode::kOptimizationBarrier:
       // Binary elementwise operations
       case HloOpcode::kAdd:
       case HloOpcode::kAtan2:

From 7cdbe3889e8fcfec86ed3378f4fc57d312a00a95 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Thu, 21 Sep 2023 11:43:02 -0700
Subject: [PATCH 096/567] [XLA:GPU] Move dot dimension merger before padding
 for cuBLAS to enable more Triton fusions.

Inside the padding pass for cuBLAS the Triton GEMM rewriter tells whether a dot should be handled by Triton, if so padding is skipped and Triton is used later. Merging dot dimensions before this makes Triton handle more of them.

PiperOrigin-RevId: 567372057
---
 third_party/xla/xla/service/gpu/BUILD         | 18 +++++++------
 .../xla/xla/service/gpu/gpu_compiler.cc       |  3 ---
 .../xla/xla/service/gpu/nvptx_compiler.cc     |  4 ++-
 .../xla/service/gpu/nvptx_compiler_test.cc    | 26 +++++++++++++++++--
 4 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 8646507b0eef2f..abba4f41bd9c97 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2669,7 +2669,6 @@ cc_library(
         "//xla/service:convolution_pred_expander",
         "//xla/service:copy_insertion",
         "//xla/service:dot_decomposer",
-        "//xla/service:dot_dimension_merger",
         "//xla/service:dot_merger",
         "//xla/service:dump",
         "//xla/service:dynamic_dimension_simplifier",
@@ -2853,6 +2852,7 @@ cc_library(
         "//xla/service:convert_mover",
         "//xla/service:dump",
         "//xla/hlo/ir:hlo",
+        "//xla/service:dot_dimension_merger",
         "//xla/service:float_normalization",
         "//xla/service:float_support",
         "//xla/service:hlo_constant_folding",
@@ -2890,21 +2890,23 @@ xla_cc_test(
     srcs = if_gpu_is_configured([
         "nvptx_compiler_test.cc",
     ]),
-    tags = tf_cuda_tests_tags() + [
-        "no_rocm",
-        "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false
-        # positives in msan.
+    tags = [
+        "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false positives in msan.
+        "requires-gpu-sm70",
     ],
     deps = [
         ":nvptx_compiler_impl",
-        "//xla:status_macros",
+        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/service:backend",
         "//xla/service:buffer_assignment",
         "//xla/service:gpu_plugin",
-        "//xla/service:hlo_parser",
         "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index bad85be4677da6..1d5c7c106906be 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -75,7 +75,6 @@ limitations under the License.
 #include "xla/service/convolution_pred_expander.h"
 #include "xla/service/copy_insertion.h"
 #include "xla/service/dot_decomposer.h"
-#include "xla/service/dot_dimension_merger.h"
 #include "xla/service/dot_merger.h"
 #include "xla/service/dump.h"
 #include "xla/service/dynamic_dimension_simplifier.h"
@@ -903,8 +902,6 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   {
     HloPassPipeline pipeline("hlo normalization");
 
-    pipeline.AddPass<DotDimensionMerger>();
-
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
     AlgebraicSimplifierOptions options;
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 01cf4fed6a3e1f..8dd3b2fc8d74c6 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/base/call_once.h"
@@ -33,6 +32,7 @@ limitations under the License.
 #include "xla/service/algebraic_simplifier.h"
 #include "xla/service/call_inliner.h"
 #include "xla/service/convert_mover.h"
+#include "xla/service/dot_dimension_merger.h"
 #include "xla/service/dump.h"
 #include "xla/service/float_normalization.h"
 #include "xla/service/float_support.h"
@@ -244,6 +244,8 @@ Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
     TF_RETURN_IF_ERROR(mha_fusion_pipeline.Run(hlo_module).status());
   }
 
+  pre_pipeline.AddPass<DotDimensionMerger>();
+
   for (const CublasPaddingRequirement& requirement :
        CublasPaddingRequirements) {
     if (cuda_compute_capability.IsAtLeast(requirement.min_compute_capability)) {
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
index 264f2df97cf5db..9a76f025c9341b 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
@@ -17,12 +17,15 @@ limitations under the License.
 
 #include <memory>
 
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/backend.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/hlo_parser.h"
-#include "xla/status_macros.h"
+#include "xla/statusor.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -93,5 +96,24 @@ ENTRY entry {
       all_reduce, {1}, all_reduce->operand(1), {}));
 }
 
+TEST_F(NVPTXCompilerTest,
+       DotDimensionAreSortedBeforePaddingForCublasEnablingTritonFusion) {
+  MatchOptimizedHlo(R"(
+ENTRY e {
+ p0 = f16[11,22,33,44] parameter(0)
+ p1 = s8[11,22,33,44] parameter(1)
+ p1c = f16[11,22,33,44] convert(p1)
+ ROOT d = f16[11,22,44,44] dot(p0, p1c),
+  lhs_batch_dims={0,1}, lhs_contracting_dims={2},
+  rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+})",
+                    R"(
+; CHECK: ENTRY
+; CHECK-NEXT: parameter
+; CHECK-NEXT: parameter
+; CHECK-NEXT: __triton_gemm
+  )");
+}
+
 }  // namespace gpu
 }  // namespace xla

From dad0396ad15f178b58a50d612aec47af7ffea15f Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 21 Sep 2023 11:58:24 -0700
Subject: [PATCH 097/567] [stream_executor] NFC: Remove unused timer target

https://github.com/openxla/xla/issues/5761

PiperOrigin-RevId: 567376822
---
 .../c/experimental/stream_executor/BUILD      |  1 -
 third_party/xla/xla/service/BUILD             |  1 -
 third_party/xla/xla/stream_executor/BUILD     | 34 -------------------
 3 files changed, 36 deletions(-)

diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
index 84df975cffa367..39b81d93fb1723 100644
--- a/tensorflow/c/experimental/stream_executor/BUILD
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -49,7 +49,6 @@ cc_library(
         "@local_xla//xla/stream_executor:multi_platform_manager",
         "@local_xla//xla/stream_executor:platform",
         "@local_xla//xla/stream_executor:stream_executor_pimpl",
-        "@local_xla//xla/stream_executor:timer",
     ],
 )
 
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 934cbf95b298be..04003d8f96c093 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1448,7 +1448,6 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:timer",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 677b085f83a7db..15e3642d4afd89 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -557,40 +557,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "timer",
-    srcs = [
-        "device_description.h",
-        "kernel_cache_config.h",
-    ],
-    hdrs = [
-        "blas.h",
-        "kernel.h",
-        "stream.h",
-        "stream_executor.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":data_type",
-        ":device_description",
-        ":device_description_proto_cc",
-        ":kernel_cache_config",
-        ":platform",
-        ":stream_executor_headers",
-        ":stream_executor_pimpl_header",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/protobuf:dnn_proto_cc",
-    ],
-)
-
 cc_library(
     name = "blas",
     srcs = ["blas.cc"],

From 0b836ba91ec31f7b64b22471bc0e3a03090c5ce5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 12:15:29 -0700
Subject: [PATCH 098/567] Implement Zero-Point-Offset Calculation for Lowering
 Dot/DotGeneral Ops in ConvertMhloQuantToInt Pass

Currently this implementation supports DotGeneral and Dot (which is a specical case of DotGeneral).

It supports static shapes only. Dynamic shapes support can be added later if needed.

PiperOrigin-RevId: 567382083
---
 .../mlir/quantization/stablehlo/BUILD         |   1 +
 .../bridge/convert_mhlo_quant_to_int.cc       | 299 +++++++++++++-
 .../bridge/convert-mhlo-quant-to-int.mlir     | 384 ++++++++++++++++--
 3 files changed, 646 insertions(+), 38 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index a4618d6de03082..77cb6b3f38ad06 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -142,6 +142,7 @@ cc_library(
         "//tensorflow/core/framework:numeric_types",
         "//tensorflow/core/util/quantization:uniform_quant_ops_attr_proto_cc",
         "//tensorflow/core/util/quantization:uniform_quant_ops_params",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
index 255c69573f08eb..0a081269b4db4e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string_view>
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
@@ -29,11 +30,14 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -591,6 +595,256 @@ LogicalResult matchAndRewriteDotLikeOp(OpType &op, OpAdaptorType &adaptor,
   return success();
 }
 
+Value CreateZeroPointPartialOffset(OpBuilder &builder, Location loc,
+                                   Value tensor, const int64_t other_tensor_zp,
+                                   ArrayRef<int64_t> contracting_dims) {
+  // This function calculates part of the zero-point-offset by using
+  // mhlo::Reduce to sum over the contracting dims of the tensor, and then
+  // multiply by zp of the other tensor.
+  auto output_element_type = builder.getI32Type();
+
+  // Calculate the output tensor shape. This is input tensor dims minus
+  // contracting dims.
+  auto ranked_tensor = tensor.getType().dyn_cast<RankedTensorType>();
+  llvm::SmallVector<int64_t> output_dims;
+  for (int64_t i = 0; i < ranked_tensor.getRank(); ++i) {
+    if (absl::c_count(contracting_dims, i) == 0) {
+      output_dims.push_back(ranked_tensor.getDimSize(i));
+    }
+  }
+
+  // Convert input tensor to output type since mhlo::Reduce only supports same
+  // element type for input/output.
+  tensor = builder.create<mhlo::ConvertOp>(
+      loc, tensor.getType().dyn_cast<TensorType>().clone(output_element_type),
+      tensor);
+  auto reducer_tensor_type = RankedTensorType::get({}, output_element_type);
+
+  // Initial value for reduced tensor. This is set 0.
+  Value init_values = builder.create<mhlo::ConstantOp>(
+      loc, DenseIntElementsAttr::get(reducer_tensor_type, {0}));
+  mhlo::ReduceOp reduce = builder.create<mhlo::ReduceOp>(
+      loc, RankedTensorType::get(output_dims, output_element_type), tensor,
+      init_values, builder.getI64TensorAttr(contracting_dims));
+  // Define reducer function to compute sum.
+  Region &region = reduce.getBody();
+  Block &block = region.emplaceBlock();
+  block.addArgument(reducer_tensor_type, loc);
+  block.addArgument(reducer_tensor_type, loc);
+  auto *firstArgument = block.args_begin();
+  auto secondArgument = block.args_rbegin();
+  {
+    OpBuilder::InsertionGuard guard(builder);
+    builder.setInsertionPointToStart(&block);
+    Value sum =
+        builder.create<mhlo::AddOp>(loc, *firstArgument, *secondArgument);
+    builder.create<mhlo::ReturnOp>(loc, sum);
+  }
+  Value zp = builder.create<mhlo::ConstantOp>(
+      loc, builder.getI32IntegerAttr(other_tensor_zp));
+  Value mul_op = builder.create<chlo::BroadcastMulOp>(loc, reduce.getResult(0),
+                                                      zp, nullptr);
+  return mul_op;
+}
+
+llvm::SmallVector<int64_t> CalculateBroadcastDims(
+    Value zp_contribution, llvm::ArrayRef<int64_t> contracting_dims,
+    llvm::ArrayRef<int64_t> batching_dims, int64_t non_batching_starting_idx) {
+  // This function calculates the dims for broadcasting from the
+  // zero-point-offset tensor to the final output tensor.
+  auto zp_contribution_rank =
+      zp_contribution.getType().dyn_cast<ShapedType>().getRank();
+  llvm::SmallVector<int64_t> broadcast_dims;
+  broadcast_dims.resize(zp_contribution_rank, 0);
+  // Result tensor will have batching dims first, then LHS result dims, then
+  // RHS result dims. So non-batching result dims index doesn't start from 0.
+  // The arg non_batching_starting_idx is used distinguish LHS and RHS.
+  int64_t result_batching_idx = 0;
+  int64_t result_non_batching_idx = non_batching_starting_idx;
+  for (int64_t idx = 0, original_idx = 0; idx < zp_contribution_rank;
+       ++idx, ++original_idx) {
+    // zp_contribution has removed contracting dims from the tensor. The
+    // following recovers the index in the original tensor.
+    while (absl::c_count(contracting_dims, original_idx) != 0) {
+      original_idx++;
+    }
+    if (absl::c_count(batching_dims, original_idx) == 0) {
+      broadcast_dims[idx] = result_non_batching_idx++;
+    } else {
+      broadcast_dims[idx] = result_batching_idx++;
+    }
+  }
+  return broadcast_dims;
+}
+
+Value CalculateZeroPointOffset(OpBuilder &builder, Location loc, Value lhs,
+                               Value rhs, int64_t lhs_zp, int64_t rhs_zp,
+                               mhlo::DotDimensionNumbersAttr dims) {
+  // According to StableHLO spec, the output tensor has dims in the following
+  // order:
+  //   batching dims, LHS result dims, RHS result dims
+  // where LHS/RHS result dims are any dims that are neither batching dims nor
+  // contracting dims.
+  llvm::SmallVector<int64_t> output_dims;
+  mlir::ShapedType lhs_shape = lhs.getType().cast<mlir::ShapedType>();
+  mlir::ShapedType rhs_shape = rhs.getType().cast<mlir::ShapedType>();
+  for (int64_t i = 0; i < lhs_shape.getRank(); ++i) {
+    if (absl::c_count(dims.getLhsBatchingDimensions(), i) != 0) {
+      output_dims.push_back(lhs_shape.getDimSize(i));
+    }
+  }
+  for (int64_t i = 0; i < lhs_shape.getRank(); ++i) {
+    if (absl::c_count(dims.getLhsContractingDimensions(), i) == 0 &&
+        absl::c_count(dims.getLhsBatchingDimensions(), i) == 0) {
+      output_dims.push_back(lhs_shape.getDimSize(i));
+    }
+  }
+  for (int64_t i = 0; i < rhs_shape.getRank(); ++i) {
+    if (absl::c_count(dims.getRhsContractingDimensions(), i) == 0 &&
+        absl::c_count(dims.getRhsBatchingDimensions(), i) == 0) {
+      output_dims.push_back(rhs_shape.getDimSize(i));
+    }
+  }
+  auto output_element_type = builder.getI32Type();
+  auto output_tensor_type =
+      RankedTensorType::get(output_dims, output_element_type);
+
+  Value result = builder.create<mhlo::ConstantOp>(
+      loc, DenseIntElementsAttr::get(output_tensor_type, {0}));
+
+  // Calculate LHS contribution when RHS zp is non-zero.
+  if (rhs_zp != 0) {
+    Value lhs_zp_contribution = CreateZeroPointPartialOffset(
+        builder, loc, lhs, rhs_zp, dims.getLhsContractingDimensions());
+    // Broadcast lhs ZP contribution to result tensor shape.
+    llvm::SmallVector<int64_t> broadcast_dims = CalculateBroadcastDims(
+        lhs_zp_contribution, dims.getLhsContractingDimensions(),
+        dims.getLhsBatchingDimensions(),
+        dims.getLhsBatchingDimensions().size());
+    lhs_zp_contribution = builder.create<mhlo::BroadcastInDimOp>(
+        loc, output_tensor_type, lhs_zp_contribution,
+        DenseIntElementsAttr::get(
+            RankedTensorType::get({static_cast<int64_t>(broadcast_dims.size())},
+                                  builder.getI64Type()),
+            broadcast_dims));
+    result = builder.create<mhlo::AddOp>(loc, result, lhs_zp_contribution);
+  }
+  // Calculate RHS contribution when LHS zp is non-zero.
+  if (lhs_zp != 0) {
+    Value rhs_zp_contribution = CreateZeroPointPartialOffset(
+        builder, loc, rhs, lhs_zp, dims.getRhsContractingDimensions());
+    // Broadcast rhs ZP contribution to result tensor shape.
+    llvm::SmallVector<int64_t> broadcast_dims = CalculateBroadcastDims(
+        rhs_zp_contribution, dims.getRhsContractingDimensions(),
+        dims.getRhsBatchingDimensions(),
+        lhs_shape.getRank() - dims.getLhsContractingDimensions().size());
+
+    rhs_zp_contribution = builder.create<mhlo::BroadcastInDimOp>(
+        loc, output_tensor_type, rhs_zp_contribution,
+        DenseIntElementsAttr::get(
+            RankedTensorType::get({static_cast<int64_t>(broadcast_dims.size())},
+                                  builder.getI64Type()),
+            broadcast_dims));
+    result = builder.create<mhlo::AddOp>(loc, result, rhs_zp_contribution);
+  }
+
+  if (lhs_zp != 0 && rhs_zp != 0) {
+    // Contributions from LHS_ZP * RHS_ZP.
+    // This is multiplied by the product of all contracting dimensions.
+    int32_t contracting_dim_total = 1;
+    for (const int64_t rhs_idx : dims.getRhsContractingDimensions()) {
+      contracting_dim_total *= rhs_shape.getDimSize(rhs_idx);
+    }
+    const int32_t zp_constant_offset = static_cast<int32_t>(lhs_zp) *
+                                       static_cast<int32_t>(rhs_zp) *
+                                       contracting_dim_total;
+    auto zp_offset_value = builder.create<mhlo::ConstantOp>(
+        loc, builder.getI32IntegerAttr(zp_constant_offset));
+    result = builder.create<chlo::BroadcastSubOp>(loc, result, zp_offset_value,
+                                                  nullptr);
+  }
+  return result;
+}
+
+template <typename DotOp, typename DotOpAdaptor>
+LogicalResult RewriteDotGeneralOp(DotOp op, DotOpAdaptor adaptor,
+                                  ArrayRef<NamedAttribute> attrs,
+                                  const mhlo::DotDimensionNumbersAttr &dims,
+                                  ConversionPatternRewriter &rewriter) {
+  // Lower Dot/DotGeneral UQ ops to DotGeneral int.
+  // Assumes that operands and results are static-shape tensors of uq types.
+  auto lhs_element_quant_type =
+      getElementTypeOrSelf(op.getLhs().getType())
+          .template dyn_cast<quant::UniformQuantizedType>();
+  auto rhs_element_quant_type =
+      getElementTypeOrSelf(op.getRhs().getType())
+          .template dyn_cast<quant::UniformQuantizedType>();
+  auto res_element_quant_type =
+      getElementTypeOrSelf(op.getResult())
+          .template dyn_cast<quant::UniformQuantizedType>();
+  Value lhs = adaptor.getLhs();
+  Value rhs = adaptor.getRhs();
+  auto res_int32_tensor_type =
+      op.getResult().getType().clone(rewriter.getI32Type());
+
+  // Dot result
+  //  = dot((lhs - zp_l) * scale_l, (rhs - zp_r) * scale_r) / scale_res
+  //      + zp_res
+  //  = dot(lhs - zp_l, rhs - zp_r) * scale_l * scale_r / scale_res + zp_res
+  //  = dot(lhs, rhs) * combined_scale + combined_zp
+  // where:
+  //   zp_offset = zp_l*rhs + zp_r*lhs - zp_l*zp_r
+  //   combined_scale = scale_l * scale_r / scale_res
+  //   combined_zp = res_zp - zp_offset * combined_scale
+  SmallVector<Value, 2> operands{lhs, rhs};
+  Value res_i32 = rewriter.create<mhlo::DotGeneralOp>(
+      op->getLoc(), res_int32_tensor_type, operands, attrs);
+
+  Value zp_offset = CalculateZeroPointOffset(
+      rewriter, op->getLoc(), lhs, rhs, lhs_element_quant_type.getZeroPoint(),
+      rhs_element_quant_type.getZeroPoint(), dims);
+
+  // Multiply dot result and zp_offset by combined_scale only if it is not 1.0.
+  float combined_scale_fp = lhs_element_quant_type.getScale() *
+                            rhs_element_quant_type.getScale() /
+                            res_element_quant_type.getScale();
+  if (combined_scale_fp != 1.0f) {
+    Value combined_scale = rewriter.create<mhlo::ConstantOp>(
+        op->getLoc(), rewriter.getF32FloatAttr(combined_scale_fp));
+
+    auto res_float32_tensor_type =
+        op.getResult().getType().clone(rewriter.getF32Type());
+    Value res_f32 = rewriter.create<mhlo::ConvertOp>(
+        op->getLoc(), res_float32_tensor_type, res_i32);
+    res_f32 = rewriter.create<chlo::BroadcastMulOp>(
+        op->getLoc(), res_float32_tensor_type, res_f32, combined_scale,
+        nullptr);
+    res_i32 = rewriter.create<mhlo::ConvertOp>(op->getLoc(),
+                                               res_int32_tensor_type, res_f32);
+
+    auto zp_offset_float32_tensor_type =
+        zp_offset.getType().dyn_cast<TensorType>().clone(rewriter.getF32Type());
+    zp_offset = rewriter.create<mhlo::ConvertOp>(
+        op->getLoc(), zp_offset_float32_tensor_type, zp_offset);
+    zp_offset = rewriter.create<chlo::BroadcastMulOp>(
+        op->getLoc(), zp_offset_float32_tensor_type, zp_offset, combined_scale,
+        nullptr);
+    zp_offset = rewriter.create<mhlo::ConvertOp>(
+        op->getLoc(),
+        zp_offset_float32_tensor_type.clone(rewriter.getI32Type()), zp_offset);
+  }
+
+  Value res_zp = rewriter.create<mhlo::ConstantOp>(
+      op->getLoc(),
+      rewriter.getI32IntegerAttr(res_element_quant_type.getZeroPoint()));
+  Value combined_zp = rewriter.create<chlo::BroadcastSubOp>(
+      op->getLoc(), res_int32_tensor_type, res_zp, zp_offset, nullptr);
+
+  rewriter.replaceOpWithNewOp<mhlo::AddOp>(op, res_int32_tensor_type, res_i32,
+                                           combined_zp);
+  return success();
+}
+
 class ConvertUniformQuantizedDotOp : public OpConversionPattern<mhlo::DotOp> {
  public:
   using OpConversionPattern::OpConversionPattern;
@@ -598,7 +852,32 @@ class ConvertUniformQuantizedDotOp : public OpConversionPattern<mhlo::DotOp> {
   LogicalResult matchAndRewrite(
       mhlo::DotOp op, mhlo::DotOpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    return matchAndRewriteDotLikeOp(op, adaptor, rewriter);
+    // Use matchAndRewriteDotLikeOp for DotHybrid and dynamic shapes.
+    if (!op.getLhs()
+             .getType()
+             .getElementType()
+             .isa<quant::UniformQuantizedType>() ||
+        !op.getRhs()
+             .getType()
+             .getElementType()
+             .isa<quant::UniformQuantizedType>() ||
+        !op.getLhs().getType().cast<ShapedType>().hasStaticShape() ||
+        !op.getRhs().getType().cast<ShapedType>().hasStaticShape() ||
+        !op.getResult().getType().cast<ShapedType>().hasStaticShape()) {
+      return matchAndRewriteDotLikeOp(op, adaptor, rewriter);
+    }
+
+    // DotOp is a special case of DotGeneralOp, where LHS and RHS are both
+    // rank-2 tensors and have contracting dims of 1 and 0 respectively.
+    auto dims = mhlo::DotDimensionNumbersAttr::get(
+        rewriter.getContext(), /*lhsBatchingDimensions=*/{},
+        /*rhsBatchingDimensions=*/{}, /*lhsContractingDimensions=*/{1},
+        /*rhsContractingDimensions=*/{0});
+    llvm::SmallVector<mlir::NamedAttribute> attrs(op->getAttrs());
+    attrs.push_back(
+        {StringAttr::get(rewriter.getContext(), "dot_dimension_numbers"),
+         dims});
+    return RewriteDotGeneralOp(op, adaptor, attrs, dims, rewriter);
   }
 };
 
@@ -610,7 +889,23 @@ class ConvertUniformQuantizedDotGeneralOp
   LogicalResult matchAndRewrite(
       mhlo::DotGeneralOp op, mhlo::DotGeneralOpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    return matchAndRewriteDotLikeOp(op, adaptor, rewriter);
+    // Use matchAndRewriteDotLikeOp for DotHybridGeneral case and dynamic
+    // shapes.
+    if (!op.getLhs()
+             .getType()
+             .getElementType()
+             .isa<quant::UniformQuantizedType>() ||
+        !op.getRhs()
+             .getType()
+             .getElementType()
+             .isa<quant::UniformQuantizedType>() ||
+        !op.getLhs().getType().cast<ShapedType>().hasStaticShape() ||
+        !op.getRhs().getType().cast<ShapedType>().hasStaticShape() ||
+        !op.getResult().getType().cast<ShapedType>().hasStaticShape()) {
+      return matchAndRewriteDotLikeOp(op, adaptor, rewriter);
+    }
+    return RewriteDotGeneralOp(op, adaptor, op->getAttrs(),
+                               op.getDotDimensionNumbers(), rewriter);
   }
 };
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
index 863c020e96b09e..9e00c80fb5aab5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
@@ -303,8 +303,8 @@ func.func @uniform_quantize_requantize_merged_zp_zero_and_dequantize(%arg0: tens
 
 // -----
 
-// CHECK-LABEL: func @uniform_quantize_dot_dequantize
-func.func @uniform_quantize_dot_dequantize(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+// CHECK-LABEL: func @uniform_quantize_dot_dequantize_dynamic
+func.func @uniform_quantize_dot_dequantize_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i8:f32, 2.000000e+00:3>>
   %1 = mhlo.uniform_quantize %arg1 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>
 
@@ -327,8 +327,8 @@ func.func @uniform_quantize_dot_dequantize(%arg0: tensor<?x?xf32>, %arg1: tensor
 
 // -----
 
-// CHECK-LABEL: func @uniform_quantize_dot_int4
-func.func @uniform_quantize_dot_int4(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) {
+// CHECK-LABEL: func @uniform_quantize_dot_dynamic_int4
+func.func @uniform_quantize_dot_dynamic_int4(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) {
   %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>
   %1 = mhlo.uniform_quantize %arg1 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>
 
@@ -344,51 +344,363 @@ func.func @uniform_quantize_dot_int4(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf
 
 // -----
 
+// CHECK-LABEL: func @uniform_quantize_dot_dequantize
+func.func @uniform_quantize_dot_dequantize(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+
+  // CHECK: "mhlo.dot_general"
+  // CHECK-SAME: lhs_contracting_dimensions = [1]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+  // CHECK-SAME: (tensor<2x2xi8>, tensor<2x2xi8>) -> tensor<2x2xi32>
+  %2 = "mhlo.dot" (%0, %1) : (tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  %3 = mhlo.uniform_dequantize %2 : (tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2xf32>
+  return %3 : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantize_dot_int4
+func.func @uniform_quantize_dot_int4(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>
+
+  // CHECK: "mhlo.dot_general"
+  // CHECK-SAME: lhs_contracting_dimensions = [1]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+  // CHECK-SAME: (tensor<2x2xi4>, tensor<2x2xi4>) -> tensor<2x2xi32>
+  %2 = "mhlo.dot" (%0, %1): (tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>, tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @uniform_quantize_dot_general_dequantize
-func.func @uniform_quantize_dot_general_dequantize(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i8:f32, 2.000000e+00:3>>
-  %1 = mhlo.uniform_quantize %arg1 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>
+func.func @uniform_quantize_dot_general_dequantize(
+    %arg0: tensor<2x5x6xf32>, %arg1: tensor<6x8x2xf32>) -> tensor<2x5x8xf32> {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x5x6xf32>)
+      -> tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<6x8x2xf32>)
+      -> tensor<6x8x2x!quant.uniform<i8:f32, 1.000000e+00:5>>
+
+  // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
+  // CHECK-SAME: lhs_batching_dimensions = [0]
+  // CHECK-SAME: rhs_batching_dimensions = [2]
+  // CHECK-SAME: lhs_contracting_dimensions = [2]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.constant dense<0> : tensor<2x5x8xi32>
+
+  // Zero point offset contribution from LHS tensor * RHS ZP.
+
+  // CHECK: %[[LHS_I32:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<2x5x6xi8>)
+  // CHECK-SAME: -> tensor<2x5x6xi32>
+  // CHECK: %[[LHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[LHS_REDUCE:.*]] = mhlo.reduce(%[[LHS_I32]] init: %[[LHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [2]
+  // CHECK-SAME: (tensor<2x5x6xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<2x5xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<5> : tensor<i32>
+  // CHECK: %[[LHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[LHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<2x5xi32>, tensor<i32>) -> tensor<2x5xi32>
+  // CHECK: %[[LHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[LHS_ZP_CONTRIB]])
+  // CHECK-SAME: broadcast_dimensions = dense<[0, 1]>
+  // CHECK-SAME: (tensor<2x5xi32>) -> tensor<2x5x8xi32>
+  // CHECK: %[[ZP_TOTAL_2:.*]] = mhlo.add %[[ZP_TOTAL_1]], %[[LHS_ZP_BCAST]]
+
+  // Zero point offset contribution from RHS tensor * LHS ZP.
+
+  // CHECK: %[[RHS_I32:.*]] = mhlo.convert %[[RHS:.*]] : (tensor<6x8x2xi8>)
+  // CHECK-SAME: -> tensor<6x8x2xi32>
+  // CHECK: %[[RHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[RHS_REDUCE:.*]] = mhlo.reduce(%[[RHS_I32]] init: %[[RHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [0]
+  // CHECK-SAME: (tensor<6x8x2xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<8x2xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<3> : tensor<i32>
+  // CHECK: %[[RHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<8x2xi32>, tensor<i32>) -> tensor<8x2xi32>
+  // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[RHS_ZP_CONTRIB]])
+  // CHECK-SAME: broadcast_dimensions = dense<[2, 0]>
+  // CHECK-SAME: (tensor<8x2xi32>) -> tensor<2x5x8xi32>
+  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.add %[[ZP_TOTAL_2]], %[[RHS_ZP_BCAST]]
+
+  // Zero point offset contribution from LHS ZP * RHS ZP.
+
+  // CHECK: %[[ZPS:.*]] = mhlo.constant dense<90> : tensor<i32>
+  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_3]], %[[ZPS]]
+  // CHECK-SAME: (tensor<2x5x8xi32>, tensor<i32>) -> tensor<2x5x8xi32>
+
+  // Combine dot result with zero point offset and output final result.
+
+  // CHECK: %[[COMBINED_SCALE:.*]] = mhlo.constant dense<5.000000e-01> : tensor<f32>
+  // CHECK: %[[RES_FP:.*]] = mhlo.convert %[[DOT_RES]]
+  // CHECK-SAME: (tensor<2x5x8xi32>) -> tensor<2x5x8xf32>
+  // CHECK: %[[RES_FP_1:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RES_FP:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
+  // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
+
+  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
+  // CHECK-SAME: (tensor<2x5x8xi32>) -> tensor<2x5x8xf32>
+  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[ZP_TOTAL_5:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[ZP_TOTAL_7:.*]] = mhlo.convert %[[ZP_TOTAL_6]]
+  // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
+
+  // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
+  // CHECK: %[[ZP_TOTAL_8:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_7]]
+  // CHECK-SAME: (tensor<i32>, tensor<2x5x8xi32>) -> tensor<2x5x8xi32>
+  // CHECK: mhlo.add %[[RES_INT]], %[[ZP_TOTAL_8]]
 
-  // CHECK: %[[VAL1:.*]] = mhlo.convert %[[VAL0:.*]] : (tensor<?x?xi8>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL3:.*]] = chlo.broadcast_subtract %[[VAL1]], %[[VAL2:.*]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL5:.*]] = mhlo.convert %[[VAL4:.*]] : (tensor<?x?xi8>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL7:.*]] = chlo.broadcast_subtract %[[VAL5]], %[[VAL6:.*]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL8:.*]] = "mhlo.dot_general"(%[[VAL3]], %[[VAL7]]) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL10:.*]] = chlo.broadcast_multiply %[[VAL8]], %[[VAL9:.*]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL12:.*]] = chlo.broadcast_add %[[VAL10]], %[[VAL11:.*]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL13:.*]] = mhlo.floor %[[VAL12]] : tensor<?x?xf32>
-  // CHECK: %[[VAL15:.*]] = chlo.broadcast_add %[[VAL13]], %[[VAL14:.*]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL16:.*]] = mhlo.convert %[[VAL15]] : (tensor<?x?xf32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL19:.*]] = mhlo.clamp %[[VAL17:.*]], %[[VAL16]], %[[VAL18:.*]] : (tensor<i32>, tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL20:.*]] = mhlo.convert %[[VAL19]] : (tensor<?x?xi32>) -> tensor<?x?xi8>
   %2 = "mhlo.dot_general" (%0, %1) {
     dot_dimension_numbers = #mhlo.dot<
-      lhs_contracting_dimensions = [1],
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [2],
+      lhs_contracting_dimensions = [2],
       rhs_contracting_dimensions = [0]
-    >} : (tensor<?x?x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>
-  %3 = mhlo.uniform_dequantize %2 : (tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<?x?xf32>
-  return %3 : tensor<?x?xf32>
+    >} : (
+      tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
+      tensor<6x8x2x!quant.uniform<i8:f32, 1.000000e+00:5>>
+    ) -> tensor<2x5x8x!quant.uniform<i8:f32, 4.000000e+00:7>>
+  %3 = mhlo.uniform_dequantize %2 : (
+    tensor<2x5x8x!quant.uniform<i8:f32, 4.000000e+00:7>>
+  ) -> tensor<2x5x8xf32>
+  return %3 : tensor<2x5x8xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @uniform_quantize_dot_general_int4
-func.func @uniform_quantize_dot_general_int4(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) {
-  %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>
-  %1 = mhlo.uniform_quantize %arg1 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>
+// CHECK-LABEL: func @uniform_quantize_dot_general_dequantize_multiple_batching_dims
+func.func @uniform_quantize_dot_general_dequantize_multiple_batching_dims(
+    %arg0: tensor<2x5x3x7x6xf32>, %arg1: tensor<6x2x7x8x3xf32>) -> tensor<2x3x5x8xf32> {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x5x3x7x6xf32>)
+      -> tensor<2x5x3x7x6x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<6x2x7x8x3xf32>)
+      -> tensor<6x2x7x8x3x!quant.uniform<i8:f32, 1.000000e+00:5>>
+
+  // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
+  // CHECK-SAME: lhs_batching_dimensions = [0, 2]
+  // CHECK-SAME: rhs_batching_dimensions = [1, 4]
+  // CHECK-SAME: lhs_contracting_dimensions = [4, 3]
+  // CHECK-SAME: rhs_contracting_dimensions = [0, 2]>}
+
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.constant dense<0> : tensor<2x3x5x8xi32>
+
+  // Zero point offset contribution from LHS tensor * RHS ZP.
+
+  // CHECK: %[[LHS_I32:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<2x5x3x7x6xi8>)
+  // CHECK-SAME: -> tensor<2x5x3x7x6xi32>
+  // CHECK: %[[LHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[LHS_REDUCE:.*]] = mhlo.reduce(%[[LHS_I32]] init: %[[LHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [4, 3]
+  // CHECK-SAME: (tensor<2x5x3x7x6xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<2x5x3xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<5> : tensor<i32>
+  // CHECK: %[[LHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[LHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<2x5x3xi32>, tensor<i32>) -> tensor<2x5x3xi32>
+  // CHECK: %[[LHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[LHS_ZP_CONTRIB]])
+  // CHECK-SAME: broadcast_dimensions = dense<[0, 2, 1]>
+  // CHECK-SAME: (tensor<2x5x3xi32>) -> tensor<2x3x5x8xi32>
+  // CHECK: %[[ZP_TOTAL_2:.*]] = mhlo.add %[[ZP_TOTAL_1]], %[[LHS_ZP_BCAST]]
+
+  // Zero point offset contribution from RHS tensor * LHS ZP.
+
+  // CHECK: %[[RHS_I32:.*]] = mhlo.convert %[[RHS:.*]] : (tensor<6x2x7x8x3xi8>)
+  // CHECK-SAME: -> tensor<6x2x7x8x3xi32>
+  // CHECK: %[[RHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[RHS_REDUCE:.*]] = mhlo.reduce(%[[RHS_I32]] init: %[[RHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [0, 2]
+  // CHECK-SAME: (tensor<6x2x7x8x3xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<2x8x3xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<3> : tensor<i32>
+  // CHECK: %[[RHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<2x8x3xi32>, tensor<i32>) -> tensor<2x8x3xi32>
+  // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[RHS_ZP_CONTRIB]])
+  // CHECK-SAME: broadcast_dimensions = dense<[0, 3, 1]>
+  // CHECK-SAME: (tensor<2x8x3xi32>) -> tensor<2x3x5x8xi32>
+  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.add %[[ZP_TOTAL_2]], %[[RHS_ZP_BCAST]]
+
+  // Zero point offset contribution from LHS ZP * RHS ZP.
+
+  // CHECK: %[[ZPS:.*]] = mhlo.constant dense<630> : tensor<i32>
+  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_3]], %[[ZPS]]
+  // CHECK-SAME: (tensor<2x3x5x8xi32>, tensor<i32>) -> tensor<2x3x5x8xi32>
+
+  // Combine dot result with zero point offset and output final result.
+
+  // CHECK: %[[COMBINED_SCALE:.*]] = mhlo.constant dense<5.000000e-01> : tensor<f32>
+  // CHECK: %[[RES_FP:.*]] = mhlo.convert %[[DOT_RES]]
+  // CHECK-SAME: (tensor<2x3x5x8xi32>) -> tensor<2x3x5x8xf32>
+  // CHECK: %[[RES_FP_1:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RES_FP:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
+  // CHECK-SAME: (tensor<2x3x5x8xf32>) -> tensor<2x3x5x8xi32>
+
+  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
+  // CHECK-SAME: (tensor<2x3x5x8xi32>) -> tensor<2x3x5x8xf32>
+  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[ZP_TOTAL_5:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[ZP_TOTAL_7:.*]] = mhlo.convert %[[ZP_TOTAL_6]]
+  // CHECK-SAME: (tensor<2x3x5x8xf32>) -> tensor<2x3x5x8xi32>
+
+  // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
+  // CHECK: %[[ZP_TOTAL_8:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_7]]
+  // CHECK-SAME: (tensor<i32>, tensor<2x3x5x8xi32>) -> tensor<2x3x5x8xi32>
+  // CHECK: mhlo.add %[[RES_INT]], %[[ZP_TOTAL_8]]
+  
+  %2 = "mhlo.dot_general" (%0, %1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0, 2],
+      rhs_batching_dimensions = [1, 4],
+      lhs_contracting_dimensions = [4, 3],
+      rhs_contracting_dimensions = [0, 2]
+    >} : (
+      tensor<2x5x3x7x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
+      tensor<6x2x7x8x3x!quant.uniform<i8:f32, 1.000000e+00:5>>
+    ) -> tensor<2x3x5x8x!quant.uniform<i8:f32, 4.000000e+00:7>>
+  %3 = mhlo.uniform_dequantize %2 : (
+    tensor<2x3x5x8x!quant.uniform<i8:f32, 4.000000e+00:7>>
+  ) -> tensor<2x3x5x8xf32>
+  return %3 : tensor<2x3x5x8xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantize_dot_general_dequantize_rhs_zero_zp
+func.func @uniform_quantize_dot_general_dequantize_rhs_zero_zp(
+    %arg0: tensor<2x5x6xf32>, %arg1: tensor<6x8x2xf32>) -> tensor<2x5x8xf32> {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x5x6xf32>)
+      -> tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<6x8x2xf32>)
+      -> tensor<6x8x2x!quant.uniform<i8:f32, 1.000000e+00:0>>
+
+  // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
+  // CHECK-SAME: lhs_batching_dimensions = [0]
+  // CHECK-SAME: rhs_batching_dimensions = [2]
+  // CHECK-SAME: lhs_contracting_dimensions = [2]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.constant dense<0> : tensor<2x5x8xi32>
+
+  // Zero point offset contribution from LHS tensor * RHS ZP is 0 and skipped.
+
+  // Zero point offset contribution from RHS tensor * LHS ZP.
+
+  // CHECK: %[[RHS_I32:.*]] = mhlo.convert %[[RHS:.*]] : (tensor<6x8x2xi8>)
+  // CHECK-SAME: -> tensor<6x8x2xi32>
+  // CHECK: %[[RHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[RHS_REDUCE:.*]] = mhlo.reduce(%[[RHS_I32]] init: %[[RHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [0]
+  // CHECK-SAME: (tensor<6x8x2xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<8x2xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<3> : tensor<i32>
+  // CHECK: %[[RHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<8x2xi32>, tensor<i32>) -> tensor<8x2xi32>
+  // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[RHS_ZP_CONTRIB]])
+  // CHECK-SAME: broadcast_dimensions = dense<[2, 0]>
+  // CHECK-SAME: (tensor<8x2xi32>) -> tensor<2x5x8xi32>
+  // CHECK: %[[ZP_TOTAL_2:.*]] = mhlo.add %[[ZP_TOTAL_1]], %[[RHS_ZP_BCAST]]
+
+  // Zero point offset contribution from LHS ZP * RHS ZP is 0 and skipped.
+
+  // Combine dot result with zero point offset and output final result.
+
+  // CHECK: %[[COMBINED_SCALE:.*]] = mhlo.constant dense<5.000000e-01> : tensor<f32>
+  // CHECK: %[[RES_FP:.*]] = mhlo.convert %[[DOT_RES]]
+  // CHECK-SAME: (tensor<2x5x8xi32>) -> tensor<2x5x8xf32>
+  // CHECK: %[[RES_FP_1:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RES_FP:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
+  // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
+
+  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
+  // CHECK-SAME: (tensor<2x5x8xi32>) -> tensor<2x5x8xf32>
+  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[ZP_TOTAL_3:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
+  // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
+
+  // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
+  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_5]]
+  // CHECK-SAME: (tensor<i32>, tensor<2x5x8xi32>) -> tensor<2x5x8xi32>
+  // CHECK: mhlo.add %[[RES_INT]], %[[ZP_TOTAL_6]]
 
-  // CHECK: %[[VAL2:.*]] = "mhlo.dot_general"(%[[VAL0:.*]], %[[VAL1:.*]]) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL4:.*]] = mhlo.convert %[[VAL3:.*]] : (tensor<?x?xf32>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[VAL5:.*]] = mhlo.constant dense<-8> : tensor<i32>
-  // CHECK-DAG: %[[VAL6:.*]] = mhlo.constant dense<7> : tensor<i32>
-  // CHECK: %[[VAL7:.*]] = mhlo.clamp %[[VAL5]], %[[VAL4]], %[[VAL6]] : (tensor<i32>, tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL8:.*]] = mhlo.convert %[[VAL7]] : (tensor<?x?xi32>) -> tensor<?x?xi4>
   %2 = "mhlo.dot_general" (%0, %1) {
     dot_dimension_numbers = #mhlo.dot<
-      lhs_contracting_dimensions = [1],
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [2],
+      lhs_contracting_dimensions = [2],
       rhs_contracting_dimensions = [0]
-    >} : (tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>, tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>) -> tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>
-  return
+    >} : (
+      tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
+      tensor<6x8x2x!quant.uniform<i8:f32, 1.000000e+00:0>>
+    ) -> tensor<2x5x8x!quant.uniform<i8:f32, 4.000000e+00:7>>
+  %3 = mhlo.uniform_dequantize %2 : (
+    tensor<2x5x8x!quant.uniform<i8:f32, 4.000000e+00:7>>
+  ) -> tensor<2x5x8xf32>
+  return %3 : tensor<2x5x8xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantize_dot_general_dequantize_zero_zp
+func.func @uniform_quantize_dot_general_dequantize_zero_zp(
+    %arg0: tensor<2x5x6xf32>, %arg1: tensor<6x8x2xf32>) -> tensor<2x5x8xf32> {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x5x6xf32>)
+      -> tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:0>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<6x8x2xf32>)
+      -> tensor<6x8x2x!quant.uniform<i8:f32, 3.000000e+00:0>>
+
+  // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
+  // CHECK-SAME: lhs_batching_dimensions = [0]
+  // CHECK-SAME: rhs_batching_dimensions = [2]
+  // CHECK-SAME: lhs_contracting_dimensions = [2]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+
+  // Both LHS/RHS have zero zp. No zp contribution.
+
+  // CHECK-DAG: %[[ZP_CONTRIB:.*]] = mhlo.constant dense<0> : tensor<2x5x8xi32>
+
+  // CHECK-DAG: %[[COMBINED_SCALE:.*]] = mhlo.constant dense<1.500000e+00> : tensor<f32>
+  // CHECK: %[[RES_FP:.*]] = mhlo.convert %[[DOT_RES]] :
+  // CHECK-SAME: (tensor<2x5x8xi32>) -> tensor<2x5x8xf32>
+  // CHECK: %[[RES_FP_1:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RES_FP:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
+  // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
+
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.convert %[[ZP_CONTRIB]]
+  // CHECK-SAME: (tensor<2x5x8xi32>) -> tensor<2x5x8xf32>
+  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[ZP_TOTAL_1:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
+  // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
+
+  // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
+  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_3]]
+  // CHECK-SAME: (tensor<i32>, tensor<2x5x8xi32>) -> tensor<2x5x8xi32>
+  // CHECK: mhlo.add %[[RES_INT]], %[[ZP_TOTAL_4]]
+
+  %2 = "mhlo.dot_general" (%0, %1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [2],
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [0]
+    >} : (
+      tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:0>>,
+      tensor<6x8x2x!quant.uniform<i8:f32, 3.000000e+00:0>>
+    ) -> tensor<2x5x8x!quant.uniform<i8:f32, 4.000000e+00:7>>
+  %3 = mhlo.uniform_dequantize %2 : (
+    tensor<2x5x8x!quant.uniform<i8:f32, 4.000000e+00:7>>
+  ) -> tensor<2x5x8xf32>
+  return %3 : tensor<2x5x8xf32>
 }
 
 // -----

From 5192adf96920853d699b9bb3ce405438934d3191 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Thu, 21 Sep 2023 12:22:59 -0700
Subject: [PATCH 099/567] [XLA:GPU] Trigger Triton GEMM fusions also on kCopy
 input operations.

PiperOrigin-RevId: 567384050
---
 .../xla/service/gpu/gemm_rewriter_triton.cc   | 30 +++++++------------
 .../xla/service/gpu/ir_emitter_triton_test.cc | 22 --------------
 2 files changed, 11 insertions(+), 41 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
index 8f237801bfa4cf..8f1e0399ddb13a 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
@@ -73,16 +73,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-int GetFusionLevel(const HloInstruction& hlo, const GpuVersion gpu_version) {
-  int level =
-      hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
-  if (!std::get<se::CudaComputeCapability>(gpu_version)
-           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    level = std::min(level, 1);
-  }
-  return level;
-}
-
 bool HasDivisibleSuffixAllowingSplit(const absl::Span<int64_t const> span,
                                      const int64_t divisor) {
   CHECK_GE(divisor, 1);
@@ -1092,6 +1082,12 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
     absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
         old_to_new_mapping,
     const GpuVersion gpu_version) const {
+  int fusion_level =
+      hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
+  if (!std::get<se::CudaComputeCapability>(gpu_version)
+           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+    fusion_level = std::min(fusion_level, 1);
+  }
   if (hlo.opcode() == HloOpcode::kTuple ||
       hlo.opcode() == HloOpcode::kGetTupleElement) {
     return "Unsupported instruction.";
@@ -1112,7 +1108,7 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
     return "Unsupported output data type.";
   }
   if (as_input) {
-    if (GetFusionLevel(hlo, gpu_version) < 2) {
+    if (fusion_level < 2) {
       if (hlo.opcode() == HloOpcode::kConvert) {
         if (FusionDecision decision =
                 RequireTritonFusibleConvert(&hlo, gpu_version);
@@ -1128,7 +1124,7 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
       }
     }
   } else {
-    if (GetFusionLevel(hlo, gpu_version) < 2) {
+    if (fusion_level < 2) {
       return "Skipping fusing outputs at low fusion levels.";
     }
     for (const HloInstruction* operand : hlo.operands()) {
@@ -1378,14 +1374,10 @@ StatusOr<FusionDecision> FuseDot(HloInstruction& dot,
   if (dot.GetModule()->config().debug_options().xla_gpu_triton_gemm_any()) {
     return FusionDecision{};
   }
-
-  absl::flat_hash_set<HloOpcode> triggers{
-      HloOpcode::kConvert, HloOpcode::kSlice, HloOpcode::kTranspose};
-  if (GetFusionLevel(dot, gpu_version) >= 2) {
-    triggers.insert(HloOpcode::kCopy);
-  }
   for (const auto& iter : old_to_new_mapping) {
-    if (triggers.contains(iter.second->opcode())) {
+    if (iter.second->opcode() == HloOpcode::kConvert ||
+        iter.second->opcode() == HloOpcode::kSlice ||
+        iter.second->opcode() == HloOpcode::kTranspose) {
       return FusionDecision{};
     }
   }
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index af6776ed1c0231..65f6906475d8e2 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -1086,28 +1086,6 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-3, /*arel=*/2e-3}));
 }
 
-TEST_F(TritonGemmLevel2Test, FuseTransposeWithoutMixedTypes) {
-  const std::string kHloText = R"(
-ENTRY e {
-  p1 = f16[150,32,60]{2,1,0} parameter(1)
-  p0 = f16[75,2,26,60]{3,2,1,0} parameter(0)
-  t = f16[75,2,60,26]{3,2,1,0} transpose(p0), dimensions={0,1,3,2}
-  r = f16[150,60,26]{2,1,0} reshape(t)
-  ROOT tmp_4 = f16[150,32,26]{2,1,0} dot(p1, r),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={1}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
 TEST_F(TritonGemmTest, SineOutputIsNotFused) {
   const std::string kHloText = R"(
 HloModule m

From 712acc54fb2dab395e6cd80316206d53647ae9bf Mon Sep 17 00:00:00 2001
From: Luke Boyer <lukeboyer@google.com>
Date: Thu, 21 Sep 2023 12:45:11 -0700
Subject: [PATCH 100/567] Add helper to parse element type from variant. Useful
 when ops don't support the helper query for retrieving underlying type.

PiperOrigin-RevId: 567390216
---
 .../mlir/lite/transforms/legalize_tensorlist.cc | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
index 498009806090bc..820021284b833b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
@@ -18,14 +18,17 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
@@ -35,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace {
@@ -53,6 +57,19 @@ mlir::TFL::ConstBytesAttr CreateListReserveOptions(
   return SerializeOptionsToBytes(context, options);
 }
 
+std::optional<mlir::Type> GetSingularVariantBaseType(mlir::Value val) {
+  auto val_t =
+      mlir::getElementTypeOrSelf(val).dyn_cast_or_null<mlir::TF::VariantType>();
+  if (!val_t) {
+    return std::nullopt;
+  }
+  llvm::ArrayRef<mlir::TensorType> subtypes = val_t.getSubtypes();
+  if (subtypes.size() != 1) {
+    return std::nullopt;
+  }
+  return subtypes[0].getElementType();
+}  // NOLINT: TODO(b/257472333) This function will be used in child changes.
+
 }  // namespace
 
 // Create an `mlir::TFL::ConstBytesAttr` which encodes the options

From b893c461189c8a44ed850c10d5ba34b6a21704c2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 21 Sep 2023 13:13:43 -0700
Subject: [PATCH 101/567] [xla:gpu] Depend on stream_executor implementation in
 hlo_graph_dumper

PiperOrigin-RevId: 567398096
---
 third_party/xla/xla/service/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 04003d8f96c093..65c42edbbfcaad 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -5272,7 +5272,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",

From d844b5b0311a1718060ccd1c98720e189d038136 Mon Sep 17 00:00:00 2001
From: Fiona Lang <flang@google.com>
Date: Thu, 21 Sep 2023 13:23:19 -0700
Subject: [PATCH 102/567] Create an python/ops/distributions init file and
 delete the distributions import in the python init file.

PiperOrigin-RevId: 567400961
---
 tensorflow/python/BUILD                         |  2 +-
 tensorflow/python/__init__.py                   |  4 ----
 tensorflow/python/modules_with_exports.py       |  3 +++
 tensorflow/python/ops/distributions/BUILD       | 11 ++++++++++-
 tensorflow/python/ops/distributions/__init__.py | 16 ++++++++++++++++
 5 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 2fea59e32f411e..4f7b1427e37ce1 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -83,7 +83,6 @@ py_strict_library(
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/ops:gradient_checker_v2",
         "//tensorflow/python/ops:stateful_random_ops",
-        "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/structured:structured_ops",
         "//tensorflow/python/tpu:tpu_estimator",
         "//tensorflow/python/tpu:tpu_noestimator",
@@ -411,6 +410,7 @@ py_strict_library(
         "//tensorflow/python/ops:tpu_ops_gen",
         "//tensorflow/python/ops:uniform_quant_ops_gen",
         "//tensorflow/python/ops:while_v2",
+        "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg/sparse:sparse_py",
         "//tensorflow/python/ops/losses",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index cc24e60c698a5f..00ad7e3138798d 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -48,7 +48,3 @@
 # Expose symbols minus dunders, unless they are allowlisted above.
 # This is necessary to export our dunders.
 __all__ = [s for s in dir() if s in _exported_dunders or not s.startswith('_')]
-
-# TODO(b/296442875): remove this when we remove the tf.distribution package.
-# This import is needed for tf.compat.v1.distributions.
-from tensorflow.python.ops.distributions import distributions
diff --git a/tensorflow/python/modules_with_exports.py b/tensorflow/python/modules_with_exports.py
index 6e59fc7f94424a..77bb4573e761c0 100644
--- a/tensorflow/python/modules_with_exports.py
+++ b/tensorflow/python/modules_with_exports.py
@@ -42,6 +42,9 @@
 # Data
 from tensorflow.python import data
 
+# Distributions
+from tensorflow.python.ops import distributions
+
 # TensorFlow Debugger (tfdbg).
 from tensorflow.python.debug.lib import check_numerics_callback
 from tensorflow.python.debug.lib import dumping_callback
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index 90790b9c071839..34cd63cb08ac8b 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -8,7 +8,7 @@ package(
 
 py_strict_library(
     name = "distributions",
-    srcs = ["distributions.py"],
+    srcs = ["__init__.py"],
     deprecation = ("TensorFlow Distributions has migrated to " +
                    "TensorFlow Probability " +
                    "(https://github.com/tensorflow/probability). " +
@@ -17,6 +17,15 @@ py_strict_library(
                    "early 2019. You should update all usage of " +
                    "`tf.distributions` to `tfp.distributions`."),
     srcs_version = "PY3",
+    deps = [
+        ":distributions_py",
+    ],
+)
+
+py_strict_library(
+    name = "distributions_py",
+    srcs = ["distributions.py"],
+    srcs_version = "PY3",
     deps = [
         ":bernoulli",
         ":beta",
diff --git a/tensorflow/python/ops/distributions/__init__.py b/tensorflow/python/ops/distributions/__init__.py
index e69de29bb2d1d6..8365f2485eeb95 100644
--- a/tensorflow/python/ops/distributions/__init__.py
+++ b/tensorflow/python/ops/distributions/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Core module for TensorFlow distribution objects and helpers."""
+from tensorflow.python.ops.distributions import distributions

From 307cfab7f3192db349d028b87a382a5b0328c647 Mon Sep 17 00:00:00 2001
From: Luke Boyer <lukeboyer@google.com>
Date: Thu, 21 Sep 2023 14:12:25 -0700
Subject: [PATCH 103/567] TensorListLength kernel implementation.

PiperOrigin-RevId: 567415547
---
 tensorflow/lite/kernels/variants/BUILD        | 15 ++++
 .../variants/list_kernels/list_length.cc      | 74 +++++++++++++++++++
 .../variants/list_kernels/list_length_test.cc | 68 +++++++++++++++++
 .../lite/kernels/variants/list_ops_lib.h      |  2 +
 .../variants/list_ops_subgraph_test.cc        | 34 +++++++++
 .../variants/list_ops_subgraph_test_util.cc   | 58 +++++++++++++++
 .../variants/list_ops_subgraph_test_util.h    |  3 +
 .../kernels/variants/register_list_ops.cc     |  1 +
 8 files changed, 255 insertions(+)
 create mode 100644 tensorflow/lite/kernels/variants/list_kernels/list_length.cc
 create mode 100644 tensorflow/lite/kernels/variants/list_kernels/list_length_test.cc

diff --git a/tensorflow/lite/kernels/variants/BUILD b/tensorflow/lite/kernels/variants/BUILD
index 27abbe16721a1e..55fd1a30ac1c5a 100644
--- a/tensorflow/lite/kernels/variants/BUILD
+++ b/tensorflow/lite/kernels/variants/BUILD
@@ -127,6 +127,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "list_length_test",
+    srcs = ["list_kernels/list_length_test.cc"],
+    deps = [
+        ":list_ops_lib",
+        ":tensor_array",
+        ":test_util",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "list_ops_util",
     srcs = ["list_ops_util.cc"],
diff --git a/tensorflow/lite/kernels/variants/list_kernels/list_length.cc b/tensorflow/lite/kernels/variants/list_kernels/list_length.cc
new file mode 100644
index 00000000000000..612e626fa5d6f4
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/list_kernels/list_length.cc
@@ -0,0 +1,74 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/variants/list_ops_lib.h"
+#include "tensorflow/lite/kernels/variants/tensor_array.h"
+
+namespace tflite {
+namespace variants {
+namespace ops {
+namespace list_length {
+namespace {
+
+using ::tflite::variants::TensorArray;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* list_input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &list_input));
+  TF_LITE_ENSURE_TYPES_EQ(context, list_input->type, kTfLiteVariant);
+
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, output->dims->size, 0);
+
+  output->allocation_type = kTfLiteArenaRw;
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* list_input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &list_input));
+  TF_LITE_ENSURE_EQ(context, list_input->allocation_type, kTfLiteVariantObject);
+  const TensorArray* const input_arr =
+      reinterpret_cast<TensorArray*>(list_input->data.data);
+
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+
+  const int length = input_arr->NumElements();
+  output->data.i32[0] = length;
+
+  return kTfLiteOk;
+}
+}  // namespace
+}  // namespace list_length
+
+TfLiteRegistration* Register_LIST_LENGTH() {
+  static TfLiteRegistration r = {nullptr, nullptr, list_length::Prepare,
+                                 list_length::Eval};
+  return &r;
+}
+
+}  // namespace ops
+}  // namespace variants
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/list_kernels/list_length_test.cc b/tensorflow/lite/kernels/variants/list_kernels/list_length_test.cc
new file mode 100644
index 00000000000000..ac1f49a9e52f41
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/list_kernels/list_length_test.cc
@@ -0,0 +1,68 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/kernels/variants/list_kernels/test_util.h"
+#include "tensorflow/lite/kernels/variants/list_ops_lib.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace variants {
+namespace ops {
+namespace {
+
+class ListLengthModel : public ListOpModel {
+ public:
+  ListLengthModel() {
+    list_input_ = AddInput({TensorType_VARIANT, {}});
+    length_output_ = AddOutput({TensorType_INT32, {}});
+    SetCustomOp("ListLength", {}, Register_LIST_LENGTH);
+    BuildInterpreter({{}});
+  }
+  const TfLiteTensor* GetOutputTensor() {
+    return interpreter_->tensor(length_output_);
+  }
+  int list_input_;
+  int length_output_;
+};
+
+class ListLengthTest : public ::testing::TestWithParam<int> {};
+
+TEST_P(ListLengthTest, OutputIsListLength) {
+  const int length = GetParam();
+  ListLengthModel m;
+  m.PopulateListTensor(m.list_input_, {2, 2}, length, kTfLiteInt32);
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  const TfLiteTensor* const output = m.GetOutputTensor();
+  ASSERT_EQ(output->type, kTfLiteInt32);
+  ASSERT_EQ(output->allocation_type, kTfLiteArenaRw);
+  ASSERT_THAT(output, DimsAre({}));
+  ASSERT_EQ(output->data.i32[0], length);
+}
+
+INSTANTIATE_TEST_SUITE_P(ListLengthTests, ListLengthTest,
+                         testing::Values(0, 1, 5, 10, 100));
+
+}  // namespace
+}  // namespace ops
+}  // namespace variants
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/list_ops_lib.h b/tensorflow/lite/kernels/variants/list_ops_lib.h
index eae6e88a87a1f3..4bf1b6038661ed 100644
--- a/tensorflow/lite/kernels/variants/list_ops_lib.h
+++ b/tensorflow/lite/kernels/variants/list_ops_lib.h
@@ -39,6 +39,8 @@ TfLiteRegistration* Register_LIST_FROM_TENSOR();
 
 TfLiteRegistration* Register_LIST_GET_ITEM();
 
+TfLiteRegistration* Register_LIST_LENGTH();
+
 }  // namespace ops
 }  // namespace variants
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/list_ops_subgraph_test.cc b/tensorflow/lite/kernels/variants/list_ops_subgraph_test.cc
index 888574cc87212f..ee12e45ea3871d 100644
--- a/tensorflow/lite/kernels/variants/list_ops_subgraph_test.cc
+++ b/tensorflow/lite/kernels/variants/list_ops_subgraph_test.cc
@@ -287,5 +287,39 @@ TEST_F(WhileIncrementListOpsTest,
   }
 }
 
+class ListReserveLengthSubgraphTest
+    : public ListOpsSubgraphTest,
+      public ::testing::WithParamInterface<int> {};
+
+TEST_P(ListReserveLengthSubgraphTest, InterpreterOutputsListLength) {
+  const int length = GetParam();
+
+  builder_.BuildReserveLengthSubgraph(&interpreter_.primary_subgraph());
+
+  ASSERT_EQ(interpreter_.ResizeInputTensor(0, {1}), kTfLiteOk);
+  ASSERT_EQ(interpreter_.ResizeInputTensor(1, {}), kTfLiteOk);
+  ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+
+  TfLiteTensor* element_shape = interpreter_.input_tensor(0);
+  element_shape->data.i32[0] = 2;
+
+  TfLiteTensor* num_elements = interpreter_.input_tensor(1);
+  num_elements->data.i32[0] = length;
+
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_.output_tensor(0);
+  ASSERT_EQ(output->type, kTfLiteInt32);
+  ASSERT_EQ(output->allocation_type, kTfLiteArenaRw);
+  ASSERT_THAT(output, DimsAre({}));
+  ASSERT_TRUE(output->data.data != nullptr);
+
+  ASSERT_EQ(output->data.i32[0], length);
+}
+
+INSTANTIATE_TEST_SUITE_P(ListOpsSubgraphParamTests,
+                         ListReserveLengthSubgraphTest,
+                         testing::Values(0, 1, 2, 5, 10));
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.cc b/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.cc
index 6090741e631eb5..536e1446588a78 100644
--- a/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.cc
+++ b/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.cc
@@ -35,6 +35,7 @@ limitations under the License.
 
 using ::tflite::subgraph_test_util::SetupTensor;
 using ::tflite::variants::detail::ListReserveOptions;
+using ::tflite::variants::ops::Register_LIST_LENGTH;
 using ::tflite::variants::ops::Register_LIST_RESERVE;
 using ::tflite::variants::ops::Register_LIST_SET_ITEM;
 using ::tflite::variants::ops::Register_LIST_STACK;
@@ -344,4 +345,61 @@ void ListOpsSubgraphBuilder::BuildSetItemAndIncrementSubgraph(
   TF_LITE_ASSERT_EQ(add_stat, kTfLiteOk);
 }
 
+void ListOpsSubgraphBuilder::BuildReserveLengthSubgraph(Subgraph* subgraph) {
+  constexpr int kElementShape = 0;
+  constexpr int kNumElements = 1;
+  constexpr int kReserveOut = 2;
+  constexpr int kLengthOut = 3;
+  constexpr int kTensorCount = 4;
+  // kElementShape(0) --> +-------------+
+  //                      | ListReserve |
+  // kNumElements(1)  --> +-------------+ --> kReserveOut(2)
+  //                                                |
+  //                                          +------------+
+  //                                          | ListLength |
+  //                                          +------------+ --> kLengthOut(3)
+
+  int first_new_tensor_index;
+  TF_LITE_ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+                    kTfLiteOk);
+  TF_LITE_ASSERT_EQ(first_new_tensor_index, 0);
+
+  TF_LITE_ASSERT_EQ(subgraph->SetOutputs({kLengthOut}), kTfLiteOk);
+  SetupTensor(subgraph, kLengthOut, kTfLiteInt32);
+
+  TF_LITE_ASSERT_EQ(subgraph->SetInputs({kElementShape, kNumElements}),
+                    kTfLiteOk);
+  SetupTensor(subgraph, kElementShape, kTfLiteInt32);
+  SetupTensor(subgraph, kNumElements, kTfLiteInt32);
+  SetupTensor(subgraph, kReserveOut, kTfLiteVariant);
+
+  TfLiteRegistration* reserve_reg = Register_LIST_RESERVE();
+  reserve_reg->builtin_code = BuiltinOperator_CUSTOM;
+  reserve_reg->custom_name = "ListReserve";
+
+  ListReserveOptions* options = RequestReserveOptions(TensorType_INT32);
+
+  int reserve_node_index;
+  TfLiteStatus stat = subgraph->AddNodeWithParameters(
+      {kElementShape, kNumElements}, {kReserveOut},
+      /*intermediates=*/{}, reinterpret_cast<const char*>(options),
+      sizeof(ListReserveOptions),
+      /*builtin_data=*/nullptr, reserve_reg, &reserve_node_index);
+
+  TF_LITE_ASSERT_EQ(stat, kTfLiteOk);
+
+  TfLiteRegistration* length_reg = Register_LIST_LENGTH();
+  length_reg->builtin_code = BuiltinOperator_CUSTOM;
+  length_reg->custom_name = "ListLength";
+
+  int length_node_index;
+  stat = subgraph->AddNodeWithParameters(
+      {kReserveOut}, {kLengthOut},
+      /*intermediates=*/{}, /*init_data=*/nullptr,
+      /*init_data_size=*/0,
+      /*builtin_data=*/nullptr, length_reg, &length_node_index);
+
+  TF_LITE_ASSERT_EQ(stat, kTfLiteOk);
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h b/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h
index d4e469e6e53526..720a1461bc392d 100644
--- a/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h
+++ b/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h
@@ -56,6 +56,9 @@ class ListOpsSubgraphBuilder {
   // the given int by 1.
   void BuildSetItemAndIncrementSubgraph(Subgraph* subgraph);
 
+  // Populates the given Subgraph with a "ListReserve" and "ListLength" op.
+  void BuildReserveLengthSubgraph(Subgraph* subgraph);
+
  private:
   // Creates a constant tensor in given Subgraphs at given indice with
   // corresponding data.
diff --git a/tensorflow/lite/kernels/variants/register_list_ops.cc b/tensorflow/lite/kernels/variants/register_list_ops.cc
index 606ef9afc9a1c0..6c9f205377b0c5 100644
--- a/tensorflow/lite/kernels/variants/register_list_ops.cc
+++ b/tensorflow/lite/kernels/variants/register_list_ops.cc
@@ -27,6 +27,7 @@ void RegisterListOps(MutableOpResolver* resolver) {
   resolver->AddCustom("TensorListSetItem", Register_LIST_SET_ITEM());
   resolver->AddCustom("TensorListFromTensor", Register_LIST_FROM_TENSOR());
   resolver->AddCustom("TensorListGetItem", Register_LIST_GET_ITEM());
+  resolver->AddCustom("TensorListLength", Register_LIST_LENGTH());
 }
 
 }  // namespace ops

From 8c3316dfd30e0ce8bfe323d94d35c0b2edf60b98 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 21 Sep 2023 14:16:59 -0700
Subject: [PATCH 104/567] Replicate small constants so they don't need to be
 sent to their successors. A small constant is replicated to each of its
 successors' devices. The maximum size of a constant to be replicated is 16
 elements.

This pass is disabled by default and can be enabled with the flag replicate_small_constants.

PiperOrigin-RevId: 567416836
---
 tensorflow/core/common_runtime/BUILD          |  50 ++-
 .../replicate_constants_pass.cc               | 189 ++++++++++
 .../common_runtime/replicate_constants_pass.h |  50 +++
 .../replicate_constants_pass_test.cc          | 334 ++++++++++++++++++
 tensorflow/core/config/flag_defs.h            |   4 +
 tensorflow/core/config/flags_api_wrapper.cc   |   1 +
 tensorflow/python/flags_pybind.pyi            |   1 +
 7 files changed, 620 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/replicate_constants_pass.cc
 create mode 100644 tensorflow/core/common_runtime/replicate_constants_pass.h
 create mode 100644 tensorflow/core/common_runtime/replicate_constants_pass_test.cc

diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 4b7c5eb8b7c29d..1db3c0d12ed9f5 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1,7 +1,9 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_google",
     "if_libtpu",
+    "if_macos",
     "if_oss",
     "if_zendnn",
     "tf_cc_test",
@@ -21,25 +23,24 @@ load(
     "tf_protos_all",
     "tf_protos_grappler",
 )
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
     "tf_cuda_tests_tags",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
-    "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
-    "if_mkl_ml",
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
 )
 load(
     "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
     "tf_cc_fuzz_test",
 )
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+    "if_mkl_ml",
+)
 
 default_package_visibility = [
     "//tensorflow:internal",
@@ -299,6 +300,7 @@ filegroup(
         "renamed_device.h",
         "rendezvous_mgr.h",
         "rendezvous_util.h",
+        "replicate_constants_pass.h",
         "replicate_per_replica_nodes.h",
         "ring_alg.h",
         "ring_gatherer.h",
@@ -1132,6 +1134,31 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "replicate_constants_pass",
+    srcs = ["replicate_constants_pass.cc"],
+    hdrs = ["replicate_constants_pass.h"],
+    copts = tf_copts(),
+    deps = [
+        ":optimization_registry",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core/config:flag_defs",
+        "//tensorflow/core/config:flags",
+        "//tensorflow/core/framework:node_def_util",
+        "//tensorflow/core/framework:tensor_proto_cc",
+        "//tensorflow/core/framework:tensor_shape_proto_cc",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "local_device",
     srcs = ["local_device.cc"],
@@ -1948,7 +1975,10 @@ tf_cuda_library(
         ":step_stats_collector",
         ":threadpool_device",
         ":threadpool_device_factory",
-    ] + if_zendnn([":zen_layout_pass"]),
+    ] + if_zendnn([":zen_layout_pass"]) + if_macos(
+        [],
+        [":replicate_constants_pass"],
+    ),
 )
 
 tf_cuda_library(
@@ -2312,6 +2342,7 @@ tf_cc_tests(
         "optimization_registry_test.cc",
         "pending_counts_test.cc",
         "placer_inspection_required_ops_utils_test.cc",
+        "replicate_constants_pass_test.cc",
         "session_test.cc",
         "threadpool_device_test.cc",
     ],
@@ -2342,6 +2373,7 @@ tf_cc_tests(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/config:flag_defs",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/platform:regexp",
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass.cc b/tensorflow/core/common_runtime/replicate_constants_pass.cc
new file mode 100644
index 00000000000000..73f96d66f940bb
--- /dev/null
+++ b/tensorflow/core/common_runtime/replicate_constants_pass.cc
@@ -0,0 +1,189 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/replicate_constants_pass.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "absl/container/btree_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/config/flag_defs.h"
+#include "tensorflow/core/config/flags.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace {
+
+// Maximum size constant to replicate.
+constexpr int64_t kMaxSize = 16;
+
+// Set `node`'s name to <original-name>/replicate/_<unique-index>
+void SetUniqueName(Graph* graph, Node* node) {
+  node->set_name(graph->NewName(absl::StrCat(node->name(), "/replicate")));
+}
+
+// `node` has an output control edge.
+bool HasControlOut(Node* node) {
+  auto control_out_it =
+      std::find_if(node->out_edges().begin(), node->out_edges().end(),
+                   [](const auto& e) { return e->IsControlEdge(); });
+  return control_out_it != node->out_edges().end();
+}
+
+// `node`'s device is a CPU.
+bool HasCpuDevice(const Node* node) {
+  DeviceNameUtils::ParsedName device;
+  if (!DeviceNameUtils::ParseFullName(node->assigned_device_name(), &device))
+    return false;
+  return device.type == "CPU";
+}
+
+// Get the CPU device on the same host as dst.
+Status GetDestinationCpuDevice(const Node* dst, std::string* device) {
+  if (!dst->has_assigned_device_name())
+    return absl::AbortedError(
+        absl::StrCat("Node name: ", dst->name(), " has no assigned device."));
+  return DeviceNameUtils::DeviceNameToCpuDeviceName(dst->assigned_device_name(),
+                                                    device);
+}
+
+// Collect the successor edges of the constant. Group them by the device of the
+// successor.
+Status GetSuccessorEdges(
+    Node* node,
+    absl::btree_map<std::string, std::vector<const Edge*>>& device_to_edges) {
+  for (const auto& edge : node->out_edges()) {
+    const Node* dst = edge->dst();
+    std::string device;
+    TF_RETURN_IF_ERROR(GetDestinationCpuDevice(dst, &device));
+    if (!device_to_edges.count(device)) device_to_edges.insert({device, {}});
+    device_to_edges[device].push_back(edge);
+  }
+  return OkStatus();
+}
+
+// Replicate the constant to each successor device.
+void ReplicateToEachDevice(
+    Graph* graph, Node* node,
+    absl::btree_map<std::string, std::vector<const Edge*>>& device_to_edges) {
+  for (const auto& pair : device_to_edges) {
+    Node* copy = graph->CopyNode(node);
+    SetUniqueName(graph, copy);
+    const std::string device = pair.first;
+    copy->set_assigned_device_name(device);
+    // Set the successor edges to ops on this device.
+    for (const Edge* edge : pair.second) {
+      graph->AddEdge(copy, edge->src_output(), edge->dst(), edge->dst_input());
+    }
+    // Replicate in edges that are control.
+    for (Node* src : node->in_nodes()) {
+      graph->AddControlEdge(src, copy, true);
+    }
+  }
+  graph->RemoveNode(node);
+}
+
+}  // namespace
+
+Status ReplicateConstantsPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  if (!flags::Global().replicate_small_constants.value()) {
+    VLOG(1) << "replicate_constants_pass not enabled";
+    return OkStatus();
+  }
+  VLOG(1) << "replicate_constants_pass will replicate constants with "
+             "number-of-elements <= "
+          << kMaxSize;
+
+  Graph* graph = options.graph->get();
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << DumpGraphToFile("before_replicate_constants_pass", *graph,
+                               options.flib_def);
+  }
+  int64_t min_skipped = std::numeric_limits<int64_t>::max();
+  int64_t max_skipped = std::numeric_limits<int64_t>::min();
+  for (Node* node : graph->nodes()) {
+    if (!node->IsConstant()) continue;
+
+    // For performance, skip when there is at most one successor.
+    if (node->out_edges().size() <= 1) continue;
+
+    // Skip if the constant has a control successor. Replicating constants with
+    // control successors would require relpicating these control edges, which
+    // could result in even more message passing.
+    if (HasControlOut(node)) continue;
+
+    // Skip if the constant is too large.
+    const TensorProto* value = nullptr;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "value", &value));
+    TF_ASSIGN_OR_RETURN(TensorShape shape,
+                        TensorShape::BuildTensorShape(value->tensor_shape()));
+    if (shape.num_elements() > kMaxSize) {
+      min_skipped = std::min(min_skipped, shape.num_elements());
+      max_skipped = std::max(max_skipped, shape.num_elements());
+      continue;
+    }
+
+    // Skip if there is no assigned device.
+    if (!node->has_assigned_device_name()) continue;
+
+    // Skip when the original constant is not on a CPU, because is not clear
+    // whether replicating from non-CPU to CPU is valid.
+    if (!HasCpuDevice(node)) continue;
+
+    // Collect successor edges, per device.
+    absl::btree_map<std::string, std::vector<const Edge*>> device_to_edges;
+    TF_RETURN_IF_ERROR(GetSuccessorEdges(node, device_to_edges));
+
+    // Skip if all successors are on the same device.
+    if (device_to_edges.size() <= 1) continue;
+
+    // Replicate the constant to each successor device.
+    ReplicateToEachDevice(graph, node, device_to_edges);
+  }
+  if (min_skipped != std::numeric_limits<int64_t>::max()) {
+    VLOG(1) << "replicate_constants_pass skipped replicating constants with "
+               "number of elements in the range "
+            << min_skipped << " to " << max_skipped << ".";
+  }
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << DumpGraphToFile("after_replicate_constants_pass", *graph,
+                               options.flib_def);
+  }
+  return OkStatus();
+}
+
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_PLACEMENT, 3,
+                      ReplicateConstantsPass);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass.h b/tensorflow/core/common_runtime/replicate_constants_pass.h
new file mode 100644
index 00000000000000..b7b2f0fe98c0d2
--- /dev/null
+++ b/tensorflow/core/common_runtime/replicate_constants_pass.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+// Small constants are replicated to the hosts of their successors. This pass
+// only applies when there are multiple successors.
+//
+// For example, the graph:
+//   C -> {Op0, Op1, Op2, Op3}
+//   C's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:CPU:0
+//   Op0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:0
+//   Op1's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:1
+//   Op2's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:0
+//   Op3's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:1
+// is rewritten to:
+//   C0 -> {Op0, Op1}
+//   C1 -> {Op2, Op3}
+//   C0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:CPU:0
+//   C1's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:CPU:0
+//   Op0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:0
+//   Op1's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:1
+//   Op2's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:0
+//   Op3's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:1
+
+namespace tensorflow {
+
+class ReplicateConstantsPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass_test.cc b/tensorflow/core/common_runtime/replicate_constants_pass_test.cc
new file mode 100644
index 00000000000000..dae22012bfba8b
--- /dev/null
+++ b/tensorflow/core/common_runtime/replicate_constants_pass_test.cc
@@ -0,0 +1,334 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/replicate_constants_pass.h"
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/config/flag_defs.h"
+#include "tensorflow/core/config/flags.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/test.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/test.h"
+
+namespace tensorflow {
+
+const char kCpu0[] = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0";
+const char kCpu1[] = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0";
+const char kTpu00[] = "/job:tpu_host_worker/replica:0/task:0/device:TPU:0";
+const char kTpu01[] = "/job:tpu_host_worker/replica:0/task:0/device:TPU:1";
+const char kTpu10[] = "/job:tpu_host_worker/replica:0/task:1/device:TPU:0";
+const char kTpu11[] = "/job:tpu_host_worker/replica:0/task:1/device:TPU:1";
+
+// Return the node with name `name`.
+Node* GetNode(const Graph& graph, const std::string& name) {
+  for (Node* node : graph.nodes()) {
+    if (node->name() == name) return node;
+  }
+  CHECK(false) << "Unknown node name: " << name;
+  return nullptr;
+}
+
+// Return the first predecessor of `node`.
+Node* GetPredecessor(Node* node) {
+  auto it = node->in_nodes().begin();
+  CHECK(it != node->in_nodes().end())
+      << "No predecessor for " << node->name() << "\n";
+  return *it;
+}
+
+// There exists an edge from `src` to `dst`.
+bool IsEdge(Node* src, Node* dst) {
+  for (Node* node : src->out_nodes()) {
+    if (node == dst) return true;
+  }
+  return false;
+}
+
+// Test that a small constant is replicated to each successor's device.
+TEST(ReplicateConstantsPassTest, TestSmallConstant) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output const0 =
+        ops::Const(scope.WithOpName("const"), 1.0f, TensorShape({}));
+    ops::Negate dst0(scope.WithOpName("dst0"), const0);
+    ops::Negate dst1(scope.WithOpName("dst1"), const0);
+    ops::Negate dst2(scope.WithOpName("dst2"), const0);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "const")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
+  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* dst0 = GetNode(*graph, "dst0");
+  Node* dst1 = GetNode(*graph, "dst1");
+  Node* dst2 = GetNode(*graph, "dst2");
+  EXPECT_EQ(dst0->assigned_device_name(),
+            GetPredecessor(dst0)->assigned_device_name());
+  EXPECT_EQ(dst1->assigned_device_name(),
+            GetPredecessor(dst1)->assigned_device_name());
+  EXPECT_EQ(dst2->assigned_device_name(),
+            GetPredecessor(dst2)->assigned_device_name());
+}
+
+// Test that a large constant is ignored.
+TEST(ReplicateConstantsPassTest, TestLargeConstant) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output const0 =
+        ops::Const(scope.WithOpName("const"),
+                   {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    ops::Negate dst0(scope.WithOpName("dst0"), const0);
+    ops::Negate dst1(scope.WithOpName("dst1"), const0);
+    ops::Negate dst2(scope.WithOpName("dst2"), const0);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "const")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
+  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* dst0 = GetNode(*graph, "dst0");
+  Node* dst1 = GetNode(*graph, "dst1");
+  Node* dst2 = GetNode(*graph, "dst2");
+  EXPECT_EQ(dst0->assigned_device_name(),
+            GetPredecessor(dst0)->assigned_device_name());
+  EXPECT_NE(dst1->assigned_device_name(),
+            GetPredecessor(dst1)->assigned_device_name());
+  EXPECT_NE(dst2->assigned_device_name(),
+            GetPredecessor(dst2)->assigned_device_name());
+}
+
+// Test that a constant with a control successor is ignored.
+TEST(ReplicateConstantsPassTest, TestControlOut) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output const0 =
+        ops::Const(scope.WithOpName("const0"), 1.0f, TensorShape({}));
+    Output ctrl_succ =
+        ops::Const(scope.WithOpName("ctrl_succ"), 1.0f, TensorShape({}));
+    ops::Negate dst0(scope.WithOpName("dst0"), const0);
+    ops::Negate dst1(scope.WithOpName("dst1"), const0);
+    ops::Negate dst2(scope.WithOpName("dst2"), const0);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "const0")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "ctrl_succ")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
+  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
+  graph->AddControlEdge(GetNode(*graph, "const0"),
+                        GetNode(*graph, "ctrl_succ"));
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* dst0 = GetNode(*graph, "dst0");
+  Node* dst1 = GetNode(*graph, "dst1");
+  Node* dst2 = GetNode(*graph, "dst2");
+  EXPECT_EQ(dst0->assigned_device_name(),
+            GetPredecessor(dst0)->assigned_device_name());
+  EXPECT_NE(dst1->assigned_device_name(),
+            GetPredecessor(dst1)->assigned_device_name());
+  EXPECT_NE(dst2->assigned_device_name(),
+            GetPredecessor(dst2)->assigned_device_name());
+}
+
+// Test that a constant on a TPU is ignored.
+TEST(ReplicateConstantsPassTest, TestTpuConst) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output const0 =
+        ops::Const(scope.WithOpName("const0"), 1.0f, TensorShape({}));
+    ops::Negate dst0(scope.WithOpName("dst0"), const0);
+    ops::Negate dst1(scope.WithOpName("dst1"), const0);
+    ops::Negate dst2(scope.WithOpName("dst2"), const0);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "const0")->set_assigned_device_name(kTpu00);
+  GetNode(*graph, "dst0")->set_assigned_device_name(kTpu00);
+  GetNode(*graph, "dst1")->set_assigned_device_name(kTpu10);
+  GetNode(*graph, "dst2")->set_assigned_device_name(kTpu10);
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* dst0 = GetNode(*graph, "dst0");
+  Node* dst1 = GetNode(*graph, "dst1");
+  Node* dst2 = GetNode(*graph, "dst2");
+  EXPECT_EQ(dst0->assigned_device_name(),
+            GetPredecessor(dst0)->assigned_device_name());
+  EXPECT_NE(dst1->assigned_device_name(),
+            GetPredecessor(dst1)->assigned_device_name());
+  EXPECT_NE(dst2->assigned_device_name(),
+            GetPredecessor(dst2)->assigned_device_name());
+}
+
+TEST(ReplicateConstantsPassTest, TestSmallAndLargeConstants) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output small = ops::Const(scope.WithOpName("small"), 1.0f, TensorShape({}));
+    Output large =
+        ops::Const(scope.WithOpName("large"),
+                   {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+                    10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f});
+    ops::Add dst0(scope.WithOpName("dst0"), small, large);
+    ops::Add dst1(scope.WithOpName("dst1"), small, large);
+    ops::Add dst2(scope.WithOpName("dst2"), small, large);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "small")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "large")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
+  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* small0 = GetNode(*graph, "small/replicate/_0");
+  Node* small1 = GetNode(*graph, "small/replicate/_1");
+  Node* large = GetNode(*graph, "large");
+  Node* dst0 = GetNode(*graph, "dst0");
+  Node* dst1 = GetNode(*graph, "dst1");
+  Node* dst2 = GetNode(*graph, "dst2");
+  EXPECT_EQ(small0->assigned_device_name(), kCpu0);
+  EXPECT_EQ(small1->assigned_device_name(), kCpu1);
+  EXPECT_EQ(large->assigned_device_name(), kCpu0);
+  EXPECT_EQ(dst0->assigned_device_name(), kCpu0);
+  EXPECT_EQ(dst1->assigned_device_name(), kCpu1);
+  EXPECT_EQ(dst1->assigned_device_name(), kCpu1);
+  EXPECT_TRUE(IsEdge(small0, dst0));
+  EXPECT_TRUE(IsEdge(large, dst0));
+  EXPECT_TRUE(IsEdge(small1, dst1));
+  EXPECT_TRUE(IsEdge(large, dst1));
+  EXPECT_TRUE(IsEdge(small1, dst2));
+  EXPECT_TRUE(IsEdge(large, dst2));
+}
+
+// Test that a constant at a CPU with TPU successors is replicated to the
+// TPUs' host CPUs.
+TEST(ReplicateConstantsPassTest, TestTpuDestinations) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output const0 =
+        ops::Const(scope.WithOpName("const"), 1.0f, TensorShape({}));
+    ops::Negate dst00(scope.WithOpName("dst00"), const0);
+    ops::Negate dst01(scope.WithOpName("dst01"), const0);
+    ops::Negate dst10(scope.WithOpName("dst10"), const0);
+    ops::Negate dst11(scope.WithOpName("dst11"), const0);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "const")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst00")->set_assigned_device_name(kTpu00);
+  GetNode(*graph, "dst01")->set_assigned_device_name(kTpu01);
+  GetNode(*graph, "dst10")->set_assigned_device_name(kTpu10);
+  GetNode(*graph, "dst11")->set_assigned_device_name(kTpu11);
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* const0 = GetNode(*graph, "const/replicate/_0");
+  Node* const1 = GetNode(*graph, "const/replicate/_1");
+  Node* dst00 = GetNode(*graph, "dst00");
+  Node* dst01 = GetNode(*graph, "dst01");
+  Node* dst10 = GetNode(*graph, "dst10");
+  Node* dst11 = GetNode(*graph, "dst11");
+  EXPECT_EQ(const0->assigned_device_name(), kCpu0);
+  EXPECT_EQ(const1->assigned_device_name(), kCpu1);
+  EXPECT_TRUE(IsEdge(const0, dst00));
+  EXPECT_TRUE(IsEdge(const0, dst01));
+  EXPECT_TRUE(IsEdge(const1, dst10));
+  EXPECT_TRUE(IsEdge(const1, dst11));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/config/flag_defs.h b/tensorflow/core/config/flag_defs.h
index 4ab5fb4750de46..89ea6a9b73bbf1 100644
--- a/tensorflow/core/config/flag_defs.h
+++ b/tensorflow/core/config/flag_defs.h
@@ -49,6 +49,10 @@ class Flags {
   TF_DECLARE_FLAG(more_stack_traces, false,
                   "Enable experimental code that preserves and propagates "
                   "graph node stack traces in C++.");
+  TF_DECLARE_FLAG(replicate_small_constants, false,
+                  "Enable a graph optimization pass that replicate each small "
+                  "constant to its successors' devices. This can decrease "
+                  "message passing.");
   // LINT.ThenChange(//tensorflow/core/config/flags_api_wrapper.cc)
 };
 
diff --git a/tensorflow/core/config/flags_api_wrapper.cc b/tensorflow/core/config/flags_api_wrapper.cc
index 58074fb06257d1..974581e931f7ec 100644
--- a/tensorflow/core/config/flags_api_wrapper.cc
+++ b/tensorflow/core/config/flags_api_wrapper.cc
@@ -51,5 +51,6 @@ PYBIND11_MODULE(flags_pybind, m) {
   TF_PY_DECLARE_FLAG(saved_model_fingerprinting);
   TF_PY_DECLARE_FLAG(tf_shape_default_int64);
   TF_PY_DECLARE_FLAG(more_stack_traces);
+  TF_PY_DECLARE_FLAG(replicate_small_constants);
   // LINT.ThenChange(//tensorflow/core/config/flag_defs.h)
 };
diff --git a/tensorflow/python/flags_pybind.pyi b/tensorflow/python/flags_pybind.pyi
index 34b0a0c5666eb8..90aa0a7d76114b 100644
--- a/tensorflow/python/flags_pybind.pyi
+++ b/tensorflow/python/flags_pybind.pyi
@@ -24,6 +24,7 @@ class Flags:
     graph_building_optimization: Flag
     more_stack_traces: Flag
     op_building_optimization: Flag
+    replicate_small_constants: Flag
     saved_model_fingerprinting: Flag
     test_only_experiment_1: Flag
     test_only_experiment_2: Flag

From 44f18cf198b4315e55e9dfd84e35ee748e629286 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 14:27:34 -0700
Subject: [PATCH 105/567] [XLA] Add WithSharding in pattern matcher and modify
 tests to conform to the new pattern matching format

-Add WithSharding implementation for HloInstructionPattern to match with the sharding.

PiperOrigin-RevId: 567419758
---
 third_party/xla/xla/service/gpu/BUILD         |  7 +-
 .../gpu/all_reduce_blueconnect_test.cc        |  3 -
 .../gpu/auto_sharding_gpu_compiler_test.cc    | 10 ++-
 .../gpu/scatter_slice_simplifier_test.cc      | 82 ++++++++++---------
 third_party/xla/xla/service/pattern_matcher.h | 55 +++++++++++++
 .../xla/xla/service/pattern_matcher_test.cc   | 25 ++++++
 6 files changed, 137 insertions(+), 45 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index abba4f41bd9c97..c6f2f7d1ba3a08 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2781,6 +2781,8 @@ xla_cc_test(
     deps = [
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:gpu_plugin",
+        "//xla/service:pattern_matcher",
+        "//xla/service:pattern_matcher_gmock",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
     ],
@@ -3013,7 +3015,6 @@ xla_cc_test(
         ":all_reduce_blueconnect",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_matchers",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/tests:hlo_test_base",
@@ -4314,7 +4315,9 @@ xla_cc_test(
     srcs = ["scatter_slice_simplifier_test.cc"],
     deps = [
         ":scatter_slice_simplifier",
-        "//xla/hlo/utils:hlo_matchers",
+        "//xla:shape_util",
+        "//xla/service:pattern_matcher",
+        "//xla/service:pattern_matcher_gmock",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
     ],
diff --git a/third_party/xla/xla/service/gpu/all_reduce_blueconnect_test.cc b/third_party/xla/xla/service/gpu/all_reduce_blueconnect_test.cc
index 2a7f471bc5883a..755d3413d335d6 100644
--- a/third_party/xla/xla/service/gpu/all_reduce_blueconnect_test.cc
+++ b/third_party/xla/xla/service/gpu/all_reduce_blueconnect_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
@@ -31,9 +30,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using ::testing::AllOf;
 using ::tsl::testing::IsOkAndHolds;
-namespace op = xla::testing::opcode_matchers;
 namespace m = ::xla::match;
 
 using AllReduceBlueConnectTest = HloTestBase;
diff --git a/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
index 2c3f2a5396f76c..c410370b9700cb 100644
--- a/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include <memory>
 
 #include "xla/hlo/utils/hlo_matchers.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
@@ -23,6 +25,7 @@ namespace gpu {
 namespace {
 
 namespace op = xla::testing::opcode_matchers;
+namespace m = ::xla::match;
 
 class AutoShardingTest : public HloTestBase {
  protected:
@@ -34,7 +37,7 @@ ENTRY matmul {
   ROOT root = f32[32,128]{1,0} dot(parameter.1, parameter.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
   std::unique_ptr<HloModule> CompileMatMul(bool use_autosharding,
-                                       int num_partitions) {
+                                           int num_partitions) {
     HloModuleConfig config;
     config.set_use_spmd_partitioning(true);
     config.set_use_auto_spmd_partitioning(use_autosharding);
@@ -57,14 +60,15 @@ TEST_F(AutoShardingTest, MatMulWithAutosharding) {
   auto compiled_module = CompileMatMul(true, 4);
   auto* instruction = FindInstruction(compiled_module.get(), "param");
   VLOG(2) << instruction->ToString();
-  EXPECT_THAT(instruction, op::Sharding("{devices=[4,1]0,1,2,3}"));
+  EXPECT_THAT(instruction,
+              GmockMatch(m::Op().WithSharding("{devices=[4,1]0,1,2,3}")));
 }
 
 TEST_F(AutoShardingTest, MatMulWithoutAutosharding) {
   auto compiled_module = CompileMatMul(false, 4);
   auto* instruction = FindInstruction(compiled_module.get(), "param");
   VLOG(2) << instruction->ToString();
-  EXPECT_THAT(instruction, op::Sharding("{replicated}"));
+  EXPECT_THAT(instruction, GmockMatch(m::Op().WithSharding("{replicated}")));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/scatter_slice_simplifier_test.cc b/third_party/xla/xla/service/gpu/scatter_slice_simplifier_test.cc
index 8762c78783fdbf..89b43e74acf643 100644
--- a/third_party/xla/xla/service/gpu/scatter_slice_simplifier_test.cc
+++ b/third_party/xla/xla/service/gpu/scatter_slice_simplifier_test.cc
@@ -15,14 +15,16 @@ limitations under the License.
 
 #include "xla/service/gpu/scatter_slice_simplifier.h"
 
-#include "xla/hlo/utils/hlo_matchers.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace {
 
-namespace op = xla::testing::opcode_matchers;
-using ::testing::AllOf;
+namespace m = ::xla::match;
 
 using ScatterSliceSimplifierTest = HloTestBase;
 
@@ -48,9 +50,9 @@ ENTRY main {
   ScatterSliceSimplifier test_pass;
   ASSERT_TRUE(RunHloPass(&test_pass, module.get()).value());
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              AllOf(op::Shape("f32[8]"),
-                    op::Scatter(op::Slice(op::Constant()), op::Parameter(0),
-                                op::Parameter(1))));
+              GmockMatch(m::Scatter(m::Slice(m::Constant()), m::Parameter(0),
+                                    m::Parameter(1))
+                             .WithShape(F32, {8})));
 }
 
 TEST_F(ScatterSliceSimplifierTest, Scatter3D) {
@@ -75,9 +77,9 @@ ENTRY main {
   ScatterSliceSimplifier test_pass;
   ASSERT_TRUE(RunHloPass(&test_pass, module.get()).value());
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              AllOf(op::Shape("f32[4, 4, 4]"),
-                    op::Scatter(op::Slice(op::Constant()), op::Parameter(0),
-                                op::Parameter(1))));
+              GmockMatch(m::Scatter(m::Slice(m::Constant()), m::Parameter(0),
+                                    m::Parameter(1))
+                             .WithShape(F32, {4, 4, 4})));
 }
 
 TEST_F(ScatterSliceSimplifierTest, ScatterMultiOutput) {
@@ -112,12 +114,15 @@ ENTRY main {
   ScatterSliceSimplifier test_pass;
   ASSERT_TRUE(RunHloPass(&test_pass, module.get()).value());
   auto expected_scatter =
-      op::Scatter(op::Slice(op::Constant()), op::Slice(op::Constant()),
-                  op::Parameter(0), op::Parameter(1), op::Parameter(2));
+      m::Scatter(m::Slice(m::Constant()), m::Slice(m::Constant()),
+                 m::Parameter(0), m::Parameter(1), m::Parameter(2));
+
+  Shape expected_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {8}), ShapeUtil::MakeShape(F16, {8})});
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              AllOf(op::Shape("(f32[8], f16[8])"),
-                    op::Tuple(op::GetTupleElement(expected_scatter),
-                              op::GetTupleElement(expected_scatter))));
+              GmockMatch(m::Tuple(m::GetTupleElement(expected_scatter),
+                                  m::GetTupleElement(expected_scatter))
+                             .WithShapeEqualTo(&expected_shape)));
 }
 
 TEST_F(ScatterSliceSimplifierTest, NotMatching) {
@@ -211,13 +216,16 @@ ENTRY main {
                     .value();
   ScatterSliceSimplifier test_pass;
   ASSERT_TRUE(RunHloPass(&test_pass, module.get()).value());
-  auto expected_scatter = op::Scatter(op::Slice(op::Constant()),
-                                      op::Parameter(0), op::Parameter(1));
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              AllOf(op::Shape("(f32[8], f32[8])"),
-                    op::Tuple(op::Abs(expected_scatter),
-                              op::Maximum(expected_scatter,
-                                          op::Slice(op::Constant())))));
+  auto expected_scatter =
+      m::Scatter(m::Slice(m::Constant()), m::Parameter(0), m::Parameter(1));
+
+  Shape expected_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {8}), ShapeUtil::MakeShape(F32, {8})});
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(m::Abs(expected_scatter),
+                          m::Maximum(expected_scatter, m::Slice(m::Constant())))
+                     .WithShapeEqualTo(&expected_shape)));
 }
 
 TEST_F(ScatterSliceSimplifierTest, IntermediaryChain) {
@@ -244,12 +252,12 @@ ENTRY main {
                     .value();
   ScatterSliceSimplifier test_pass;
   ASSERT_TRUE(RunHloPass(&test_pass, module.get()).value());
-  auto expected_scatter = op::Scatter(op::Slice(op::Constant()),
-                                      op::Parameter(0), op::Parameter(1));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      AllOf(op::Shape("f32[8]"), op::Add(op::Abs(expected_scatter),
-                                         op::Exp(op::Abs(expected_scatter)))));
+  auto expected_scatter =
+      m::Scatter(m::Slice(m::Constant()), m::Parameter(0), m::Parameter(1));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Add(m::Abs(expected_scatter),
+                                m::Exp(m::Abs(expected_scatter)))
+                             .WithShape(F32, {8})));
 }
 
 TEST_F(ScatterSliceSimplifierTest, DiamondShape) {
@@ -283,12 +291,12 @@ ENTRY main {
   ScatterSliceSimplifier test_pass;
   ASSERT_TRUE(RunHloPass(&test_pass, module.get()).value());
   auto expected_scatter =
-      op::Scatter(op::Slice(op::Constant()), op::Slice(op::Constant()),
-                  op::Parameter(0), op::Parameter(1), op::Parameter(2));
+      m::Scatter(m::Slice(m::Constant()), m::Slice(m::Constant()),
+                 m::Parameter(0), m::Parameter(1), m::Parameter(2));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              AllOf(op::Shape("f32[8]"),
-                    op::Add(op::GetTupleElement(expected_scatter),
-                            op::GetTupleElement(expected_scatter))));
+              GmockMatch(m::Add(m::GetTupleElement(expected_scatter),
+                                m::GetTupleElement(expected_scatter))
+                             .WithShape(F32, {8})));
 }
 
 TEST_F(ScatterSliceSimplifierTest, ElementwiseSelect) {
@@ -314,12 +322,12 @@ ENTRY main {
                     .value();
   ScatterSliceSimplifier test_pass;
   ASSERT_TRUE(RunHloPass(&test_pass, module.get()).value());
-  auto expected_scatter = op::Scatter(op::Slice(op::Constant()),
-                                      op::Parameter(0), op::Parameter(1));
+  auto expected_scatter =
+      m::Scatter(m::Slice(m::Constant()), m::Parameter(0), m::Parameter(1));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              AllOf(op::Shape("f32[8]"),
-                    op::Select(op::Slice(op::Parameter(2)), expected_scatter,
-                               op::Slice(op::Constant()))));
+              GmockMatch(m::Select(m::Slice(m::Parameter(2)), expected_scatter,
+                                   m::Slice(m::Constant()))
+                             .WithShape(F32, {8})));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/pattern_matcher.h b/third_party/xla/xla/service/pattern_matcher.h
index 6ff8faf666e346..041d500006ad53 100644
--- a/third_party/xla/xla/service/pattern_matcher.h
+++ b/third_party/xla/xla/service/pattern_matcher.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/layout_util.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_parser.h"
@@ -2086,6 +2087,55 @@ class HloInstructionReplicaGroupsImpl {
   std::vector<std::vector<int64_t>> replica_groups_;
 };
 
+class HloInstructionShardingImpl {
+ public:
+  explicit HloInstructionShardingImpl(
+      const std::optional<HloSharding>& sharding)
+      : sharding_(sharding) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    if (sharding_.has_value()) {
+      *os << "with sharding " << sharding_->ToString();
+    } else {
+      *os << "with no sharding";
+    }
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (!sharding_.has_value()) {
+      if (!inst->has_sharding()) {
+        return true;
+      }
+      EXPLAIN << "HloInstruction is expected to have no sharding.";
+      return false;
+    }
+    if (inst->has_sharding()) {
+      if (inst->sharding() == sharding_.value()) {
+        return true;
+      }
+      EXPLAIN << "sharding " << inst->sharding().ToString()
+              << " don't match expected " << sharding_->ToString();
+      return false;
+    } else {
+      EXPLAIN << "HloInstruction has no sharding. Expected: "
+              << sharding_->ToString();
+      return false;
+    }
+  }
+
+  std::optional<HloSharding> sharding_;
+};
+
 // Matches a constant scalar or effective scalar, optionally with a given value.
 template <typename ScalarTy>
 class HloConstantScalarImpl {
@@ -2398,6 +2448,11 @@ class HloInstructionPattern {
         HloInstructionReplicaGroupsImpl(std::move(replica_groups)));
   }
 
+  auto WithSharding(absl::string_view sharding) const {
+    return AppendImpl(
+        HloInstructionShardingImpl(ParseSharding(sharding).value()));
+  }
+
   void DescribeTo(std::ostream* os, int64_t indent = 0) const {
     impl_.DescribeTo(os, indent);
   }
diff --git a/third_party/xla/xla/service/pattern_matcher_test.cc b/third_party/xla/xla/service/pattern_matcher_test.cc
index 811ac0d720be5a..4511aa99cd08ea 100644
--- a/third_party/xla/xla/service/pattern_matcher_test.cc
+++ b/third_party/xla/xla/service/pattern_matcher_test.cc
@@ -1428,5 +1428,30 @@ TEST_F(PatternMatcherTest, TestWithReplicaGroups) {
       "replica_groups={{0,1},{2,3}}, to_apply=add");
 }
 
+TEST_F(PatternMatcherTest, TestWithSharding) {
+  constexpr char kModuleStr[] = R"(
+    HloModule test_module
+    ENTRY test {
+      p0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+        sharding={devices=[1,2,2,1]0,1,2,3},
+        metadata={op_name="test"}
+      ROOT copy = f32[5,7,11,13]{3,2,1,0} copy(p0)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  auto* instruction = FindInstruction(hlo_module.get(), "p0");
+  EXPECT_TRUE(
+      Match(instruction, m::Op().WithSharding("{devices=[1,2,2,1]0,1,2,3}")));
+  EXPECT_FALSE(
+      Match(instruction, m::Op().WithSharding("{devices=[2,2,1,1]0,1,2,3}")));
+  EXPECT_DESC_AND_EXPLANATION(
+      instruction, m::Op().WithSharding("{devices=[2,2,1,1]0,1,2,3}"),
+      "an HloInstruction with sharding {devices=[2,2,1,1]0,1,2,3}",
+      "sharding {devices=[1,2,2,1]0,1,2,3} don't match expected "
+      "{devices=[2,2,1,1]0,1,2,3}\n"
+      "in p0 = f32[5,7,11,13]{3,2,1,0} parameter(0), "
+      "sharding={devices=[1,2,2,1]0,1,2,3}");
+}
+
 }  // namespace
 }  // namespace xla

From fb01efc24a76eb53e808737ee882888489bdf3f9 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Thu, 21 Sep 2023 14:34:31 -0700
Subject: [PATCH 106/567] [XLA] Hand-roll the evaluator's sort

This allows us to not crash if the strict weak ordering requirement is not met. Also, try to detect comparators which violate strict weak ordering are passed in.

PiperOrigin-RevId: 567421978
---
 third_party/xla/xla/BUILD                     |   2 +-
 third_party/xla/xla/comparison_util.h         |  33 +-
 third_party/xla/xla/hlo/evaluator/BUILD       |  14 +-
 .../xla/xla/hlo/evaluator/hlo_evaluator.cc    | 389 +++++++++++-------
 4 files changed, 264 insertions(+), 174 deletions(-)

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index b7ea0daba41c4e..184e312a72ede1 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -144,7 +144,7 @@ cc_library(
         ":xla_data_proto_cc",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:float8",
         "@local_tsl//tsl/platform:logging",
diff --git a/third_party/xla/xla/comparison_util.h b/third_party/xla/xla/comparison_util.h
index e0e5dec1c9eec7..364e503ff7c0c1 100644
--- a/third_party/xla/xla/comparison_util.h
+++ b/third_party/xla/xla/comparison_util.h
@@ -18,15 +18,15 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
-#include <limits>
 #include <optional>
 #include <ostream>
 #include <string>
 
-#include "absl/meta/type_traits.h"
+#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "xla/primitive_util.h"
 #include "xla/statusor.h"
+#include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
@@ -185,28 +185,19 @@ class Comparison {
     }
   }
 
-  // Applies the comparison from this Comparison's direction and ordering for
-  // integral types.
-  template <typename T,
-            absl::enable_if_t<std::numeric_limits<T>::is_integer, int> = 0>
-  inline bool Compare(const T a, const T b) const {
-    DCHECK(primitive_util::IsCanonicalRepresentation<T>(primitive_type_));
-    return GetComparator<T>()(a, b);
-  }
-
-  // Applies the comparison from this Comparison's direction and ordering
-  // for floating point types.
-  template <typename T,
-            absl::enable_if_t<!std::numeric_limits<T>::is_integer, int> = 0>
+  template <typename T>
   inline bool Compare(const T a, const T b) const {
     DCHECK(primitive_util::IsCanonicalRepresentation<T>(primitive_type_));
-    if (IsTotalOrder()) {
-      //  -NaN < -Inf < -Finite < -0 < +0 < +Finite < +Inf < +NaN
-      // Reference:
-      // https://www.tensorflow.org/xla/operation_semantics#element-wise_comparison_operations
-      using R = SignedIntegerTypeForSizeType<sizeof(T)>;
-      return GetComparator<R>()(ToSignMagnitude(a), ToSignMagnitude(b));
+    if constexpr (is_specialized_floating_point_v<T>) {
+      if (IsTotalOrder()) {
+        //  -NaN < -Inf < -Finite < -0 < +0 < +Finite < +Inf < +NaN
+        // Reference:
+        // https://www.tensorflow.org/xla/operation_semantics#element-wise_comparison_operations
+        using R = SignedIntegerTypeForSizeType<sizeof(T)>;
+        return GetComparator<R>()(ToSignMagnitude(a), ToSignMagnitude(b));
+      }
     }
+    // Applies the comparison from this Comparison's direction and ordering.
     return GetComparator<T>()(a, b);
   }
 
diff --git a/third_party/xla/xla/hlo/evaluator/BUILD b/third_party/xla/xla/hlo/evaluator/BUILD
index 93e99dc8b357ac..7b3af5b3239d54 100644
--- a/third_party/xla/xla/hlo/evaluator/BUILD
+++ b/third_party/xla/xla/hlo/evaluator/BUILD
@@ -43,20 +43,23 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla:array2d",
+        "//xla:comparison_util",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
-        "//xla:window_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:call_graph",
         "//xla/service:compilation_environments",
         "//xla/service:dynamic_dimension_inference",
+        "//xla/service:hlo_module_config",
+        "//xla/service:logical_buffer",
         "//xla/service:pattern_matcher",
         "//xla/service:shape_inference",
         "//xla/service:tuple_points_to_analysis",
@@ -68,13 +71,20 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/lib/core:bitmap",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:float8",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:types",
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
index c0530ba523e58e..29117bcfcaa82e 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
@@ -18,15 +18,18 @@ limitations under the License.
 #include <atomic>
 #include <cmath>
 #include <complex>
+#include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
 #include <functional>
 #include <iterator>
+#include <limits>
 #include <memory>
+#include <numeric>
 #include <optional>
+#include <random>
 #include <string>
-#include <type_traits>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -34,40 +37,54 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/base/internal/endian.h"
 #include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
-#include "absl/strings/match.h"
+#include "absl/functional/function_ref.h"
+#include "absl/memory/memory.h"
+#include "absl/numeric/bits.h"
+#include "absl/strings/cord.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "Eigen/Core"  // from @eigen_archive
+#include "xla/array2d.h"
+#include "xla/comparison_util.h"
 #include "xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/index_util.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/map_util.h"
 #include "xla/primitive_util.h"
+#include "xla/service/call_graph.h"
 #include "xla/service/compilation_environments.h"
 #include "xla/service/cpu/runtime_single_threaded_matmul.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/logical_buffer.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/shape_inference.h"
 #include "xla/service/tuple_points_to_analysis.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
-#include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/lib/core/bitmap.h"
+#include "tsl/platform/cpu_info.h"
+#include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/float8.h"
 #include "tsl/platform/logging.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/types.h"
@@ -79,114 +96,52 @@ namespace {
 using primitive_util::NativeTypeOf;
 
 template <typename OperandT>
-StatusOr<Literal> Compare(const Shape& shape, ComparisonDirection direction,
+StatusOr<Literal> Compare(const Shape& shape, Comparison comparison,
                           LiteralSlice lhs_literal, LiteralSlice rhs_literal) {
-  std::function<bool(OperandT, OperandT)> compare_op;
-  switch (direction) {
+  auto populate = [&](auto compare_op) -> StatusOr<Literal> {
+    Literal result(shape);
+    TF_RETURN_IF_ERROR(result.PopulateParallel<bool>(
+        [&](absl::Span<const int64_t> multi_index, int /*thread_id*/) {
+          auto lhs = lhs_literal.Get<OperandT>(multi_index);
+          auto rhs = rhs_literal.Get<OperandT>(multi_index);
+          if constexpr (is_specialized_floating_point_v<OperandT>) {
+            if (comparison.IsTotalOrder()) {
+              return compare_op(ToSignMagnitude(lhs), ToSignMagnitude(rhs));
+            }
+          }
+          return compare_op(lhs, rhs);
+        }));
+    return std::move(result);
+  };
+  switch (comparison.GetDirection()) {
     case ComparisonDirection::kEq:
-      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
-        return lhs_el == rhs_el;
-      };
-      break;
+      return populate([](auto lhs, auto rhs) { return lhs == rhs; });
     case ComparisonDirection::kNe:
-      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
-        return lhs_el != rhs_el;
-      };
-      break;
+      return populate([](auto lhs, auto rhs) { return lhs != rhs; });
     case ComparisonDirection::kGe:
-      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
-        return lhs_el >= rhs_el;
-      };
+      if constexpr (!is_complex_v<OperandT>) {
+        return populate([](auto lhs, auto rhs) { return lhs >= rhs; });
+      }
       break;
     case ComparisonDirection::kGt:
-      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
-        return lhs_el > rhs_el;
-      };
+      if constexpr (!is_complex_v<OperandT>) {
+        return populate([](auto lhs, auto rhs) { return lhs > rhs; });
+      }
       break;
     case ComparisonDirection::kLe:
-      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
-        return lhs_el <= rhs_el;
-      };
+      if constexpr (!is_complex_v<OperandT>) {
+        return populate([](auto lhs, auto rhs) { return lhs <= rhs; });
+      }
       break;
     case ComparisonDirection::kLt:
-      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
-        return lhs_el < rhs_el;
-      };
-      break;
-  }
-
-  Literal result(shape);
-  TF_RETURN_IF_ERROR(
-      result.Populate<bool>([&](absl::Span<const int64_t> multi_index) {
-        return compare_op(lhs_literal.Get<OperandT>(multi_index),
-                          rhs_literal.Get<OperandT>(multi_index));
-      }));
-
-  return std::move(result);
-}
-
-template <>
-StatusOr<Literal> Compare<complex64>(const Shape& shape,
-                                     ComparisonDirection direction,
-                                     LiteralSlice lhs_literal,
-                                     LiteralSlice rhs_literal) {
-  std::function<bool(complex64, complex64)> compare_op;
-  switch (direction) {
-    case ComparisonDirection::kEq:
-      compare_op = [](complex64 lhs_el, complex64 rhs_el) {
-        return lhs_el == rhs_el;
-      };
-      break;
-    case ComparisonDirection::kNe:
-      compare_op = [](complex64 lhs_el, complex64 rhs_el) {
-        return lhs_el != rhs_el;
-      };
+      if constexpr (!is_complex_v<OperandT>) {
+        return populate([](auto lhs, auto rhs) { return lhs < rhs; });
+      }
       break;
-    default:
-      LOG(FATAL) << "unhandled direction for conversion to Comparison: "
-                 << ComparisonDirectionToString(direction);
   }
 
-  Literal result(shape);
-  TF_RETURN_IF_ERROR(
-      result.Populate<bool>([&](absl::Span<const int64_t> multi_index) {
-        return compare_op(lhs_literal.Get<complex64>(multi_index),
-                          rhs_literal.Get<complex64>(multi_index));
-      }));
-
-  return std::move(result);
-}
-
-template <>
-StatusOr<Literal> Compare<complex128>(const Shape& shape,
-                                      ComparisonDirection direction,
-                                      LiteralSlice lhs_literal,
-                                      LiteralSlice rhs_literal) {
-  std::function<bool(complex128, complex128)> compare_op;
-  switch (direction) {
-    case ComparisonDirection::kEq:
-      compare_op = [](complex128 lhs_el, complex128 rhs_el) {
-        return lhs_el == rhs_el;
-      };
-      break;
-    case ComparisonDirection::kNe:
-      compare_op = [](complex128 lhs_el, complex128 rhs_el) {
-        return lhs_el != rhs_el;
-      };
-      break;
-    default:
-      LOG(FATAL) << "unhandled direction for conversion to Comparison: "
-                 << ComparisonDirectionToString(direction);
-  }
-
-  Literal result(shape);
-  TF_RETURN_IF_ERROR(
-      result.Populate<bool>([&](absl::Span<const int64_t> multi_index) {
-        return compare_op(lhs_literal.Get<complex128>(multi_index),
-                          rhs_literal.Get<complex128>(multi_index));
-      }));
-
-  return std::move(result);
+  LOG(FATAL) << "unhandled direction for conversion to Comparison: "
+             << comparison.ToString();
 }
 
 std::optional<bool> GetInstructionStaticValueAsBool(
@@ -1572,30 +1527,32 @@ Status HloEvaluator::HandleComplex(const HloInstruction* complex) {
 
 Status HloEvaluator::HandleCompare(const HloInstruction* compare) {
   ComparisonDirection direction = compare->comparison_direction();
+  ComparisonOrder order = compare->comparison_order();
   auto lhs = compare->operand(0);
   auto rhs = compare->operand(1);
   DCHECK(ShapeUtil::SameDimensions(compare->shape(), rhs->shape()) &&
          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
 
   TF_RET_CHECK(lhs->shape().element_type() == rhs->shape().element_type());
+  auto element_type = lhs->shape().element_type();
+  Comparison comparison(direction, element_type, order);
 
   const Literal& lhs_literal = GetEvaluatedLiteralFor(lhs);
   const Literal& rhs_literal = GetEvaluatedLiteralFor(rhs);
-
   // Note here we switch on the operand's type.
   return primitive_util::PrimitiveTypeSwitch<Status>(
       [&](auto primitive_type_constant) -> Status {
         if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
           TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                              Compare<NativeT>(compare->shape(), direction,
+                              Compare<NativeT>(compare->shape(), comparison,
                                                lhs_literal, rhs_literal));
           return OkStatus();
         }
         LOG(FATAL) << "HandleCompare: unknown primitive type: "
-                   << PrimitiveType_Name(lhs->shape().element_type());
+                   << PrimitiveType_Name(element_type);
       },
-      lhs->shape().element_type());
+      element_type);
 }
 
 Status HloEvaluator::HandleTuple(const HloInstruction* tuple) {
@@ -3969,8 +3926,180 @@ Status HloEvaluator::HandleSort(const HloInstruction* sort) {
       << "Unexpected out-of-bound sort dimension " << sort_dim
       << " accessing increment of size " << increment.size();
   increment[sort_dim] = sort_dim_elements;
-  std::unique_ptr<HloEvaluator> embedded_evaluator =
-      CreateEmbedded(max_loop_iterations_);
+
+  auto comparator = [sort](absl::Span<const Literal> literals_to_sort,
+                           int64_t a, int64_t b,
+                           HloEvaluator* embedded_evaluator) -> StatusOr<bool> {
+    absl::InlinedVector<Literal, 8> literals;
+    literals.reserve(2 * sort->operand_count());
+    for (int64_t i = 0; i < sort->operand_count(); ++i) {
+      literals.push_back(
+          LiteralUtil::GetScalarLiteral(literals_to_sort[i], {a}));
+      literals.push_back(
+          LiteralUtil::GetScalarLiteral(literals_to_sort[i], {b}));
+    }
+    absl::InlinedVector<const Literal*, 8> literal_ptrs;
+    absl::c_transform(literals, std::back_inserter(literal_ptrs),
+                      [](const Literal& literal) { return &literal; });
+
+    TF_ASSIGN_OR_RETURN(
+        auto computed_result,
+        embedded_evaluator->Evaluate(*sort->to_apply(), literal_ptrs));
+    // Clear visit states so that we can use the evaluator again
+    // on the same computation.
+    embedded_evaluator->ResetVisitStates();
+    return computed_result.Get<bool>({});
+  };
+  auto less_than = [&comparator](
+                       absl::Span<const Literal> literals_to_sort, int64_t a,
+                       int64_t b,
+                       HloEvaluator* embedded_evaluator) -> StatusOr<bool> {
+    TF_ASSIGN_OR_RETURN(bool a_is_smaller,
+                        comparator(literals_to_sort, a, b, embedded_evaluator));
+#ifndef NDEBUG
+    // Let's see if the comparator violates strict weak ordering.
+    // N.B. This does not test transitivity.
+    TF_ASSIGN_OR_RETURN(bool b_is_smaller,
+                        comparator(literals_to_sort, b, a, embedded_evaluator));
+    TF_RET_CHECK(!(b_is_smaller && a_is_smaller));
+    TF_ASSIGN_OR_RETURN(bool b_is_reflexive,
+                        comparator(literals_to_sort, b, b, embedded_evaluator));
+    TF_RET_CHECK(!b_is_reflexive);
+    TF_ASSIGN_OR_RETURN(bool a_is_reflexive,
+                        comparator(literals_to_sort, a, a, embedded_evaluator));
+    TF_RET_CHECK(!a_is_reflexive);
+#endif
+    return a_is_smaller;
+  };
+  std::function<Status(absl::Span<const Literal>, absl::Span<int64_t>,
+                       absl::Span<int64_t>, absl::Span<int64_t>,
+                       std::vector<int64_t>&, HloEvaluator*)>
+      merge = [&](absl::Span<const Literal> literals_to_sort,
+                  absl::Span<int64_t> lhs, absl::Span<int64_t> rhs,
+                  absl::Span<int64_t> output, std::vector<int64_t>& tmp,
+                  HloEvaluator* embedded_evaluator) -> Status {
+    tmp.clear();
+    tmp.reserve(output.size());
+    // Keep picking between elements.
+    while (!lhs.empty() && !rhs.empty()) {
+      // If rhs < lhs, pick rhs. Otherwise, pick lhs. This should ensure
+      // stability as lhs comes first in the array.
+      TF_ASSIGN_OR_RETURN(bool rhs_is_smaller,
+                          less_than(literals_to_sort, rhs.front(), lhs.front(),
+                                    embedded_evaluator));
+      if (rhs_is_smaller) {
+        tmp.push_back(rhs.front());
+        rhs.remove_prefix(1);
+      } else {
+        tmp.push_back(lhs.front());
+        lhs.remove_prefix(1);
+      }
+    }
+    // At least one of the two input arrays are now empty, we need to copy
+    // the remaining elements.
+    absl::c_copy(lhs, std::back_inserter(tmp));
+    absl::c_copy(rhs, std::back_inserter(tmp));
+    absl::c_copy(tmp, output.begin());
+    return OkStatus();
+  };
+  auto* env = tsl::Env::Default();
+  const int max_parallelism = tsl::port::MaxParallelism();
+  constexpr size_t kMinElementsPerThread{1024};
+  const size_t useful_parallelism = std::min<size_t>(
+      sort_dim_elements / kMinElementsPerThread, max_parallelism);
+  const size_t work_per_thread = useful_parallelism > 1
+                                     ? sort_dim_elements / useful_parallelism
+                                     : std::numeric_limits<size_t>::max();
+  std::function<Status(absl::Span<const Literal>, absl::Span<int64_t>,
+                       std::vector<int64_t>*, HloEvaluator*)>
+      mergesort = [&merge, &mergesort, &less_than, this, env, work_per_thread](
+                      absl::Span<const Literal> literals_to_sort,
+                      absl::Span<int64_t> to_sort,
+                      std::vector<int64_t>* scratch,
+                      HloEvaluator* embedded_evaluator) -> Status {
+    // Base case: inputs with 0 or 1 elements are already sorted.
+    if (to_sort.size() < 2) {
+      return OkStatus();
+    }
+    size_t halfway = to_sort.size() / 2;
+    auto lhs = to_sort.subspan(/*pos=*/0, halfway);
+    auto rhs = to_sort.subspan(/*pos=*/halfway);
+
+    // Allocate an evaluator if we never got one, we will reuse an
+    // allocator so long as we are not moving it between threads.
+    std::unique_ptr<HloEvaluator> thread_local_embedded_evaluator;
+    if (embedded_evaluator == nullptr) {
+      thread_local_embedded_evaluator = CreateEmbedded(max_loop_iterations_);
+      embedded_evaluator = thread_local_embedded_evaluator.get();
+    }
+
+    constexpr size_t kMinElementsForMergesort{9};
+    if (to_sort.size() >= kMinElementsForMergesort) {
+      std::unique_ptr<std::vector<int64_t>> thread_local_scratch;
+      if (!scratch) {
+        thread_local_scratch = std::make_unique<std::vector<int64_t>>();
+        scratch = thread_local_scratch.get();
+      }
+      // Overlap sorting the LHS with the RHS if we have enough work to
+      // do. The recursive call for to `mergesort(rhs)` will potentially
+      // create more threads.
+      Status lhs_status;
+      if (to_sort.size() >= work_per_thread) {
+        std::unique_ptr<tsl::Thread> thread = absl::WrapUnique(env->StartThread(
+            tsl::ThreadOptions(), "XLA_mergesort",
+            [literals_to_sort, lhs, &mergesort, &lhs_status] {
+              lhs_status = mergesort(literals_to_sort, lhs, nullptr, nullptr);
+            }));
+        TF_RETURN_IF_ERROR(
+            mergesort(literals_to_sort, rhs, scratch, embedded_evaluator));
+        // Here, `thread` will run its destructor ensuring that it is done
+        // sorting `lhs`.
+        thread.reset();
+      } else {
+        TF_RETURN_IF_ERROR(
+            mergesort(literals_to_sort, rhs, scratch, embedded_evaluator));
+        lhs_status =
+            mergesort(literals_to_sort, lhs, scratch, embedded_evaluator);
+      }
+      TF_RETURN_IF_ERROR(lhs_status);
+      TF_RETURN_IF_ERROR(merge(literals_to_sort, lhs, rhs, to_sort, *scratch,
+                               embedded_evaluator));
+    } else {
+      // Do an insertion sort. Values to the left of `i` are sorted.
+      // Any values larger than it in will be moved past `i`. Binary
+      // search in [0, i) looking for the smallest value larger than `i`
+      // which we will call `ub`. By induction, [ub, i) are all larger
+      // than `i`.
+      for (auto i = to_sort.begin(); i != to_sort.end(); ++i) {
+        auto len = i - to_sort.begin();
+        auto ub = to_sort.begin();
+        auto needle = *i;
+        while (len != 0) {
+          auto half_len = len / 2;
+          auto midpoint = ub + half_len;
+          TF_ASSIGN_OR_RETURN(bool is_smaller,
+                              less_than(literals_to_sort, needle, *midpoint,
+                                        embedded_evaluator));
+          if (is_smaller) {
+            // Our needle is smaller than the midpoint, we need to shrink
+            // the range by trimming the rightmost portion of it. We can't
+            // exclude the midpoint value yet.
+            len = half_len;
+          } else {
+            // Our needle is at least as big as the midpoint but we want
+            // something larger, we can exclude the midpoint.
+            ub = midpoint + 1;
+            len -= half_len + 1;
+          }
+        }
+        // Shift values larger than `i` to the right by 1 and insert `i`
+        // in the new gap. Now the sorted range is [0, i].
+        std::rotate(ub, i, i + 1);
+      }
+    }
+    return OkStatus();
+  };
+
   // Iterate through each dimension except 'sort_dim'.
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
       key_shape, zero_base, key_shape.dimensions(), increment,
@@ -3991,49 +4120,9 @@ Status HloEvaluator::HandleSort(const HloInstruction* sort) {
         }
         std::vector<int64_t> indices_to_sort(sort_dim_elements);
         std::iota(indices_to_sort.begin(), indices_to_sort.end(), 0);
-        Status compare_status = OkStatus();
-        auto comparator = [sort, &compare_status,
-                           embedded_evaluator = embedded_evaluator.get(),
-                           &literals_to_sort](int64_t a, int64_t b) {
-          std::vector<Literal> literals;
-          literals.reserve(2 * sort->operand_count());
-          for (int64_t i = 0; i < sort->operand_count(); ++i) {
-            literals.push_back(
-                LiteralUtil::GetScalarLiteral(literals_to_sort[i], {a}));
-            literals.push_back(
-                LiteralUtil::GetScalarLiteral(literals_to_sort[i], {b}));
-          }
-          std::vector<const Literal*> literal_ptrs;
-          absl::c_transform(literals, std::back_inserter(literal_ptrs),
-                            [](const Literal& literal) { return &literal; });
-
-          auto computed_result =
-              embedded_evaluator->Evaluate(*sort->to_apply(), literal_ptrs);
-          // Clear visit states so that we can use the evaluator again
-          // on the same computation.
-          embedded_evaluator->ResetVisitStates();
-          if (!computed_result.ok()) {
-            compare_status = computed_result.status();
-            return false;
-          }
-          return computed_result.value().Get<bool>({});
-        };
-        if (!indices_to_sort.empty()) {
-          // Smoke test of the comparator - it should not be reflexive.
-          const int64_t a = indices_to_sort[0];
-          TF_RET_CHECK(!comparator(a, a))
-              << "Invalid sort comparator - does not satisfy the strict weak "
-                 "ordering requirement";
-        }
-        if (Cast<HloSortInstruction>(sort)->is_stable()) {
-          std::stable_sort(indices_to_sort.begin(), indices_to_sort.end(),
-                           comparator);
-        } else {
-          std::sort(indices_to_sort.begin(), indices_to_sort.end(), comparator);
-        }
-        if (!compare_status.ok()) {
-          return compare_status;
-        }
+        TF_RETURN_IF_ERROR(mergesort(literals_to_sort,
+                                     absl::MakeSpan(indices_to_sort), nullptr,
+                                     nullptr));
         std::vector<int64_t> slice_dimensions(rank, 1);
         slice_dimensions[sort_dim] = sort_dim_elements;
         std::vector<int64_t> start_indices(rank, 0);

From f65dd98d9b5820a19dc170f2b10ff35c463e588e Mon Sep 17 00:00:00 2001
From: Yu Feng <feyu@google.com>
Date: Thu, 21 Sep 2023 14:49:19 -0700
Subject: [PATCH 107/567] Allow additional coordinator arguments to the TPU
 test.

PiperOrigin-RevId: 567425791
---
 tensorflow/python/tpu/tpu_test_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/tpu/tpu_test_wrapper.py b/tensorflow/python/tpu/tpu_test_wrapper.py
index 3076f53de7a76a..534707655c7844 100644
--- a/tensorflow/python/tpu/tpu_test_wrapper.py
+++ b/tensorflow/python/tpu/tpu_test_wrapper.py
@@ -213,7 +213,7 @@ def run_user_main(wrapped_test_module):
     os.environ['TEST_TOTAL_SHARDS'] = saved_total_shards
   maybe_define_flags()
   # Parse remaining flags.
-  FLAGS(unparsed)
+  FLAGS(unparsed, known_only=True)
   set_random_test_dir()
 
   move_test_classes_into_scope(user_module)

From 8bebcb801b6be745c39f6cbfee180cce5e4e9523 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Thu, 21 Sep 2023 14:49:28 -0700
Subject: [PATCH 108/567] Decouple traceme context id from step id in tfrt
 threadpool interface.

PiperOrigin-RevId: 567425831
---
 tensorflow/core/tfrt/graph_executor/BUILD        |  1 +
 .../core/tfrt/graph_executor/graph_executor.cc   |  9 +++++----
 .../core/tfrt/runtime/work_queue_interface.h     | 16 ++++++++--------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/tfrt/graph_executor/BUILD b/tensorflow/core/tfrt/graph_executor/BUILD
index 7c43ffe7ebb429..7cc355fa1b2e8b 100644
--- a/tensorflow/core/tfrt/graph_executor/BUILD
+++ b/tensorflow/core/tfrt/graph_executor/BUILD
@@ -122,6 +122,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:refcount",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/profiler/lib:traceme",
         "@tf_runtime//:basic_kernels_alwayslink",
         "@tf_runtime//:bef",
         "@tf_runtime//:befexecutor",
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.cc b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
index 9045e8078c5169..2b8430f7af2af8 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
@@ -84,6 +84,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/refcount.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/profiler/lib/traceme.h"
 #include "tfrt/bef_converter/mlir_to_bef.h"  // from @tf_runtime
 #include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
 #include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
@@ -292,8 +293,9 @@ tensorflow::Status GraphExecutionRunOnFunction(
                         process_function_library_runtime, cost_recorder));
 
   int64_t request_id = request_info->tfrt_request_context->id();
-  tensorflow::profiler::TraceMeProducer traceme(
-      // To TraceMeConsumers in RunHandlerThreadPool::WorkerLoop.
+  // The top level traceme root for this request. The thread pool used later
+  // will add TraceMeProducer and TraceMeConsumer to connect async tasks.
+  tsl::profiler::TraceMe traceme(
       [request_id, signature_name, &options, symbol_uids] {
         return tensorflow::profiler::TraceMeEncode(
             "TfrtModelRun",
@@ -304,8 +306,7 @@ tensorflow::Status GraphExecutionRunOnFunction(
                                        options.model_metadata.version())},
              {"tf_symbol_uid", symbol_uids.tf_symbol_uid},
              {"tfrt_symbol_uid", symbol_uids.tfrt_symbol_uid}});
-      },
-      tensorflow::profiler::ContextType::kTfrtExecutor, request_id);
+      });
 
   // Only configure timer when the deadline is set.
   if (run_options.deadline.has_value()) {
diff --git a/tensorflow/core/tfrt/runtime/work_queue_interface.h b/tensorflow/core/tfrt/runtime/work_queue_interface.h
index b5d3f27477a764..08c9f786496796 100644
--- a/tensorflow/core/tfrt/runtime/work_queue_interface.h
+++ b/tensorflow/core/tfrt/runtime/work_queue_interface.h
@@ -87,16 +87,16 @@ template <typename Callable>
 tfrt::TaskFunction WrapWork(int64_t id, absl::string_view name,
                             Callable&& work) {
   tensorflow::Context context(tensorflow::ContextKind::kThread);
-  return tfrt::TaskFunction([id, name = std::string(name),
+  tensorflow::profiler::TraceMeProducer producer(
+      [&]() { return absl::StrCat("producer_", name); },
+      tensorflow::profiler::ContextType::kTfrtExecutor);
+  return tfrt::TaskFunction([traceme_id = producer.GetContextId(),
+                             name = std::string(name),
                              context = std::move(context),
                              work = std::forward<Callable>(work)]() mutable {
-    // From TraceMeProducer in the function that launches graph execution, eg.
-    // SavedModelImpl::Run().
-    tensorflow::profiler::TraceMeConsumer activity(
-        [&]() {
-          return tensorflow::profiler::TraceMeEncode(name, {{"id", id}});
-        },
-        tensorflow::profiler::ContextType::kTfrtExecutor, id,
+    tensorflow::profiler::TraceMeConsumer consumer(
+        [&]() { return absl::StrCat("consumer_", name); },
+        tensorflow::profiler::ContextType::kTfrtExecutor, traceme_id,
         tensorflow::profiler::TraceMeLevel::kInfo);
     tensorflow::WithContext wc(context);
     std::forward<Callable>(work)();

From c9b1800ce365003aff34665ff6564a0e5554de35 Mon Sep 17 00:00:00 2001
From: Penporn Koanantakool <penporn@google.com>
Date: Thu, 21 Sep 2023 15:02:09 -0700
Subject: [PATCH 109/567] Temporarily disabling XLA CPU oneDNN Dot op rewriter
 because it causes performance regression in JAX.

PiperOrigin-RevId: 567429452
---
 third_party/xla/xla/service/cpu/cpu_compiler.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 5f8153fe13faea..c6c78592825e58 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -698,7 +698,8 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
   // AOT compiled code runs in single thread.
   if (!is_aot_compile) {
-    pipeline.AddPass<OneDnnRewriter>();
+    // Temporarily disabling oneDNN rewriter because it causes JAX regression.
+    // pipeline.AddPass<OneDnnRewriter>();
   }
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 

From 24171a1a4aa7823e1092176ecde449b65d9bc484 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 21 Sep 2023 15:06:56 -0700
Subject: [PATCH 110/567] [stream_executor] NFC: Prepare header filegroups for
 defining StreamExecutor API

PiperOrigin-RevId: 567430767
---
 third_party/xla/xla/stream_executor/BUILD | 57 +++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 15e3642d4afd89..3254a4ead38063 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -30,6 +30,63 @@ package_group(
     packages = stream_executor_internal(),
 )
 
+#===--------------------------------------------------------------------------------------------===#
+# StreamExecutor public API
+#===--------------------------------------------------------------------------------------------===#
+
+# We bundle headers into filegroups for internal use only (we re-export the same set of headers
+# from multiple targets), and all external clients should depend on one of the public `cc_library`
+# targets that have dependencies required for compiling headers (e.g. absl dependencies). These
+# filegroup roughly correspond to "StreamExecutor components" that are available to the clients.
+
+# These are the headers that constitute StreamExecutor public API. Clients should not depend on
+# this filegroup directly, but instead depend on a `stream_executor_headers` target if they need
+# a header-only dependency (this is a very rare exception when we are building dynamic librarires
+# for open source projects, e.g. Tensorflow, internally at Google we almost always link statically),
+# or usually on a `sream_executor` target that will also link implementation.
+
+filegroup(
+    name = "stream_executor_public_headers",
+    srcs = [
+        "allocator_stats.h",
+        "command_buffer.h",
+        "device_description.h",
+        "device_memory.h",
+        "device_memory_allocator.h",
+        "device_options.h",
+        "event.h",
+        "executor_cache.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "kernel_spec.h",
+        "launch_dim.h",
+        "module_spec.h",
+        "numeric_options.h",
+        "platform.h",
+        "plugin.h",
+        "plugin_registry.h",
+        "stream.h",
+        "stream_executor.h",
+        "stream_executor_internal.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+        "trace_listener.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+# These are the headers for default StreamExecutor plugins.
+filegroup(
+    name = "stream_executor_plugin_headers",
+    srcs = [
+        "blas.h",
+        "dnn.h",
+        "fft.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 #===--------------------------------------------------------------------------------------------===#
 # StreamExecutor platform-dependent implementation details
 #===--------------------------------------------------------------------------------------------===#

From 8369f9479d79be57fd87c3b1f00ea1eed00dcd29 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Thu, 21 Sep 2023 16:25:05 -0700
Subject: [PATCH 111/567] Integrate LLVM at llvm/llvm-project@ebefe83c092e

Updates LLVM usage to match
[ebefe83c092e](https://github.com/llvm/llvm-project/commit/ebefe83c092e)

PiperOrigin-RevId: 567450893
---
 .../compiler/mlir/tools/kernel_gen/BUILD      | 20 ++++++++++---------
 .../mlir/tools/kernel_gen/hlo_to_kernel.cc    |  6 +++++-
 .../kernel_gen/tf_framework_c_interface.cc    |  6 +++++-
 third_party/llvm/workspace.bzl                |  4 ++--
 4 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 5ac0ad29c6eeae..395b79491460e8 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -1,24 +1,24 @@
-load(
-    "@local_xla//xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 load(
     "//tensorflow:tensorflow.bzl",
     "check_deps",
     "tf_cc_binary",
 )
 load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
+    "@local_xla//xla/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "if_llvm_system_z_available",
+    "tf_proto_library",
 )
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
 load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "if_llvm_system_z_available",
-    "tf_proto_library",
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
 )
 
 package(
@@ -122,6 +122,7 @@ tf_cc_binary(
         "@llvm-project//llvm:X86Disassembler",  # fixdeps: keep
         "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ToLLVMIRTranslation",
     ] + if_llvm_system_z_available([
@@ -165,6 +166,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ExecutionEngine",
         "@llvm-project//mlir:ExecutionEngineUtils",
+        "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:mlir_runner_utils",
         "@local_xla//xla/stream_executor:stream_executor_headers",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
index 31e77feaf41c4a..b5537741529d06 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
@@ -35,6 +35,7 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/Host.h"
+#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/ExecutionEngine/OptUtils.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
@@ -127,7 +128,10 @@ Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
       ReadFileToString(Env::Default(), input_file.str(), &hlo_code));
 
   // Compile.
-  mlir::MLIRContext context;
+  mlir::DialectRegistry registry;
+  mlir::memref::registerAllocationOpInterfaceExternalModels(registry);
+  mlir::MLIRContext context(registry);
+
   llvm::SourceMgr source_mgr;
   mlir::SourceMgrDiagnosticHandler source_mgr_handler(source_mgr, &context);
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
index 021c1566c78b98..34cbb4069ffeac 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/TargetSelect.h"
+#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/ExecutionEngine/ExecutionEngine.h"  // from @llvm-project
 #include "mlir/ExecutionEngine/OptUtils.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
@@ -185,7 +186,10 @@ llvm::Expected<std::unique_ptr<ExecutionEngine>> Compile(
   }
 
   // Create the kernel.
-  mlir::MLIRContext context;
+  mlir::DialectRegistry registry;
+  mlir::memref::registerAllocationOpInterfaceExternalModels(registry);
+  mlir::MLIRContext context(registry);
+
   mlir::OwningOpRef<mlir::ModuleOp> module;
 
   if (item.result_module().empty()) {
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 6b039a5a6e9efe..aaec90a65c08e6 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "afd7db48c55cb87566758e961f1ebac8af16b8bc"
-    LLVM_SHA256 = "64f1436eb824ee7f6125ae06c7c337c8edfa8763767f38d7fd218ca02b0311c3"
+    LLVM_COMMIT = "ebefe83c092e41d243829ab812bb650674e2f3d2"
+    LLVM_SHA256 = "0cac9b05231cd3f0f4efb29fad98ef9b6eb9f01c9c99d016a93764033a603426"
 
     tf_http_archive(
         name = name,

From 216a9cea0dff4ce4b12089eff24488babb5a3f4a Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Thu, 21 Sep 2023 16:40:27 -0700
Subject: [PATCH 112/567] #tf-data-service Use native proto casters to simplify
 pybind code.

This way we don't need to define the Python version of
`DataServiceMetadata`.

PiperOrigin-RevId: 567454478
---
 tensorflow/python/data/experimental/service/BUILD |  1 +
 .../experimental/service/_pywrap_server_lib.pyi   |  7 -------
 .../experimental/service/server_lib_wrapper.cc    | 15 ++++-----------
 3 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
index 40394834021318..312d9607056906 100644
--- a/tensorflow/python/data/experimental/service/BUILD
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -22,6 +22,7 @@ tf_python_pybind_extension(
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/strings",
         "@pybind11",
+        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/service/_pywrap_server_lib.pyi b/tensorflow/python/data/experimental/service/_pywrap_server_lib.pyi
index 63a57f2ed08b3c..d39c6ac8225da8 100644
--- a/tensorflow/python/data/experimental/service/_pywrap_server_lib.pyi
+++ b/tensorflow/python/data/experimental/service/_pywrap_server_lib.pyi
@@ -15,13 +15,6 @@
 
 from typing import Any
 
-class DataServiceMetadata:
-    def __init__(self) -> None: ...
-    @property
-    def compression(self) -> Any: ...
-    @property
-    def element_spec(self) -> bytes: ...
-
 class DispatchGrpcDataServer:
     def __init__(self, *args, **kwargs) -> None: ...
     def bound_port(self) -> int: ...
diff --git a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
index 13315eaa137de4..a3157997249423 100644
--- a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>
@@ -26,6 +26,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "pybind11/stl.h"  // from @pybind11
+#include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/dispatcher_client.h"
 #include "tensorflow/core/data/service/grpc_util.h"
@@ -40,6 +41,8 @@ limitations under the License.
 namespace py = pybind11;
 
 PYBIND11_MODULE(_pywrap_server_lib, m) {
+  pybind11_protobuf::ImportNativeProtoCasters();
+
   py::class_<tensorflow::data::DispatchGrpcDataServer>(m,
                                                        "DispatchGrpcDataServer")
       .def("start", &tensorflow::data::DispatchGrpcDataServer::Start)
@@ -147,16 +150,6 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
       },
       py::return_value_policy::reference);
 
-  py::class_<tensorflow::data::DataServiceMetadata> data_service_metadata(
-      m, "DataServiceMetadata");
-  data_service_metadata.def(py::init<>())
-      .def_property_readonly(
-          "element_spec",
-          [](const tensorflow::data::DataServiceMetadata& data_service_metadata)
-              -> py::bytes { return data_service_metadata.element_spec(); })
-      .def_property_readonly(
-          "compression", &tensorflow::data::DataServiceMetadata::compression)
-      .def("__repr__", &tensorflow::data::DataServiceMetadata::DebugString);
   py::class_<tensorflow::data::SnapshotTaskProgressWrapper>
       snapshot_task_progress_wrapper(m, "SnapshotTaskProgressWrapper");
   snapshot_task_progress_wrapper.def(py::init<>())

From 2247d487e8d276ca774a52d51930ed3857c433d4 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 21 Sep 2023 16:50:49 -0700
Subject: [PATCH 113/567] [stream_executor] NFC: Remove stream_executor_headers
 dependencies outside of StreamExecutor

`stream_executor_headers` is an implementation detail of StreamExecutor, all external clients should depend on a regular `stream_executor` target (unless there is a reason to depend on header only target).

And it makes absolutely no sense to depend on both targets.

PiperOrigin-RevId: 567456924
---
 .../compiler/mlir/tools/kernel_gen/BUILD      |  4 +--
 tensorflow/compiler/tf2xla/BUILD              |  2 +-
 tensorflow/compiler/tf2xla/kernels/BUILD      |  2 +-
 tensorflow/compiler/xrt/kernels/BUILD         |  4 +--
 tensorflow/core/common_runtime/gpu/BUILD      | 10 +++---
 tensorflow/core/kernels/BUILD                 |  2 +-
 tensorflow/core/tpu/kernels/BUILD             |  4 +--
 tensorflow/core/util/autotune_maps/BUILD      | 12 +++----
 third_party/xla/xla/service/BUILD             |  2 +-
 third_party/xla/xla/service/gpu/BUILD         | 32 ++++++++-----------
 third_party/xla/xla/service/gpu/runtime/BUILD | 16 +++++-----
 third_party/xla/xla/service/gpu/tests/BUILD   |  3 +-
 third_party/xla/xla/stream_executor/BUILD     |  1 +
 13 files changed, 44 insertions(+), 50 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 395b79491460e8..e6ce181074de7f 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -169,7 +169,7 @@ cc_library(
         "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:mlir_runner_utils",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
+        "@local_xla//xla/stream_executor",
     ],
 )
 
@@ -204,7 +204,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:mlir_runner_utils",
         "@local_tsl//tsl/platform:hash",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
+        "@local_xla//xla/stream_executor",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "@local_xla//xla/stream_executor/cuda:stream_executor_cuda",
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index c0d81ef159c287..44ecb3ee0e2b27 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -694,7 +694,7 @@ cc_library(
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/service:computation_placer_hdr",
         "@local_xla//xla/service/gpu:gpu_executable_run_options",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/translate/mhlo_to_hlo:layout_util",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 4b2af30072bb2f..a3b082939c1dc8 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -216,7 +216,7 @@ tf_cuda_library(
         "@local_xla//xla/service:custom_call_status",
         "@local_xla//xla/service:custom_call_target_registry",
         "@local_xla//xla/service:hlo_proto_cc",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor/gpu:gpu_executor_header",
         "@local_xla//xla/stream_executor/gpu:gpu_stream_header",
         "@local_xla//xla/stream_executor/gpu:gpu_types_header",
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index fb9995240dd897..e4c4075a392c3a 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -97,7 +97,7 @@ cc_library(
         "@local_xla//xla/service:computation_placer",
         "@local_xla//xla/service:dump",
         "@local_xla//xla/service:hlo_proto_cc",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor/tpu:tpu_api",
     ],
     alwayslink = 1,
@@ -140,7 +140,7 @@ cc_library(
         "@local_xla//xla/service:compiler",
         "@local_xla//xla/service:computation_placer",
         "@local_xla//xla/service/gpu:gpu_executable_run_options",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
+        "@local_xla//xla/stream_executor",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 7b91cc850489a9..4652e0051cd8fd 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -14,15 +14,15 @@ load(
     "filegroup",
     "tf_cuda_cc_test",
 )
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
     "tf_cuda_tests_tags",
 )
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -282,8 +282,8 @@ tf_cuda_library(
         "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/strings:str_format",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:platform",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
     ],
 )
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1612d7c1a707be..35957cc529c019 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4078,7 +4078,7 @@ tf_kernel_library(
         "@local_xla//xla/stream_executor/cuda:cudnn_plugin",
         "@local_xla//xla/stream_executor/gpu:gpu_asm_opts",
         "@local_xla//xla/stream_executor:tf_allocator_adapter",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
+        "@local_xla//xla/stream_executor",
         "//tensorflow/core/platform:stream_executor",
     ]) + if_cuda_or_rocm([
         ":gpu_utils",
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index b7ac39d708bb39..4ac173f9c5b655 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -201,7 +201,7 @@ tf_kernel_library(
         "@local_tsl//tsl/platform:tstring",
         "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
         "@local_xla//xla:util",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor/tpu:proto_helper",
         "@local_xla//xla/stream_executor/tpu:status_helper",
         "@local_xla//xla/stream_executor/tpu:tpu_api",
@@ -238,7 +238,7 @@ tf_kernel_library(
         "//tensorflow/core/tpu:tpu_defs",
         "@com_google_absl//absl/cleanup",
         "@local_xla//xla:util",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor/tpu:proto_helper",
         "@local_xla//xla/stream_executor/tpu:status_helper",
         "@local_xla//xla/stream_executor/tpu:tpu_api",
diff --git a/tensorflow/core/util/autotune_maps/BUILD b/tensorflow/core/util/autotune_maps/BUILD
index cf0b8eda2abb06..4be1a30bf6d410 100644
--- a/tensorflow/core/util/autotune_maps/BUILD
+++ b/tensorflow/core/util/autotune_maps/BUILD
@@ -3,6 +3,11 @@
 # and later restore them.
 
 # Placeholder: load py_proto_library
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cuda_library",
+    "tf_cuda_only_cc_test",
+)
 
 # TODO(ruochengw): Currently only supports contrib's fused_conv2d_bias_activation_op.
 # We plan to add more ops and move fused_conv2d_bias_activation_op back into core library.
@@ -14,11 +19,6 @@ load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cuda_library",
-    "tf_cuda_only_cc_test",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -136,9 +136,9 @@ tf_cuda_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:stream_executor",
         "@local_xla//xla:status_macros",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:dnn_proto_cc",
         "@local_xla//xla/stream_executor:lazy_op_runner",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
         "@local_xla//xla/stream_executor/gpu:gpu_init",
     ],
 )
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 65c42edbbfcaad..2aee4c3a9ec378 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -3846,7 +3846,7 @@ cc_library(
         "//xla:status",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@local_tsl//tsl/platform:status",
     ],
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index c6f2f7d1ba3a08..2427757038d855 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -255,8 +255,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":gpu_types",
+        "//xla/stream_executor",
         "//xla/stream_executor:device_description_proto_cc",
-        "//xla/stream_executor:stream_executor_headers",
     ],
 )
 
@@ -308,7 +308,7 @@ cc_library(
         ":gpu_executable",
         "//xla/service:buffer_assignment",
         "//xla/service:name_uniquer",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
     ],
@@ -1177,7 +1177,6 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_asm_opts",
     ]) + ["@local_tsl//tsl/platform:status"],
 )
@@ -1205,7 +1204,6 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_asm_opts",
     ]) + ["@local_tsl//tsl/platform:status"],
 )
@@ -1225,7 +1223,6 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/stream_executor",
         "//xla/stream_executor:scratch_allocator",
-        "//xla/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1376,8 +1373,8 @@ cc_library(
         ":thunk",
         "//xla:status",
         "//xla/service:buffer_assignment",
+        "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:stream_executor_headers",
         "@local_tsl//tsl/platform:logging",
     ],
 )
@@ -1400,7 +1397,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla:status",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ]) + if_cuda_is_configured([
@@ -1556,7 +1553,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:lhlo_gpu",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:statusor",
@@ -2181,7 +2178,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:window_util",
         "//xla/hlo/ir:hlo",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -2198,8 +2195,8 @@ xla_cc_test(
         "//xla/service:hlo_parser",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
+        "//xla/stream_executor",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:stream_executor_headers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
@@ -2225,7 +2222,7 @@ cc_library(
         "//xla:window_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "@com_google_absl//absl/functional:bind_front",
         "@local_tsl//tsl/platform:status",
     ],
@@ -2262,7 +2259,7 @@ cc_library(
         "//xla/client:xla_builder",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
     ],
 )
 
@@ -2405,7 +2402,6 @@ cc_library(
         "//xla/service:generic_transfer_manager",
         "//xla/service:transfer_manager",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/host:host_platform_id",
         "//xla/stream_executor/rocm:rocm_platform_id",
@@ -2729,7 +2725,6 @@ cc_library(
         "//xla/service/spmd:collective_permute_motion",
         "//xla/service/spmd:stateful_rng_spmd_partitioner",
         "//xla/stream_executor:device_description_proto_cc_impl",
-        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
@@ -2869,7 +2864,7 @@ cc_library(
         "//xla/service:tuple_simplifier",
         "//xla/service/gpu/llvm_gpu_backend",
         "//xla/service/llvm_ir:llvm_util",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "//xla/stream_executor/cuda:cuda_diagnostics",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/gpu:asm_compiler",
@@ -3394,7 +3389,7 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_runner",
         "//xla/service:platform_util",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -3435,7 +3430,6 @@ cc_library(
         "//xla:util",
         "//xla/service:hlo_module_config",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:asm_compiler",
         "@local_tsl//tsl/platform:statusor",
     ]),
@@ -3589,8 +3583,8 @@ cc_library(
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_pass",
         "//xla/service:pattern_matcher",
+        "//xla/stream_executor",
         "//xla/stream_executor:dnn_proto_cc",
-        "//xla/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
@@ -4200,7 +4194,7 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "//xla/stream_executor/gpu:asm_compiler",
         "//xla/stream_executor/gpu:gpu_asm_opts",
     ]) + if_rocm_is_configured([
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 8eef320f5fa5b0..168a6e04b071d4 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -1,14 +1,14 @@
-load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-load(
-    "@local_tsl//tsl/platform:build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
+load("//xla:xla.bzl", "xla_cc_test")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
+load(
+    "@local_tsl//tsl/platform:build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -202,7 +202,7 @@ cc_library(
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:non_atomically_upgradeable_rw_lock",
         "//xla/service/gpu:thunk",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_stream",
         "@com_google_absl//absl/container:inlined_vector",
         "@local_tsl//tsl/protobuf:dnn_proto_cc",
@@ -241,8 +241,8 @@ cc_library(
         ":topk_kernel_cuda",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/stream_executor",  # build_cleaner: keep
         "//xla/stream_executor:platform",
-        "//xla/stream_executor:stream_executor_headers",  # build_cleaner: keep
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_types_header",
         "@com_google_absl//absl/numeric:bits",
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 8fe908e22901c3..854766e620a00f 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -58,7 +58,7 @@ cc_library(
         "//xla:types",
         "//xla/service:gpu_plugin",
         "//xla/service/gpu:gpu_executable",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "//xla/tests:filecheck",
         "//xla/tests:llvm_irgen_test_base",
         "//xla/tests:verified_hlo_module",
@@ -848,7 +848,6 @@ xla_cc_test(
         "//xla/stream_executor/gpu:asm_compiler",
         "//xla/service/gpu:gpu_asm_opts_util",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_headers",
         "//xla/service/gpu:stream_executor_util",
         "//xla/stream_executor:device_memory",
     ]),
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 3254a4ead38063..afbc0451827152 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -781,6 +781,7 @@ cc_library(
         "plugin.h",
         "plugin_registry.h",
         "stream.h",
+        "numeric_options.h",
         "stream_executor.h",
         "stream_executor_internal.h",
         "stream_executor_pimpl.h",

From 34ad3d985a520364be3a3aa51d894dfe482964a0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 17:02:21 -0700
Subject: [PATCH 114/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/73bfccd957234f8cc02c69dc50078288b6d4db8c.

PiperOrigin-RevId: 567459578
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index a788df8889a6a7..5b50da65a5d4de 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "6a3e3ece9c01f3e5742a297c73357d463a2fe151"
-    TFRT_SHA256 = "24477cd3e9ac93a1010542de308341f1c112df974f9510c18e4d4efb2de78e59"
+    TFRT_COMMIT = "73bfccd957234f8cc02c69dc50078288b6d4db8c"
+    TFRT_SHA256 = "2ae79e0aa046864afc9b43a23ee077c52e2f91a192e62f6ca8e82a68a206f116"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index a788df8889a6a7..5b50da65a5d4de 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "6a3e3ece9c01f3e5742a297c73357d463a2fe151"
-    TFRT_SHA256 = "24477cd3e9ac93a1010542de308341f1c112df974f9510c18e4d4efb2de78e59"
+    TFRT_COMMIT = "73bfccd957234f8cc02c69dc50078288b6d4db8c"
+    TFRT_SHA256 = "2ae79e0aa046864afc9b43a23ee077c52e2f91a192e62f6ca8e82a68a206f116"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index a788df8889a6a7..5b50da65a5d4de 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "6a3e3ece9c01f3e5742a297c73357d463a2fe151"
-    TFRT_SHA256 = "24477cd3e9ac93a1010542de308341f1c112df974f9510c18e4d4efb2de78e59"
+    TFRT_COMMIT = "73bfccd957234f8cc02c69dc50078288b6d4db8c"
+    TFRT_SHA256 = "2ae79e0aa046864afc9b43a23ee077c52e2f91a192e62f6ca8e82a68a206f116"
 
     tf_http_archive(
         name = "tf_runtime",

From 83b712c02c2fccff2667e706794c722edb0bce3b Mon Sep 17 00:00:00 2001
From: Praveen Narayanan <pravnar@google.com>
Date: Thu, 21 Sep 2023 17:16:12 -0700
Subject: [PATCH 115/567] Fixes to avoid deadlocks with collectives in the
 pipelining while loop

PiperOrigin-RevId: 567462558
---
 .../tests/embedding_pipelining.mlir           |  89 -------------
 .../transforms/embedding_pipelining.cc        | 119 ++----------------
 2 files changed, 11 insertions(+), 197 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir
index ead04ab9abd579..fdb64b900e7a8d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir
@@ -555,95 +555,6 @@ module {
   }
 }
 
-// -----
-// This test verifies the handling of CollectiveGatherV2 ops.
-module {
-  func.func @main() {
-    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
-    return
-  }
-  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
-    // Verify the overall pipelining control flow and supporting functions.
-    // The order of these functions is also significant.
-    // CHECK: {{.*StatefulPartitionedCall.* f = @while_cond.*}}
-    // CHECK: {{.*StatefulPartitionedCall.* f = @non_tpu.*}}
-    // CHECK: {{.*StatefulPartitionedCall.* f = @start_step_0.*}}
-    // CHECK: {{.*StatefulPartitionedCall.* f = @while_cond.*}}
-    // CHECK: {{.*StatefulPartitionedCall.* f = @non_tpu.*}}
-    // CHECK: {{.*StatefulPartitionedCall.* f = @start_step_1.*}}
-    // CHECK: {{.*StatefulPartitionedCall.* f = @while_cond.*}}
-    // CHECK: {{.*tf.While.* body = @new_while_body.* cond = @new_while_cond.*}}
-    // CHECK: {{.*StatefulPartitionedCall.* f = @finish_step_nm2.*}}
-    // CHECK: {{.*StatefulPartitionedCall.* f = @finish_step_nm1.*}}
-    // CHECK: return
-    // metadata ops
-    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
-    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
-
-    // forward_ops
-    %v_0 = "tf.StatefulPartitionedCall"(){_collective_manager_ids = [], _read_only_resource_inputs = [], config = "", config_proto = "blah", device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0", executor_type = "", f = @helper_task0}:() -> (tensor<?xi64>)
-    %v_1 = "tf.StatefulPartitionedCall"(){_collective_manager_ids = [], _read_only_resource_inputs = [], config = "", config_proto = "blah", device = "/job:tpu_host_worker/replica:0/task:2/device:CPU:0", executor_type = "", f = @helper_task1}:() -> (tensor<?xi64>)
-    %a_0 = "tf.Identity"(%v_0) {_embedding_pipelining = "forward", _replication_info = "repl_info"}: (tensor<?xi64>) -> tensor<?xi64>
-    %a_1 = "tf.Identity"(%v_1) {_embedding_pipelining = "forward", _replication_info = "repl_info"}: (tensor<?xi64>) -> tensor<?xi64>
-    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
-
-    // core_tpu ops:
-    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
-
-    // backward_ops
-    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
-
-    // non_tpu_ops
-    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
-
-    return %res_n : tensor<i32>
-  }
-  func.func private @helper_task0() -> tensor<?xi64> {
-    %grpsz_0 = "tf.Const"() {device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
-    %grpky_0 = "tf.Const"() {device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %cgi_0 = "tf.Const"() {device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
-    %gid64_0 = "tf.GlobalIterId"() {device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"} : () -> tensor<*xi64>
-    %gid_0 = "tf.Cast"(%gid64_0) {Truncate = false, device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"} : (tensor<*xi64>) -> tensor<*xi32>
-    %cg_0 = "tf.CollectiveGatherV2"(%cgi_0, %grpsz_0, %grpky_0, %gid_0) {communication_hint = "auto", device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0", is_stateless = true, timeout_seconds = 0.000000e+00 : f32} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<*xi32>) -> tensor<?xi64>
-    return %cg_0 : tensor<?xi64>
-  }
-  func.func private @helper_task1() -> tensor<?xi64> {
-    %grpsz_1 = "tf.Const"() {device = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
-    %grpky_1 = "tf.Const"() {device = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %cgi_1 = "tf.Const"() {device = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
-    %gid64_1 = "tf.GlobalIterId"() {device = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0"} : () -> tensor<*xi64>
-    %gid_1 = "tf.Cast"(%gid64_1) {Truncate = false, device = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0"} : (tensor<*xi64>) -> tensor<*xi32>
-    %cg_1 = "tf.CollectiveGatherV2"(%cgi_1, %grpsz_1, %grpky_1, %gid_1) {communication_hint = "auto", device = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0", is_stateless = true, timeout_seconds = 0.000000e+00 : f32} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<*xi32>) -> tensor<?xi64>
-    return %cg_1 : tensor<?xi64>
-  }
-  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
-    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    return %0 : tensor<i1>
-  }
-  // Generated functions for control flow ops (if, while, switch)
-
-  //
-  // CHECK: func.func private @start_step_0
-  // CHECK: tf.GlobalIterId
-  // CHECK: tf.AddV2
-  // CHECK: tf.CollectiveGatherV2
-  // CHECK: return
-
-  //
-  // CHECK: func.func private @start_step_1
-  // CHECK: tf.GlobalIterId
-  // CHECK: tf.AddV2
-  // CHECK: tf.CollectiveGatherV2
-  // CHECK: return
-
-  //
-  // CHECK: func.func private @new_while_body
-  // CHECK: {{.*StatefulPartitionedCall.* f = @helper_task0.*}}
-  // CHECK: {{.*StatefulPartitionedCall.* f = @helper_task1.*}}
-  // CHECK: return
-}
-
 // -----
 // This test verifies that for ops with multiple TPU -> backward edges, we
 // create input/output ops for all of them.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
index 3463e17beed1c0..3257cc2e6bbe43 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
@@ -375,106 +375,16 @@ struct Inliner : public InlinerInterface {
     return LogicalResult::success();
   }
 
-  bool HasCollectiveGathers(func::FuncOp func) {
-    return !func.getRegion().getOps<TF::CollectiveGatherV2Op>().empty();
-  }
-
-  LogicalResult PatchCollectiveGatherInstanceKey(func::FuncOp func) {
-    // We're expecting the original model to have a single CollectiveGatherV2Op
-    // with the instance_key set to the global_iter_id (see GlobalIterIdOp).
-    // That collective gets split into 3 copies in the start_step_0,
-    // start_step_1 and new_while_body functions. At this point, however, we're
-    // expecting one gather per device and we want them all to have the same
-    // instance key. To make sure the instance keys are unique among the three
-    // functions them we replace the original instance key (global_iter_id) as:
-    //   new_instance_key = global_iter_id + c
-    // where c = 0, 1, or 2 depending on which function it's being replaced in.
-    // Note, also, that the following code assumes this inlining pass is call
-    // for start_step_0 and start_step_1 before new_while_body.
-    //
-    // Verify our assumption that we have just 3 collectives (per device).
-    const int32_t max_collectives = 3;
-    static int32_t offset_value = -2;
-    bool has_gathers = false;
-    for (auto gather_op : func.getRegion().getOps<TF::CollectiveGatherV2Op>()) {
-      if (offset_value >= max_collectives) {
-        gather_op->emitError()
-            << "Expected to find only " << max_collectives
-            << " CollectiveGatherV2 ops but found " << offset_value;
-        return LogicalResult::failure();
-      }
-      has_gathers = true;
-      Value orig_instance_key = gather_op->getOperand(3);
-      auto loc = gather_op->getLoc();
-      builder.setInsertionPoint(gather_op);
-      auto offset = builder.create<TF::ConstOp>(
-          loc, builder.getI32IntegerAttr(offset_value));
-      auto new_instance_key = builder.create<TF::AddV2Op>(
-          loc, orig_instance_key, offset->getResult(0));
-      gather_op->setOperand(3, new_instance_key->getResult(0));
-      std::vector<std::string> attr_names = {
-          TF::kReplicationInfoAttr.str(), "_xla_compile_device_type",
-          kEmbeddingPipelining, "_xla_outside_compilation", "device"};
-      for (const auto& attr_name : attr_names) {
-        if (!gather_op->hasAttr(attr_name)) continue;
-        offset->setAttr(attr_name, gather_op->getAttr(attr_name));
-        new_instance_key->setAttr(attr_name, gather_op->getAttr(attr_name));
-      }
-    }
-    // Make the next function to get inlined use a different offset.
-    if (has_gathers) ++offset_value;
-    return LogicalResult::success();
-  }
-
-  LogicalResult PatchCollectiveGatherOps(func::FuncOp func) {
-    // We currently expect the gathers to be in nested functions. Check the
-    // functions called from this function to see if they have gather ops. If
-    // so, then inline that function so we can locally modify the instance keys
-    // for the gathers.
-    llvm::SetVector<Operation*> ops_to_erase;
-    for (auto caller :
-         func.getRegion().getOps<TF::StatefulPartitionedCallOp>()) {
-      auto callee_op = symbol_table.lookup(caller.getF());
-      if (callee_op == nullptr) {
-        func.emitError() << "Symbol not found in SymbolTable: "
-                         << caller.getF();
-        return LogicalResult::failure();
-      }
-      func::FuncOp callee = llvm::dyn_cast<func::FuncOp>(callee_op);
-      // If the function called here doesn't have gathers then ignore it.
-      if (!HasCollectiveGathers(callee)) continue;
-
-      // Do the inlining.
-      VLOG(1) << "Nested inlining " << caller.getF().str();
-      auto& src_region = callee.getRegion();
-      auto result = inlineCall(*this, caller, callee, &src_region, true);
-      if (failed(result)) {
-        func.emitError("CollectiveGather Inlining failed");
-        return result;
-      }
-      ops_to_erase.insert(caller);
-    }
-    // If we didn't find nested gathers, we're done.
-    if (ops_to_erase.empty()) return LogicalResult::success();
-
-    for (auto op : ops_to_erase) op->erase();
-
-    // Ok, now we need to update the instance keys. We're expecting one gather
-    // per device and we should give them all the same instance key.
-    return PatchCollectiveGatherInstanceKey(func);
-    return LogicalResult::success();
-  }
-
   // Find any StatefulPartitionedCalls and inline their contents in this func.
   LogicalResult InlineCallsInFunc(func::FuncOp func,
-                                  bool patch_gathers = false) {
+                                  bool inline_all_funcs = false) {
     llvm::SetVector<Operation*> ops_to_erase;
     for (auto caller :
          func.getRegion().getOps<TF::StatefulPartitionedCallOp>()) {
-      if (!caller->hasAttr(kEmbeddingPipeliningInlineAttr)) {
+      if (!inline_all_funcs &&
+          !caller->hasAttr(kEmbeddingPipeliningInlineAttr)) {
         continue;
       }
-      VLOG(1) << "Inlining " << caller.getF().str();
       Operation* symbol = symbol_table.lookup(caller.getF());
       if (symbol == nullptr) {
         func.emitError() << "Symbol not found in SymbolTable: "
@@ -497,11 +407,6 @@ struct Inliner : public InlinerInterface {
     }
     for (auto op : ops_to_erase) op->erase();
 
-    if (patch_gathers) {
-      auto result = PatchCollectiveGatherOps(func);
-      if (failed(result)) return result;
-    }
-
     auto result = UnifyReplicationInfo(func);
     if (failed(result)) return result;
 
@@ -633,7 +538,7 @@ void GatherOpsForExtraction(mlir::SetVector<Operation*>* operations,
   // Walk the input and output dependencies of the Ops in `operations` to form
   // the closer of Ops needed to evaluate 'operations'. Input dependencies are
   // walked if 'predecessors' is true and output dependencies are walked if
-  // 'successors' is true. In either case, if a discovered Op is in the
+  // 'successors' is true. In either case, if a discoverd Op is in the
   // 'ops_to_avoid' set, then the dependency walking is terminated.
   llvm::SetVector<Operation*> ops_to_process(*operations);
   llvm::SetVector<Operation*> new_ops;
@@ -1258,7 +1163,7 @@ LogicalResult ExtractOpsAsFunc(
       if (!ops.contains(defining_op)) inputs.insert(operand);
     }
   }
-  // Find the output edges to form the set of results of the new function call.
+  // Find the output edges to form the set of resutls of the new function call.
   llvm::SetVector<OpResult> results;
   for (Operation* op : ops) {
     for (auto result : op->getResults()) {
@@ -1316,7 +1221,7 @@ int FindReturnIndex(Value val) {
 }
 
 // Skip the assertions because they currently create problematic dependencies.
-constexpr bool kDoAssertions = false;
+constexpr bool kDoAssertions = true;
 
 void AddAssertion(OpBuilder& builder, Location& loc, Value cond,
                   const std::string& message) {
@@ -1404,8 +1309,7 @@ LogicalResult StartStep0(OpBuilder& builder, Location& loc,
   func_builder.create<func::ReturnOp>(loc, results);
 
   // Inline any StatefulPartitionCall Ops.
-  auto result = Inliner(builder, symbol_table)
-                    .InlineCallsInFunc(then_func, /*patch_gathers=*/true);
+  auto result = Inliner(builder, symbol_table).InlineCallsInFunc(then_func);
   if (failed(result)) return result;
 
   builder.restoreInsertionPoint(insertion_point);
@@ -1463,8 +1367,7 @@ LogicalResult StartStep1(OpBuilder& builder, Location& loc,
   func_builder.create<func::ReturnOp>(loc, new_forward->getResults());
 
   // Inline any StatefulPartitionCall Ops.
-  auto result = Inliner(builder, symbol_table)
-                    .InlineCallsInFunc(then_func, /*patch_gathers=*/true);
+  auto result = Inliner(builder, symbol_table).InlineCallsInFunc(then_func);
   if (failed(result)) return result;
 
   builder.restoreInsertionPoint(insertion_point);
@@ -2159,8 +2062,8 @@ void EmbeddingPipeliningPass::runOnOperation() {
   //
   // Finish step i-1
   //
-  // Second, add all the inputs to core_tpu(). These all come from the while
-  // loop operands, sc_forward() or non_tpu() and need to be pulled from the
+  // Second, add all the inputs to core_tpu(). Thesse all come from the while
+  // loop opernads, sc_forward() or non_tpu() and need to be pulled from the
   // "i-1" (or "1") version of the inputs.
   std::vector<Value> t_operands;
   result = MakeCoreTPUOperands(core_tpu_caller, non_tpu_caller, forward_caller,
@@ -2299,7 +2202,7 @@ void EmbeddingPipeliningPass::runOnOperation() {
                                *orig_while_op->getParentRegion());
 
   // Inline the new while body.
-  result = Inliner(builder, symbol_table).InlineCallsInFunc(body);
+  result = Inliner(builder, symbol_table).InlineCallsInFunc(body, false);
   if (failed(result)) return signalPassFailure();
 
   // Erase original while op and temporary functions. Note, we use the non_tpu

From e70006f20cb98ac17876fadf3ab9aa6d17149aeb Mon Sep 17 00:00:00 2001
From: Jake Harmon <jakeharmon@google.com>
Date: Thu, 21 Sep 2023 19:14:36 -0700
Subject: [PATCH 116/567] Restore TSL/XLA headers in tensorflow/include (again)

The last attempt at this had two problems:
1) Some files were duplicated, which caused permissions errors when copied over (fixed by #61937). I still need both copies to get the right files, so added -n (no-clobber) to prevent overwriting.
2) cp -r should be cp -R to not cause problems on Mac

PiperOrigin-RevId: 567484051
---
 tensorflow/tools/pip_package/build_pip_package.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index a77aa82a21c5aa..07f755ac6137cc 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -256,8 +256,11 @@ function prepare_src() {
 
   # Move vendored files into proper locations
   # This is required because TSL/XLA don't publish their own wheels
-  cp -rL bazel-bin/external/local_tsl/tsl/ ${TMPDIR}/tensorflow
-  cp -rL bazel-bin/external/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
+  # We copy from bazel-bin/tensorflow instead of bazel-bin/internal to copy
+  # headers from TSL/XLA into tensorflow so that InstallHeaders can move
+  # them back into tensorflow/include
+  cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_tsl/tsl/ ${TMPDIR}/tensorflow
+  cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
   # Fix the proto stubs
   if is_macos; then
     find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i '' 's/from tsl\./from tensorflow.tsl./' {} \;

From 7e47da962783934779c7611f4566407e26929d4c Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 21 Sep 2023 21:08:13 -0700
Subject: [PATCH 117/567] [stream_executor] NFC: Clean up kernel_spec headers
 and warnings

Remove unused OpenCL kernel loaders.

https://github.com/openxla/xla/issues/5761

PiperOrigin-RevId: 567504087
---
 third_party/xla/xla/service/gpu/BUILD         |   1 -
 third_party/xla/xla/stream_executor/BUILD     |  48 ++++---
 .../stream_executor/cuda/cuda_gpu_executor.cc |  14 +-
 .../xla/xla/stream_executor/kernel_spec.cc    |  90 +++++--------
 .../xla/xla/stream_executor/kernel_spec.h     | 120 ++++--------------
 5 files changed, 99 insertions(+), 174 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 2427757038d855..9f2ce2858edc4d 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3197,7 +3197,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/stream_executor",
-        "//xla/stream_executor:kernel_spec",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index afbc0451827152..bae9ae09f248b3 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -88,12 +88,12 @@ filegroup(
 )
 
 #===--------------------------------------------------------------------------------------------===#
-# StreamExecutor platform-dependent implementation details
+# StreamExecutor platform-dependent interfaces
 #===--------------------------------------------------------------------------------------------===#
 
 # Only platform-dependent StreamExecutor implementations (e.g. StreamExecutor for GPUs) and targets
 # defined by StreamExecutor itself (e.g. `event`, `kernel`, etc.) can depend on internal
-# implementation details (interfaces that define platform-specific API).
+# interfaces (interfaces that define platform-specific API).
 #
 # External clients of StreamExecutor should depend on `stream_executor` target (links StreamExecutor
 # implementation in static build configuration), or a header only `stream_executor_headers`.
@@ -128,6 +128,30 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+#===--------------------------------------------------------------------------------------------===#
+# StreamExecutor implementation
+#===--------------------------------------------------------------------------------------------===#
+
+# Targets that implement StreamExecutor APIs are private, and should not be used outside of
+# `stream_executor` package. Clients should depend on `stream_executor` (headers and
+# implementation) or `stream_executor_headers` (only headers, if there is a reason not to link
+# implementation) if they want to use StreamExecutor.
+
+cc_library(
+    name = "kernel_spec",
+    srcs = ["kernel_spec.cc"],
+    hdrs = ["kernel_spec.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
 #===--------------------------------------------------------------------------------------------===#
 
 # The stream_executor_headers target does not prescribe an implementation.
@@ -173,6 +197,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -241,19 +266,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "kernel_spec",
-    srcs = ["kernel_spec.cc"],
-    hdrs = ["kernel_spec.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
 cc_library(
     name = "kernel_cache_config",
     hdrs = ["kernel_cache_config.h"],
@@ -355,6 +367,7 @@ cc_library(
         ":plugin",
         ":stream_executor_headers",
         "//xla/stream_executor/platform",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -436,6 +449,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -598,6 +612,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -792,6 +807,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":stream_executor_headers",
+        "@com_google_absl//absl/log:check",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
@@ -847,6 +863,7 @@ cc_library(
         ":stream_executor_headers",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -884,6 +901,7 @@ cc_library(
         ":device_memory_allocator",
         ":platform",
         ":stream_executor_headers",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/platform:errors",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc
index 992b9a53990a68..0d882033ee56e5 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc
@@ -183,18 +183,18 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
                                    KernelBase* kernel) {
   GpuKernel* cuda_kernel = AsGpuKernel(kernel);
   CUmodule module;
-  const std::string* kernelname;
+  const std::string* kernel_name;
 
   VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
 
   if (spec.has_cuda_cubin_in_memory()) {
     absl::MutexLock lock{&in_memory_modules_mu_};
-    kernelname = &spec.cuda_cubin_in_memory().kernelname();
+    kernel_name = &spec.cuda_cubin_in_memory().kernel_name();
     const char* cubin = spec.cuda_cubin_in_memory().bytes();
     TF_RETURN_IF_ERROR(LoadModuleFromCuBin(cubin, &module));
     kernel_to_gpu_binary_[kernel] = cubin;
   } else if (spec.has_cuda_ptx_in_memory()) {
-    kernelname = &spec.cuda_ptx_in_memory().kernelname();
+    kernel_name = &spec.cuda_ptx_in_memory().kernel_name();
 
     if (cc_major_ == 0 && cc_minor_ == 0) {
       return tsl::errors::Internal("Compute capability not set");
@@ -205,7 +205,7 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
       ptx = spec.cuda_ptx_in_memory().default_text();
     }
     if (ptx == nullptr) {
-      LOG(FATAL) << "Loader spec has no ptx for kernel " << *kernelname;
+      LOG(FATAL) << "Loader spec has no ptx for kernel " << *kernel_name;
     }
 
     absl::MutexLock lock{&in_memory_modules_mu_};
@@ -214,9 +214,9 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   } else {
     return tsl::errors::Internal("No method of loading CUDA kernel provided");
   }
-  VLOG(2) << "getting function " << *kernelname << " from module " << module;
+  VLOG(2) << "getting function " << *kernel_name << " from module " << module;
   TF_RETURN_IF_ERROR(GpuDriver::GetModuleFunction(
-      context_, module, kernelname->c_str(), cuda_kernel->gpu_function_ptr()));
+      context_, module, kernel_name->c_str(), cuda_kernel->gpu_function_ptr()));
 
   // We have to trust the kernel loader spec arity because there doesn't appear
   // to be a way to reflect on the number of expected arguments w/the CUDA API.
@@ -225,7 +225,7 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   KernelMetadata kernel_metadata;
   TF_RETURN_IF_ERROR(GetKernelMetadata(cuda_kernel, &kernel_metadata));
   kernel->set_metadata(kernel_metadata);
-  kernel->set_name(*kernelname);
+  kernel->set_name(*kernel_name);
   return ::tsl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.cc b/third_party/xla/xla/stream_executor/kernel_spec.cc
index a8efbb398fc485..f0fabf44d5399e 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.cc
+++ b/third_party/xla/xla/stream_executor/kernel_spec.cc
@@ -15,28 +15,37 @@ limitations under the License.
 
 #include "xla/stream_executor/kernel_spec.h"
 
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <tuple>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 
 namespace stream_executor {
 
-KernelLoaderSpec::KernelLoaderSpec(absl::string_view kernelname)
-    : kernelname_(std::string(kernelname)) {}
+KernelLoaderSpec::KernelLoaderSpec(absl::string_view kernel_name)
+    : kernel_name_(std::string(kernel_name)) {}
 
 OnDiskKernelLoaderSpec::OnDiskKernelLoaderSpec(absl::string_view filename,
-                                               absl::string_view kernelname)
-    : KernelLoaderSpec(kernelname), filename_(std::string(filename)) {}
+                                               absl::string_view kernel_name)
+    : KernelLoaderSpec(kernel_name), filename_(std::string(filename)) {}
 
 CudaPtxOnDisk::CudaPtxOnDisk(absl::string_view filename,
-                             absl::string_view kernelname)
-    : OnDiskKernelLoaderSpec(filename, kernelname) {}
+                             absl::string_view kernel_name)
+    : OnDiskKernelLoaderSpec(filename, kernel_name) {}
 
 CudaCubinOnDisk::CudaCubinOnDisk(absl::string_view filename,
-                                 absl::string_view kernelname)
-    : OnDiskKernelLoaderSpec(filename, kernelname) {}
+                                 absl::string_view kernel_name)
+    : OnDiskKernelLoaderSpec(filename, kernel_name) {}
 
 CudaCubinInMemory::CudaCubinInMemory(const char *bytes,
-                                     absl::string_view kernelname)
-    : KernelLoaderSpec(kernelname), bytes_(bytes) {}
+                                     absl::string_view kernel_name)
+    : KernelLoaderSpec(kernel_name), bytes_(bytes) {}
 
 bool CompareComputeCapability(const std::tuple<int, int> &lhs,
                               const std::tuple<int, int> &rhs) {
@@ -157,91 +166,58 @@ const char *CudaPtxInMemory::original_text(int compute_capability_major,
   return ptx_iter->second;
 }
 
-OpenCLTextOnDisk::OpenCLTextOnDisk(absl::string_view filename,
-                                   absl::string_view kernelname)
-    : OnDiskKernelLoaderSpec(filename, kernelname) {}
-
-OpenCLTextInMemory::OpenCLTextInMemory(absl::string_view text,
-                                       absl::string_view kernelname)
-    : KernelLoaderSpec(kernelname), text_(text) {}
-
-OpenCLBinaryOnDisk::OpenCLBinaryOnDisk(absl::string_view filename,
-                                       absl::string_view kernelname)
-    : OnDiskKernelLoaderSpec(filename, kernelname) {}
-
-MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextOnDisk(
-    absl::string_view filename, absl::string_view kernelname) {
-  CHECK(ocl_text_on_disk_ == nullptr);
-  ocl_text_on_disk_.reset(new OpenCLTextOnDisk{filename, kernelname});
-  return this;
-}
-
-MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLBinaryOnDisk(
-    absl::string_view filename, absl::string_view kernelname) {
-  CHECK(ocl_binary_on_disk_ == nullptr);
-  ocl_binary_on_disk_.reset(new OpenCLBinaryOnDisk{filename, kernelname});
-  return this;
-}
-
-MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextInMemory(
-    absl::string_view filename, absl::string_view kernelname) {
-  CHECK(ocl_text_in_memory_ == nullptr);
-  ocl_text_in_memory_.reset(new OpenCLTextInMemory{filename, kernelname});
-  return this;
-}
-
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxOnDisk(
-    absl::string_view filename, absl::string_view kernelname) {
+    absl::string_view filename, absl::string_view kernel_name) {
   CHECK(cuda_ptx_on_disk_ == nullptr);
-  cuda_ptx_on_disk_.reset(new CudaPtxOnDisk{filename, kernelname});
+  cuda_ptx_on_disk_.reset(new CudaPtxOnDisk{filename, kernel_name});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinInMemory(
-    const char *bytes, absl::string_view kernelname) {
+    const char *bytes, absl::string_view kernel_name) {
   CHECK(cuda_cubin_in_memory_ == nullptr);
-  cuda_cubin_in_memory_.reset(new CudaCubinInMemory{bytes, kernelname});
+  cuda_cubin_in_memory_.reset(new CudaCubinInMemory{bytes, kernel_name});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinOnDisk(
-    absl::string_view filename, absl::string_view kernelname) {
+    absl::string_view filename, absl::string_view kernel_name) {
   CHECK(cuda_cubin_on_disk_ == nullptr);
-  cuda_cubin_on_disk_.reset(new CudaCubinOnDisk{filename, kernelname});
+  cuda_cubin_on_disk_.reset(new CudaCubinOnDisk{filename, kernel_name});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
-    absl::string_view ptx, absl::string_view kernelname) {
+    absl::string_view ptx, absl::string_view kernel_name) {
   CHECK(cuda_ptx_in_memory_ == nullptr);
   cuda_ptx_in_memory_.reset(
-      new CudaPtxInMemory{ptx, kernelname, false /* ptx_compressed */});
+      new CudaPtxInMemory{ptx, kernel_name, false /* ptx_compressed */});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
-    absl::string_view ptx, absl::string_view kernelname) {
+    absl::string_view ptx, absl::string_view kernel_name) {
   CHECK(cuda_ptx_in_memory_ == nullptr);
   cuda_ptx_in_memory_.reset(
-      new CudaPtxInMemory{ptx, kernelname, true /* ptx_compressed */});
+      new CudaPtxInMemory{ptx, kernel_name, true /* ptx_compressed */});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
     std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
-    absl::string_view kernelname) {
+    absl::string_view kernel_name) {
   CHECK(cuda_ptx_in_memory_ == nullptr);
   cuda_ptx_in_memory_.reset(
-      new CudaPtxInMemory{spec_list, kernelname, false /* ptx_compressed */});
+      new CudaPtxInMemory{spec_list, kernel_name, false /* ptx_compressed */});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
     std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
-    absl::string_view kernelname) {
+    absl::string_view kernel_name) {
   CHECK(cuda_ptx_in_memory_ == nullptr);
   cuda_ptx_in_memory_.reset(
-      new CudaPtxInMemory{spec_list, kernelname, true /* ptx_compressed */});
+      new CudaPtxInMemory{spec_list, kernel_name, true /* ptx_compressed */});
   return this;
 }
 
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.h b/third_party/xla/xla/stream_executor/kernel_spec.h
index 70ee8b607faa00..4d28b5b6c72dca 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.h
+++ b/third_party/xla/xla/stream_executor/kernel_spec.h
@@ -26,19 +26,16 @@ limitations under the License.
 //  static const MultiKernelLoaderSpec &SaxpySpec() {
 //    static auto *mkls =
 //        (new MultiKernelLoaderSpec{4 /* = arity */})
-//            ->AddCudaPtxOnDisk(ptx_file_path, ptx_kernelname)
-//            ->AddOpenCLTextOnDisk(opencl_text_file_path, ocl_kernelname);
+//            ->AddCudaPtxOnDisk(ptx_file_path, ptx_kernel_name);
 //    };
 //
 //    return *mkls;
 //  }
 //
 // This lazily instantiates an object that describes how to load CUDA PTX
-// present on disk that implements saxpy for the for the CUDA platform, or
-// OpenCL text present on disk that implements saxpy for an OpenCL-based
-// platform. The CudaPtxOnDisk and OpenCLTextOnDisk objects are subtypes of
-// KernelLoaderSpec -- KernelLoaderSpec describes how to load a kernel for
-// subsequent launching on a single platform.
+// present on disk that implements saxpy for the CUDA platform. The
+// CudaPtxOnDisk object is a subtype of KernelLoaderSpec -- KernelLoaderSpec
+// describes how to load a kernel for subsequent launching on a single platform.
 //
 // For the loader functionality that accepts these KernelLoaderSpecs in order
 // to grab the kernel appropriately, see StreamExecutor::GetKernel().
@@ -48,9 +45,13 @@ limitations under the License.
 
 #include <stddef.h>
 
+#include <initializer_list>
 #include <map>
 #include <memory>
+#include <string>
+#include <tuple>
 
+#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/platform/port.h"
@@ -73,15 +74,15 @@ class KernelLoaderSpec {
   virtual ~KernelLoaderSpec() {}
 
   // Returns the kernel name to load out of the program.
-  const std::string &kernelname() const { return kernelname_; }
+  const std::string &kernel_name() const { return kernel_name_; }
 
  protected:
-  explicit KernelLoaderSpec(absl::string_view kernelname);
+  explicit KernelLoaderSpec(absl::string_view kernel_name);
 
  private:
   // The kernel name that should be loaded out of the program description given
   // above.
-  std::string kernelname_;
+  std::string kernel_name_;
 
   SE_DISALLOW_COPY_AND_ASSIGN(KernelLoaderSpec);
 };
@@ -102,7 +103,7 @@ class OnDiskKernelLoaderSpec : public KernelLoaderSpec {
 
  protected:
   OnDiskKernelLoaderSpec(absl::string_view filename,
-                         absl::string_view kernelname);
+                         absl::string_view kernel_name);
 
   std::string filename_;
 
@@ -113,7 +114,7 @@ class OnDiskKernelLoaderSpec : public KernelLoaderSpec {
 // Kernel loader specification for PTX text that resides on disk.
 class CudaPtxOnDisk : public OnDiskKernelLoaderSpec {
  public:
-  CudaPtxOnDisk(absl::string_view filename, absl::string_view kernelname);
+  CudaPtxOnDisk(absl::string_view filename, absl::string_view kernel_name);
   ~CudaPtxOnDisk() override {}
 
   const char *CanonicalSuffix() const override { return ".ptx"; }
@@ -125,7 +126,7 @@ class CudaPtxOnDisk : public OnDiskKernelLoaderSpec {
 // Kernel loader specification for CUBIN binary that resides on disk.
 class CudaCubinOnDisk : public OnDiskKernelLoaderSpec {
  public:
-  CudaCubinOnDisk(absl::string_view filename, absl::string_view kernelname);
+  CudaCubinOnDisk(absl::string_view filename, absl::string_view kernel_name);
   ~CudaCubinOnDisk() override {}
 
   const std::string &filename() const { return filename_; }
@@ -153,7 +154,7 @@ class CudaPtxInMemory : public KernelLoaderSpec {
   //
   // Warning: the string backing the provided absl::string_view ptx must outlive
   // this instance.
-  CudaPtxInMemory(absl::string_view ptx, absl::string_view kernelname,
+  CudaPtxInMemory(absl::string_view ptx, absl::string_view kernel_name,
                   bool ptx_compressed = false);
 
   // Multiple-PTX-version constructor. Adds each item in spec_list to this
@@ -215,50 +216,10 @@ class CudaPtxInMemory : public KernelLoaderSpec {
   SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxInMemory);
 };
 
-// Kernel loader specification for OpenCL text that resides on disk.
-class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec {
- public:
-  OpenCLTextOnDisk(absl::string_view filename, absl::string_view kernelname);
-  ~OpenCLTextOnDisk() override {}
-
-  const char *CanonicalSuffix() const override { return ".ocl"; }
-
- private:
-  SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextOnDisk);
-};
-
-// Kernel loader specification for OpenCL binary that resides on disk.
-class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec {
- public:
-  OpenCLBinaryOnDisk(absl::string_view filename, absl::string_view kernelname);
-  ~OpenCLBinaryOnDisk() override {}
-
-  const char *CanonicalSuffix() const override { return ".aocx"; }
-
- private:
-  SE_DISALLOW_COPY_AND_ASSIGN(OpenCLBinaryOnDisk);
-};
-
-// Kernel loader specification for OpenCL text that resides in memory.
-class OpenCLTextInMemory : public KernelLoaderSpec {
- public:
-  OpenCLTextInMemory(absl::string_view text, absl::string_view kernelname);
-  ~OpenCLTextInMemory() override {}
-
-  // Returns the OpenCL text contents.
-  const std::string &text() const { return text_; }
-
- private:
-  // OpenCL translation unit text contents in memory.
-  std::string text_;
-
-  SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextInMemory);
-};
-
 // Kernel loader specification for a CUBIN blob that resides in memory.
 class CudaCubinInMemory : public KernelLoaderSpec {
  public:
-  CudaCubinInMemory(const char *bytes, absl::string_view kernelname);
+  CudaCubinInMemory(const char *bytes, absl::string_view kernel_name);
   ~CudaCubinInMemory() override {}
 
   const char *bytes() const { return bytes_; }
@@ -285,9 +246,6 @@ class MultiKernelLoaderSpec {
     return cuda_cubin_in_memory_ != nullptr;
   }
   bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; }
-  bool has_ocl_text_on_disk() const { return ocl_text_on_disk_ != nullptr; }
-  bool has_ocl_binary_on_disk() const { return ocl_binary_on_disk_ != nullptr; }
-  bool has_ocl_text_in_memory() const { return ocl_text_in_memory_ != nullptr; }
 
   // Accessors for platform variant kernel load specifications.
   // Precondition: corresponding has_* is true.
@@ -307,49 +265,29 @@ class MultiKernelLoaderSpec {
     CHECK(has_cuda_ptx_in_memory());
     return *cuda_ptx_in_memory_;
   }
-  const OpenCLTextOnDisk &ocl_text_on_disk() const {
-    CHECK(has_ocl_text_on_disk());
-    return *ocl_text_on_disk_;
-  }
-  const OpenCLBinaryOnDisk &ocl_binary_on_disk() const {
-    CHECK(has_ocl_binary_on_disk());
-    return *ocl_binary_on_disk_;
-  }
-  const OpenCLTextInMemory &ocl_text_in_memory() const {
-    CHECK(has_ocl_text_in_memory());
-    return *ocl_text_in_memory_;
-  }
-
   // Builder-pattern-like methods for use in initializing a
   // MultiKernelLoaderSpec. Each of these should be used at most once for a
   // single MultiKernelLoaderSpec object. See file comment for example usage.
   //
-  // Note that the kernelname parameter must be consistent with the kernel in
-  // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel
-  // name may be mangled by the compiler if it is not declared in an
-  // extern "C" scope.
-  MultiKernelLoaderSpec *AddOpenCLTextOnDisk(absl::string_view filename,
-                                             absl::string_view kernelname);
-  MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(absl::string_view filename,
-                                               absl::string_view kernelname);
-  MultiKernelLoaderSpec *AddOpenCLTextInMemory(absl::string_view ocl_text,
-                                               absl::string_view kernelname);
+  // Note that the kernel_name parameter must be consistent with the kernel in
+  // the PTX being loaded. Also be aware that in CUDA C++ the kernel name may be
+  // mangled by the compiler if it is not declared in an extern "C" scope.
   MultiKernelLoaderSpec *AddCudaPtxOnDisk(absl::string_view filename,
-                                          absl::string_view kernelname);
+                                          absl::string_view kernel_name);
   MultiKernelLoaderSpec *AddCudaCubinOnDisk(absl::string_view filename,
-                                            absl::string_view kernelname);
+                                            absl::string_view kernel_name);
   MultiKernelLoaderSpec *AddCudaCubinInMemory(const char *cubin_bytes,
-                                              absl::string_view kernelname);
+                                              absl::string_view kernel_name);
   MultiKernelLoaderSpec *AddCudaPtxInMemory(absl::string_view ptx,
-                                            absl::string_view kernelname);
+                                            absl::string_view kernel_name);
   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
-      absl::string_view ptx, absl::string_view kernelname);
+      absl::string_view ptx, absl::string_view kernel_name);
   MultiKernelLoaderSpec *AddCudaPtxInMemory(
       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
-      absl::string_view kernelname);
+      absl::string_view kernel_name);
   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
-      absl::string_view kernelname);
+      absl::string_view kernel_name);
 
  private:
   std::unique_ptr<CudaPtxOnDisk>
@@ -360,12 +298,6 @@ class MultiKernelLoaderSpec {
       cuda_cubin_in_memory_;  // Binary CUDA program in memory.
   std::unique_ptr<CudaPtxInMemory>
       cuda_ptx_in_memory_;  // PTX text that resides in memory.
-  std::unique_ptr<OpenCLTextOnDisk>
-      ocl_text_on_disk_;  // OpenCL text that resides on disk.
-  std::unique_ptr<OpenCLBinaryOnDisk>
-      ocl_binary_on_disk_;  // OpenCL binary that resides on disk.
-  std::unique_ptr<OpenCLTextInMemory>
-      ocl_text_in_memory_;  // OpenCL text that resides in memory.
 
   // Number of parameters that the kernel takes. (This is nicer to have in a
   // constexpr than having to determine it from the types via template

From 1518a980e1e0af880a7de819106a522c72f76036 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 21:36:56 -0700
Subject: [PATCH 118/567] Internal change only.

PiperOrigin-RevId: 567508075
---
 third_party/xla/third_party/tsl/tsl/platform/windows/env.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/windows/env.cc b/third_party/xla/third_party/tsl/tsl/platform/windows/env.cc
index 4dd129aadabb91..13fb4515d5a9fd 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/windows/env.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/windows/env.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tsl/platform/windows/windows_file_system.h"
 #include "tsl/protobuf/error_codes.pb.h"
 
-#pragma comment(lib, "Shlwapi.lib")
+#pragma comment(lib, "shlwapi.lib")
 
 namespace tsl {
 

From 9adcd215fc5da2c8f2c2270c47bcfc373fb65e7a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 21:57:33 -0700
Subject: [PATCH 119/567] Internal change only.

PiperOrigin-RevId: 567511750
---
 .../tsl/tsl/profiler/backends/cpu/annotation_stack.cc  |  9 ++++++++-
 .../tsl/tsl/profiler/backends/cpu/traceme_recorder.cc  | 10 +++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.cc b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.cc
index 529ef95387a6b6..97b4c5daeb373e 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.cc
@@ -23,7 +23,14 @@ namespace tsl {
 namespace profiler {
 namespace internal {
 
-std::atomic<int> g_annotation_enabled(0);
+#ifdef _WIN32
+#define DECL_DLL_EXPORT __declspec(dllexport)
+#else
+#define DECL_DLL_EXPORT
+#endif
+// DLL imported variables cannot be initialized on Windows. This file is
+// included only on DLL exports.
+DECL_DLL_EXPORT std::atomic<int> g_annotation_enabled(0);
 
 // g_annotation_enabled implementation must be lock-free for faster execution of
 // the ScopedAnnotation API. This can be commented (if compilation is failing)
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/traceme_recorder.cc b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/traceme_recorder.cc
index 7cd2d658bcdc49..3d4c7f6c289217 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/traceme_recorder.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/traceme_recorder.cc
@@ -34,7 +34,15 @@ namespace tsl {
 namespace profiler {
 namespace internal {
 
-std::atomic<int> g_trace_level(TraceMeRecorder::kTracingDisabled);
+#ifdef _WIN32
+#define DECL_DLL_EXPORT __declspec(dllexport)
+#else
+#define DECL_DLL_EXPORT
+#endif
+// DLL imported variables cannot be initialized on Windows. This file is
+// included only on DLL exports.
+DECL_DLL_EXPORT std::atomic<int> g_trace_level(
+    TraceMeRecorder::kTracingDisabled);
 
 // g_trace_level implementation must be lock-free for faster execution of the
 // TraceMe API. This can be commented (if compilation is failing) but execution

From e24946f338a92e7ee731d7fe8d765b544054821d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 22:16:36 -0700
Subject: [PATCH 120/567] Internal Code Change

PiperOrigin-RevId: 567515434
---
 tensorflow/cc/saved_model/BUILD | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 026ac1683fe971..d0f2fb7324eb6e 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -388,7 +388,6 @@ cc_library(
     visibility = [
         "//tensorflow:__pkg__",
         "//tensorflow/python:__pkg__",
-        "//tensorflow/security/fuzzing/cc/ops:__pkg__",  # TODO(b/261455394): Remove.
     ],
     deps = [
         "//tensorflow/core:protos_all_cc",
@@ -493,7 +492,6 @@ cc_library(
     visibility = [
         "//tensorflow:__pkg__",
         "//tensorflow/python:__pkg__",
-        "//tensorflow/security/fuzzing/cc/ops:__pkg__",  # TODO(b/261455394): Remove.
     ],
     deps = [
         ":constants",

From a851496e9c8b75448cb3052352d14674ee972237 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 22:19:52 -0700
Subject: [PATCH 121/567] Internal Code Change

PiperOrigin-RevId: 567516040
---
 third_party/xla/xla/translate/hlo_to_mhlo/tests/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/BUILD b/third_party/xla/xla/translate/hlo_to_mhlo/tests/BUILD
index dc9d7e34e2390a..0a9bd841a9663c 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/BUILD
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/BUILD
@@ -3,6 +3,7 @@ load("//xla:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 

From 19c8aba6902f54cab505a17f76a9214bf615ab3d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 22:36:26 -0700
Subject: [PATCH 122/567] Internal change only.

PiperOrigin-RevId: 567519140
---
 .../tsl/tsl/platform/windows/subprocess.cc      | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.cc b/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.cc
index e31432a0047575..c0c948bbc1814a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.cc
@@ -110,9 +110,11 @@ SubProcess::~SubProcess() {
   mutex_lock procLock(proc_mu_);
   mutex_lock dataLock(data_mu_);
   if (win_pi_) {
-    CloseHandle(reinterpret_cast<PROCESS_INFORMATION*>(win_pi_)->hProcess);
-    CloseHandle(reinterpret_cast<PROCESS_INFORMATION*>(win_pi_)->hThread);
-    delete win_pi_;
+    auto* pi = reinterpret_cast<PROCESS_INFORMATION*>(win_pi_);
+    CloseHandle(pi->hProcess);
+    CloseHandle(pi->hThread);
+    delete pi;
+    win_pi_ = nullptr;
   }
   running_ = false;
   FreeArgs();
@@ -364,9 +366,11 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
   // Lock data_mu_ but not proc_mu_ while communicating with the child process
   // in order for Kill() to be able to terminate the child from another thread.
   data_mu_.lock();
-  if (!IsProcessFinished(
-          reinterpret_cast<PROCESS_INFORMATION*>(win_pi_)->hProcess) ||
-      (parent_pipe_[CHAN_STDOUT] != nullptr) ||
+  proc_mu_.lock();
+  bool process_finished = IsProcessFinished(
+      reinterpret_cast<PROCESS_INFORMATION*>(win_pi_)->hProcess);
+  proc_mu_.unlock();
+  if (!process_finished || (parent_pipe_[CHAN_STDOUT] != nullptr) ||
       (parent_pipe_[CHAN_STDERR] != nullptr)) {
     if (parent_pipe_[CHAN_STDIN] != nullptr) {
       if (stdin_input) {
@@ -422,6 +426,7 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
     if (wait_result != WAIT_OBJECT_0) {
       LOG(ERROR) << "Waiting on the io threads failed! result: " << wait_result
                  << std::endl;
+      data_mu_.unlock();
       return -1;
     }
 

From acd860ca55cf3773ca2a1674dcfac5024a878bc0 Mon Sep 17 00:00:00 2001
From: Jake Harmon <jakeharmon@google.com>
Date: Thu, 21 Sep 2023 22:41:24 -0700
Subject: [PATCH 123/567] Add ml_dtypes headers to tensorflow/include

PiperOrigin-RevId: 567520259
---
 tensorflow/tools/pip_package/setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 755405fbf15eea..80b6e28ed1fc1c 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -265,6 +265,7 @@ def mkdir_and_copy_file(self, header):
     external_header_locations = {
         '/tensorflow/include/external/eigen_archive': '',
         '/tensorflow/include/external/com_google_absl': '',
+        '/tensorflow/include/external/ml_dtypes': '/ml_dtypes',
         '/tensorflow/include/tensorflow/compiler/xla': '/tensorflow/include/xla',
         '/tensorflow/include/tensorflow/tsl': '/tensorflow/include/tsl',
     }
@@ -345,7 +346,8 @@ def find_files(pattern, root):
     list(find_files('*', 'third_party/gpus')) +
     list(find_files('*.h', 'tensorflow/include/external/com_google_absl')) +
     list(find_files('*.inc', 'tensorflow/include/external/com_google_absl')) +
-    list(find_files('*', 'tensorflow/include/external/eigen_archive')))
+    list(find_files('*', 'tensorflow/include/external/eigen_archive')) +
+    list(find_files('*.h', 'tensorflow/include/external/ml_dtypes')))
 
 # Quite a lot of setup() options are different if this is a collaborator package
 # build. We explicitly list the differences here, then unpack the dict as

From b73d62ac5c0585f3ddd34769bec51e12afb8389c Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 21 Sep 2023 22:42:11 -0700
Subject: [PATCH 124/567] [stream_executor] NFC: Clean up
 xla/stream_executor:executor_cache depencencies and fix warnings

PiperOrigin-RevId: 567520429
---
 .../c/experimental/stream_executor/BUILD      |  5 +-
 .../stream_executor/stream_executor.cc        |  3 +-
 .../stream_executor_internal.h                |  1 +
 .../xla/service/gpu/runtime/collectives.cc    |  1 -
 .../xla/xla/service/gpu/runtime/collectives.h |  1 -
 third_party/xla/xla/stream_executor/BUILD     | 51 ++-------------
 .../xla/xla/stream_executor/executor_cache.cc | 65 ++++++++++---------
 .../xla/xla/stream_executor/executor_cache.h  | 27 +++++---
 8 files changed, 63 insertions(+), 91 deletions(-)

diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
index 39b81d93fb1723..5cf55e03aa778e 100644
--- a/tensorflow/c/experimental/stream_executor/BUILD
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -45,10 +45,9 @@ cc_library(
         "//tensorflow/core/common_runtime/device:device_utils",
         "//tensorflow/core/platform:strcat",
         "@com_google_absl//absl/functional:any_invocable",
-        "@local_xla//xla/stream_executor:executor_cache",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:multi_platform_manager",
         "@local_xla//xla/stream_executor:platform",
-        "@local_xla//xla/stream_executor:stream_executor_pimpl",
     ],
 )
 
@@ -66,7 +65,7 @@ cc_library(
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
-        "@local_xla//xla/stream_executor:executor_cache",
+        "@local_xla//xla/stream_executor",
     ],
 )
 
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
index 1e2ceee4656049..3fcd255a2248ab 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -33,8 +33,7 @@ limitations under the License.
 #include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor_internal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tensorflow/core/common_runtime/device/device_utils.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
index 14028fc7d6eec3..6792a61f7bb8cc 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status_helper.h"
 #include "xla/stream_executor/executor_cache.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/service/gpu/runtime/collectives.cc b/third_party/xla/xla/service/gpu/runtime/collectives.cc
index 1c702fee5ce5c9..2b28efbe470b24 100644
--- a/third_party/xla/xla/service/gpu/runtime/collectives.cc
+++ b/third_party/xla/xla/service/gpu/runtime/collectives.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/support.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/service_executable_run_options.h"
-#include "xla/stream_executor/stream.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/collectives.h b/third_party/xla/xla/service/gpu/runtime/collectives.h
index ca0e2b0203573d..23bd700cc151fc 100644
--- a/third_party/xla/xla/service/gpu/runtime/collectives.h
+++ b/third_party/xla/xla/service/gpu/runtime/collectives.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "xla/runtime/custom_call_registry.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
 #include "xla/stream_executor/event.h"
-#include "xla/stream_executor/stream.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index bae9ae09f248b3..6783578a4bc150 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -196,6 +196,7 @@ cc_library(
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
@@ -406,63 +407,25 @@ cc_library(
 
 cc_library(
     name = "executor_cache",
-    srcs = [
-        "device_description.h",
-        "device_memory.h",
-        "device_options.h",
-        "event.h",
-        "executor_cache.cc",
-        "launch_dim.h",
-        "plugin.h",
-        "plugin_registry.h",
-        "stream_executor_pimpl.h",
-        "temporary_device_memory.h",
-        "temporary_memory_manager.h",
-    ],
-    hdrs = [
-        "blas.h",
-        "executor_cache.h",
-        "fft.h",
-        "kernel.h",
-        "kernel_cache_config.h",
-        "kernel_spec.h",
-        "platform.h",
-        "stream.h",
-        "stream_executor_internal.h",
-        "trace_listener.h",
-    ],
+    srcs = ["executor_cache.cc"],
+    hdrs = ["executor_cache.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":allocator_stats",
-        ":data_type",
-        ":device_description",
-        ":device_description_proto_cc",
-        ":device_memory",
-        ":device_options",
-        ":fft",
-        ":kernel_cache_config",
-        ":kernel_spec",
-        ":launch_dim",
-        ":plugin",
-        ":stream_executor_headers",
+        ":platform",
+        ":stream_executor_pimpl_header",
         "//xla/stream_executor/platform",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/protobuf:dnn_proto_cc",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/executor_cache.cc b/third_party/xla/xla/stream_executor/executor_cache.cc
index f12af19b72f6b2..10b3627f7aee3a 100644
--- a/third_party/xla/xla/stream_executor/executor_cache.cc
+++ b/third_party/xla/xla/stream_executor/executor_cache.cc
@@ -16,20 +16,27 @@ limitations under the License.
 #include "xla/stream_executor/executor_cache.h"
 
 #include <memory>
+#include <utility>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
+ExecutorCache::ExecutorCache() = default;
+ExecutorCache::~ExecutorCache() { DestroyAllExecutors(); }
+
 tsl::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
-    const StreamExecutorConfig& config,
-    const std::function<ExecutorFactory>& factory) {
+    const StreamExecutorConfig& config, const ExecutorFactory& factory) {
   // In the fast path case, the cache already has an entry and we can just
   // return after Get() which only takes a shared lock and not a unique lock.
   // If we need to create, we take a unique lock on cache_.
-  auto fast_result = Get(config);
-  if (fast_result.ok()) {
+  if (auto fast_result = Get(config); fast_result.ok()) {
     return fast_result;
   }
 
@@ -38,7 +45,7 @@ tsl::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
     absl::MutexLock lock{&mutex_};
     entry = &cache_[config.ordinal];
     // Release the map lock; the address of 'entry' is stable because
-    // std::map guarantees reference stability.
+    // absl::node_hash_map guarantees reference stability.
   }
 
   // Acquire the per-Entry mutex without holding the map mutex. Initializing
@@ -70,47 +77,43 @@ tsl::StatusOr<StreamExecutor*> ExecutorCache::Get(
   {
     absl::ReaderMutexLock lock{&mutex_};
 
-    {
-      if (config.gpu_stream) {
-        // Need to iterate through all stored executors.
-        for (auto& [ordinal, e] : cache_) {
-          absl::ReaderMutexLock l{&e.configurations_mutex};
-          for (auto& [c, executor] : e.configurations) {
-            if (executor->FindAllocatedStream(config.gpu_stream)) {
-              return executor.get();
-            }
+    // If gpu stream is not nullptr we have to find StreamExecutor that owns it,
+    // and return NOT_FOUND error if we can't find it.
+    if (config.gpu_stream) {
+      for (auto& [ordinal, e] : cache_) {
+        absl::ReaderMutexLock l{&e.configurations_mutex};
+        for (auto& [c, executor] : e.configurations) {
+          if (executor->FindAllocatedStream(config.gpu_stream)) {
+            return executor.get();
           }
         }
-        return tsl::Status(
-            absl::StatusCode::kNotFound,
-            absl::StrFormat("No executors own stream %p", config.gpu_stream));
       }
+      return absl::NotFoundError(
+          absl::StrFormat("No executors own stream %p", config.gpu_stream));
     }
 
-    auto it = cache_.find(config.ordinal);
-    if (it != cache_.end()) {
+    if (auto it = cache_.find(config.ordinal); it != cache_.end()) {
       entry = &it->second;
     } else {
-      return tsl::Status(
-          absl::StatusCode::kNotFound,
-          absl::StrFormat("No executors registered for ordinal %d",
-                          config.ordinal));
+      return absl::NotFoundError(absl::StrFormat(
+          "No executors registered for ordinal %d", config.ordinal));
     }
   }
+
   absl::ReaderMutexLock lock{&entry->configurations_mutex};
   if (entry->configurations.empty()) {
-    return tsl::Status(absl::StatusCode::kNotFound,
-                       absl::StrFormat("No executors registered for ordinal %d",
-                                       config.ordinal));
+    return absl::NotFoundError(absl::StrFormat(
+        "No executors registered for ordinal %d", config.ordinal));
   }
-  for (const auto& iter : entry->configurations) {
-    if (iter.first.device_options == config.device_options) {
+
+  for (auto& [entry_config, entry_executor] : entry->configurations) {
+    if (entry_config.device_options == config.device_options) {
       VLOG(2) << "hit in cache for device ordinal " << config.ordinal;
-      return iter.second.get();
+      return entry_executor.get();
     }
   }
-  return tsl::Status(absl::StatusCode::kNotFound,
-                     "No executor found with a matching config.");
+
+  return absl::NotFoundError("No executor found with a matching config.");
 }
 
 void ExecutorCache::DestroyAllExecutors() {
diff --git a/third_party/xla/xla/stream_executor/executor_cache.h b/third_party/xla/xla/stream_executor/executor_cache.h
index cef0d3d565614d..d845b8eabb5ce3 100644
--- a/third_party/xla/xla/stream_executor/executor_cache.h
+++ b/third_party/xla/xla/stream_executor/executor_cache.h
@@ -17,28 +17,37 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_EXECUTOR_CACHE_H_
 
 #include <functional>
-#include <map>
+#include <memory>
+#include <utility>
+#include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
-#include "tsl/platform/status.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform/port.h"
 #include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
+// Forward declare.
+class StreamExecutor;
+
 // Utility class to allow Platform objects to manage cached StreamExecutors.
 // Thread-safe.
 class ExecutorCache {
  public:
-  ExecutorCache() {}
+  using ExecutorFactory =
+      std::function<tsl::StatusOr<std::unique_ptr<StreamExecutor>>()>;
+
+  ExecutorCache();
+  ~ExecutorCache();
 
   // Looks up 'config' in the cache. Returns a pointer to the existing executor,
   // if already present, or creates it using 'factory', if it does not.
   // Factories may be executed concurrently for different device ordinals.
-  typedef tsl::StatusOr<std::unique_ptr<StreamExecutor>> ExecutorFactory();
-  tsl::StatusOr<StreamExecutor*> GetOrCreate(
-      const StreamExecutorConfig& config,
-      const std::function<ExecutorFactory>& factory);
+  tsl::StatusOr<StreamExecutor*> GetOrCreate(const StreamExecutorConfig& config,
+                                             const ExecutorFactory& factory);
 
   // Returns a pointer to the described executor (if one with a matching config
   // has been created), or a NOT_FOUND status.
@@ -70,7 +79,7 @@ class ExecutorCache {
   // We key off of ordinal (instead of just looking up all fields in the
   // StreamExecutorConfig) for a slight improvement in lookup time.
   absl::Mutex mutex_;
-  std::map<int, Entry> cache_ ABSL_GUARDED_BY(mutex_);
+  absl::node_hash_map<int, Entry> cache_ ABSL_GUARDED_BY(mutex_);
 
   SE_DISALLOW_COPY_AND_ASSIGN(ExecutorCache);
 };

From 41154a69d36384c90d75043a8052c045e037b084 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Sep 2023 23:09:51 -0700
Subject: [PATCH 125/567] Internal Code Change

PiperOrigin-RevId: 567527024
---
 tensorflow/core/common_runtime/device/BUILD | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/device/BUILD b/tensorflow/core/common_runtime/device/BUILD
index 3cddb136424737..12eb86fd0bd391 100644
--- a/tensorflow/core/common_runtime/device/BUILD
+++ b/tensorflow/core/common_runtime/device/BUILD
@@ -20,10 +20,7 @@ load(
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//tensorflow:internal",
-        "//tensorflow_models:__subpackages__",
-    ],
+    default_visibility = ["//tensorflow:internal"],
     # features = ["-parse_headers"],
     features = ["-layering_check"],
     licenses = ["notice"],

From 2635d85c1cac2562197f5aedfeba4274c0454e9c Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Fri, 22 Sep 2023 00:36:41 -0700
Subject: [PATCH 126/567] [XLA] Fix big memory allocation and compile time
 slowdown for WhileLoopInvariantCodeMotion.

The pass allocates in some cases the input for the while-cond computation, but it really only
needs one of the tuple elements of the parameter, not the whole tuple.

Allocating the whole tuple (because the inputs to the loop can be big) can result in very large
memory allocations an significant time spent on memory allocation/initialization.

PiperOrigin-RevId: 567543997
---
 third_party/xla/xla/service/BUILD             |  1 +
 .../xla/xla/service/while_loop_analysis.cc    | 21 ++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 2aee4c3a9ec378..b32c8fb37c070f 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -3209,6 +3209,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_reachability",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
diff --git a/third_party/xla/xla/service/while_loop_analysis.cc b/third_party/xla/xla/service/while_loop_analysis.cc
index 7a881be0474fe8..dae16d93928085 100644
--- a/third_party/xla/xla/service/while_loop_analysis.cc
+++ b/third_party/xla/xla/service/while_loop_analysis.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/while_loop_analysis.h"
 
 #include "absl/base/casts.h"
+#include "absl/container/flat_hash_map.h"
 #include "xla/hlo/evaluator/hlo_evaluator.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -614,15 +615,29 @@ optional<int64_t> ComputeWhileLoopTripCountUpperBound(
             << while_body_indvar->ToString();
     return nullopt;
   }
+  // Create a new while cond computation accessing only the single parameter
+  // extracted by the GTE above to avoid excessive memory allocation for the
+  // evaluator.
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  auto new_param = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({cond_gte->shape()}), "temp");
+  replacements[cond_gte] =
+      HloInstruction::CreateGetTupleElement(new_param.get(), 0);
+  replacements[while_cond_param] = std::move(new_param);
+  auto new_module = std::make_unique<HloModule>("temp_mod", HloModuleConfig{});
+  auto* new_computation = new_module->AddEmbeddedComputation(
+      while_cond->CloneWithReplacements(&replacements));
 
   // We have a constant. Evaluate the condition on this constant.
   HloEvaluator evaluator(/*max_loop_iterations=*/0);
-  Literal fake_input = Literal::CreateFromShape(while_cond_param->shape());
+  Literal fake_input = Literal::CreateFromShape(
+      new_computation->parameter_instruction(0)->shape());
   TF_CHECK_OK(fake_input.CopyFrom(while_body_indvar->literal(),
-                                  /*dest_shape_index=*/{indvar_index},
+                                  /*dest_shape_index=*/{0},
                                   /*src_shape_index=*/{}));
   StatusOr<Literal> eval_result =
-      evaluator.Evaluate(*while_cond, {std::move(fake_input)});
+      evaluator.Evaluate(*new_computation, {std::move(fake_input)});
 
   if (!eval_result.ok()) {
     VLOG(2) << "Couldn't evaluate while loop condition.";

From aeb240d9029c006ef7c66f308bee82854202e1fb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 02:01:48 -0700
Subject: [PATCH 127/567] compat: Update forward compatibility horizon to
 2023-09-22

PiperOrigin-RevId: 567558968
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 6e494480e54ba7..4145a22481f391 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 21)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 22)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From a5ff20d0a39f1f6638448076c4eb01613d127d73 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 02:01:50 -0700
Subject: [PATCH 128/567] Update GraphDef version to 1627.

PiperOrigin-RevId: 567558973
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 7c76122b9da32f..470b11e06459b2 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1626  // Updated: 2023/9/21
+#define TF_GRAPH_DEF_VERSION 1627  // Updated: 2023/9/22
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From bdbcdcea22474ea8278b61ecf688365634d9c9f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Fri, 22 Sep 2023 02:59:02 -0700
Subject: [PATCH 129/567] [XLA:GPU] Fix upcasting of dot operands for triton
 GEMM fusions

PiperOrigin-RevId: 567569309
---
 third_party/xla/xla/service/gpu/BUILD         |  8 +++
 .../xla/xla/service/gpu/float_support_test.cc | 52 +++++++++++++++++--
 .../xla/xla/service/gpu/gpu_float_support.cc  | 24 +++++++++
 .../xla/xla/service/gpu/gpu_float_support.h   |  6 +++
 4 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 9f2ce2858edc4d..a18d39d3e56812 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2432,7 +2432,10 @@ cc_library(
     hdrs = ["gpu_float_support.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/service:float_support",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
@@ -3666,8 +3669,13 @@ xla_test(
         "gpu",
     ],
     deps = [
+        "//xla:error_spec",
+        "//xla:xla_proto_cc",
+        "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/float_support_test.cc b/third_party/xla/xla/service/gpu/float_support_test.cc
index 0f10626606e1ac..3f0bf03a2b1487 100644
--- a/third_party/xla/xla/service/gpu/float_support_test.cc
+++ b/third_party/xla/xla/service/gpu/float_support_test.cc
@@ -13,18 +13,59 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <string>
-
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/error_spec.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using FloatSupportTest = HloTestBase;
+using FloatSupportTestWithCublas = HloTestBase;
+
+class FloatSupportTestWithTriton : public HloTestBase {
+ public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_triton_gemm_any(true);
+    return debug_options;
+  }
+};
+
+TEST_F(FloatSupportTestWithCublas, MixedTypeDotIsNotUpcasted) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = bf16[32,32] parameter(0)
+  p1 = bf16[32,32] parameter(1)
+  ROOT d = f32[32,32] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK-NOT: convert
+; CHECK: __cublas
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{1e-6, 1e-6}));
+}
+
+TEST_F(FloatSupportTestWithTriton, MixedTypeDotWithBF16IsNotUpcasted) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
 
-TEST_F(FloatSupportTest, MixedTypeDotIsNotUpcasted) {
-  const std::string kHloText = R"(
+  constexpr absl::string_view kHloText = R"(
 ENTRY e {
   p0 = bf16[32,32] parameter(0)
   p1 = bf16[32,32] parameter(1)
@@ -34,6 +75,7 @@ ENTRY e {
 
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK-NOT: convert
+; CHECK: __triton
 )");
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{1e-6, 1e-6}));
diff --git a/third_party/xla/xla/service/gpu/gpu_float_support.cc b/third_party/xla/xla/service/gpu/gpu_float_support.cc
index f99e40f0d93d81..7652bed27c61fb 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support.cc
+++ b/third_party/xla/xla/service/gpu/gpu_float_support.cc
@@ -15,9 +15,33 @@ limitations under the License.
 
 #include "xla/service/gpu/gpu_float_support.h"
 
+#include "absl/log/check.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/float_support.h"
+#include "xla/xla_data.pb.h"
+
 namespace xla {
 namespace gpu {
 
+bool GpuFloatSupport::SupportsMixedPrecisions(const HloInstruction& hlo) const {
+  if (FloatSupport::SupportsMixedPrecisions(hlo)) return true;
+
+  switch (hlo.opcode()) {
+    // Handled by Triton GEMM or cuBLAS.
+    case HloOpcode::kDot: {
+      CHECK_EQ(hlo.operand_count(), 2);
+      const PrimitiveType lhs_type = hlo.operand(0)->shape().element_type();
+      const PrimitiveType rhs_type = hlo.operand(1)->shape().element_type();
+      const PrimitiveType result_type = hlo.shape().element_type();
+      return (lhs_type == F16 && rhs_type == F16 && result_type == F32) ||
+             (lhs_type == BF16 && rhs_type == BF16 && result_type == F32);
+    }
+    default:
+      return false;
+  }
+}
+
 bool GpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
   switch (hlo.opcode()) {
     // Collective ops.
diff --git a/third_party/xla/xla/service/gpu/gpu_float_support.h b/third_party/xla/xla/service/gpu/gpu_float_support.h
index 51a54020d38420..c9e0e2ac0c48e7 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support.h
+++ b/third_party/xla/xla/service/gpu/gpu_float_support.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_GPU_FLOAT_SUPPORT_H_
 #define XLA_SERVICE_GPU_GPU_FLOAT_SUPPORT_H_
 
+#include <cstdint>
+
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/float_support.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -36,6 +40,8 @@ class GpuFloatSupport : public FloatSupport {
     return FloatSupport::SupportsLowPrecisionOutput(hlo) || IsSupported(hlo);
   }
 
+  bool SupportsMixedPrecisions(const HloInstruction& hlo) const override;
+
  private:
   bool IsSupported(const HloInstruction& hlo) const;
 };

From 8b0ec1ea93faa08eadbead5b96cc02720cb3569a Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Fri, 22 Sep 2023 16:10:04 +0530
Subject: [PATCH 130/567] Fixed typos in TF doc

Several typos fixed in the TF documentation.
---
 tensorflow/python/debug/lib/debug_events_reader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/debug/lib/debug_events_reader.py b/tensorflow/python/debug/lib/debug_events_reader.py
index c6234afef04da8..706823b799b14a 100644
--- a/tensorflow/python/debug/lib/debug_events_reader.py
+++ b/tensorflow/python/debug/lib/debug_events_reader.py
@@ -324,7 +324,7 @@ class BaseDigest:
           for the case of all digests of the same kind coming from the same
           file.
        2. A tuple of a file index and a byte offset. This applies to case
-          in which the same type of debugger data may come from multple files,
+          in which the same type of debugger data may come from multiple files,
           e.g., graph execution traces.
   """
 
@@ -1306,7 +1306,7 @@ def execution_to_tensor_values(self, execution):
     """Read the full tensor values from an Execution or ExecutionDigest.
 
     Args:
-      execution: An `ExecutionDigest` or `ExeuctionDigest` object.
+      execution: An `ExecutionDigest` or `ExecutionDigest` object.
 
     Returns:
       A list of numpy arrays representing the output tensor values of the

From 9cdc613d71a61e5b93f2690f7d522dd341c49948 Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Fri, 22 Sep 2023 16:15:49 +0530
Subject: [PATCH 131/567] Fixing typos

---
 tensorflow/python/debug/lib/dumping_callback.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/debug/lib/dumping_callback.py b/tensorflow/python/debug/lib/dumping_callback.py
index 9f9afad5b599d4..8abfc242c91f3e 100644
--- a/tensorflow/python/debug/lib/dumping_callback.py
+++ b/tensorflow/python/debug/lib/dumping_callback.py
@@ -594,7 +594,7 @@ def _lookup_tensor_name(self, tensor):
       tensor: The graph tensor to look up the name for.
 
     Returns:
-      Name of the orignal instrumented tensor as known to the debugger.
+      Name of the original instrumented tensor as known to the debugger.
     """
     return self._tensor_aliases.get(tensor.name, tensor.name)
 

From 91de7cef66b4bb75f33a3e79b7e9e39805216953 Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Fri, 22 Sep 2023 16:18:51 +0530
Subject: [PATCH 132/567] Update debug_events_writer_test.py

---
 tensorflow/python/debug/lib/debug_events_writer_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/debug/lib/debug_events_writer_test.py b/tensorflow/python/debug/lib/debug_events_writer_test.py
index 1c2f11dff72f83..e7e6a71bf650b1 100644
--- a/tensorflow/python/debug/lib/debug_events_writer_test.py
+++ b/tensorflow/python/debug/lib/debug_events_writer_test.py
@@ -809,7 +809,7 @@ def testDebuggedGraphToJonsWithNameAndInnerOuterGraphIds(self):
       ("EmptyList", []),
       ("None", None),
   )
-  def testGraphOpDigestWithNoOutpusReturnsNumOutputsZero(
+  def testGraphOpDigestWithNoOutputsReturnsNumOutputsZero(
       self, output_tensor_ids):
     op_creation_digest = debug_events_reader.GraphOpCreationDigest(
         1234,

From fb36fd261aeb2c7a11ebe3fe1f533685b4c030dc Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Fri, 22 Sep 2023 16:20:33 +0530
Subject: [PATCH 133/567] Update debugger_cli_common.py

---
 tensorflow/python/debug/cli/debugger_cli_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/debug/cli/debugger_cli_common.py b/tensorflow/python/debug/cli/debugger_cli_common.py
index 48696e5fdb7a01..a4a184fa3d2628 100644
--- a/tensorflow/python/debug/cli/debugger_cli_common.py
+++ b/tensorflow/python/debug/cli/debugger_cli_common.py
@@ -813,7 +813,7 @@ def _get_help_for_command_prefix(self, cmd_prefix):
         aliases.
 
     Returns:
-      A list of str as the help information fo cmd_prefix. If the cmd_prefix
+      A list of str as the help information for cmd_prefix. If the cmd_prefix
         does not exist, the returned list of str will indicate that.
     """
     lines = []

From 899323ee1aadbf5d2cb4cfa598eb25224862a86d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Fri, 22 Sep 2023 03:45:50 -0700
Subject: [PATCH 134/567] [XLA:GPU][NFC] Move out some constants in
 OptimizeHloPostLayoutAssignment

This is a preparation for a later CL.

PiperOrigin-RevId: 567577255
---
 third_party/xla/xla/service/gpu/BUILD         |  1 +
 .../xla/xla/service/gpu/gpu_compiler.cc       | 98 +++++++++----------
 2 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index a18d39d3e56812..f6ba7b004e3988 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2677,6 +2677,7 @@ cc_library(
         "//xla/service:executable",
         "//xla/service:flatten_call_graph",
         "//xla/service:float_normalization",
+        "//xla/service:float_support",
         "//xla/service:gather_expander",
         "//xla/service:gather_simplifier",
         "//xla/service:hlo_computation_deduplicator",
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 1d5c7c106906be..66dd3fd2ac57bc 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -84,10 +84,12 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/flatten_call_graph.h"
 #include "xla/service/float_normalization.h"
+#include "xla/service/float_support.h"
 #include "xla/service/gather_expander.h"
 #include "xla/service/gather_simplifier.h"
 #include "xla/service/gpu/alias_passthrough_params.h"
 #include "xla/service/gpu/all_reduce_blueconnect.h"
+#include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/conv_layout_normalization.h"
 #include "xla/service/gpu/copy_fusion.h"
@@ -897,22 +899,53 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
     const AutotuneResults* autotune_results,
     tsl::thread::ThreadPool* thread_pool) {
+  // Constants:
   const DebugOptions& debug_options = hlo_module->config().debug_options();
+  const GpuVersion gpu_version =
+      gpu_target_config.gpu_device_info.compute_capability;
+  const se::CudaComputeCapability* const cuda_cc =
+      std::get_if<se::CudaComputeCapability>(&gpu_version);
+  const AlgebraicSimplifierOptions simplifier_options = [&] {
+    AlgebraicSimplifierOptions opts;
+    opts.set_supports_non_canonical_dots(false);
+    opts.set_is_layout_sensitive(true);
+    opts.set_enable_conv_operand_swap(false);
+    // "slow" minmax means we propagate nan.
+    opts.set_minmax_propagate_nan(!debug_options.xla_gpu_enable_fast_min_max());
+    opts.set_enable_unconditional_reduce_of_concat_replacement(false);
+    return opts;
+  }();
+  TF_ASSIGN_OR_RETURN(AutotuneConfig autotune_config,
+                      GetAutotuneConfig(stream_exec, debug_options, options,
+                                        gpu_target_config, autotune_results));
+  // Lambdas and related constants:
+  const GpuFloatSupport bf16_support(BF16);
+  const GpuFloatSupport f8e5m2_support(F8E5M2);
+  const GpuFloatSupport f8e4m3fn_support(F8E4M3FN);
+  const FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ);
+  const FloatSupport f8e5m2fnuz_support(F8E5M2FNUZ);
+  const FloatSupport f8e4m3fnuz_support(F8E4M3FNUZ);
+  auto add_float_normalization = [&](HloPassPipeline& pipeline) {
+    auto& sub_pipeline =
+        pipeline.AddPass<HloPassPipeline>("float_normalization");
+    sub_pipeline.AddPass<FloatNormalization>(&bf16_support);
+    sub_pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
+    sub_pipeline.AddPass<FloatNormalization>(&f8e4m3fn_support);
+    sub_pipeline.AddPass<FloatNormalization>(&f8e4m3b11fnuz_support);
+    sub_pipeline.AddPass<FloatNormalization>(&f8e5m2fnuz_support);
+    sub_pipeline.AddPass<FloatNormalization>(&f8e4m3fnuz_support);
+    // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
+    if (debug_options.xla_gpu_simplify_all_fp_conversions()) {
+      sub_pipeline.AddPass<SimplifyFPConversions>();
+    }
+  };
 
   {
     HloPassPipeline pipeline("hlo normalization");
 
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    AlgebraicSimplifierOptions options;
-    options.set_supports_non_canonical_dots(false);
-    options.set_is_layout_sensitive(true);
-    options.set_enable_conv_operand_swap(false);
-    // "slow" minmax means we propagate nan.
-    options.set_minmax_propagate_nan(
-        !debug_options.xla_gpu_enable_fast_min_max());
-    options.set_enable_unconditional_reduce_of_concat_replacement(false);
-    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
+    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
 
     // GemmRewriter assumes that all transposes are folded into gemms, but,
     // since commit 7d529df, this is not always true at this point.
@@ -927,9 +960,6 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     pipeline.AddPass<HloPassFix<MoveCopyToUsers>>();
 
     // Rewrite GEMMs into custom calls.
-    GpuVersion gpu_version =
-        gpu_target_config.gpu_device_info.compute_capability;
-    const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version);
     if (debug_options.xla_gpu_enable_triton_gemm() && cuda_cc != nullptr &&
         cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) {
       pipeline.AddPass<GemmRewriterTriton>(gpu_version);
@@ -941,7 +971,7 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
 
     if (debug_options.xla_gpu_normalize_layouts()) {
       pipeline.AddPass<LayoutNormalization>(&NormalizeLayoutForGpuCustomCalls);
-      pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
+      pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
     }
     pipeline.AddPass<BroadcastCanonicalizer>();
 
@@ -954,7 +984,7 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     if (debug_options.xla_gpu_enable_triton_softmax_fusion() &&
         cuda_cc != nullptr &&
         cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) {
-      pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
+      pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
       pipeline.AddPass<SoftmaxRewriterTriton>(gpu_version);
     }
 
@@ -981,34 +1011,10 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
         return RequiresCollectiveScheduleLinearizer(module, stream_exec);
       });
 
-  GpuFloatSupport bf16_support(BF16);
-  GpuFloatSupport f8e5m2_support(F8E5M2);
-  GpuFloatSupport f8e4m3fn_support(F8E4M3FN);
-  FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ);
-  FloatSupport f8e5m2fnuz_support(F8E5M2FNUZ);
-  FloatSupport f8e4m3fnuz_support(F8E4M3FNUZ);
-
-  auto add_float_normalization = [&](HloPassPipeline& pipeline) {
-    auto& sub_pipeline =
-        pipeline.AddPass<HloPassPipeline>("float_normalization");
-    sub_pipeline.AddPass<FloatNormalization>(&bf16_support);
-    sub_pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
-    sub_pipeline.AddPass<FloatNormalization>(&f8e4m3fn_support);
-    sub_pipeline.AddPass<FloatNormalization>(&f8e4m3b11fnuz_support);
-    sub_pipeline.AddPass<FloatNormalization>(&f8e5m2fnuz_support);
-    sub_pipeline.AddPass<FloatNormalization>(&f8e4m3fnuz_support);
-    // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
-    if (debug_options.xla_gpu_simplify_all_fp_conversions()) {
-      sub_pipeline.AddPass<SimplifyFPConversions>();
-    }
-  };
   // Triton compilation needs normalized operations on bf16 (i.e. converted to
   // f32).
   add_float_normalization(pipeline);
 
-  TF_ASSIGN_OR_RETURN(AutotuneConfig autotune_config,
-                      GetAutotuneConfig(stream_exec, debug_options, options,
-                                        gpu_target_config, autotune_results));
   TF_RETURN_IF_ERROR(
       AddAutotuningPasses(&pipeline, hlo_module, autotune_config, thread_pool));
 
@@ -1019,19 +1025,9 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // Clean up new_tuple described above.
   pipeline.AddPass<TupleSimplifier>();
 
-  {
-    // The LayoutAssignment pass may leave behind kCopy instructions which are
-    // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    AlgebraicSimplifierOptions options;
-    options.set_supports_non_canonical_dots(false);
-    options.set_is_layout_sensitive(true);
-    options.set_enable_conv_operand_swap(false);
-    // "slow" minmax means we propagate nan.
-    options.set_minmax_propagate_nan(
-        !hlo_module->config().debug_options().xla_gpu_enable_fast_min_max());
-    options.set_enable_unconditional_reduce_of_concat_replacement(false);
-    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
-  }
+  // The LayoutAssignment pass may leave behind kCopy instructions which are
+  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
+  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
 
   // Since this CSE runs after collective schedule linearizer which inserts
   // control dependencies, ignore these control deps when replacing instructions

From c51351f28b39b3c74946127cb96d501d6d4a8169 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 03:48:13 -0700
Subject: [PATCH 135/567] Integrate LLVM at llvm/llvm-project@3b0f812b9af4

Updates LLVM usage to match
[3b0f812b9af4](https://github.com/llvm/llvm-project/commit/3b0f812b9af4)

PiperOrigin-RevId: 567577565
---
 third_party/llvm/generated.patch | 1717 +-----------------------------
 third_party/llvm/workspace.bzl   |    4 +-
 2 files changed, 53 insertions(+), 1668 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 450540fcebd5c7..409e0541024c8e 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,1669 +1,54 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
---- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
-+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
-@@ -271,10 +271,7 @@
-   bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
-                                              IRBuilder<> &Builder);
- 
--  bool hoistCommonCodeFromSuccessors(BasicBlock *BB, bool EqTermsOnly);
--  bool hoistSuccIdenticalTerminatorToSwitchOrIf(
--      Instruction *TI, Instruction *I1,
--      SmallVectorImpl<Instruction *> &OtherSuccTIs);
-+  bool HoistThenElseCodeToIf(BranchInst *BI, bool EqTermsOnly);
-   bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
-   bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
-                                   BasicBlock *TrueBB, BasicBlock *FalseBB,
-@@ -1411,9 +1408,8 @@
- }
- 
- // If we would need to insert a select that uses the value of this invoke
--// (comments in hoistSuccIdenticalTerminatorToSwitchOrIf explain why we would
--// need to do this), we can't hoist the invoke, as there is nowhere to put the
--// select in this case.
-+// (comments in HoistThenElseCodeToIf explain why we would need to do this), we
-+// can't hoist the invoke, as there is nowhere to put the select in this case.
- static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
-                                 Instruction *I1, Instruction *I2) {
-   for (BasicBlock *Succ : successors(BB1)) {
-@@ -1428,9 +1424,9 @@
-   return true;
- }
- 
--// Get interesting characteristics of instructions that
--// `hoistCommonCodeFromSuccessors` didn't hoist. They restrict what kind of
--// instructions can be reordered across.
-+// Get interesting characteristics of instructions that `HoistThenElseCodeToIf`
-+// didn't hoist. They restrict what kind of instructions can be reordered
-+// across.
- enum SkipFlags {
-   SkipReadMem = 1,
-   SkipSideEffect = 2,
-@@ -1488,7 +1484,7 @@
- 
- static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValueMayBeModified = false);
- 
--/// Helper function for hoistCommonCodeFromSuccessors. Return true if identical
-+/// Helper function for HoistThenElseCodeToIf. Return true if identical
- /// instructions \p I1 and \p I2 can and should be hoisted.
- static bool shouldHoistCommonInstructions(Instruction *I1, Instruction *I2,
-                                           const TargetTransformInfo &TTI) {
-@@ -1519,51 +1515,62 @@
-   return true;
- }
- 
--/// Hoist any common code in the successor blocks up into the block. This
--/// function guarantees that BB dominates all successors. If EqTermsOnly is
--/// given, only perform hoisting in case both blocks only contain a terminator.
--/// In that case, only the original BI will be replaced and selects for PHIs are
--/// added.
--bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
--                                                   bool EqTermsOnly) {
-+/// Given a conditional branch that goes to BB1 and BB2, hoist any common code
-+/// in the two blocks up into the branch block. The caller of this function
-+/// guarantees that BI's block dominates BB1 and BB2. If EqTermsOnly is given,
-+/// only perform hoisting in case both blocks only contain a terminator. In that
-+/// case, only the original BI will be replaced and selects for PHIs are added.
-+bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI, bool EqTermsOnly) {
-   // This does very trivial matching, with limited scanning, to find identical
--  // instructions in the two blocks. In particular, we don't want to get into
--  // O(N1*N2*...) situations here where Ni are the sizes of these successors. As
-+  // instructions in the two blocks.  In particular, we don't want to get into
-+  // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
-   // such, we currently just scan for obviously identical instructions in an
-   // identical order, possibly separated by the same number of non-identical
-   // instructions.
--  unsigned int SuccSize = succ_size(BB);
--  if (SuccSize < 2)
--    return false;
-+  BasicBlock *BB1 = BI->getSuccessor(0); // The true destination.
-+  BasicBlock *BB2 = BI->getSuccessor(1); // The false destination
- 
-   // If either of the blocks has it's address taken, then we can't do this fold,
-   // because the code we'd hoist would no longer run when we jump into the block
-   // by it's address.
--  for (auto *Succ : successors(BB))
--    if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
--      return false;
-+  if (BB1->hasAddressTaken() || BB2->hasAddressTaken())
-+    return false;
- 
--  auto *TI = BB->getTerminator();
-+  BasicBlock::iterator BB1_Itr = BB1->begin();
-+  BasicBlock::iterator BB2_Itr = BB2->begin();
- 
--  // The second of pair is a SkipFlags bitmask.
--  using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>;
--  SmallVector<SuccIterPair, 8> SuccIterPairs;
--  for (auto *Succ : successors(BB)) {
--    BasicBlock::iterator SuccItr = Succ->begin();
--    if (isa<PHINode>(*SuccItr))
--      return false;
--    SuccIterPairs.push_back(SuccIterPair(SuccItr, 0));
-+  Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++;
-+  // Skip debug info if it is not identical.
-+  DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
-+  DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
-+  if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
-+    while (isa<DbgInfoIntrinsic>(I1))
-+      I1 = &*BB1_Itr++;
-+    while (isa<DbgInfoIntrinsic>(I2))
-+      I2 = &*BB2_Itr++;
-   }
-+  if (isa<PHINode>(I1))
-+    return false;
-+
-+  BasicBlock *BIParent = BI->getParent();
-+
-+  bool Changed = false;
-+
-+  auto _ = make_scope_exit([&]() {
-+    if (Changed)
-+      ++NumHoistCommonCode;
-+  });
- 
-   // Check if only hoisting terminators is allowed. This does not add new
-   // instructions to the hoist location.
-   if (EqTermsOnly) {
-     // Skip any debug intrinsics, as they are free to hoist.
--    for (auto &SuccIter : make_first_range(SuccIterPairs)) {
--      auto *INonDbg = &*skipDebugIntrinsics(SuccIter);
--      if (!INonDbg->isTerminator())
--        return false;
--    }
-+    auto *I1NonDbg = &*skipDebugIntrinsics(I1->getIterator());
-+    auto *I2NonDbg = &*skipDebugIntrinsics(I2->getIterator());
-+    if (!I1NonDbg->isIdenticalToWhenDefined(I2NonDbg))
-+      return false;
-+    if (!I1NonDbg->isTerminator())
-+      return false;
-     // Now we know that we only need to hoist debug intrinsics and the
-     // terminator. Let the loop below handle those 2 cases.
-   }
-@@ -1572,234 +1579,154 @@
-   // many instructions we skip, serving as a compilation time control as well as
-   // preventing excessive increase of life ranges.
-   unsigned NumSkipped = 0;
--  // If we find an unreachable instruction at the beginning of a basic block, we
--  // can still hoist instructions from the rest of the basic blocks.
--  if (SuccIterPairs.size() > 2) {
--    erase_if(SuccIterPairs,
--             [](const auto &Pair) { return isa<UnreachableInst>(Pair.first); });
--    if (SuccIterPairs.size() < 2)
--      return false;
--  }
- 
--  bool Changed = false;
-+  // Record any skipped instuctions that may read memory, write memory or have
-+  // side effects, or have implicit control flow.
-+  unsigned SkipFlagsBB1 = 0;
-+  unsigned SkipFlagsBB2 = 0;
- 
+diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
++++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+@@ -5700,7 +5700,7 @@
+     if (OpOpcode == ISD::TRUNCATE) {
+       SDValue OpOp = N1.getOperand(0);
+       if (OpOp.getValueType() == VT) {
+-        if (OpOp.getOpcode() == ISD::AssertZext) {
++        if (OpOp.getOpcode() == ISD::AssertZext && N1->hasOneUse()) {
+           APInt HiBits = APInt::getBitsSetFrom(VT.getScalarSizeInBits(),
+                                                N1.getScalarValueSizeInBits());
+           if (MaskedValueIsZero(OpOp, HiBits)) {
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
++++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+@@ -2645,8 +2645,7 @@
    for (;;) {
--    auto *SuccIterPairBegin = SuccIterPairs.begin();
--    auto &BB1ItrPair = *SuccIterPairBegin++;
--    auto OtherSuccIterPairRange =
--        iterator_range(SuccIterPairBegin, SuccIterPairs.end());
--    auto OtherSuccIterRange = make_first_range(OtherSuccIterPairRange);
--
--    Instruction *I1 = &*BB1ItrPair.first;
--    auto *BB1 = I1->getParent();
--
--    // Skip debug info if it is not identical.
--    bool AllDbgInstsAreIdentical = all_of(OtherSuccIterRange, [I1](auto &Iter) {
--      Instruction *I2 = &*Iter;
--      return I1->isIdenticalToWhenDefined(I2);
--    });
--    if (!AllDbgInstsAreIdentical) {
--      while (isa<DbgInfoIntrinsic>(I1))
--        I1 = &*++BB1ItrPair.first;
--      for (auto &SuccIter : OtherSuccIterRange) {
--        Instruction *I2 = &*SuccIter;
--        while (isa<DbgInfoIntrinsic>(I2))
--          I2 = &*++SuccIter;
--      }
--    }
--
--    bool AllInstsAreIdentical = true;
--    bool HasTerminator = I1->isTerminator();
--    for (auto &SuccIter : OtherSuccIterRange) {
--      Instruction *I2 = &*SuccIter;
--      HasTerminator |= I2->isTerminator();
--      if (AllInstsAreIdentical && !I1->isIdenticalToWhenDefined(I2))
--        AllInstsAreIdentical = false;
--    }
--
-     // If we are hoisting the terminator instruction, don't move one (making a
-     // broken BB), instead clone it, and remove BI.
--    if (HasTerminator) {
-+    if (I1->isTerminator() || I2->isTerminator()) {
-       // If any instructions remain in the block, we cannot hoist terminators.
--      if (NumSkipped || SuccSize != SuccIterPairs.size() ||
--          !AllInstsAreIdentical)
-+      if (NumSkipped || !I1->isIdenticalToWhenDefined(I2))
-         return Changed;
--      SmallVector<Instruction *, 8> Insts;
--      for (auto &SuccIter : OtherSuccIterRange)
--        Insts.push_back(&*SuccIter);
--      return hoistSuccIdenticalTerminatorToSwitchOrIf(TI, I1, Insts) || Changed;
--    }
--
--    if (AllInstsAreIdentical) {
--      unsigned SkipFlagsBB1 = BB1ItrPair.second;
--      AllInstsAreIdentical =
--          isSafeToHoistInstr(I1, SkipFlagsBB1) &&
--          all_of(OtherSuccIterPairRange, [=](const auto &Pair) {
--            Instruction *I2 = &*Pair.first;
--            unsigned SkipFlagsBB2 = Pair.second;
--            // Even if the instructions are identical, it may not
--            // be safe to hoist them if we have skipped over
--            // instructions with side effects or their operands
--            // weren't hoisted.
--            return isSafeToHoistInstr(I2, SkipFlagsBB2) &&
--                   shouldHoistCommonInstructions(I1, I2, TTI);
--          });
--    }
--
--    if (AllInstsAreIdentical) {
--      BB1ItrPair.first++;
--      if (isa<DbgInfoIntrinsic>(I1)) {
-+      goto HoistTerminator;
-+    }
-+
-+    if (I1->isIdenticalToWhenDefined(I2) &&
-+        // Even if the instructions are identical, it may not be safe to hoist
-+        // them if we have skipped over instructions with side effects or their
-+        // operands weren't hoisted.
-+        isSafeToHoistInstr(I1, SkipFlagsBB1) &&
-+        isSafeToHoistInstr(I2, SkipFlagsBB2) &&
-+        shouldHoistCommonInstructions(I1, I2, TTI)) {
-+      if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
-+        assert(isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
-         // The debug location is an integral part of a debug info intrinsic
-         // and can't be separated from it or replaced.  Instead of attempting
-         // to merge locations, simply hoist both copies of the intrinsic.
--        I1->moveBeforePreserving(TI);
--        for (auto &SuccIter : OtherSuccIterRange) {
--          auto *I2 = &*SuccIter++;
--          assert(isa<DbgInfoIntrinsic>(I2));
--          I2->moveBeforePreserving(TI);
--        }
-+        I1->moveBeforePreserving(BI);
-+        I2->moveBeforePreserving(BI);
-+        Changed = true;
-       } else {
-         // For a normal instruction, we just move one to right before the
-         // branch, then replace all uses of the other with the first.  Finally,
-         // we remove the now redundant second instruction.
--        I1->moveBeforePreserving(TI);
--        BB->splice(TI->getIterator(), BB1, I1->getIterator());
--        for (auto &SuccIter : OtherSuccIterRange) {
--          Instruction *I2 = &*SuccIter++;
--          assert(I2 != I1);
--          if (!I2->use_empty())
--            I2->replaceAllUsesWith(I1);
--          I1->andIRFlags(I2);
--          combineMetadataForCSE(I1, I2, true);
--          // I1 and I2 are being combined into a single instruction.  Its debug
--          // location is the merged locations of the original instructions.
--          I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
--          I2->eraseFromParent();
--        }
-+        I1->moveBeforePreserving(BI);
-+        if (!I2->use_empty())
-+          I2->replaceAllUsesWith(I1);
-+        I1->andIRFlags(I2);
-+        combineMetadataForCSE(I1, I2, true);
-+
-+        // I1 and I2 are being combined into a single instruction.  Its debug
-+        // location is the merged locations of the original instructions.
-+        I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
-+
-+        I2->eraseFromParent();
-       }
--      if (!Changed)
--        NumHoistCommonCode += SuccIterPairs.size();
-       Changed = true;
--      NumHoistCommonInstrs += SuccIterPairs.size();
-+      ++NumHoistCommonInstrs;
-     } else {
-       if (NumSkipped >= HoistCommonSkipLimit)
-         return Changed;
-       // We are about to skip over a pair of non-identical instructions. Record
-       // if any have characteristics that would prevent reordering instructions
-       // across them.
--      for (auto &SuccIterPair : SuccIterPairs) {
--        Instruction *I = &*SuccIterPair.first++;
--        SuccIterPair.second |= skippedInstrFlags(I);
--      }
-+      SkipFlagsBB1 |= skippedInstrFlags(I1);
-+      SkipFlagsBB2 |= skippedInstrFlags(I2);
-       ++NumSkipped;
-     }
--  }
--}
--
--bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf(
--    Instruction *TI, Instruction *I1,
--    SmallVectorImpl<Instruction *> &OtherSuccTIs) {
--
--  auto *BI = dyn_cast<BranchInst>(TI);
--
--  bool Changed = false;
--  BasicBlock *TIParent = TI->getParent();
--  BasicBlock *BB1 = I1->getParent();
- 
--  // Use only for an if statement.
--  auto *I2 = *OtherSuccTIs.begin();
--  auto *BB2 = I2->getParent();
--  if (BI) {
--    assert(OtherSuccTIs.size() == 1);
--    assert(BI->getSuccessor(0) == I1->getParent());
--    assert(BI->getSuccessor(1) == I2->getParent());
-+    I1 = &*BB1_Itr++;
-+    I2 = &*BB2_Itr++;
-+    // Skip debug info if it is not identical.
-+    DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
-+    DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
-+    if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
-+      while (isa<DbgInfoIntrinsic>(I1))
-+        I1 = &*BB1_Itr++;
-+      while (isa<DbgInfoIntrinsic>(I2))
-+        I2 = &*BB2_Itr++;
-+    }
-   }
- 
--  // In the case of an if statement, we try to hoist an invoke.
-+  return Changed;
-+
-+HoistTerminator:
-+  // It may not be possible to hoist an invoke.
-   // FIXME: Can we define a safety predicate for CallBr?
--  // FIXME: Test case llvm/test/Transforms/SimplifyCFG/2009-06-15-InvokeCrash.ll
--  // removed in 4c923b3b3fd0ac1edebf0603265ca3ba51724937 commit?
--  if (isa<InvokeInst>(I1) && (!BI || !isSafeToHoistInvoke(BB1, BB2, I1, I2)))
--    return false;
-+  if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
-+    return Changed;
- 
-   // TODO: callbr hoisting currently disabled pending further study.
-   if (isa<CallBrInst>(I1))
--    return false;
-+    return Changed;
- 
-   for (BasicBlock *Succ : successors(BB1)) {
-     for (PHINode &PN : Succ->phis()) {
-       Value *BB1V = PN.getIncomingValueForBlock(BB1);
--      for (Instruction *OtherSuccTI : OtherSuccTIs) {
--        Value *BB2V = PN.getIncomingValueForBlock(OtherSuccTI->getParent());
--        if (BB1V == BB2V)
--          continue;
-+      Value *BB2V = PN.getIncomingValueForBlock(BB2);
-+      if (BB1V == BB2V)
-+        continue;
- 
--        // In the case of an if statement, check for
--        // passingValueIsAlwaysUndefined here because we would rather eliminate
--        // undefined control flow then converting it to a select.
--        if (!BI || passingValueIsAlwaysUndefined(BB1V, &PN) ||
--            passingValueIsAlwaysUndefined(BB2V, &PN))
--          return false;
--      }
-+      // Check for passingValueIsAlwaysUndefined here because we would rather
-+      // eliminate undefined control flow then converting it to a select.
-+      if (passingValueIsAlwaysUndefined(BB1V, &PN) ||
-+          passingValueIsAlwaysUndefined(BB2V, &PN))
-+        return Changed;
-     }
-   }
- 
-   // Okay, it is safe to hoist the terminator.
-   Instruction *NT = I1->clone();
--  NT->insertInto(TIParent, TI->getIterator());
-+  NT->insertInto(BIParent, BI->getIterator());
-   if (!NT->getType()->isVoidTy()) {
-     I1->replaceAllUsesWith(NT);
--    for (Instruction *OtherSuccTI : OtherSuccTIs)
--      OtherSuccTI->replaceAllUsesWith(NT);
-+    I2->replaceAllUsesWith(NT);
-     NT->takeName(I1);
-   }
-   Changed = true;
--  NumHoistCommonInstrs += OtherSuccTIs.size() + 1;
-+  ++NumHoistCommonInstrs;
- 
-   // Ensure terminator gets a debug location, even an unknown one, in case
-   // it involves inlinable calls.
--  SmallVector<DILocation *, 4> Locs;
--  Locs.push_back(I1->getDebugLoc());
--  for (auto *OtherSuccTI : OtherSuccTIs)
--    Locs.push_back(OtherSuccTI->getDebugLoc());
--  NT->setDebugLoc(DILocation::getMergedLocations(Locs));
-+  NT->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
- 
-   // PHIs created below will adopt NT's merged DebugLoc.
-   IRBuilder<NoFolder> Builder(NT);
- 
--  // In the case of an if statement, hoisting one of the terminators from our
--  // successor is a great thing. Unfortunately, the successors of the if/else
--  // blocks may have PHI nodes in them.  If they do, all PHI entries for BB1/BB2
--  // must agree for all PHI nodes, so we insert select instruction to compute
--  // the final result.
--  if (BI) {
--    std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
--    for (BasicBlock *Succ : successors(BB1)) {
--      for (PHINode &PN : Succ->phis()) {
--        Value *BB1V = PN.getIncomingValueForBlock(BB1);
--        Value *BB2V = PN.getIncomingValueForBlock(BB2);
--        if (BB1V == BB2V)
--          continue;
--
--        // These values do not agree.  Insert a select instruction before NT
--        // that determines the right value.
--        SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
--        if (!SI) {
--          // Propagate fast-math-flags from phi node to its replacement select.
--          IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
--          if (isa<FPMathOperator>(PN))
--            Builder.setFastMathFlags(PN.getFastMathFlags());
--
--          SI = cast<SelectInst>(Builder.CreateSelect(
--              BI->getCondition(), BB1V, BB2V,
--              BB1V->getName() + "." + BB2V->getName(), BI));
--        }
-+  // Hoisting one of the terminators from our successor is a great thing.
-+  // Unfortunately, the successors of the if/else blocks may have PHI nodes in
-+  // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
-+  // nodes, so we insert select instruction to compute the final result.
-+  std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
-+  for (BasicBlock *Succ : successors(BB1)) {
-+    for (PHINode &PN : Succ->phis()) {
-+      Value *BB1V = PN.getIncomingValueForBlock(BB1);
-+      Value *BB2V = PN.getIncomingValueForBlock(BB2);
-+      if (BB1V == BB2V)
-+        continue;
- 
--        // Make the PHI node use the select for all incoming values for BB1/BB2
--        for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
--          if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2)
--            PN.setIncomingValue(i, SI);
--      }
-+      // These values do not agree.  Insert a select instruction before NT
-+      // that determines the right value.
-+      SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
-+      if (!SI) {
-+        // Propagate fast-math-flags from phi node to its replacement select.
-+        IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
-+        if (isa<FPMathOperator>(PN))
-+          Builder.setFastMathFlags(PN.getFastMathFlags());
-+
-+        SI = cast<SelectInst>(
-+            Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
-+                                 BB1V->getName() + "." + BB2V->getName(), BI));
-+      }
-+
-+      // Make the PHI node use the select for all incoming values for BB1/BB2
-+      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
-+        if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2)
-+          PN.setIncomingValue(i, SI);
-     }
-   }
- 
-@@ -1807,16 +1734,16 @@
- 
-   // Update any PHI nodes in our new successors.
-   for (BasicBlock *Succ : successors(BB1)) {
--    AddPredecessorToBlock(Succ, TIParent, BB1);
-+    AddPredecessorToBlock(Succ, BIParent, BB1);
-     if (DTU)
--      Updates.push_back({DominatorTree::Insert, TIParent, Succ});
-+      Updates.push_back({DominatorTree::Insert, BIParent, Succ});
-   }
- 
-   if (DTU)
--    for (BasicBlock *Succ : successors(TI))
--      Updates.push_back({DominatorTree::Delete, TIParent, Succ});
-+    for (BasicBlock *Succ : successors(BI))
-+      Updates.push_back({DominatorTree::Delete, BIParent, Succ});
- 
--  EraseTerminatorAndDCECond(TI);
-+  EraseTerminatorAndDCECond(BI);
-   if (DTU)
-     DTU->applyUpdates(Updates);
-   return Changed;
-@@ -2850,8 +2777,8 @@
-     Value *OrigV = PN.getIncomingValueForBlock(BB);
-     Value *ThenV = PN.getIncomingValueForBlock(ThenBB);
- 
--    // FIXME: Try to remove some of the duplication with
--    // hoistCommonCodeFromSuccessors. Skip PHIs which are trivial.
-+    // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf.
-+    // Skip PHIs which are trivial.
-     if (ThenV == OrigV)
+     // Look through nodes that don't alter the bits of the incoming value.
+     unsigned Op = Arg.getOpcode();
+-    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
+-        Op == ISD::AssertZext) {
++    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
+       Arg = Arg.getOperand(0);
        continue;
- 
-@@ -6888,10 +6815,6 @@
-   if (ReduceSwitchRange(SI, Builder, DL, TTI))
-     return requestResimplify();
- 
--  if (HoistCommon &&
--      hoistCommonCodeFromSuccessors(SI->getParent(), !Options.HoistCommonInsts))
--    return requestResimplify();
--
-   return false;
- }
- 
-@@ -7158,8 +7081,7 @@
-   // can hoist it up to the branching block.
-   if (BI->getSuccessor(0)->getSinglePredecessor()) {
-     if (BI->getSuccessor(1)->getSinglePredecessor()) {
--      if (HoistCommon && hoistCommonCodeFromSuccessors(
--                             BI->getParent(), !Options.HoistCommonInsts))
-+      if (HoistCommon && HoistThenElseCodeToIf(BI, !Options.HoistCommonInsts))
-         return requestResimplify();
-     } else {
-       // If Successor #1 has multiple preds, we may be able to conditionally
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll b/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll
---- a/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll
-+++ b/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll
-@@ -70,7 +70,7 @@
-     i64 4, label %sw.bb4
-   ]
- sw.bb0:
--  call void asm sideeffect "nop", ""()
-+  call void asm sideeffect "", ""()
-   ret void
- sw.bb1:
-   call void asm sideeffect "", ""()
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
---- a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
-+++ b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
-@@ -19,9 +19,21 @@
- 
- define void @foo_switch(i64 %C, ptr %P) {
- ; CHECK-LABEL: @foo_switch(
--; CHECK-NEXT:  common.ret:
--; CHECK-NEXT:    store i32 7, ptr [[P:%.*]], align 4
-+; CHECK-NEXT:    switch i64 [[C:%.*]], label [[BB0:%.*]] [
-+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-+; CHECK-NEXT:    ]
-+; CHECK:       common.ret:
- ; CHECK-NEXT:    ret void
-+; CHECK:       bb0:
-+; CHECK-NEXT:    store i32 7, ptr [[P:%.*]], align 4
-+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
-+; CHECK:       bb1:
-+; CHECK-NEXT:    store i32 7, ptr [[P]], align 4
-+; CHECK-NEXT:    br label [[COMMON_RET]]
-+; CHECK:       bb2:
-+; CHECK-NEXT:    store i32 7, ptr [[P]], align 4
-+; CHECK-NEXT:    br label [[COMMON_RET]]
- ;
-   switch i64 %C, label %bb0 [
-   i64 1, label %bb1
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
---- a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
-+++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
-@@ -26,11 +26,27 @@
- 
- define void @test_switch(i64 %i, ptr %Q) {
- ; CHECK-LABEL: @test_switch(
--; CHECK-NEXT:  common.ret:
-+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-+; CHECK-NEXT:    ]
-+; CHECK:       common.ret:
-+; CHECK-NEXT:    ret void
-+; CHECK:       bb0:
- ; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
- ; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[Q]], align 4
- ; CHECK-NEXT:    call void @bar(i32 [[A]])
--; CHECK-NEXT:    ret void
-+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
-+; CHECK:       bb1:
-+; CHECK-NEXT:    store i32 1, ptr [[Q]], align 4
-+; CHECK-NEXT:    [[B:%.*]] = load i32, ptr [[Q]], align 4
-+; CHECK-NEXT:    call void @bar(i32 [[B]])
-+; CHECK-NEXT:    br label [[COMMON_RET]]
-+; CHECK:       bb2:
-+; CHECK-NEXT:    store i32 1, ptr [[Q]], align 4
-+; CHECK-NEXT:    [[C:%.*]] = load i32, ptr [[Q]], align 4
-+; CHECK-NEXT:    call void @bar(i32 [[C]])
-+; CHECK-NEXT:    br label [[COMMON_RET]]
- ;
-   switch i64 %i, label %bb0 [
-   i64 1, label %bb1
-@@ -53,41 +69,25 @@
-   ret void
- }
- 
--; We ensure that we examine all instructions during each iteration to confirm the presence of a terminating one.
--define void @test_switch_reach_terminator(i64 %i, ptr %p) {
--; CHECK-LABEL: @test_switch_reach_terminator(
--; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
--; CHECK-NEXT:    i64 2, label [[COMMON_RET:%.*]]
--; CHECK-NEXT:    ]
--; CHECK:       common.ret:
--; CHECK-NEXT:    ret void
--; CHECK:       bb0:
--; CHECK-NEXT:    store i32 1, ptr [[P:%.*]], align 4
--; CHECK-NEXT:    br label [[COMMON_RET]]
--; CHECK:       bb1:
--; CHECK-NEXT:    store i32 2, ptr [[P]], align 4
--; CHECK-NEXT:    br label [[COMMON_RET]]
--;
--  switch i64 %i, label %bb0 [
--  i64 1, label %bb1
--  i64 2, label %bb2
--  ]
--bb0:              ; preds = %0
--  store i32 1, ptr %p
--  ret void
--bb1:              ; preds = %0
--  store i32 2, ptr %p
--  ret void
--bb2:              ; preds = %0
--  ret void
--}
--
- define i1 @common_instr_on_switch(i64 %a, i64 %b, i64 %c) unnamed_addr {
- ; CHECK-LABEL: @common_instr_on_switch(
- ; CHECK-NEXT:  start:
-+; CHECK-NEXT:    switch i64 [[A:%.*]], label [[BB0:%.*]] [
-+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-+; CHECK-NEXT:    ]
-+; CHECK:       bb0:
- ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
--; CHECK-NEXT:    ret i1 [[TMP0]]
-+; CHECK-NEXT:    br label [[EXIT:%.*]]
-+; CHECK:       bb1:
-+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
-+; CHECK-NEXT:    br label [[EXIT]]
-+; CHECK:       bb2:
-+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
-+; CHECK-NEXT:    br label [[EXIT]]
-+; CHECK:       exit:
-+; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
-+; CHECK-NEXT:    ret i1 [[RESULT]]
- ;
- start:
-   switch i64 %a, label %bb0 [
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll
---- a/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll
-+++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-code-with-unreachable.ll
-@@ -4,8 +4,25 @@
- define i1 @common_instr_with_unreachable(i64 %a, i64 %b, i64 %c) {
- ; CHECK-LABEL: @common_instr_with_unreachable(
- ; CHECK-NEXT:  start:
-+; CHECK-NEXT:    switch i64 [[A:%.*]], label [[UNREACHABLE:%.*]] [
-+; CHECK-NEXT:    i64 0, label [[BB0:%.*]]
-+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-+; CHECK-NEXT:    ]
-+; CHECK:       unreachable:
-+; CHECK-NEXT:    unreachable
-+; CHECK:       bb0:
- ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
--; CHECK-NEXT:    ret i1 [[TMP0]]
-+; CHECK-NEXT:    br label [[EXIT:%.*]]
-+; CHECK:       bb1:
-+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
-+; CHECK-NEXT:    br label [[EXIT]]
-+; CHECK:       bb2:
-+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
-+; CHECK-NEXT:    br label [[EXIT]]
-+; CHECK:       exit:
-+; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
-+; CHECK-NEXT:    ret i1 [[RESULT]]
- ;
- start:
-   switch i64 %a, label %unreachable [
-@@ -37,90 +54,43 @@
- define i1 @common_instr_with_unreachable_2(i64 %a, i64 %b, i64 %c) {
- ; CHECK-LABEL: @common_instr_with_unreachable_2(
- ; CHECK-NEXT:  start:
--; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
--; CHECK-NEXT:    ret i1 [[TMP0]]
--;
--start:
--  switch i64 %a, label %bb1 [
--  i64 0, label %bb0
--  i64 1, label %unreachable
--  i64 2, label %bb2
--  ]
--
--unreachable:
--  unreachable
--
--bb0:                                              ; preds = %start
--  %0 = icmp eq i64 %b, %c
--  br label %exit
--
--bb1:                                              ; preds = %start
--  %1 = icmp eq i64 %b, %c
--  br label %exit
--
--bb2:                                              ; preds = %start
--  %2 = icmp eq i64 %b, %c
--  br label %exit
--
--exit:                                             ; preds = %bb2, %bb1, %bb0
--  %result = phi i1 [ %0, %bb0 ], [ %1, %bb1 ], [ %2, %bb2 ]
--  ret i1 %result
--}
--
--declare void @no_return()
--declare void @foo()
--
--define i1 @not_only_unreachable(i64 %a, i64 %b, i64 %c) {
--; CHECK-LABEL: @not_only_unreachable(
--; CHECK-NEXT:  start:
--; CHECK-NEXT:    switch i64 [[A:%.*]], label [[UNREACHABLE:%.*]] [
-+; CHECK-NEXT:    switch i64 [[A:%.*]], label [[BB1:%.*]] [
- ; CHECK-NEXT:    i64 0, label [[BB0:%.*]]
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
- ; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
- ; CHECK-NEXT:    ]
--; CHECK:       unreachable:
--; CHECK-NEXT:    call void @no_return()
--; CHECK-NEXT:    unreachable
- ; CHECK:       bb0:
- ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[B:%.*]], [[C:%.*]]
--; CHECK-NEXT:    call void @foo()
- ; CHECK-NEXT:    br label [[EXIT:%.*]]
- ; CHECK:       bb1:
- ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[B]], [[C]]
--; CHECK-NEXT:    call void @foo()
- ; CHECK-NEXT:    br label [[EXIT]]
- ; CHECK:       bb2:
- ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[B]], [[C]]
--; CHECK-NEXT:    call void @foo()
- ; CHECK-NEXT:    br label [[EXIT]]
- ; CHECK:       exit:
- ; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP2]], [[BB2]] ]
- ; CHECK-NEXT:    ret i1 [[RESULT]]
- ;
- start:
--  switch i64 %a, label %unreachable [
-+  switch i64 %a, label %bb1 [
-   i64 0, label %bb0
--  i64 1, label %bb1
-+  i64 1, label %unreachable
-   i64 2, label %bb2
-   ]
- 
- unreachable:
--  call void @no_return()
-   unreachable
- 
- bb0:                                              ; preds = %start
-   %0 = icmp eq i64 %b, %c
--  call void @foo()
-   br label %exit
- 
- bb1:                                              ; preds = %start
-   %1 = icmp eq i64 %b, %c
--  call void @foo()
-   br label %exit
- 
- bb2:                                              ; preds = %start
-   %2 = icmp eq i64 %b, %c
--  call void @foo()
-   br label %exit
- 
- exit:                                             ; preds = %bb2, %bb1, %bb0
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll
---- a/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll
-+++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll
-@@ -48,68 +48,6 @@
-   ret void
- }
- 
--define void @f0_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m,  ptr nocapture noundef readonly %b) {
--; CHECK-LABEL: @f0_switch(
--; CHECK-NEXT:  entry:
--; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
--; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[M:%.*]], align 2
--; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
--; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
--; CHECK-NEXT:    ]
--; CHECK:       bb0:
--; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[TMP0]], 1
--; CHECK-NEXT:    [[U:%.*]] = add i16 [[ADD]], [[TMP1]]
--; CHECK-NEXT:    br label [[END:%.*]]
--; CHECK:       bb1:
--; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i16 [[TMP0]], 1
--; CHECK-NEXT:    [[TMP2:%.*]] = add i16 [[SUB]], 3
--; CHECK-NEXT:    [[V:%.*]] = add i16 [[SUB]], [[TMP2]]
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       bb2:
--; CHECK-NEXT:    [[SUB2:%.*]] = sub nsw i16 [[TMP0]], 1
--; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[SUB2]], 3
--; CHECK-NEXT:    [[W:%.*]] = add i16 [[SUB2]], [[TMP3]]
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       end:
--; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[V]], [[BB1]] ], [ [[W]], [[BB2]] ]
--; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
--; CHECK-NEXT:    ret void
--;
--entry:
--  switch i64 %i, label %bb0 [
--  i64 1, label %bb1
--  i64 2, label %bb2
--  ]
--
--bb0:
--  %0 = load i16, ptr %b, align 2
--  %add = add nsw i16 %0, 1
--  %1 = load i16, ptr %m, align 2
--  %u = add i16 %add, %1
--  br label %end
--
--bb1:
--  %2 = load i16, ptr %b, align 2
--  %sub = sub nsw i16 %2, 1
--  %3 = load i16, ptr %m, align 2
--  %4 = add i16 %sub, 3
--  %v = add i16 %sub, %4
--  br label %end
--
--bb2:
--  %5 = load i16, ptr %b, align 2
--  %sub2 = sub nsw i16 %5, 1
--  %6 = load i16, ptr %m, align 2
--  %7 = add i16 %sub2, 3
--  %w = add i16 %sub2, %7
--  br label %end
--
--end:
--  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
--  store i16 %uv, ptr %d, align 2
--  ret void
--}
- 
- ;; Check some instructions (e.g. add) can be reordered across instructions with side
- ;; effects, while others (e.g. load) can't.
-@@ -159,70 +97,6 @@
-   ret void
- }
- 
--define void @f2_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
--; CHECK-LABEL: @f2_switch(
--; CHECK-NEXT:  entry:
--; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
--; CHECK-NEXT:    [[ADD_0:%.*]] = add nsw i16 [[TMP0]], 1
--; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
--; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
--; CHECK-NEXT:    ]
--; CHECK:       bb0:
--; CHECK-NEXT:    call void @side_effects0()
--; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[M:%.*]], align 2
--; CHECK-NEXT:    [[U:%.*]] = add i16 [[ADD_0]], [[TMP1]]
--; CHECK-NEXT:    br label [[END:%.*]]
--; CHECK:       bb1:
--; CHECK-NEXT:    call void @no_side_effects0()
--; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[M]], align 2
--; CHECK-NEXT:    [[V:%.*]] = add i16 [[ADD_0]], [[TMP2]]
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       bb2:
--; CHECK-NEXT:    call void @no_side_effects0()
--; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[M]], align 2
--; CHECK-NEXT:    [[W:%.*]] = add i16 [[ADD_0]], [[TMP3]]
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       end:
--; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[V]], [[BB1]] ], [ [[W]], [[BB2]] ]
--; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
--; CHECK-NEXT:    ret void
--;
--entry:
--  switch i64 %i, label %bb0 [
--  i64 1, label %bb1
--  i64 2, label %bb2
--  ]
--
--bb0:
--  %0 = load i16, ptr %b, align 2
--  call void @side_effects0()
--  %add.0 = add nsw i16 %0, 1
--  %1 = load i16, ptr %m, align 2
--  %u = add i16 %add.0, %1
--  br label %end
--
--bb1:
--  %2 = load i16, ptr %b, align 2
--  call void @no_side_effects0()
--  %add.1 = add nsw i16 %2, 1
--  %3 = load i16, ptr %m, align 2
--  %v = add i16 %add.1, %3
--  br label %end
--
--bb2:
--  %4 = load i16, ptr %b, align 2
--  call void @no_side_effects0()
--  %add.2 = add nsw i16 %4, 1
--  %5 = load i16, ptr %m, align 2
--  %w = add i16 %add.2, %5
--  br label %end
--
--end:
--  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
--  store i16 %uv, ptr %d, align 2
--  ret void
--}
- 
- ;; Check indeed it was the side effects that prevented hoisting the load
- ;; in the previous test.
-@@ -269,67 +143,6 @@
-   ret void
- }
- 
--define void @f3_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
--; CHECK-LABEL: @f3_switch(
--; CHECK-NEXT:  entry:
--; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
--; CHECK-NEXT:    [[ADD_0:%.*]] = add nsw i16 [[TMP0]], 1
--; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[M:%.*]], align 2
--; CHECK-NEXT:    [[U:%.*]] = add i16 [[ADD_0]], [[TMP1]]
--; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
--; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
--; CHECK-NEXT:    ]
--; CHECK:       bb0:
--; CHECK-NEXT:    call void @no_side_effects0()
--; CHECK-NEXT:    br label [[END:%.*]]
--; CHECK:       bb1:
--; CHECK-NEXT:    call void @no_side_effects1()
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       bb2:
--; CHECK-NEXT:    call void @no_side_effects1()
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       end:
--; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[U]], [[BB1]] ], [ [[U]], [[BB2]] ]
--; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
--; CHECK-NEXT:    ret void
--;
--entry:
--  switch i64 %i, label %bb0 [
--  i64 1, label %bb1
--  i64 2, label %bb2
--  ]
--
--bb0:
--  %0 = load i16, ptr %b, align 2
--  call void @no_side_effects0()
--  %add.0 = add nsw i16 %0, 1
--  %1 = load i16, ptr %m, align 2
--  %u = add i16 %add.0, %1
--  br label %end
--
--bb1:
--  %2 = load i16, ptr %b, align 2
--  call void @no_side_effects1()
--  %add.1 = add nsw i16 %2, 1
--  %3 = load i16, ptr %m, align 2
--  %v = add i16 %add.1, %3
--  br label %end
--
--bb2:
--  %4 = load i16, ptr %b, align 2
--  call void @no_side_effects1()
--  %add.2 = add nsw i16 %4, 1
--  %5 = load i16, ptr %m, align 2
--  %w = add i16 %add.2, %5
--  br label %end
--
--end:
--  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
--  store i16 %uv, ptr %d, align 2
--  ret void
--}
--
- ;; Check some instructions (e.g. sdiv) are not speculatively executed.
- 
- ;; Division by non-zero constant OK to speculate ...
-@@ -373,63 +186,6 @@
-   ret void
- }
- 
--define void @f4_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
--; CHECK-LABEL: @f4_switch(
--; CHECK-NEXT:  entry:
--; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
--; CHECK-NEXT:    [[DIV_0:%.*]] = sdiv i16 [[TMP0]], 2
--; CHECK-NEXT:    [[U:%.*]] = add i16 [[DIV_0]], [[TMP0]]
--; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
--; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
--; CHECK-NEXT:    ]
--; CHECK:       bb0:
--; CHECK-NEXT:    call void @side_effects0()
--; CHECK-NEXT:    br label [[IF_END:%.*]]
--; CHECK:       bb1:
--; CHECK-NEXT:    call void @side_effects1()
--; CHECK-NEXT:    br label [[IF_END]]
--; CHECK:       bb2:
--; CHECK-NEXT:    call void @side_effects1()
--; CHECK-NEXT:    br label [[IF_END]]
--; CHECK:       if.end:
--; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[U]], [[BB1]] ], [ [[U]], [[BB2]] ]
--; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
--; CHECK-NEXT:    ret void
--;
--entry:
--  switch i64 %i, label %bb0 [
--  i64 1, label %bb1
--  i64 2, label %bb2
--  ]
--
--bb0:
--  %0 = load i16, ptr %b, align 2
--  call void @side_effects0()
--  %div.0 = sdiv i16 %0, 2
--  %u = add i16 %div.0, %0
--  br label %if.end
--
--bb1:
--  %1 = load i16, ptr %b, align 2
--  call void @side_effects1()
--  %div.1 = sdiv i16 %1, 2
--  %v = add i16 %div.1, %1
--  br label %if.end
--
--bb2:
--  %2 = load i16, ptr %b, align 2
--  call void @side_effects1()
--  %div.2 = sdiv i16 %2, 2
--  %w = add i16 %div.2, %2
--  br label %if.end
--
--if.end:
--  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
--  store i16 %uv, ptr %d, align 2
--  ret void
--}
--
- ;; ... but not a general division ...
- define void @f5(i1 %c, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
- ; CHECK-LABEL: @f5(
-@@ -474,67 +230,6 @@
-   ret void
- }
- 
--define void @f5_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
--; CHECK-LABEL: @f5_switch(
--; CHECK-NEXT:  entry:
--; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
--; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
--; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
--; CHECK-NEXT:    ]
--; CHECK:       bb0:
--; CHECK-NEXT:    call void @side_effects0()
--; CHECK-NEXT:    [[DIV_0:%.*]] = sdiv i16 211, [[TMP0]]
--; CHECK-NEXT:    [[U:%.*]] = add i16 [[DIV_0]], [[TMP0]]
--; CHECK-NEXT:    br label [[END:%.*]]
--; CHECK:       bb1:
--; CHECK-NEXT:    call void @side_effects1()
--; CHECK-NEXT:    [[DIV_1:%.*]] = sdiv i16 211, [[TMP0]]
--; CHECK-NEXT:    [[V:%.*]] = add i16 [[DIV_1]], [[TMP0]]
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       bb2:
--; CHECK-NEXT:    call void @side_effects1()
--; CHECK-NEXT:    [[DIV_2:%.*]] = sdiv i16 211, [[TMP0]]
--; CHECK-NEXT:    [[W:%.*]] = add i16 [[DIV_2]], [[TMP0]]
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       end:
--; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[V]], [[BB1]] ], [ [[W]], [[BB2]] ]
--; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
--; CHECK-NEXT:    ret void
--;
--entry:
--  switch i64 %i, label %bb0 [
--  i64 1, label %bb1
--  i64 2, label %bb2
--  ]
--
--bb0:
--  %0 = load i16, ptr %b, align 2
--  call void @side_effects0()
--  %div.0 = sdiv i16 211, %0
--  %u = add i16 %div.0, %0
--  br label %end
--
--bb1:
--  %1 = load i16, ptr %b, align 2
--  call void @side_effects1()
--  %div.1 = sdiv i16 211, %1
--  %v = add i16 %div.1, %1
--  br label %end
--
--bb2:
--  %2 = load i16, ptr %b, align 2
--  call void @side_effects1()
--  %div.2 = sdiv i16 211, %2
--  %w = add i16 %div.2, %2
--  br label %end
--
--end:
--  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
--  store i16 %uv, ptr %d, align 2
--  ret void
--}
--
- ;; ... and it's also OK to hoist the division when there's no speculation happening.
- define void @f6(i1 %c, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
- ; CHECK-LABEL: @f6(
-@@ -576,63 +271,6 @@
-   ret void
- }
- 
--define void @f6_switch(i64 %i, ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
--; CHECK-LABEL: @f6_switch(
--; CHECK-NEXT:  entry:
--; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B:%.*]], align 2
--; CHECK-NEXT:    [[DIV_0:%.*]] = sdiv i16 211, [[TMP0]]
--; CHECK-NEXT:    [[U:%.*]] = add i16 [[DIV_0]], [[TMP0]]
--; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
--; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
--; CHECK-NEXT:    ]
--; CHECK:       bb0:
--; CHECK-NEXT:    call void @no_side_effects0()
--; CHECK-NEXT:    br label [[END:%.*]]
--; CHECK:       bb1:
--; CHECK-NEXT:    call void @no_side_effects1()
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       bb2:
--; CHECK-NEXT:    call void @no_side_effects1()
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       end:
--; CHECK-NEXT:    [[UV:%.*]] = phi i16 [ [[U]], [[BB0]] ], [ [[U]], [[BB1]] ], [ [[U]], [[BB2]] ]
--; CHECK-NEXT:    store i16 [[UV]], ptr [[D:%.*]], align 2
--; CHECK-NEXT:    ret void
--;
--entry:
--  switch i64 %i, label %bb0 [
--  i64 1, label %bb1
--  i64 2, label %bb2
--  ]
--
--bb0:
--  %0 = load i16, ptr %b, align 2
--  call void @no_side_effects0()
--  %div.0 = sdiv i16 211, %0
--  %u = add i16 %div.0, %0
--  br label %end
--
--bb1:
--  %1 = load i16, ptr %b, align 2
--  call void @no_side_effects1()
--  %div.1 = sdiv i16 211, %1
--  %v = add i16 %div.1, %1
--  br label %end
--
--bb2:
--  %2 = load i16, ptr %b, align 2
--  call void @no_side_effects1()
--  %div.2 = sdiv i16 211, %2
--  %w = add i16 %div.2, %2
--  br label %end
--
--end:
--  %uv = phi i16 [ %u, %bb0 ], [ %v, %bb1 ], [ %w, %bb2 ]
--  store i16 %uv, ptr %d, align 2
--  ret void
--}
--
- ;; No reorder of store over a load.
- define i16 @f7(i1 %c, ptr %a, ptr %b) {
- ; CHECK-LABEL: @f7(
-@@ -668,55 +306,6 @@
-   ret i16 %v
- }
- 
--define i16 @f7_switch(i64 %i, ptr %a, ptr %b) {
--; CHECK-LABEL: @f7_switch(
--; CHECK-NEXT:  entry:
--; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
--; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
--; CHECK-NEXT:    ]
--; CHECK:       bb0:
--; CHECK-NEXT:    [[VA:%.*]] = load i16, ptr [[A:%.*]], align 2
--; CHECK-NEXT:    store i16 0, ptr [[B:%.*]], align 2
--; CHECK-NEXT:    br label [[END:%.*]]
--; CHECK:       bb1:
--; CHECK-NEXT:    [[VB:%.*]] = load i16, ptr [[B]], align 2
--; CHECK-NEXT:    store i16 0, ptr [[B]], align 2
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       bb2:
--; CHECK-NEXT:    [[VC:%.*]] = load i16, ptr [[B]], align 2
--; CHECK-NEXT:    store i16 0, ptr [[B]], align 2
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       end:
--; CHECK-NEXT:    [[V:%.*]] = phi i16 [ [[VA]], [[BB0]] ], [ [[VB]], [[BB1]] ], [ [[VC]], [[BB2]] ]
--; CHECK-NEXT:    ret i16 [[V]]
--;
--entry:
--  switch i64 %i, label %bb0 [
--  i64 1, label %bb1
--  i64 2, label %bb2
--  ]
--
--bb0:
--  %va = load i16, ptr %a, align 2
--  store i16 0, ptr %b, align 2
--  br label %end
--
--bb1:
--  %vb = load i16, ptr %b, align 2
--  store i16 0, ptr %b, align 2
--  br label %end
--
--bb2:
--  %vc = load i16, ptr %b, align 2
--  store i16 0, ptr %b, align 2
--  br label %end
--
--end:
--  %v = phi i16 [ %va, %bb0 ], [ %vb, %bb1 ], [ %vc, %bb2 ]
--  ret i16 %v
--}
--
- ;; Can reorder load over another load
- define i16 @f8(i1 %cond, ptr %a, ptr %b, ptr %c) {
- ; CHECK-LABEL: @f8(
-@@ -757,59 +346,6 @@
-   ret i16 %w
- }
- 
--define i16 @f8_switch(i64 %i, ptr %a, ptr %b, ptr %c) {
--; CHECK-LABEL: @f8_switch(
--; CHECK-NEXT:  entry:
--; CHECK-NEXT:    [[C_0:%.*]] = load i16, ptr [[C:%.*]], align 2
--; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
--; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
--; CHECK-NEXT:    ]
--; CHECK:       bb0:
--; CHECK-NEXT:    [[VA:%.*]] = load i16, ptr [[A:%.*]], align 2
--; CHECK-NEXT:    br label [[END:%.*]]
--; CHECK:       bb1:
--; CHECK-NEXT:    [[VB:%.*]] = load i16, ptr [[B:%.*]], align 2
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       bb2:
--; CHECK-NEXT:    [[VC:%.*]] = load i16, ptr [[B]], align 2
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       end:
--; CHECK-NEXT:    [[V:%.*]] = phi i16 [ [[VA]], [[BB0]] ], [ [[VB]], [[BB1]] ], [ [[VC]], [[BB2]] ]
--; CHECK-NEXT:    [[U:%.*]] = phi i16 [ [[C_0]], [[BB0]] ], [ [[C_0]], [[BB1]] ], [ [[C_0]], [[BB2]] ]
--; CHECK-NEXT:    [[W:%.*]] = add i16 [[V]], [[U]]
--; CHECK-NEXT:    ret i16 [[W]]
--;
--entry:
--  switch i64 %i, label %bb0 [
--  i64 1, label %bb1
--  i64 2, label %bb2
--  ]
--
--bb0:
--  %va = load i16, ptr %a, align 2
--  %c.0 = load i16, ptr %c
--  br label %end
--
--bb1:
--  %vb = load i16, ptr %b, align 2
--  %c.1 = load i16, ptr %c
--  br label %end
--
--bb2:
--  %vc = load i16, ptr %b, align 2
--  %c.2 = load i16, ptr %c
--  br label %end
--
--end:
--  %v = phi i16 [ %va, %bb0 ], [ %vb, %bb1 ], [ %vc, %bb2 ]
--  %u = phi i16 [ %c.0, %bb0 ], [ %c.1, %bb1 ], [ %c.2, %bb2 ]
--
--  %w = add i16 %v, %u
--
--  ret i16 %w
--}
--
- ;; Currently won't reorder volatile and non-volatile loads.
- define i16 @f9(i1 %cond, ptr %a, ptr %b, ptr %c) {
- ; CHECK-LABEL: @f9(
-@@ -851,61 +387,6 @@
-   ret i16 %w
- }
- 
--define i16 @f9_switch(i64 %i, ptr %a, ptr %b, ptr %c) {
--; CHECK-LABEL: @f9_switch(
--; CHECK-NEXT:  entry:
--; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
--; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
--; CHECK-NEXT:    ]
--; CHECK:       bb0:
--; CHECK-NEXT:    [[VA:%.*]] = load volatile i16, ptr [[A:%.*]], align 2
--; CHECK-NEXT:    [[C_0:%.*]] = load i16, ptr [[C:%.*]], align 2
--; CHECK-NEXT:    br label [[END:%.*]]
--; CHECK:       bb1:
--; CHECK-NEXT:    [[VB:%.*]] = load i16, ptr [[B:%.*]], align 2
--; CHECK-NEXT:    [[C_1:%.*]] = load i16, ptr [[C]], align 2
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       bb2:
--; CHECK-NEXT:    [[VC:%.*]] = load i16, ptr [[B]], align 2
--; CHECK-NEXT:    [[C_2:%.*]] = load i16, ptr [[C]], align 2
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       end:
--; CHECK-NEXT:    [[V:%.*]] = phi i16 [ [[VA]], [[BB0]] ], [ [[VB]], [[BB1]] ], [ [[VC]], [[BB2]] ]
--; CHECK-NEXT:    [[U:%.*]] = phi i16 [ [[C_0]], [[BB0]] ], [ [[C_1]], [[BB1]] ], [ [[C_2]], [[BB2]] ]
--; CHECK-NEXT:    [[W:%.*]] = add i16 [[V]], [[U]]
--; CHECK-NEXT:    ret i16 [[W]]
--;
--entry:
--  switch i64 %i, label %bb0 [
--  i64 1, label %bb1
--  i64 2, label %bb2
--  ]
--
--bb0:
--  %va = load volatile i16, ptr %a, align 2
--  %c.0 = load i16, ptr %c
--  br label %end
--
--bb1:
--  %vb = load i16, ptr %b, align 2
--  %c.1 = load i16, ptr %c
--  br label %end
--
--bb2:
--  %vc = load i16, ptr %b, align 2
--  %c.2 = load i16, ptr %c
--  br label %end
--
--end:
--  %v = phi i16 [ %va, %bb0 ], [ %vb, %bb1 ], [ %vc, %bb2 ]
--  %u = phi i16 [ %c.0, %bb0 ], [ %c.1, %bb1 ], [ %c.2, %bb2 ]
--
--  %w = add i16 %v, %u
--
--  ret i16 %w
--}
--
- ;; Don't hoist stacksaves across inalloca allocas
- define void @f10(i1 %cond) {
- ; CHECK-LABEL: @f10(
-@@ -953,79 +434,6 @@
-   br label %end
- 
- end:
--  call void @llvm.stackrestore(ptr %ss)
--  ret void
--}
--
--define void @f10_switch(i64 %i) {
--; CHECK-LABEL: @f10_switch(
--; CHECK-NEXT:    [[SS:%.*]] = call ptr @llvm.stacksave.p0()
--; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
--; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
--; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
--; CHECK-NEXT:    ]
--; CHECK:       bb0:
--; CHECK-NEXT:    [[I1:%.*]] = alloca inalloca i32, align 4
--; CHECK-NEXT:    [[SS2:%.*]] = call ptr @llvm.stacksave.p0()
--; CHECK-NEXT:    [[I2:%.*]] = alloca inalloca i64, align 8
--; CHECK-NEXT:    call void @inalloca_i64(ptr inalloca(i64) [[I2]])
--; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS2]])
--; CHECK-NEXT:    call void @inalloca_i32(ptr inalloca(i32) [[I1]])
--; CHECK-NEXT:    br label [[END:%.*]]
--; CHECK:       bb1:
--; CHECK-NEXT:    [[I3:%.*]] = alloca inalloca i64, align 8
--; CHECK-NEXT:    [[SS3:%.*]] = call ptr @llvm.stacksave.p0()
--; CHECK-NEXT:    [[I4:%.*]] = alloca inalloca i64, align 8
--; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I4]])
--; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS3]])
--; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I3]])
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       bb2:
--; CHECK-NEXT:    [[I5:%.*]] = alloca inalloca i64, align 8
--; CHECK-NEXT:    [[SS4:%.*]] = call ptr @llvm.stacksave.p0()
--; CHECK-NEXT:    [[I6:%.*]] = alloca inalloca i64, align 8
--; CHECK-NEXT:    [[TMP3:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I6]])
--; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS4]])
--; CHECK-NEXT:    [[TMP4:%.*]] = call ptr @inalloca_i64(ptr inalloca(i64) [[I5]])
--; CHECK-NEXT:    br label [[END]]
--; CHECK:       end:
--; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SS]])
--; CHECK-NEXT:    ret void
--;
--  %ss = call ptr @llvm.stacksave()
--  switch i64 %i, label %bb0 [
--  i64 1, label %bb1
--  i64 2, label %bb2
--  ]
--
--bb0:
--  %i1 = alloca inalloca i32
--  %ss2 = call ptr @llvm.stacksave()
--  %i2 = alloca inalloca i64
--  call void @inalloca_i64(ptr inalloca(i64) %i2)
--  call void @llvm.stackrestore(ptr %ss2)
--  call void @inalloca_i32(ptr inalloca(i32) %i1)
--  br label %end
--
--bb1:
--  %i3 = alloca inalloca i64
--  %ss3 = call ptr @llvm.stacksave()
--  %i4 = alloca inalloca i64
--  call ptr @inalloca_i64(ptr inalloca(i64) %i4)
--  call void @llvm.stackrestore(ptr %ss3)
--  call ptr @inalloca_i64(ptr inalloca(i64) %i3)
--  br label %end
--
--bb2:
--  %i5 = alloca inalloca i64
--  %ss4 = call ptr @llvm.stacksave()
--  %i6 = alloca inalloca i64
--  call ptr @inalloca_i64(ptr inalloca(i64) %i6)
--  call void @llvm.stackrestore(ptr %ss4)
--  call ptr @inalloca_i64(ptr inalloca(i64) %i5)
--  br label %end
--
--end:
-   call void @llvm.stackrestore(ptr %ss)
-   ret void
- }
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
---- a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
-+++ b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
-@@ -21,8 +21,20 @@
- 
- define void @hoist_range_switch(i64 %i, ptr %p) {
- ; CHECK-LABEL: @hoist_range_switch(
--; CHECK-NEXT:  out:
-+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-+; CHECK-NEXT:    ]
-+; CHECK:       bb0:
- ; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !range [[RNG1:![0-9]+]]
-+; CHECK-NEXT:    br label [[OUT:%.*]]
-+; CHECK:       bb1:
-+; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1, !range [[RNG2:![0-9]+]]
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       bb2:
-+; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !range [[RNG3:![0-9]+]]
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       out:
- ; CHECK-NEXT:    ret void
- ;
-   switch i64 %i, label %bb0 [
-@@ -45,7 +57,7 @@
- define void @hoist_both_noundef(i1 %c, ptr %p) {
- ; CHECK-LABEL: @hoist_both_noundef(
- ; CHECK-NEXT:  if:
--; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !2
-+; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
- ; CHECK-NEXT:    ret void
- ;
- if:
-@@ -66,8 +78,20 @@
- 
- define void @hoist_both_noundef_switch(i64 %i, ptr %p) {
- ; CHECK-LABEL: @hoist_both_noundef_switch(
--; CHECK-NEXT:  out:
--; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !2
-+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-+; CHECK-NEXT:    ]
-+; CHECK:       bb0:
-+; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
-+; CHECK-NEXT:    br label [[OUT:%.*]]
-+; CHECK:       bb1:
-+; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       bb2:
-+; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       out:
- ; CHECK-NEXT:    ret void
- ;
-   switch i64 %i, label %bb0 [
-@@ -110,8 +134,20 @@
- 
- define void @hoist_one_noundef_switch(i64 %i, ptr %p) {
- ; CHECK-LABEL: @hoist_one_noundef_switch(
--; CHECK-NEXT:  out:
--; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1
-+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-+; CHECK-NEXT:    ]
-+; CHECK:       bb0:
-+; CHECK-NEXT:    [[T:%.*]] = load i8, ptr [[P:%.*]], align 1, !noundef !4
-+; CHECK-NEXT:    br label [[OUT:%.*]]
-+; CHECK:       bb1:
-+; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[P]], align 1
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       bb2:
-+; CHECK-NEXT:    [[F:%.*]] = load i8, ptr [[P]], align 1, !noundef !4
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       out:
- ; CHECK-NEXT:    ret void
- ;
-   switch i64 %i, label %bb0 [
-@@ -134,7 +170,7 @@
- define void @hoist_dereferenceable(i1 %c, ptr %p) {
- ; CHECK-LABEL: @hoist_dereferenceable(
- ; CHECK-NEXT:  if:
--; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !3
-+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !5
- ; CHECK-NEXT:    ret void
- ;
- if:
-@@ -151,8 +187,20 @@
- 
- define void @hoist_dereferenceable_switch(i64 %i, ptr %p) {
- ; CHECK-LABEL: @hoist_dereferenceable_switch(
--; CHECK-NEXT:  out:
--; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !3
-+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-+; CHECK-NEXT:    ]
-+; CHECK:       bb0:
-+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable !5
-+; CHECK-NEXT:    br label [[OUT:%.*]]
-+; CHECK:       bb1:
-+; CHECK-NEXT:    [[E:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable !6
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       bb2:
-+; CHECK-NEXT:    [[F:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable !7
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       out:
- ; CHECK-NEXT:    ret void
- ;
-   switch i64 %i, label %bb0 [
-@@ -175,7 +223,7 @@
- define void @hoist_dereferenceable_or_null(i1 %c, ptr %p) {
- ; CHECK-LABEL: @hoist_dereferenceable_or_null(
- ; CHECK-NEXT:  if:
--; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !3
-+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !5
- ; CHECK-NEXT:    ret void
- ;
- if:
-@@ -192,8 +240,20 @@
- 
- define void @hoist_dereferenceable_or_null_switch(i64 %i, ptr %p) {
- ; CHECK-LABEL: @hoist_dereferenceable_or_null_switch(
--; CHECK-NEXT:  out:
--; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !3
-+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-+; CHECK-NEXT:    ]
-+; CHECK:       bb0:
-+; CHECK-NEXT:    [[T:%.*]] = load ptr, ptr [[P:%.*]], align 8, !dereferenceable_or_null !6
-+; CHECK-NEXT:    br label [[OUT:%.*]]
-+; CHECK:       bb1:
-+; CHECK-NEXT:    [[E:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable_or_null !5
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       bb2:
-+; CHECK-NEXT:    [[F:%.*]] = load ptr, ptr [[P]], align 8, !dereferenceable_or_null !7
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       out:
- ; CHECK-NEXT:    ret void
- ;
-   switch i64 %i, label %bb0 [
-@@ -217,7 +277,7 @@
- define i32 @speculate_range(i1 %c, ptr dereferenceable(8) align 8 %p) {
- ; CHECK-LABEL: @speculate_range(
- ; CHECK-NEXT:  entry:
--; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG4:![0-9]+]]
-+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG8:![0-9]+]]
- ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], i32 [[V]], i32 0
- ; CHECK-NEXT:    ret i32 [[SPEC_SELECT]]
- ;
-@@ -238,7 +298,7 @@
- define ptr @speculate_nonnull(i1 %c, ptr dereferenceable(8) align 8 %p) {
- ; CHECK-LABEL: @speculate_nonnull(
- ; CHECK-NEXT:  entry:
--; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !2
-+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull !4
- ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], ptr [[V]], ptr null
- ; CHECK-NEXT:    ret ptr [[SPEC_SELECT]]
- ;
-@@ -259,7 +319,7 @@
- define ptr @speculate_align(i1 %c, ptr dereferenceable(8) align 8 %p) {
- ; CHECK-LABEL: @speculate_align(
- ; CHECK-NEXT:  entry:
--; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !align !5
-+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !align !9
- ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], ptr [[V]], ptr null
- ; CHECK-NEXT:    ret ptr [[SPEC_SELECT]]
- ;
-@@ -278,7 +338,7 @@
- define void @hoist_fpmath(i1 %c, double %x) {
- ; CHECK-LABEL: @hoist_fpmath(
- ; CHECK-NEXT:  if:
--; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !6
-+; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !10
- ; CHECK-NEXT:    ret void
- ;
- if:
-@@ -295,8 +355,20 @@
- 
- define void @hoist_fpmath_switch(i64 %i, double %x) {
- ; CHECK-LABEL: @hoist_fpmath_switch(
--; CHECK-NEXT:  out:
--; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !6
-+; CHECK-NEXT:    switch i64 [[I:%.*]], label [[BB0:%.*]] [
-+; CHECK-NEXT:    i64 1, label [[BB1:%.*]]
-+; CHECK-NEXT:    i64 2, label [[BB2:%.*]]
-+; CHECK-NEXT:    ]
-+; CHECK:       bb0:
-+; CHECK-NEXT:    [[T:%.*]] = fadd double [[X:%.*]], 1.000000e+00, !fpmath !10
-+; CHECK-NEXT:    br label [[OUT:%.*]]
-+; CHECK:       bb1:
-+; CHECK-NEXT:    [[E:%.*]] = fadd double [[X]], 1.000000e+00, !fpmath !11
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       bb2:
-+; CHECK-NEXT:    [[F:%.*]] = fadd double [[X]], 1.000000e+00, !fpmath !12
-+; CHECK-NEXT:    br label [[OUT]]
-+; CHECK:       out:
- ; CHECK-NEXT:    ret void
- ;
-   switch i64 %i, label %bb0 [
-@@ -322,10 +394,16 @@
- !3 = !{ i8 7, i8 9 }
- ;.
- ; CHECK: [[RNG0]] = !{i8 0, i8 1, i8 3, i8 5}
--; CHECK: [[RNG1]] = !{i8 0, i8 1, i8 3, i8 5, i8 7, i8 9}
--; CHECK: [[META2:![0-9]+]] = !{}
--; CHECK: [[META3:![0-9]+]] = !{i64 10}
--; CHECK: [[RNG4]] = !{i32 0, i32 10}
--; CHECK: [[META5:![0-9]+]] = !{i64 4}
--; CHECK: [[META6:![0-9]+]] = !{float 2.500000e+00}
-+; CHECK: [[RNG1]] = !{i8 0, i8 1}
-+; CHECK: [[RNG2]] = !{i8 3, i8 5}
-+; CHECK: [[RNG3]] = !{i8 7, i8 9}
-+; CHECK: [[META4:![0-9]+]] = !{}
-+; CHECK: [[META5:![0-9]+]] = !{i64 10}
-+; CHECK: [[META6:![0-9]+]] = !{i64 20}
-+; CHECK: [[META7:![0-9]+]] = !{i64 30}
-+; CHECK: [[RNG8]] = !{i32 0, i32 10}
-+; CHECK: [[META9:![0-9]+]] = !{i64 4}
-+; CHECK: [[META10:![0-9]+]] = !{float 2.500000e+00}
-+; CHECK: [[META11:![0-9]+]] = !{float 5.000000e+00}
-+; CHECK: [[META12:![0-9]+]] = !{float 7.500000e+00}
- ;.
+     }
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
+--- a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
++++ b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
+@@ -4,6 +4,8 @@
+ define i1 @load_bv_v4i8(i1 zeroext %a) {
+ ; CHECK-LABEL: load_bv_v4i8:
+ ; CHECK:       // %bb.0:
++; CHECK-NEXT:    cmp w0, #0
++; CHECK-NEXT:    cset w0, ne
+ ; CHECK-NEXT:    ret
+   %b = zext i1 %a to i32
+   %c = icmp eq i32 %b, 1
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/RISCV/rvv/fold-vp-fadd-and-vp-fmul.ll b/llvm/test/CodeGen/RISCV/rvv/fold-vp-fadd-and-vp-fmul.ll
+--- a/llvm/test/CodeGen/RISCV/rvv/fold-vp-fadd-and-vp-fmul.ll
++++ b/llvm/test/CodeGen/RISCV/rvv/fold-vp-fadd-and-vp-fmul.ll
+@@ -62,9 +62,9 @@
+ ; CHECK-LABEL: fma_reassociate:
+ ; CHECK:       # %bb.0:
+ ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+-; CHECK-NEXT:    vfmadd.vv v11, v10, v12, v0.t
+-; CHECK-NEXT:    vfmadd.vv v9, v8, v11, v0.t
+-; CHECK-NEXT:    vmv.v.v v8, v9
++; CHECK-NEXT:    vfmadd.vv v9, v8, v12, v0.t
++; CHECK-NEXT:    vfmadd.vv v11, v10, v9, v0.t
++; CHECK-NEXT:    vmv.v.v v8, v11
+ ; CHECK-NEXT:    ret
+   %1 = call fast <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, <vscale x 1 x i1> %m, i32 %vl)
+   %2 = call fast <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> %c, <vscale x 1 x double> %d, <vscale x 1 x i1> %m, i32 %vl)
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index aaec90a65c08e6..5abb4304cc53f3 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "ebefe83c092e41d243829ab812bb650674e2f3d2"
-    LLVM_SHA256 = "0cac9b05231cd3f0f4efb29fad98ef9b6eb9f01c9c99d016a93764033a603426"
+    LLVM_COMMIT = "3b0f812b9af459c4f857e4a7ffffa01f7a21446e"
+    LLVM_SHA256 = "47f8c81275437fb4ebcf0286125d683fb946cbddf0b725dc61d786141b32bc08"
 
     tf_http_archive(
         name = name,

From 89b9d8d33903f0b448f543af7cc4e6030bd4e44d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 04:28:51 -0700
Subject: [PATCH 136/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/e2e1f9f197af799cbf558eacd26221695a7971ce.

PiperOrigin-RevId: 567584745
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 5b50da65a5d4de..3671380535a7b8 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "73bfccd957234f8cc02c69dc50078288b6d4db8c"
-    TFRT_SHA256 = "2ae79e0aa046864afc9b43a23ee077c52e2f91a192e62f6ca8e82a68a206f116"
+    TFRT_COMMIT = "e2e1f9f197af799cbf558eacd26221695a7971ce"
+    TFRT_SHA256 = "ef6ca7d0ab5fce018a8ae64500de96dffd838d1ed4bef514d797ba2b99bd8908"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index 5b50da65a5d4de..3671380535a7b8 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "73bfccd957234f8cc02c69dc50078288b6d4db8c"
-    TFRT_SHA256 = "2ae79e0aa046864afc9b43a23ee077c52e2f91a192e62f6ca8e82a68a206f116"
+    TFRT_COMMIT = "e2e1f9f197af799cbf558eacd26221695a7971ce"
+    TFRT_SHA256 = "ef6ca7d0ab5fce018a8ae64500de96dffd838d1ed4bef514d797ba2b99bd8908"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 5b50da65a5d4de..3671380535a7b8 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "73bfccd957234f8cc02c69dc50078288b6d4db8c"
-    TFRT_SHA256 = "2ae79e0aa046864afc9b43a23ee077c52e2f91a192e62f6ca8e82a68a206f116"
+    TFRT_COMMIT = "e2e1f9f197af799cbf558eacd26221695a7971ce"
+    TFRT_SHA256 = "ef6ca7d0ab5fce018a8ae64500de96dffd838d1ed4bef514d797ba2b99bd8908"
 
     tf_http_archive(
         name = "tf_runtime",

From 422d09c0badb4bc56fb554a8a50e7e7346a2c972 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Fri, 22 Sep 2023 04:45:43 -0700
Subject: [PATCH 137/567] [XLA:GPU][NFC] Always profile the cuBLAS version of
 GEMMs in TritonAutotuner

Even if no validation is requested.
This is a preparation for a later CL.

PiperOrigin-RevId: 567587347
---
 .../xla/xla/service/gpu/triton_autotuner.cc   | 50 ++++++++++---------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/triton_autotuner.cc b/third_party/xla/xla/service/gpu/triton_autotuner.cc
index 96ed7527326acb..fd5d77258cb907 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner.cc
@@ -165,7 +165,7 @@ struct ExecutableCandidate {
 // This contains all alternative executables related to one fusion.
 struct ExecutableSet {
   std::vector<ExecutableCandidate> candidates;
-  // This is nullptr iff correctness check is disabled.
+  // Not nullptr.
   std::unique_ptr<Executable> reference;
 };
 
@@ -397,9 +397,8 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
     const GemmConfigSet& gemm_config_set = key_value.second;
     config_count += gemm_config_set.configs.size();
   }
-  if (config.should_check_correctness()) {
-    config_count += gemm_config_sets.size();
-  }
+  // The cuBLAS configs:
+  config_count += gemm_config_sets.size();
 
   std::atomic<int> done_count = 0;
   std::atomic<int> good_count = 0;
@@ -490,14 +489,12 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
         });
       }
 
-      if (config.should_check_correctness()) {
-        thread_pool->Schedule([&, fusion] {
-          StatusOr<bool> has_executable = compile_reference_executable(fusion);
-          TF_CHECK_OK(has_executable.status());
-          log(has_executable.value());
-          counter.DecrementCount();
-        });
-      }
+      thread_pool->Schedule([&, fusion] {
+        StatusOr<bool> has_executable = compile_reference_executable(fusion);
+        TF_CHECK_OK(has_executable.status());
+        log(has_executable.value());
+        counter.DecrementCount();
+      });
     }
     counter.Wait();
   } else {
@@ -521,11 +518,9 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
         log(has_executable);
       }
 
-      if (config.should_check_correctness()) {
-        TF_ASSIGN_OR_RETURN(bool has_executable,
-                            compile_reference_executable(fusion));
-        log(has_executable);
-      }
+      TF_ASSIGN_OR_RETURN(bool has_executable,
+                          compile_reference_executable(fusion));
+      log(has_executable);
     }
   }
 
@@ -534,9 +529,9 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
   return executable_sets;
 }
 
-// Runs matmul fusion contents without Triton - with cuBLAS, to generate
-// a reference output.
-StatusOr<ScopedShapedBuffer> RunMatmulWithCublas(
+// Runs matmul fusion contents without Triton - with cuBLAS, to measure time and
+// generate a reference output.
+StatusOr<ProfilingOutput> RunMatmulWithCublas(
     AutotunerCompileUtil& util, se::Stream* stream, Executable& executable,
     absl::Span<se::DeviceMemoryBase const> input_buffers,
     absl::Span<Shape const> input_shapes) {
@@ -544,7 +539,7 @@ StatusOr<ScopedShapedBuffer> RunMatmulWithCublas(
       std::optional<ProfilingOutput> output,
       util.ProfileExecutable(&executable, stream, input_buffers, input_shapes));
   TF_RET_CHECK(output.has_value());
-  return std::move(output->output);
+  return std::move(output.value());
 }
 
 StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
@@ -569,7 +564,6 @@ StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
       se::RedzoneAllocator rz_allocator,
       AutotunerUtil::CreateRedzoneAllocator(config, debug_opts));
 
-  std::optional<ScopedShapedBuffer> reference_buffer;
   const HloInstruction& root = *fusion_computation->root_instruction();
   BufferComparator comparator(root.shape(),
                               fusion_computation->parent()->config());
@@ -588,13 +582,21 @@ StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
     input_shapes.push_back(param->shape());
   }
 
-  if (config.should_check_correctness()) {
+  // Run with cuBLAS.
+  std::optional<ScopedShapedBuffer> reference_buffer;
+  absl::Duration cublas_duration;
+  {
     TF_RET_CHECK(executable_set.reference != nullptr);
     TF_ASSIGN_OR_RETURN(
-        reference_buffer,
+        ProfilingOutput output,
         RunMatmulWithCublas(util, stream, *executable_set.reference, inputs,
                             input_shapes));
+    if (config.should_check_correctness()) {
+      reference_buffer = std::move(output.output);
+    }
+    cublas_duration = output.duration;
   }
+  VLOG(3) << "Running with cuBLAS took: " << cublas_duration;
 
   const int log_every_n = GetLogEveryN();
   int64_t executable_count =

From 515d795bae6b706719cc301655957c94d621d3d5 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Fri, 22 Sep 2023 05:31:23 -0700
Subject: [PATCH 138/567] [XLA:GPU] Re-add logging of Triton IR.

cl/560999729 replaced logging with dumps but dumping is not enabled on autotuning variants and thus does not replace logging for cases when autotuned variants crash.

PiperOrigin-RevId: 567594816
---
 third_party/xla/xla/service/gpu/ir_emitter_triton.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index d431f1349c294f..70457113f8be7b 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -1661,6 +1661,8 @@ StatusOr<TritonWrapperResult> TritonWrapper(
                                 device_info.shared_memory_per_block_optin));
 
   b.create<mt::ReturnOp>(loc);
+
+  VLOG(6) << llvm_ir::DumpToString(*triton_module);
   if (DumpingEnabledForHloModule(*hlo_computation->parent())) {
     DumpToFileInDirOrStdout(*hlo_computation->parent(), "triton_ir", "ttir",
                             llvm_ir::DumpToString(*triton_module));

From fad56f6ce87d25e05b43bae1c32d1fff86a95dd0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 05:56:34 -0700
Subject: [PATCH 139/567] Remove clang16 toolchain and rename clang17 toolchain

The upgrade to LLVM-17 is finished, so we can remove the old clang16 based
toolchain and rename the LLVM17 toolchain.

PiperOrigin-RevId: 567598888
---
 .bazelrc                                      | 24 +++++-----
 ci/official/bazelrcs/cpu.bazelrc              | 16 +++----
 ci/official/bazelrcs/cuda.bazelrc             | 24 +++++-----
 .../toolchains/remote_config/configs.bzl      | 47 ++-----------------
 third_party/xla/.bazelrc                      | 24 +++++-----
 third_party/xla/third_party/tsl/.bazelrc      | 24 +++++-----
 .../toolchains/remote_config/configs.bzl      | 47 ++-----------------
 .../toolchains/remote_config/configs.bzl      | 47 ++-----------------
 8 files changed, 68 insertions(+), 185 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 475a1c5378cdfe..98eff1ea726464 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -248,7 +248,7 @@ build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:cuda_clang_official --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build:cuda_clang_official --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
 
 # Debug config
 build:dbg -c dbg
@@ -457,12 +457,12 @@ build:rbe_linux --host_linkopt=-lm
 
 build:rbe_linux_cpu --config=rbe_linux
 # Linux cpu and cuda builds share the same toolchain now.
-build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
-build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang17_config_platform//:platform"
-build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
+build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
 # This is needed for all Clang17 builds but must not be present in GCC builds.
 build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
@@ -471,7 +471,7 @@ build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
-build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
+build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
@@ -494,9 +494,9 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang17_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang17_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang17_config_nccl"
+build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang_config_cuda"
+build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang_config_tensorrt"
+build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
@@ -558,7 +558,7 @@ test:release_base --test_size_filters=small,medium
 # Target the AVX instruction set
 build:release_cpu_linux --config=avx_linux
 # Use the Clang toolchain to compile
-build:release_cpu_linux --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
 # Disable clang extention that rejects type definitions within offsetof.
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
 # Can be removed once upb is updated, since a type definition is used within
diff --git a/ci/official/bazelrcs/cpu.bazelrc b/ci/official/bazelrcs/cpu.bazelrc
index 096893bdc096c0..43c951e3532466 100644
--- a/ci/official/bazelrcs/cpu.bazelrc
+++ b/ci/official/bazelrcs/cpu.bazelrc
@@ -49,7 +49,7 @@ build --linkopt="-lm"
 build --copt=-Wno-gnu-offsetof-extensions
 
 # Use the NVCC toolchain to compile for manylinux2014
-build --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
 
 # Test-related settings below this point.
 test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
@@ -88,14 +88,14 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.14-clang17_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 
 # For continuous builds
diff --git a/ci/official/bazelrcs/cuda.bazelrc b/ci/official/bazelrcs/cuda.bazelrc
index 14f4c4b96d0b6e..1bb260687d35fa 100644
--- a/ci/official/bazelrcs/cuda.bazelrc
+++ b/ci/official/bazelrcs/cuda.bazelrc
@@ -60,7 +60,7 @@ build --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
 build --action_env=TF_CUDA_CLANG="1"
 build --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
-build --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
 
 # CUDA: Enable TensorRT optimizations
 # https://developer.nvidia.com/tensorrt
@@ -111,24 +111,24 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.14-clang17_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
 # For Remote build execution -- GPU configuration
 build:rbe --repo_env=REMOTE_GPU_TESTING=1
 test:rbe --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang17_config_cuda"
-build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang17_config_tensorrt"
-build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang17_config_nccl"
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
+build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang_config_cuda"
+build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang_config_tensorrt"
+build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang_config_nccl"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
 
 # For continuous builds
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index a1fd875f4bd5e5..6e37be2447c0ec 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -620,49 +620,10 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.14-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-clang-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:c46d275e5bc760b7af465dc063629b234cfa34aabf0c7fe30581effc0b99648a",
-            "sigbuild-r2.14-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:06b3a97ef247dbb00a9c6d8315e4e035d891ae2f18088de254f15d6ecedadfb9",
-            "sigbuild-r2.14-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:00dc9e13130727dcdeb54ca77423e317a79aae84d5783c05b38b7bbdf753f0f6",
-        },
-        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
-        # and manylinux2014 is 2.17.
-        env = {
-            "ABI_LIBC_VERSION": "glibc_2.19",
-            "ABI_VERSION": "gcc",
-            "BAZEL_COMPILER": "/usr/lib/llvm-16/bin/clang",
-            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
-            "BAZEL_TARGET_CPU": "k8",
-            "BAZEL_TARGET_LIBC": "glibc_2.19",
-            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
-            "CC": "/usr/lib/llvm-16/bin/clang",
-            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
-            "CLEAR_CACHE": "1",
-            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
-            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-16/bin/clang",
-            "HOST_CXX_COMPILER": "/usr/lib/llvm-16/bin/clang",
-            "HOST_C_COMPILER": "/usr/lib/llvm-16/bin/clang",
-            "PYTHON_BIN_PATH": "/usr/bin/python3",
-            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
-            "TF_CUDA_CLANG": "1",
-            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
-            "TF_CUDA_VERSION": "11.2",
-            "TF_CUDNN_VERSION": "8.1",
-            "TF_ENABLE_XLA": "1",
-            "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
-            "TF_SYSROOT": "/dt9",
-            "TF_TENSORRT_VERSION": "7.2",
-        },
-    )
-
-    sigbuild_tf_configs(
-        name_container_map = {
-            "sigbuild-r2.14-clang17": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
-            "sigbuild-r2.14-clang17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
-            "sigbuild-r2.14-clang17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
-            "sigbuild-r2.14-clang17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
+            "sigbuild-r2.14-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
+            "sigbuild-r2.14-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
         },
         # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
         # and manylinux2014 is 2.17.
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 475a1c5378cdfe..98eff1ea726464 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -248,7 +248,7 @@ build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:cuda_clang_official --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build:cuda_clang_official --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
 
 # Debug config
 build:dbg -c dbg
@@ -457,12 +457,12 @@ build:rbe_linux --host_linkopt=-lm
 
 build:rbe_linux_cpu --config=rbe_linux
 # Linux cpu and cuda builds share the same toolchain now.
-build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
-build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang17_config_platform//:platform"
-build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
+build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
 # This is needed for all Clang17 builds but must not be present in GCC builds.
 build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
@@ -471,7 +471,7 @@ build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
-build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
+build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
@@ -494,9 +494,9 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang17_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang17_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang17_config_nccl"
+build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang_config_cuda"
+build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang_config_tensorrt"
+build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
@@ -558,7 +558,7 @@ test:release_base --test_size_filters=small,medium
 # Target the AVX instruction set
 build:release_cpu_linux --config=avx_linux
 # Use the Clang toolchain to compile
-build:release_cpu_linux --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
 # Disable clang extention that rejects type definitions within offsetof.
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
 # Can be removed once upb is updated, since a type definition is used within
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 475a1c5378cdfe..98eff1ea726464 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -248,7 +248,7 @@ build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:cuda_clang_official --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build:cuda_clang_official --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
 
 # Debug config
 build:dbg -c dbg
@@ -457,12 +457,12 @@ build:rbe_linux --host_linkopt=-lm
 
 build:rbe_linux_cpu --config=rbe_linux
 # Linux cpu and cuda builds share the same toolchain now.
-build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
-build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang17_config_platform//:platform"
-build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang17_config_platform//:platform"
+build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
 # This is needed for all Clang17 builds but must not be present in GCC builds.
 build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
@@ -471,7 +471,7 @@ build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
-build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang17_config_python"
+build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
@@ -494,9 +494,9 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang17_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang17_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang17_config_nccl"
+build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang_config_cuda"
+build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang_config_tensorrt"
+build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
@@ -558,7 +558,7 @@ test:release_base --test_size_filters=small,medium
 # Target the AVX instruction set
 build:release_cpu_linux --config=avx_linux
 # Use the Clang toolchain to compile
-build:release_cpu_linux --crosstool_top="@sigbuild-r2.14-clang17_config_cuda//crosstool:toolchain"
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
 # Disable clang extention that rejects type definitions within offsetof.
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
 # Can be removed once upb is updated, since a type definition is used within
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index 2453dc746feefb..666e51de5ffb71 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -620,49 +620,10 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.14-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-clang-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:c46d275e5bc760b7af465dc063629b234cfa34aabf0c7fe30581effc0b99648a",
-            "sigbuild-r2.14-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:06b3a97ef247dbb00a9c6d8315e4e035d891ae2f18088de254f15d6ecedadfb9",
-            "sigbuild-r2.14-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:00dc9e13130727dcdeb54ca77423e317a79aae84d5783c05b38b7bbdf753f0f6",
-        },
-        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
-        # and manylinux2014 is 2.17.
-        env = {
-            "ABI_LIBC_VERSION": "glibc_2.19",
-            "ABI_VERSION": "gcc",
-            "BAZEL_COMPILER": "/usr/lib/llvm-16/bin/clang",
-            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
-            "BAZEL_TARGET_CPU": "k8",
-            "BAZEL_TARGET_LIBC": "glibc_2.19",
-            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
-            "CC": "/usr/lib/llvm-16/bin/clang",
-            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
-            "CLEAR_CACHE": "1",
-            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
-            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-16/bin/clang",
-            "HOST_CXX_COMPILER": "/usr/lib/llvm-16/bin/clang",
-            "HOST_C_COMPILER": "/usr/lib/llvm-16/bin/clang",
-            "PYTHON_BIN_PATH": "/usr/bin/python3",
-            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
-            "TF_CUDA_CLANG": "1",
-            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
-            "TF_CUDA_VERSION": "11.2",
-            "TF_CUDNN_VERSION": "8.1",
-            "TF_ENABLE_XLA": "1",
-            "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
-            "TF_SYSROOT": "/dt9",
-            "TF_TENSORRT_VERSION": "7.2",
-        },
-    )
-
-    sigbuild_tf_configs(
-        name_container_map = {
-            "sigbuild-r2.14-clang17": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
-            "sigbuild-r2.14-clang17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
-            "sigbuild-r2.14-clang17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
-            "sigbuild-r2.14-clang17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
+            "sigbuild-r2.14-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
+            "sigbuild-r2.14-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
         },
         # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
         # and manylinux2014 is 2.17.
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index 2453dc746feefb..666e51de5ffb71 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -620,49 +620,10 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.14-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-clang-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:c46d275e5bc760b7af465dc063629b234cfa34aabf0c7fe30581effc0b99648a",
-            "sigbuild-r2.14-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:06b3a97ef247dbb00a9c6d8315e4e035d891ae2f18088de254f15d6ecedadfb9",
-            "sigbuild-r2.14-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:00dc9e13130727dcdeb54ca77423e317a79aae84d5783c05b38b7bbdf753f0f6",
-        },
-        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
-        # and manylinux2014 is 2.17.
-        env = {
-            "ABI_LIBC_VERSION": "glibc_2.19",
-            "ABI_VERSION": "gcc",
-            "BAZEL_COMPILER": "/usr/lib/llvm-16/bin/clang",
-            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
-            "BAZEL_TARGET_CPU": "k8",
-            "BAZEL_TARGET_LIBC": "glibc_2.19",
-            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
-            "CC": "/usr/lib/llvm-16/bin/clang",
-            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
-            "CLEAR_CACHE": "1",
-            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
-            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-16/bin/clang",
-            "HOST_CXX_COMPILER": "/usr/lib/llvm-16/bin/clang",
-            "HOST_C_COMPILER": "/usr/lib/llvm-16/bin/clang",
-            "PYTHON_BIN_PATH": "/usr/bin/python3",
-            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
-            "TF_CUDA_CLANG": "1",
-            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
-            "TF_CUDA_VERSION": "11.2",
-            "TF_CUDNN_VERSION": "8.1",
-            "TF_ENABLE_XLA": "1",
-            "TF_NEED_CUDA": "1",
-            "TF_NEED_TENSORRT": "1",
-            "TF_SYSROOT": "/dt9",
-            "TF_TENSORRT_VERSION": "7.2",
-        },
-    )
-
-    sigbuild_tf_configs(
-        name_container_map = {
-            "sigbuild-r2.14-clang17": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
-            "sigbuild-r2.14-clang17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
-            "sigbuild-r2.14-clang17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
-            "sigbuild-r2.14-clang17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
+            "sigbuild-r2.14-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
+            "sigbuild-r2.14-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
         },
         # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
         # and manylinux2014 is 2.17.

From 16c23ebd42c6487544881fd361fc0cd45d2abc25 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 06:25:38 -0700
Subject: [PATCH 140/567] Add fusion pattern for `jax.nn.log_softmax`

PiperOrigin-RevId: 567604558
---
 .../compiler/mlir/lite/tests/optimize.mlir    | 65 +++++++++++++
 .../compiler/mlir/lite/transforms/optimize.cc | 91 ++++++++++++++++++-
 2 files changed, 152 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 3fa644f382e31a..26369cd1f13e1e 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -3718,3 +3718,68 @@ func.func @FuseTransposeAfterBatchMatmul(%arg0: tensor<4x1024xf32>, %arg1: tenso
   func.return %1 : tensor<8x4xf32>
   // CHECK: return %[[RES0]] : tensor<8x4xf32>
 }
+
+// CHECK-LABEL: fuseLogSoftmax
+func.func @fuseLogSoftmax(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
+  %0 = arith.constant dense<1> : tensor<1xi32>
+  %1 = "tfl.reduce_max"(%arg0, %0) {keep_dims = true} : (tensor<10x10xf32>, tensor<1xi32>) -> tensor<10x1xf32>
+  %2 = tfl.sub(%arg0, %1) {fused_activation_function = "NONE"} : (tensor<10x10xf32>, tensor<10x1xf32>) -> tensor<10x10xf32>
+  %3 = "tfl.exp"(%2) : (tensor<10x10xf32>) -> tensor<10x10xf32>
+  %4 = "tfl.sum"(%3, %0) {keep_dims = true} : (tensor<10x10xf32>, tensor<1xi32>) -> tensor<10x1xf32>
+  %5 = "tfl.log"(%4) : (tensor<10x1xf32>) -> tensor<10x1xf32>
+  %6 = tfl.sub(%2, %5) {fused_activation_function = "NONE"} : (tensor<10x10xf32>, tensor<10x1xf32>) -> tensor<10x10xf32>
+  return %6 : tensor<10x10xf32>
+ // CHECK: "tfl.log_softmax"(%arg0) : (tensor<10x10xf32>) -> tensor<10x10xf32>
+}
+
+// CHECK-LABEL: fuseLogSoftmaxAxisNegativeOne
+func.func @fuseLogSoftmaxAxisNegativeOne(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
+  %0 = arith.constant dense<-1> : tensor<1xi32>
+  %1 = "tfl.reduce_max"(%arg0, %0) {keep_dims = true} : (tensor<10x10xf32>, tensor<1xi32>) -> tensor<10x1xf32>
+  %2 = tfl.sub(%arg0, %1) {fused_activation_function = "NONE"} : (tensor<10x10xf32>, tensor<10x1xf32>) -> tensor<10x10xf32>
+  %3 = "tfl.exp"(%2) : (tensor<10x10xf32>) -> tensor<10x10xf32>
+  %4 = "tfl.sum"(%3, %0) {keep_dims = true} : (tensor<10x10xf32>, tensor<1xi32>) -> tensor<10x1xf32>
+  %5 = "tfl.log"(%4) : (tensor<10x1xf32>) -> tensor<10x1xf32>
+  %6 = tfl.sub(%2, %5) {fused_activation_function = "NONE"} : (tensor<10x10xf32>, tensor<10x1xf32>) -> tensor<10x10xf32>
+  return %6 : tensor<10x10xf32>
+ // CHECK: "tfl.log_softmax"(%arg0) : (tensor<10x10xf32>) -> tensor<10x10xf32>
+}
+
+// CHECK-LABEL: fuseLogSoftmaxFusedActivationFunction
+func.func @fuseLogSoftmaxFusedActivationFunction(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
+  %0 = arith.constant dense<1> : tensor<1xi32>
+  %1 = "tfl.reduce_max"(%arg0, %0) {keep_dims = true} : (tensor<10x10xf32>, tensor<1xi32>) -> tensor<10x1xf32>
+  %2 = tfl.sub(%arg0, %1) {fused_activation_function = "RELU"} : (tensor<10x10xf32>, tensor<10x1xf32>) -> tensor<10x10xf32>
+  %3 = "tfl.exp"(%2) : (tensor<10x10xf32>) -> tensor<10x10xf32>
+  %4 = "tfl.sum"(%3, %0) {keep_dims = true} : (tensor<10x10xf32>, tensor<1xi32>) -> tensor<10x1xf32>
+  %5 = "tfl.log"(%4) : (tensor<10x1xf32>) -> tensor<10x1xf32>
+  %6 = tfl.sub(%2, %5) {fused_activation_function = "RELU"} : (tensor<10x10xf32>, tensor<10x1xf32>) -> tensor<10x10xf32>
+  return %6 : tensor<10x10xf32>
+ // CHECK-NOT: "tfl.log_softmax"(%arg0) : (tensor<10x10xf32>) -> tensor<10x10xf32>
+}
+
+// CHECK-LABEL: fuseLogSoftmax1D
+func.func @fuseLogSoftmax1D(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  %0 = arith.constant dense<0> : tensor<1xi32>
+  %1 = "tfl.reduce_max"(%arg0, %0) {keep_dims = true} : (tensor<10xf32>, tensor<1xi32>) -> tensor<1xf32>
+  %2 = tfl.sub(%arg0, %1) {fused_activation_function = "NONE"} : (tensor<10xf32>, tensor<1xf32>) -> tensor<10xf32>
+  %3 = "tfl.exp"(%2) : (tensor<10xf32>) -> tensor<10xf32>
+  %4 = "tfl.sum"(%3, %0) {keep_dims = true} : (tensor<10xf32>, tensor<1xi32>) -> tensor<1xf32>
+  %5 = "tfl.log"(%4) : (tensor<1xf32>) -> tensor<1xf32>
+  %6 = tfl.sub(%2, %5) {fused_activation_function = "NONE"} : (tensor<10xf32>, tensor<1xf32>) -> tensor<10xf32>
+  return %6 : tensor<10xf32>
+ // CHECK: "tfl.log_softmax"(%arg0) : (tensor<10xf32>) -> tensor<10xf32>
+}
+
+// CHECK-LABEL: fuseLogSoftmaxNotLastAxis
+func.func @fuseLogSoftmaxNotLastAxis(%arg0: tensor<10x10x10xf32>) -> tensor<10x10x10xf32> {
+  %0 = arith.constant dense<1> : tensor<1xi32>
+  %1 = "tfl.reduce_max"(%arg0, %0) {keep_dims = true} : (tensor<10x10x10xf32>, tensor<1xi32>) -> tensor<10x1x10xf32>
+  %2 = tfl.sub(%arg0, %1) {fused_activation_function = "NONE"} : (tensor<10x10x10xf32>, tensor<10x1x10xf32>) -> tensor<10x10x10xf32>
+  %3 = "tfl.exp"(%2) : (tensor<10x10x10xf32>) -> tensor<10x10x10xf32>
+  %4 = "tfl.sum"(%3, %0) {keep_dims = true} : (tensor<10x10x10xf32>, tensor<1xi32>) -> tensor<10x1x10xf32>
+  %5 = "tfl.log"(%4) : (tensor<10x1x10xf32>) -> tensor<10x1x10xf32>
+  %6 = tfl.sub(%2, %5) {fused_activation_function = "NONE"} : (tensor<10x10x10xf32>, tensor<10x1x10xf32>) -> tensor<10x10x10xf32>
+  return %6 : tensor<10x10x10xf32>
+ // CHECK-NOT: "tfl.log_softmax"(%arg0) : (tensor<10x10x10f32>) -> tensor<10x10x10xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index d09a92e0608347..64bff681053f6e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -2203,6 +2203,88 @@ struct FuseTransposeReshapeIntoBatchMatmul
   }
 };
 
+struct FuseLogSoftmax : public OpRewritePattern<TFL::SubOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(TFL::SubOp sub_op,
+                                PatternRewriter &rewriter) const override {
+    if (sub_op.getFusedActivationFunction() != "NONE") {
+      return failure();
+    }
+    auto log_op = dyn_cast_or_null<TFL::LogOp>(sub_op.getRhs().getDefiningOp());
+    if (!log_op || !log_op->hasOneUse()) {
+      return failure();
+    }
+    auto sum_op = dyn_cast_or_null<TFL::SumOp>(log_op.getX().getDefiningOp());
+    if (!sum_op || !sum_op.getKeepDims() ||
+        !isSupportedAxis(
+            sum_op.getAxes(),
+            sum_op.getOperand(0).getType().cast<ShapedType>().getRank())) {
+      return failure();
+    }
+    if (!sum_op->hasOneUse()) {
+      return failure();
+    }
+    auto exp_op =
+        dyn_cast_or_null<TFL::ExpOp>(sum_op.getInput().getDefiningOp());
+    if (!exp_op || !exp_op->hasOneUse()) {
+      return failure();
+    }
+
+    auto parent_sub_op =
+        dyn_cast_or_null<TFL::SubOp>(sub_op.getLhs().getDefiningOp());
+    if (!parent_sub_op || parent_sub_op != dyn_cast_or_null<TFL::SubOp>(
+                                               exp_op.getX().getDefiningOp())) {
+      return failure();
+    }
+    if (std::distance(parent_sub_op->getUses().begin(),
+                      parent_sub_op->getUses().end()) != 2) {
+      return failure();
+    }
+
+    auto reduce_max_op = dyn_cast_or_null<TFL::ReduceMaxOp>(
+        parent_sub_op.getRhs().getDefiningOp());
+    if (!reduce_max_op || !reduce_max_op->hasOneUse() ||
+        !reduce_max_op.getKeepDims() ||
+        !isSupportedAxis(reduce_max_op.getAxes(), reduce_max_op.getOperand(0)
+                                                      .getType()
+                                                      .cast<ShapedType>()
+                                                      .getRank())) {
+      return failure();
+    }
+
+    if (reduce_max_op.getInput() != parent_sub_op.getLhs()) {
+      return failure();
+    }
+
+    rewriter.replaceOpWithNewOp<TFL::LogSoftmaxOp>(sub_op, sub_op.getType(),
+                                                   parent_sub_op.getLhs());
+    return success();
+  }
+
+  // The TFL_LogSoftmaxOp implementation only works on the last axis, so we
+  // check that both TFL_ReduceMaxOP and TFL_SumOp use the last axis
+  bool isSupportedAxis(mlir::Value value, int64_t rank) const {
+    auto const_op =
+        dyn_cast_or_null<mlir::arith::ConstantOp>(value.getDefiningOp());
+    if (!const_op) {
+      return false;
+    }
+    auto axes = dyn_cast<DenseIntElementsAttr>(const_op.getValueAttr());
+    if (!axes || axes.getNumElements() != 1) {
+      return false;
+    }
+    auto axes_elem_ty = axes.getType().getElementType();
+    if (!axes_elem_ty.isInteger(32) && !axes_elem_ty.isInteger(64)) {
+      return false;
+    }
+    const int64_t axis = (*axes.begin()).getSExtValue();
+    if (axis != rank - 1 && axis != -1) {
+      return false;
+    }
+    return true;
+  }
+};
+
 // Adds canonicalization patterns to the list of patterns.
 void AddCanonicalizationPatterns(MLIRContext *context,
                                  RewritePatternSet *patterns) {
@@ -2244,10 +2326,11 @@ void OptimizePass::runOnOperation() {
   RewritePatternSet phase_2_patterns(&getContext());
   TFL::populateWithGenerated(phase_2_patterns);
   phase_2_patterns.add<
-      ScalarizeSplatConstantForAdd, ScalarizeSplatConstantForSub,
-      ScalarizeSplatConstantForMul, ScalarizeSplatConstantForDiv,
-      FuseFullyConnectedAndAdd, FuseAddAndFullyConnected,
-      FuseFullyConnectedAndMul, FuseFullyConnectedAndReluX<TFL::ReluOp, kRelu>,
+      FuseLogSoftmax, ScalarizeSplatConstantForAdd,
+      ScalarizeSplatConstantForSub, ScalarizeSplatConstantForMul,
+      ScalarizeSplatConstantForDiv, FuseFullyConnectedAndAdd,
+      FuseAddAndFullyConnected, FuseFullyConnectedAndMul,
+      FuseFullyConnectedAndReluX<TFL::ReluOp, kRelu>,
       FuseFullyConnectedAndReluX<TFL::Relu6Op, kRelu6>,
       FuseFullyConnectedAndReluX<TFL::Relu1Op, kRelu1>,
       FuseBinaryOpToFollowingConv2D, FuseBinaryOpToFollowingDepthwiseConv2D,

From 8c87d6780b91eeb010a611e37599ac3f01f32f08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Fri, 22 Sep 2023 06:31:59 -0700
Subject: [PATCH 141/567] [XLA:GPU][NFC] Separate TritonAutotuner from other
 autotuning passes

This is a preparation for a later CL.

PiperOrigin-RevId: 567605682
---
 .../xla/xla/service/gpu/amdgpu_compiler.cc       |  3 +--
 .../xla/xla/service/gpu/amdgpu_compiler.h        |  7 ++++---
 third_party/xla/xla/service/gpu/gpu_compiler.cc  |  6 ++++--
 third_party/xla/xla/service/gpu/gpu_compiler.h   | 16 +++++++++++-----
 .../xla/xla/service/gpu/nvptx_compiler.cc        |  7 ++++++-
 third_party/xla/xla/service/gpu/nvptx_compiler.h | 12 +++++++++---
 6 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
index 723ef0df23da3e..74bd6c6c01d699 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
@@ -149,7 +149,7 @@ bool AMDGPUCompiler::RequiresCollectiveScheduleLinearizer(
   return false;
 }
 
-Status AMDGPUCompiler::AddAutotuningPasses(
+Status AMDGPUCompiler::AddConvAndGemmAutotuningPasses(
     HloPassPipeline* pipeline, HloModule* hlo_module,
     AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool) {
   if (GpuConvAlgorithmPicker::IsEnabled(hlo_module)) {
@@ -157,7 +157,6 @@ Status AMDGPUCompiler::AddAutotuningPasses(
   }
   // TODO:
   // pipeline->AddPass<GemmAlgorithmPicker>(autotune_config);
-  // pipeline->AddPass<TritonAutotuner>(autotune_config, thread_pool);
   return OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.h b/third_party/xla/xla/service/gpu/amdgpu_compiler.h
index 37edfe78e0b063..c05d256d2e32ea 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.h
@@ -47,9 +47,10 @@ class AMDGPUCompiler : public GpuCompiler {
   bool RequiresCollectiveScheduleLinearizer(
       const HloModule* module, se::StreamExecutor* stream_exec) override;
 
-  Status AddAutotuningPasses(HloPassPipeline* pipeline, HloModule* hlo_module,
-                             AutotuneConfig& autotune_config,
-                             tsl::thread::ThreadPool* thread_pool) override;
+  Status AddConvAndGemmAutotuningPasses(
+      HloPassPipeline* pipeline, HloModule* hlo_module,
+      AutotuneConfig& autotune_config,
+      tsl::thread::ThreadPool* thread_pool) override;
 
   Status LoadAutotuneResultsFromFile(
       const DebugOptions& debug_options) override;
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 66dd3fd2ac57bc..ec61161d3b5ef0 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1015,8 +1015,10 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // f32).
   add_float_normalization(pipeline);
 
-  TF_RETURN_IF_ERROR(
-      AddAutotuningPasses(&pipeline, hlo_module, autotune_config, thread_pool));
+  TF_RETURN_IF_ERROR(AddConvAndGemmAutotuningPasses(
+      &pipeline, hlo_module, autotune_config, thread_pool));
+  TF_RETURN_IF_ERROR(AddTritonGemmAutotuningPasses(
+      &pipeline, hlo_module, autotune_config, thread_pool));
 
   // The Triton autotuner can insert new bf16 reductions that need to be
   // normalized again.
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 42d219c6c63d99..2a34ee7b7f86e5 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -193,11 +193,17 @@ class GpuCompiler : public LLVMCompiler {
     return false;
   }
 
-  // Add autotuning passes for convolution, gemm and triton.
-  virtual Status AddAutotuningPasses(HloPassPipeline* pipeline,
-                                     HloModule* hlo_module,
-                                     AutotuneConfig& autotune_config,
-                                     tsl::thread::ThreadPool* thread_pool) {
+  // Add autotuning passes for convolution and gemm (except triton).
+  virtual Status AddConvAndGemmAutotuningPasses(
+      HloPassPipeline* pipeline, HloModule* hlo_module,
+      AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool) {
+    return OkStatus();
+  }
+
+  // Add autotuning passes for triton gemm.
+  virtual Status AddTritonGemmAutotuningPasses(
+      HloPassPipeline* pipeline, HloModule* hlo_module,
+      AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool) {
     return OkStatus();
   }
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 8dd3b2fc8d74c6..5ee7f942e30fed 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -292,14 +292,19 @@ bool NVPTXCompiler::RequiresCollectiveScheduleLinearizer(
   return false;
 }
 
-Status NVPTXCompiler::AddAutotuningPasses(
+Status NVPTXCompiler::AddConvAndGemmAutotuningPasses(
     HloPassPipeline* pipeline, HloModule* hlo_module,
     AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool) {
   if (GpuConvAlgorithmPicker::IsEnabled(hlo_module)) {
     pipeline->AddPass<GpuConvAlgorithmPicker>(autotune_config);
   }
   pipeline->AddPass<GemmAlgorithmPicker>(autotune_config);
+  return OkStatus();
+}
 
+Status NVPTXCompiler::AddTritonGemmAutotuningPasses(
+    HloPassPipeline* pipeline, HloModule* hlo_module,
+    AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool) {
   pipeline->AddPass<TritonAutotuner>(autotune_config, thread_pool);
   return OkStatus();
 }
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/xla/xla/service/gpu/nvptx_compiler.h
index cdc4b409e11030..debc4539dfe161 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.h
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -53,9 +53,15 @@ class NVPTXCompiler : public GpuCompiler {
   bool RequiresCollectiveScheduleLinearizer(
       const HloModule* module, se::StreamExecutor* stream_exec) override;
 
-  Status AddAutotuningPasses(HloPassPipeline* pipeline, HloModule* hlo_module,
-                             AutotuneConfig& autotune_config,
-                             tsl::thread::ThreadPool* thread_pool) override;
+  Status AddConvAndGemmAutotuningPasses(
+      HloPassPipeline* pipeline, HloModule* hlo_module,
+      AutotuneConfig& autotune_config,
+      tsl::thread::ThreadPool* thread_pool) override;
+
+  Status AddTritonGemmAutotuningPasses(
+      HloPassPipeline* pipeline, HloModule* hlo_module,
+      AutotuneConfig& autotune_config,
+      tsl::thread::ThreadPool* thread_pool) override;
 
   Status LoadAutotuneResultsFromFile(
       const DebugOptions& debug_options) override;

From ff8fd53f5cf138c582fdfa906570cce66ba615c2 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Fri, 22 Sep 2023 07:45:58 -0700
Subject: [PATCH 142/567] [XLA:GPU] Fix and cover with tests type conversions
 in Triton emitters.

PiperOrigin-RevId: 567619889
---
 third_party/xla/xla/service/gpu/BUILD         |   4 +
 .../xla/xla/service/gpu/ir_emitter_triton.cc  | 138 ++++++++++--------
 .../ir_emitter_triton_parametrized_test.cc    | 119 ++++++++++++---
 3 files changed, 181 insertions(+), 80 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index f6ba7b004e3988..17eb3ead3fd0a4 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -626,7 +626,10 @@ xla_test(
         "//xla:comparison_util",
         "//xla:error_spec",
         "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:pattern_matcher",
+        "//xla/service:pattern_matcher_gmock",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cublas_plugin",
@@ -634,6 +637,7 @@ xla_test(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 70457113f8be7b..9b7fbf675625ae 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -153,7 +153,7 @@ Type TritonType(mlir::OpBuilder b, PrimitiveType t) {
     case S16:
       return b.getI16Type();
     case PRED:
-      // Treat PRED as S8.
+      return b.getI1Type();
     case S8:
       return b.getI8Type();
     default:
@@ -162,6 +162,67 @@ Type TritonType(mlir::OpBuilder b, PrimitiveType t) {
   }
 }
 
+Type StorageType(mlir::OpBuilder b, Type t) {
+  if (t.isInteger(1)) {
+    return b.getI8Type();
+  }
+  return t;
+}
+
+// Get the value of the scalar constant's literal in a C++ type.
+template <typename T>
+T ScalarConstantValue(const HloInstruction& instr, PrimitiveType dst_type) {
+  CHECK(hlo_query::IsScalarConstant(&instr));
+  StatusOr<Literal> converted = instr.literal().Convert(dst_type);
+  TF_CHECK_OK(converted.status());
+  return converted.value().GetFirstElement<T>();
+}
+
+// Create a scalar constant.
+template <typename T>
+ma::ConstantOp CreateConst(ImplicitLocOpBuilder b, Type type, T value) {
+  if (type.isa<mlir::IntegerType>()) {
+    return b.create<ma::ConstantOp>(b.getIntegerAttr(type, value));
+  }
+  if (type.isa<mlir::FloatType>()) {
+    return b.create<ma::ConstantOp>(
+        b.getFloatAttr(type, static_cast<double>(value)));
+  }
+  LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
+}
+
+// Create a tensor constant.
+template <typename T>
+ma::ConstantOp CreateConst(ImplicitLocOpBuilder& b, Type type, T value,
+                           ArrayRef<int64_t> shape) {
+  auto tensor_type = mlir::RankedTensorType::get(shape, type);
+  if (auto int_type = type.dyn_cast<mlir::IntegerType>()) {
+    return b.create<ma::ConstantOp>(mlir::DenseElementsAttr::get(
+        tensor_type, mlir::APInt(int_type.getIntOrFloatBitWidth(), value)));
+  }
+  if (auto float_type = type.dyn_cast<mlir::FloatType>()) {
+    return b.create<ma::ConstantOp>(mlir::DenseElementsAttr::get(
+        tensor_type, b.getFloatAttr(type, static_cast<double>(value))));
+  }
+  LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
+}
+
+Value ZerosLike(ImplicitLocOpBuilder& b, Value x) {
+  if (auto src_shaped_ty = x.getType().dyn_cast<mlir::ShapedType>()) {
+    Type src_ty = src_shaped_ty.getElementType();
+    return CreateConst(b, src_ty, 0, src_shaped_ty.getShape());
+  }
+  return CreateConst(b, x.getType(), 0);
+}
+
+Value OnesLike(ImplicitLocOpBuilder& b, Value x) {
+  if (auto src_shaped_ty = x.getType().dyn_cast<mlir::ShapedType>()) {
+    Type src_ty = src_shaped_ty.getElementType();
+    return CreateConst(b, src_ty, 1, src_shaped_ty.getShape());
+  }
+  return CreateConst(b, x.getType(), 1);
+}
+
 // Triton type conversions.
 Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
   Type src_ty = value.getType();
@@ -185,7 +246,7 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
     return b.create<ma::TruncFOp>(dst_ty, Cast(b, value, b.getF32Type()));
   }
 
-  // Float <=> float
+  // float => float
   auto src_fp_element_ty = src_element_ty.dyn_cast<mlir::FloatType>();
   auto dst_fp_element_ty = dst_element_ty.dyn_cast<mlir::FloatType>();
   if (src_fp_element_ty && dst_fp_element_ty) {
@@ -196,6 +257,15 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
       return b.create<ma::ExtFOp>(dst_ty, value);
     }
   }
+  // int => int
+  if (src_element_ty.isa<mlir::IntegerType>() &&
+      dst_element_ty.isa<mlir::IntegerType>()) {
+    if (src_element_ty.getIntOrFloatBitWidth() <
+        dst_element_ty.getIntOrFloatBitWidth()) {
+      return b.create<ma::ExtSIOp>(dst_ty, value);
+    }
+    return b.create<ma::TruncIOp>(dst_ty, value);
+  }
   // int => float
   if (src_element_ty.isa<mlir::IntegerType>() && dst_fp_element_ty) {
     // TODO(b/266862493): Support unsigned integer types.
@@ -207,6 +277,10 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
   // float => int
   if (src_fp_element_ty && dst_element_ty.isa<mlir::IntegerType>()) {
     // TODO(b/266862493): Support unsigned integer types.
+    if (dst_element_ty.isInteger(1)) {
+      return b.create<ma::CmpFOp>(ma::CmpFPredicate::UNE, value,
+                                  ZerosLike(b, value));
+    }
     return b.create<ma::FPToSIOp>(dst_ty, value);
   }
 
@@ -215,44 +289,6 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
              << llvm_ir::DumpToString(dst_element_ty);
 }
 
-// Get the value of the scalar constant's literal in a C++ type.
-template <typename T>
-T ScalarConstantValue(const HloInstruction& instr, PrimitiveType dst_type) {
-  CHECK(hlo_query::IsScalarConstant(&instr));
-  StatusOr<Literal> converted = instr.literal().Convert(dst_type);
-  TF_CHECK_OK(converted.status());
-  return converted.value().GetFirstElement<T>();
-}
-
-// Create a scalar constant.
-template <typename T>
-ma::ConstantOp CreateConst(ImplicitLocOpBuilder b, Type type, T value) {
-  if (type.isa<mlir::IntegerType>()) {
-    return b.create<ma::ConstantOp>(b.getIntegerAttr(type, value));
-  }
-  if (type.isa<mlir::FloatType>()) {
-    return b.create<ma::ConstantOp>(
-        b.getFloatAttr(type, static_cast<double>(value)));
-  }
-  LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
-}
-
-// Create a tensor constant.
-template <typename T>
-ma::ConstantOp CreateConst(ImplicitLocOpBuilder& b, Type type, T value,
-                           ArrayRef<int64_t> shape) {
-  auto tensor_type = mlir::RankedTensorType::get(shape, type);
-  if (auto int_type = type.dyn_cast<mlir::IntegerType>()) {
-    return b.create<ma::ConstantOp>(mlir::DenseElementsAttr::get(
-        tensor_type, mlir::APInt(int_type.getIntOrFloatBitWidth(), value)));
-  }
-  if (auto float_type = type.dyn_cast<mlir::FloatType>()) {
-    return b.create<ma::ConstantOp>(mlir::DenseElementsAttr::get(
-        tensor_type, b.getFloatAttr(type, static_cast<double>(value))));
-  }
-  LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
-}
-
 Value Subtract(ImplicitLocOpBuilder& b, ValueRange values) {
   if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::IntegerType>()) {
     return b.create<ma::SubIOp>(values[0], values[1]);
@@ -287,22 +323,6 @@ Value Minimum(ImplicitLocOpBuilder& b, ValueRange values) {
   return b.create<ma::SelectOp>(cmp, values[0], values[1]);
 }
 
-Value ZerosLike(ImplicitLocOpBuilder& b, Value x) {
-  if (auto src_shaped_ty = x.getType().dyn_cast<mlir::ShapedType>()) {
-    Type src_ty = src_shaped_ty.getElementType();
-    return CreateConst(b, src_ty, 0, src_shaped_ty.getShape());
-  }
-  return CreateConst(b, x.getType(), 0);
-}
-
-Value OnesLike(ImplicitLocOpBuilder& b, Value x) {
-  if (auto src_shaped_ty = x.getType().dyn_cast<mlir::ShapedType>()) {
-    Type src_ty = src_shaped_ty.getElementType();
-    return CreateConst(b, src_ty, 1, src_shaped_ty.getShape());
-  }
-  return CreateConst(b, x.getType(), 1);
-}
-
 // TODO(b/269489810): Contribute nicer builders to Triton, so we don't need to
 // define these utilities.
 Value Splat(ImplicitLocOpBuilder& b, Value value, ArrayRef<int64_t> shape) {
@@ -1633,13 +1653,15 @@ StatusOr<TritonWrapperResult> TritonWrapper(
   SmallVector<Type> fn_arg_types;
   for (HloInstruction* p : hlo_computation->parameter_instructions()) {
     fn_arg_types.push_back(mt::PointerType::get(
-        TritonType(b, p->shape().element_type()), mn::kGlobalMemorySpace));
+        StorageType(b, TritonType(b, p->shape().element_type())),
+        mn::kGlobalMemorySpace));
   }
 
   for (const ShapeUtil::IndexedShape& s :
        ShapeUtil::GetLeafShapes(hlo_computation->root_instruction()->shape())) {
     fn_arg_types.push_back(mt::PointerType::get(
-        TritonType(b, s.shape.element_type()), mn::kGlobalMemorySpace));
+        StorageType(b, TritonType(b, s.shape.element_type())),
+        mn::kGlobalMemorySpace));
   }
 
   auto fn = b.create<mt::FuncOp>(loc, fn_name,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index b68ce63f6a2096..832c7a295b981a 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -14,28 +14,40 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <array>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/base/optimization.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/substitute.h"
 #include "xla/comparison_util.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+namespace m = ::xla::match;
+
 struct MixTypeParams {
   PrimitiveType lhs_ty;
   PrimitiveType rhs_ty;
@@ -143,6 +155,19 @@ class NoTF32Test : public GpuCodegenTest {
     tsl::enable_tensor_float_32_execution(tf32_state_);
   }
 
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_triton_gemm_any(true);
+    return debug_options;
+  }
+
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+
  private:
   bool tf32_state_;
 };
@@ -503,25 +528,17 @@ INSTANTIATE_TEST_SUITE_P(
 
 class SelectTest : public NoTF32Test,
                    public ::testing::WithParamInterface<
-                       std::tuple<PrimitiveType, PrimitiveType>> {
- public:
-  se::CudaComputeCapability GetCudaComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .cuda_compute_capability();
-  }
-};
+                       std::tuple<PrimitiveType, PrimitiveType>> {};
 
 TEST_P(SelectTest, SelectFusionExecutesCorrectly) {
-  PrimitiveType data_type1;
-  PrimitiveType data_type2;
+  PrimitiveType data_type1, data_type2;
   std::tie(data_type1, data_type2) = GetParam();
-
-  if ((data_type1 == BF16 || data_type2 == BF16) &&
-      !GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
+  for (const PrimitiveType type : {data_type1, data_type2}) {
+    if (!IsTritonSupportedDataType(type, GetCudaComputeCapability())) {
+      GTEST_SKIP() << absl::Substitute(
+          "Unsupported data type: $0",
+          primitive_util::LowercasePrimitiveTypeName(type));
+    }
   }
 
   const std::string hHloTestTemplate = R"(
@@ -613,7 +630,7 @@ ENTRY e {
       /*run_hlo_passes=*/false));
 }
 
-std::string SelectTestParamsToString(
+std::string TwoPrimitiveTypesToString(
     const ::testing::TestParamInfo<std::tuple<PrimitiveType, PrimitiveType>>&
         data) {
   PrimitiveType data_type1;
@@ -624,17 +641,26 @@ std::string SelectTestParamsToString(
                       primitive_util::LowercasePrimitiveTypeName(data_type2));
 }
 
+// BF16: depending on the GPU generation.
+constexpr std::array<PrimitiveType, 7> kSupportedDataTypes{PRED, S8,  S16, S32,
+                                                           F16,  F32, BF16};
+
 INSTANTIATE_TEST_SUITE_P(
     SelectTestSuite, SelectTest,
-    ::testing::Combine(::testing::Values(PRED, S8, S16, S32, F16, BF16, F32),
+    ::testing::Combine(::testing::ValuesIn(kSupportedDataTypes),
                        ::testing::Values(F16, BF16, F32)),
-    SelectTestParamsToString);
+    TwoPrimitiveTypesToString);
 
 class ConstantTest : public NoTF32Test,
                      public ::testing::WithParamInterface<PrimitiveType> {};
 
 TEST_P(ConstantTest, ConstantFusionExecutesCorrectly) {
-  PrimitiveType data_type = GetParam();
+  const PrimitiveType data_type = GetParam();
+  if (!IsTritonSupportedDataType(data_type, GetCudaComputeCapability())) {
+    GTEST_SKIP() << absl::Substitute(
+        "Unsupported data type: $0",
+        primitive_util::LowercasePrimitiveTypeName(data_type));
+  }
 
   const std::string hHloTestTemplate = R"(
 HloModule m, is_scheduled=true
@@ -716,8 +742,57 @@ ENTRY e {
       /*run_hlo_passes=*/false));
 }
 
-INSTANTIATE_TEST_SUITE_P(ConstantTestSuite, ConstantTest,
-                         ::testing::Values(PRED, S8, S16, S32, F16, F32));
+INSTANTIATE_TEST_SUITE_P(
+    ConstantTestSuite, ConstantTest, ::testing::ValuesIn(kSupportedDataTypes),
+    [](const ::testing::TestParamInfo<PrimitiveType> type) {
+      return primitive_util::LowercasePrimitiveTypeName(type.param);
+    });
+
+class ConvertTest : public NoTF32Test,
+                    public ::testing::WithParamInterface<
+                        std::tuple<PrimitiveType, PrimitiveType>> {};
+
+TEST_P(ConvertTest, ConvertFusionExecutesCorrectly) {
+  PrimitiveType data_type1, data_type2;
+  std::tie(data_type1, data_type2) = GetParam();
+  for (const PrimitiveType type : {data_type1, data_type2}) {
+    if (!IsTritonSupportedDataType(type, GetCudaComputeCapability())) {
+      GTEST_SKIP() << absl::Substitute(
+          "Unsupported data type: $0",
+          primitive_util::LowercasePrimitiveTypeName(type));
+    }
+  }
+
+  const std::string hlo_text = absl::Substitute(
+      R"(
+t {
+  p0 = $0[2,2] parameter(0)
+  p0c = $1[2,2] convert(p0)
+  p0cc = f32[2,2] convert(p0c)
+  p1 = f32[2,2] parameter(1)
+  ROOT r = f32[2,2] dot(p0cc, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e{
+  p0 = $0[2,2] parameter(0)
+  p1 = f32[2,2] parameter(1)
+  ROOT r = f32[2,2] fusion(p0, p1), kind=kCustom, calls=t,
+    backend_config={"kind":"__triton_gemm"}
+})",
+      primitive_util::LowercasePrimitiveTypeName(data_type1),
+      primitive_util::LowercasePrimitiveTypeName(data_type2));
+
+  MatchOptimizedHlo(hlo_text, R"(
+CHECK: block_m
+  )");
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ConvertTestSuite, ConvertTest,
+    ::testing::Combine(::testing::ValuesIn(kSupportedDataTypes),
+                       ::testing::ValuesIn(kSupportedDataTypes)),
+    TwoPrimitiveTypesToString);
 
 class TritonSoftmaxTest : public GpuCodegenTest,
                           public ::testing::WithParamInterface<PrimitiveType> {

From be8bc81b566e89b6f9958d2ca0d84bb250ebcfeb Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <slebedev@google.com>
Date: Fri, 22 Sep 2023 07:51:57 -0700
Subject: [PATCH 143/567] Added a few missing members to jax_jit.JitState

PiperOrigin-RevId: 567621107
---
 third_party/xla/xla/python/xla_extension/jax_jit.pyi | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/python/xla_extension/jax_jit.pyi b/third_party/xla/xla/python/xla_extension/jax_jit.pyi
index 4c7cd95f446fab..e495b5fe8db9a2 100644
--- a/third_party/xla/xla/python/xla_extension/jax_jit.pyi
+++ b/third_party/xla/xla/python/xla_extension/jax_jit.pyi
@@ -26,8 +26,10 @@ CompiledFunction = xla_extension.CompiledFunction
 class JitState:
   disable_jit: Optional[bool]
   enable_x64: Optional[bool]
-  extra_jit_context: Any
-  post_hook: Optional[Callable]
+  enable_memories: Optional[bool]
+  default_device: Optional[Any]
+  extra_jit_context: Optional[Any]
+  post_hook: Optional[Callable[..., Any]]
 
 def global_state() -> JitState: ...
 def thread_local_state() -> JitState: ...

From b698c540108c00752a4668765bb5b2af762519cc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 08:28:12 -0700
Subject: [PATCH 144/567] Integrate LLVM at llvm/llvm-project@058222b23166

Updates LLVM usage to match
[058222b23166](https://github.com/llvm/llvm-project/commit/058222b23166)

PiperOrigin-RevId: 567628854
---
 third_party/llvm/generated.patch              | 53 -------------------
 third_party/llvm/workspace.bzl                |  4 +-
 .../xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.td  |  6 +--
 .../cpu_tiling/fusion_planning_for_cpu.cc     |  4 +-
 .../transform_elementwise_for_cpu.cc          | 10 ++--
 .../cpu_tiling/transform_reduce_for_cpu.cc    |  2 +-
 .../gml_st/transforms/fusion/fusion.cc        |  4 +-
 .../transforms/scalarization/scalarization.cc | 11 ++--
 .../xla/xla/mlir_hlo/thlo/IR/thlo_ops.cc      | 29 +++++-----
 .../xla/xla/mlir_hlo/thlo/IR/thlo_ops.td      | 30 +++++------
 .../tools/mlir_interpreter/dialects/vector.cc |  4 +-
 11 files changed, 50 insertions(+), 107 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 409e0541024c8e..509398da979e83 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,54 +1 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
---- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
-+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
-@@ -5700,7 +5700,7 @@
-     if (OpOpcode == ISD::TRUNCATE) {
-       SDValue OpOp = N1.getOperand(0);
-       if (OpOp.getValueType() == VT) {
--        if (OpOp.getOpcode() == ISD::AssertZext) {
-+        if (OpOp.getOpcode() == ISD::AssertZext && N1->hasOneUse()) {
-           APInt HiBits = APInt::getBitsSetFrom(VT.getScalarSizeInBits(),
-                                                N1.getScalarValueSizeInBits());
-           if (MaskedValueIsZero(OpOp, HiBits)) {
-diff -ruN --strip-trailing-cr a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
---- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
-+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
-@@ -2645,8 +2645,7 @@
-   for (;;) {
-     // Look through nodes that don't alter the bits of the incoming value.
-     unsigned Op = Arg.getOpcode();
--    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
--        Op == ISD::AssertZext) {
-+    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
-       Arg = Arg.getOperand(0);
-       continue;
-     }
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
---- a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
-+++ b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
-@@ -4,6 +4,8 @@
- define i1 @load_bv_v4i8(i1 zeroext %a) {
- ; CHECK-LABEL: load_bv_v4i8:
- ; CHECK:       // %bb.0:
-+; CHECK-NEXT:    cmp w0, #0
-+; CHECK-NEXT:    cset w0, ne
- ; CHECK-NEXT:    ret
-   %b = zext i1 %a to i32
-   %c = icmp eq i32 %b, 1
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/RISCV/rvv/fold-vp-fadd-and-vp-fmul.ll b/llvm/test/CodeGen/RISCV/rvv/fold-vp-fadd-and-vp-fmul.ll
---- a/llvm/test/CodeGen/RISCV/rvv/fold-vp-fadd-and-vp-fmul.ll
-+++ b/llvm/test/CodeGen/RISCV/rvv/fold-vp-fadd-and-vp-fmul.ll
-@@ -62,9 +62,9 @@
- ; CHECK-LABEL: fma_reassociate:
- ; CHECK:       # %bb.0:
- ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
--; CHECK-NEXT:    vfmadd.vv v11, v10, v12, v0.t
--; CHECK-NEXT:    vfmadd.vv v9, v8, v11, v0.t
--; CHECK-NEXT:    vmv.v.v v8, v9
-+; CHECK-NEXT:    vfmadd.vv v9, v8, v12, v0.t
-+; CHECK-NEXT:    vfmadd.vv v11, v10, v9, v0.t
-+; CHECK-NEXT:    vmv.v.v v8, v11
- ; CHECK-NEXT:    ret
-   %1 = call fast <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, <vscale x 1 x i1> %m, i32 %vl)
-   %2 = call fast <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> %c, <vscale x 1 x double> %d, <vscale x 1 x i1> %m, i32 %vl)
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 5abb4304cc53f3..288b1748010f35 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "3b0f812b9af459c4f857e4a7ffffa01f7a21446e"
-    LLVM_SHA256 = "47f8c81275437fb4ebcf0286125d683fb946cbddf0b725dc61d786141b32bc08"
+    LLVM_COMMIT = "058222b2316615194c089f2bc68d11341f39d26e"
+    LLVM_SHA256 = "99d3c38eb11dee8f00bd74b69152d961ab73cf4488842f6120e81342eeb94a3b"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.td b/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.td
index 8a78bc5102ed45..b19d067027d752 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.td
+++ b/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.td
@@ -51,10 +51,8 @@ def GMLST_FusionOp : GMLST_Op<"fusion", [
     YieldOp getTerminator();
 
     // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t numOperands = this->getNumOperands();
-      int64_t numInits = this->getInits().size();
-      return {numOperands - numInits, numOperands};
+    mlir::MutableOperandRange getDpsInitsMutable() {
+      return getInitsMutable();
     }
   }];
 }
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc
index 73a5a3a9ee45d2..a6d127c6eb0de4 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc
+++ b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc
@@ -69,8 +69,8 @@ bool allowedToFuse(Operation* consumerOp, Operation* producerOp) {
     auto dstStyleOp = dyn_cast<DestinationStyleOpInterface>(consumerOp);
     if (!dstStyleOp) return false;
 
-    if (llvm::any_of(dstStyleOp.getDpsInitOperands(), [&](OpOperand* operand) {
-          return operand->get().getDefiningOp() == producerOp;
+    if (llvm::any_of(dstStyleOp.getDpsInits(), [&](Value operand) {
+          return operand.getDefiningOp() == producerOp;
         }))
       return true;
   }
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc
index 309d2b02f592c5..b9a058d25904db 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc
+++ b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc
@@ -65,7 +65,9 @@ Operation *findRootElementwiseOp(Operation *op, FusionFilterFn fusionFilterFn) {
       if (hasLabel(owner, kTransformedLabel)) continue;
       if (hasLabel(owner, kFusionPlanningLabel)) continue;
       if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(owner)) {
-        if (llvm::is_contained(dpsOp.getDpsInitOperands(), &use)) continue;
+        SmallVector<OpOperand *> opOperands = llvm::to_vector(llvm::map_range(
+            dpsOp.getDpsInitsMutable(), [](OpOperand &o) { return &o; }));
+        if (llvm::is_contained(opOperands, &use)) continue;
       }
       curOp = owner;
       rootOp = curOp;
@@ -205,9 +207,9 @@ FusionCluster findElementwiseCluster(Operation *rootOp,
   // Add tensor.empty ops to the cluster.
   for (auto *op : resultOps) {
     if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(op)) {
-      for (auto &operand : dpsOp.getDpsInitOperands()) {
-        if (auto emptyOp = dyn_cast_or_null<tensor::EmptyOp>(
-                operand->get().getDefiningOp()))
+      for (auto operand : dpsOp.getDpsInits()) {
+        if (auto emptyOp =
+                dyn_cast_or_null<tensor::EmptyOp>(operand.getDefiningOp()))
           fusionCluster.operations.insert(emptyOp);
       }
     }
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
index 9910cd9c3c4b23..766c33a4573fa1 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
+++ b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
@@ -74,7 +74,7 @@ LogicalResult validateOp(linalg::ReduceOp reduceOp, PatternRewriter &rewriter,
     return rewriter.notifyMatchFailure(
         reduceOp, "expects 1 reduction dimension element. 0 or > 1 received.");
   }
-  OpOperandVector operands = reduceOp.getDpsInputOperands();
+  SmallVector<OpOperand *> operands = reduceOp.getDpsInputOperands();
   if (operands.size() != 1) {
     return rewriter.notifyMatchFailure(reduceOp,
                                        "expects 1 operand. 0 or > 1 received.");
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
index f0723f7b00e142..57e6b3a5cc42d0 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
+++ b/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
@@ -510,8 +510,8 @@ SmallVector<Value> getRootOpInitOperands(PatternRewriter& rewriter,
 
   SmallVector<Value> initOperands;
 
-  for (auto* operand : dstStyleOp.getDpsInitOperands()) {
-    initOperands.push_back(getTiedSourceOp(rewriter, operand, fusionCluster));
+  for (OpOperand& operand : dstStyleOp.getDpsInitsMutable()) {
+    initOperands.push_back(getTiedSourceOp(rewriter, &operand, fusionCluster));
   }
 
   return initOperands;
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc
index c8115b6889e1b0..8bc17d17b4b3c5 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc
+++ b/third_party/xla/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc
@@ -505,10 +505,13 @@ LogicalResult scalarizeLinalgOp(LinalgOp linalgOp, PatternRewriter &rewriter) {
   if (isa<linalg::FillOp>(linalgOp)) {
     if (llvm::all_of(linalgOp->getUses(), [&](OpOperand &use) {
           Operation *user = use.getOwner();
-          return isa<DestinationStyleOpInterface>(user) &&
-                 llvm::is_contained(cast<DestinationStyleOpInterface>(user)
-                                        .getDpsInitOperands(),
-                                    &use);
+          if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(user)) {
+            SmallVector<OpOperand *> opOperands = llvm::to_vector(
+                llvm::map_range(dpsOp.getDpsInitsMutable(),
+                                [](OpOperand &o) { return &o; }));
+            return llvm::is_contained(opOperands, &use);
+          }
+          return false;
         }))
       return failure();
   }
diff --git a/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.cc b/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.cc
index 4d09db0186585c..d56141388076da 100644
--- a/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.cc
@@ -185,18 +185,17 @@ SmallVector<Range> getIterationDomainForTensor(OpBuilder &b, Location loc,
 static void getDstStyleOpEffectsImpl(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects,
-    ValueRange results, const OpOperandVector &inputOperands,
-    const OpOperandVector &outputOperands) {
-  for (auto *operand : inputOperands) {
-    if (!operand->get().getType().isa<MemRefType>()) continue;
-    effects.emplace_back(MemoryEffects::Read::get(), operand->get(),
+    ValueRange results, ValueRange inputOperands, ValueRange outputOperands) {
+  for (auto operand : inputOperands) {
+    if (!operand.getType().isa<MemRefType>()) continue;
+    effects.emplace_back(MemoryEffects::Read::get(), operand,
                          SideEffects::DefaultResource::get());
   }
-  for (auto *operand : outputOperands) {
-    if (!operand->get().getType().isa<MemRefType>()) continue;
-    effects.emplace_back(MemoryEffects::Read::get(), operand->get(),
+  for (auto operand : outputOperands) {
+    if (!operand.getType().isa<MemRefType>()) continue;
+    effects.emplace_back(MemoryEffects::Read::get(), operand,
                          SideEffects::DefaultResource::get());
-    effects.emplace_back(MemoryEffects::Write::get(), operand->get(),
+    effects.emplace_back(MemoryEffects::Write::get(), operand,
                          SideEffects::DefaultResource::get());
   }
 }
@@ -557,7 +556,7 @@ void ConcatenateOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
   getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputOperands(), getDpsInitOperands());
+                           getDpsInputs(), getDpsInits());
 }
 
 //===----------------------------------------------------------------------===//
@@ -713,7 +712,7 @@ void DynamicBroadcastInDimOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
   getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputOperands(), getDpsInitOperands());
+                           getDpsInputs(), getDpsInits());
 }
 
 //===----------------------------------------------------------------------===//
@@ -879,7 +878,7 @@ void ScatterOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
   getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputOperands(), getDpsInitOperands());
+                           getDpsInputs(), getDpsInits());
 }
 
 //===----------------------------------------------------------------------===//
@@ -981,7 +980,7 @@ void GatherOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
   getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputOperands(), getDpsInitOperands());
+                           getDpsInputs(), getDpsInits());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1231,7 +1230,7 @@ void SortOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
   getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputOperands(), getDpsInitOperands());
+                           getDpsInputs(), getDpsInits());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1354,7 +1353,7 @@ void ReverseOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
   getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputOperands(), getDpsInitOperands());
+                           getDpsInputs(), getDpsInits());
 }
 
 }  // namespace thlo
diff --git a/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.td b/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.td
index aec9816e8a2864..5346d413842e51 100644
--- a/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.td
+++ b/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.td
@@ -83,9 +83,8 @@ def THLO_ConcatenateOp : THLO_DstStyleOp<"concatenate", [
 
   let extraClassDeclaration = [{
     // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t getNumOperands = this->getNumOperands();
-      return {getNumOperands - 1, getNumOperands};
+    mlir::MutableOperandRange getDpsInitsMutable() {
+      return getInitMutable();
     }
   }];
 }
@@ -130,9 +129,8 @@ def THLO_DynamicBroadcastInDimOp : THLO_DstStyleOp<"dynamic_broadcast_in_dim", [
 
   let extraClassDeclaration = [{
     // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t getNumOperands = this->getNumOperands();
-      return {getNumOperands - 1, getNumOperands};
+    mlir::MutableOperandRange getDpsInitsMutable() {
+      return getInitMutable();
     }
   }];
 }
@@ -176,9 +174,8 @@ def THLO_GatherOp : THLO_DstStyleOp<"gather", [
 
   let extraClassDeclaration = [{
     // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t getNumOperands = this->getNumOperands();
-      return {getNumOperands - 1, getNumOperands};
+    mlir::MutableOperandRange getDpsInitsMutable() {
+      return getInitMutable();
     }
   }];
 }
@@ -240,9 +237,8 @@ def THLO_ScatterOp : THLO_DstStyleOp<"scatter", [
     int64_t getIndicesCount() { return getIndices().getType().getDimSize(0); }
 
     // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t getNumOperands = this->getNumOperands();
-      return {getNumOperands - 1, getNumOperands};
+    mlir::MutableOperandRange getDpsInitsMutable() {
+      return getInitMutable();
     }
   }];
 }
@@ -298,9 +294,8 @@ def THLO_SortOp : THLO_DstStyleOp<"sort", [
 
   let extraClassDeclaration = [{
     // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t getNumOperands = this->getNumOperands();
-      return {getNumOperands - getInits().size(), getNumOperands};
+    mlir::MutableOperandRange getDpsInitsMutable() {
+      return getInitsMutable();
     }
   }];
 }
@@ -334,9 +329,8 @@ def THLO_ReverseOp : THLO_DstStyleOp<"reverse", [
 
   let extraClassDeclaration = [{
     // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t getNumOperands = this->getNumOperands();
-      return {getNumOperands - 1, getNumOperands};
+    mlir::MutableOperandRange getDpsInitsMutable() {
+      return getInitMutable();
     }
   }];
 }
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc
index 6d8563ff6e684d..47cf0267bc8605 100644
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc
+++ b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc
@@ -284,7 +284,7 @@ InterpreterValue extract(InterpreterState& state, vector::ExtractOp extract,
                          const InterpreterValue& vector) {
   auto result = vector;
   auto& resultView = result.view();
-  for (int64_t offset : extract.getPosition()) {
+  for (int64_t offset : extract.getStaticPosition()) {
     state.checkSuccess(resultView.slice(0, offset), "index out of bounds");
   }
   return resultView.rank() == 0 ? result.extractElement({}) : result;
@@ -374,7 +374,7 @@ InterpreterValue insert(InterpreterState& state, vector::InsertOp insert,
   auto result = dst.clone();
   auto resultSlice = result;
   auto& resultSliceView = resultSlice.view();
-  for (int64_t offset : insert.getPosition()) {
+  for (int64_t offset : insert.getStaticPosition()) {
     state.checkSuccess(resultSliceView.slice(0, offset), "index out of bounds");
   }
   resultSlice.fill([&](auto indices) { return src.extractElement(indices); });

From 655db6e4db11c3bc310e46145d7bb2292ea3063c Mon Sep 17 00:00:00 2001
From: Hyeontaek Lim <hyeontaek@google.com>
Date: Fri, 22 Sep 2023 08:36:32 -0700
Subject: [PATCH 145/567] Clarify that PjRtClient and PjRtDevice memory_spaces
 are not in particular order

PiperOrigin-RevId: 567630629
---
 third_party/xla/xla/pjrt/pjrt_client.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index 1ac2461e8fc136..5038a3e69dd399 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -171,6 +171,7 @@ class PjRtDevice {
   }
 
   // Returns all memory spaces attached to this device.
+  // The memory spaces are in no particular order.
   virtual absl::Span<PjRtMemorySpace* const> memory_spaces() const = 0;
 
   // Returns the default memory space attached to this device.
@@ -485,6 +486,7 @@ class PjRtClient {
       int local_hardware_id) const = 0;
 
   // Return all memory spaces owned by the client.
+  // The memory spaces are in no particular order.
   virtual absl::Span<PjRtMemorySpace* const> memory_spaces() const = 0;
 
   // Return an ID that identifies the platform (CPU/GPU/TPU).

From 5b7a8b07e6ead1db358fa0a5cbff930a83bbedfd Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 22 Sep 2023 09:01:16 -0700
Subject: [PATCH 146/567] Generate CUDA stubs using implib.so, rather than by
 writing C++ stubs.

We load CUDA libraries lazily using dlopen()/dlsym() primarily to comply with the manylinux rules for Python wheels, which require that libraries in a wheel only link directly against an allowlist of libraries. In order to access CUDA using dlopen()/dlsym() without changing any of our CUDA-using code, TSL contains stub implementations of CUDA APIs that, when invoked, load the relevant library, obtain the requested symbol using dlsym(), and call into that symbol.

The current CUDA stub libraries were constructed using a tool based on clang inside Google, which parses the CUDA headers and generates stub code for each API. It is therefore difficult to update the stubs without access to that tool. Each stub generated by the tool is a C++ function, which is very verbose. Further, a number of manual edits are required to the generated code, making maintenance tedious.

However, there is a better way. implib.so
(https://github.com/yugr/Implib.so) is a tool for automatically generating stubs from a .so file. This tool is considerably simpler because it generates a stub using assembly language, in which it turns out we do not need to know the type signature of the function being called. A completely generic trampoline function will do, with little or no bespoke knowledge of each function. We can adapt it to solve our CUDA stub problem.

implib-gen.py, which is the tool implib.so provides, isn't perfect for our needs, because it requires access to the .so file at stub generation time, which we don't have in our Bazel build. Instead, we can split it into two phases:

get_symbols.py, which, given a .so file extracts a list of public symbols that should be present in a stub. That list of symbols is checked into the TSL tree.
make_stub.py, which, given a list of symbols, generates trampolines for each function.
This change changes the TSL CUDA build to use make_stub.py to generate stubs from the list of symbols at Bazel build time, allowing us to delete over 130k lines of autogenerated C++ stub code.

PiperOrigin-RevId: 567635940
---
 tensorflow/opensource_only.files              |    3 +
 tensorflow/workspace2.bzl                     |    2 +
 third_party/implib_so/BUILD                   |   21 +
 third_party/implib_so/get_symbols.py          |   38 +
 third_party/implib_so/implib_so.BUILD         |   20 +
 third_party/implib_so/make_stub.py            |   68 +
 third_party/implib_so/workspace.bzl           |   13 +
 third_party/xla/opensource_only.files         |    3 +
 third_party/xla/third_party/implib_so/BUILD   |   23 +
 .../xla/third_party/implib_so/get_symbols.py  |   38 +
 .../xla/third_party/implib_so/implib_so.BUILD |   20 +
 .../xla/third_party/implib_so/make_stub.py    |   68 +
 .../xla/third_party/implib_so/workspace.bzl   |   13 +
 .../xla/third_party/tsl/opensource_only.files |    4 +
 .../tsl/third_party/implib_so/BUILD           |   23 +
 .../tsl/third_party/implib_so/get_symbols.py  |   38 +
 .../tsl/third_party/implib_so/implib_so.BUILD |   20 +
 .../tsl/third_party/implib_so/make_stub.py    |   68 +
 .../tsl/third_party/implib_so/workspace.bzl   |   13 +
 .../xla/third_party/tsl/tsl/cuda/BUILD.bazel  |  137 +-
 .../third_party/tsl/tsl/cuda/cublas.symbols   |  736 ++
 .../third_party/tsl/tsl/cuda/cublasLt.symbols |  234 +
 .../tsl/tsl/cuda/cublasLt_11_0.inc            |  390 -
 .../third_party/tsl/tsl/cuda/cublasLt_stub.cc |   44 +-
 .../third_party/tsl/tsl/cuda/cublas_10_0.inc  | 4898 ----------
 .../third_party/tsl/tsl/cuda/cublas_10_1.inc  | 5023 ----------
 .../third_party/tsl/tsl/cuda/cublas_10_2.inc  | 5023 ----------
 .../third_party/tsl/tsl/cuda/cublas_11_0.inc  | 5197 -----------
 .../third_party/tsl/tsl/cuda/cublas_stub.cc   |  222 +-
 .../xla/third_party/tsl/tsl/cuda/cuda.symbols |  583 ++
 .../third_party/tsl/tsl/cuda/cuda_10_0.inc    | 2133 -----
 .../third_party/tsl/tsl/cuda/cuda_10_1.inc    | 2166 -----
 .../third_party/tsl/tsl/cuda/cuda_10_2.inc    | 2328 -----
 .../third_party/tsl/tsl/cuda/cuda_11_0.inc    | 2943 ------
 .../third_party/tsl/tsl/cuda/cuda_11_2.inc    | 2816 ------
 .../third_party/tsl/tsl/cuda/cuda_12_0.inc    | 3323 -------
 .../tsl/tsl/cuda/cuda_runtime_10_0.inc        | 1846 ----
 .../tsl/tsl/cuda/cuda_runtime_10_1.inc        | 1854 ----
 .../tsl/tsl/cuda/cuda_runtime_10_2.inc        | 1907 ----
 .../tsl/tsl/cuda/cuda_runtime_11_0.inc        | 2639 ------
 .../tsl/tsl/cuda/cuda_runtime_11_2.inc        | 2259 -----
 .../tsl/tsl/cuda/cuda_runtime_11_8.inc        | 2771 ------
 .../tsl/tsl/cuda/cuda_runtime_12_0.inc        | 2676 ------
 .../tsl/tsl/cuda/cuda_runtime_9_0.inc         | 1421 ---
 .../xla/third_party/tsl/tsl/cuda/cuda_stub.cc |   48 +-
 .../third_party/tsl/tsl/cuda/cudart.symbols   |  399 +
 .../third_party/tsl/tsl/cuda/cudart_stub.cc   |  120 +-
 .../third_party/tsl/tsl/cuda/cudnn.symbols    |  268 +
 .../third_party/tsl/tsl/cuda/cudnn_6_0.inc    | 1825 ----
 .../third_party/tsl/tsl/cuda/cudnn_7_0.inc    | 2027 ----
 .../third_party/tsl/tsl/cuda/cudnn_7_1.inc    | 2361 -----
 .../third_party/tsl/tsl/cuda/cudnn_7_3.inc    | 2585 ------
 .../third_party/tsl/tsl/cuda/cudnn_7_4.inc    | 2726 ------
 .../third_party/tsl/tsl/cuda/cudnn_7_6.inc    | 3257 -------
 .../third_party/tsl/tsl/cuda/cudnn_8_0.inc    | 3213 -------
 .../third_party/tsl/tsl/cuda/cudnn_stub.cc    |   49 +-
 .../third_party/tsl/tsl/cuda/cufft.symbols    |   58 +
 .../third_party/tsl/tsl/cuda/cufft_10_0.inc   |  361 -
 .../third_party/tsl/tsl/cuda/cufft_9_0.inc    |  307 -
 .../third_party/tsl/tsl/cuda/cufft_stub.cc    |   35 +-
 .../third_party/tsl/tsl/cuda/cupti.symbols    |  148 +
 .../third_party/tsl/tsl/cuda/cupti_10_0.inc   |  763 --
 .../third_party/tsl/tsl/cuda/cupti_10_1.inc   |  763 --
 .../third_party/tsl/tsl/cuda/cupti_10_2.inc   |  763 --
 .../third_party/tsl/tsl/cuda/cupti_11_0.inc   |  763 --
 .../third_party/tsl/tsl/cuda/cupti_12_0.inc   |  744 --
 .../third_party/tsl/tsl/cuda/cupti_9_0.inc    |  763 --
 .../third_party/tsl/tsl/cuda/cupti_stub.cc    |   47 +-
 .../third_party/tsl/tsl/cuda/curand_10_0.inc  |  268 -
 .../third_party/tsl/tsl/cuda/curand_10_1.inc  |  268 -
 .../third_party/tsl/tsl/cuda/curand_10_2.inc  |  268 -
 .../third_party/tsl/tsl/cuda/curand_11_0.inc  |  268 -
 .../third_party/tsl/tsl/cuda/curand_9_0.inc   |  268 -
 .../third_party/tsl/tsl/cuda/cusolver.symbols |  926 ++
 .../tsl/tsl/cuda/cusolver_dense_10_0.inc      | 2283 -----
 .../tsl/tsl/cuda/cusolver_dense_10_1.inc      | 3139 -------
 .../tsl/tsl/cuda/cusolver_dense_10_2.inc      | 3667 --------
 .../tsl/tsl/cuda/cusolver_dense_11_0.inc      | 5149 ----------
 .../third_party/tsl/tsl/cuda/cusolver_stub.cc |   40 +-
 .../third_party/tsl/tsl/cuda/cusparse.symbols |  421 +
 .../tsl/tsl/cuda/cusparse_10_0.inc            | 7832 ----------------
 .../tsl/tsl/cuda/cusparse_10_1.inc            | 8262 -----------------
 .../tsl/tsl/cuda/cusparse_10_2.inc            | 8262 -----------------
 .../tsl/tsl/cuda/cusparse_11_0.inc            | 7025 --------------
 .../tsl/tsl/cuda/cusparse_12_0.inc            | 6080 ------------
 .../third_party/tsl/tsl/cuda/cusparse_9_0.inc | 7152 --------------
 .../third_party/tsl/tsl/cuda/cusparse_stub.cc |   47 +-
 .../xla/third_party/tsl/tsl/cuda/stub.bzl     |   26 +
 .../xla/third_party/tsl/workspace2.bzl        |   42 +-
 89 files changed, 4836 insertions(+), 137315 deletions(-)
 create mode 100644 third_party/implib_so/BUILD
 create mode 100644 third_party/implib_so/get_symbols.py
 create mode 100644 third_party/implib_so/implib_so.BUILD
 create mode 100644 third_party/implib_so/make_stub.py
 create mode 100644 third_party/implib_so/workspace.bzl
 create mode 100644 third_party/xla/third_party/implib_so/BUILD
 create mode 100644 third_party/xla/third_party/implib_so/get_symbols.py
 create mode 100644 third_party/xla/third_party/implib_so/implib_so.BUILD
 create mode 100644 third_party/xla/third_party/implib_so/make_stub.py
 create mode 100644 third_party/xla/third_party/implib_so/workspace.bzl
 create mode 100644 third_party/xla/third_party/tsl/third_party/implib_so/BUILD
 create mode 100644 third_party/xla/third_party/tsl/third_party/implib_so/get_symbols.py
 create mode 100644 third_party/xla/third_party/tsl/third_party/implib_so/implib_so.BUILD
 create mode 100644 third_party/xla/third_party/tsl/third_party/implib_so/make_stub.py
 create mode 100644 third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cublas.symbols
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cublasLt.symbols
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cublasLt_11_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cublas_10_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cublas_10_1.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cublas_10_2.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cublas_11_0.inc
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda.symbols
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_10_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_10_1.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_10_2.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_11_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_11_2.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_12_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_1.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_2.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_2.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_8.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_12_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_9_0.inc
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cudart.symbols
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cudnn.symbols
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cudnn_6_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_1.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_3.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_4.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_6.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cudnn_8_0.inc
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cufft.symbols
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cufft_10_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cufft_9_0.inc
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cupti.symbols
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cupti_10_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cupti_10_1.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cupti_10_2.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cupti_11_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cupti_12_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cupti_9_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/curand_10_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/curand_10_1.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/curand_10_2.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/curand_11_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/curand_9_0.inc
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusolver.symbols
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_1.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_2.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_11_0.inc
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusparse.symbols
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_1.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_2.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusparse_11_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusparse_12_0.inc
 delete mode 100644 third_party/xla/third_party/tsl/tsl/cuda/cusparse_9_0.inc
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/stub.bzl

diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 9a0bc808b1ea60..907e661369c704 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -259,6 +259,9 @@ tf_staging/third_party/gpus/rocm/rocm_config.h.tpl:
 tf_staging/third_party/gpus/rocm_configure.bzl:
 tf_staging/third_party/grpc/BUILD:
 tf_staging/third_party/icu/udata.patch:
+tf_staging/third_party/implib_so/BUILD:
+tf_staging/third_party/implib_so/get_symbols.py:
+tf_staging/third_party/implib_so/make_stub.py:
 tf_staging/third_party/linenoise.BUILD:
 tf_staging/third_party/llvm_openmp/BUILD:
 tf_staging/third_party/llvm_openmp/cmake_vars.bzl:
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 50aaf181c2beb0..d0a6b6f2ff4aad 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -31,6 +31,7 @@ load("//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
 load("//third_party/hexagon:workspace.bzl", hexagon_nn = "repo")
 load("//third_party/highwayhash:workspace.bzl", highwayhash = "repo")
 load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
+load("//third_party/implib_so:workspace.bzl", implib_so = "repo")
 load("//third_party/icu:workspace.bzl", icu = "repo")
 load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
 load("//third_party/libprotobuf_mutator:workspace.bzl", libprotobuf_mutator = "repo")
@@ -72,6 +73,7 @@ def _initialize_third_party():
     highwayhash()
     hwloc()
     icu()
+    implib_so()
     jpeg()
     kissfft()
     libprotobuf_mutator()
diff --git a/third_party/implib_so/BUILD b/third_party/implib_so/BUILD
new file mode 100644
index 00000000000000..ca6976cd8d3425
--- /dev/null
+++ b/third_party/implib_so/BUILD
@@ -0,0 +1,21 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # MIT
+
+py_binary(
+    name = "get_symbols",
+    srcs = ["get_symbols.py"],
+    deps = [
+        "@bazel_tools//tools/python/runfiles",
+        "@implib_so//:implib_gen_lib",
+    ],
+)
+
+py_binary(
+    name = "make_stub",
+    srcs = ["make_stub.py"],
+    deps = [
+        "@bazel_tools//tools/python/runfiles",
+        "@implib_so//:implib_gen_lib",
+    ],
+)
diff --git a/third_party/implib_so/get_symbols.py b/third_party/implib_so/get_symbols.py
new file mode 100644
index 00000000000000..9625052f7b69b6
--- /dev/null
+++ b/third_party/implib_so/get_symbols.py
@@ -0,0 +1,38 @@
+"""Given a .so file, lists symbols that should be included in a stub.
+
+Example usage:
+$ bazel run -c opt @local_tsl//third_party/implib_so:get_symbols
+/usr/local/cuda/lib64/libcudart.so > third_party/tsl/tsl/cuda/cudart.symbols
+"""
+
+import argparse
+import importlib
+
+# We can't import implib-gen directly because it has a dash in its name.
+implib = importlib.import_module('implib-gen')
+
+
+def _is_exported_function(s):
+  return (
+      s['Bind'] != 'LOCAL'
+      and s['Type'] == 'FUNC'
+      and s['Ndx'] != 'UND'
+      and s['Name'] not in ['', '_init', '_fini']
+      and s['Default']
+  )
+
+
+def main():
+  parser = argparse.ArgumentParser(
+      description='Extracts a list of symbols from a shared library'
+  )
+  parser.add_argument('library', help='Path to the .so file.')
+  args = parser.parse_args()
+  syms = implib.collect_syms(args.library)
+  funs = [s['Name'] for s in syms if _is_exported_function(s)]
+  for f in sorted(funs):
+    print(f)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/third_party/implib_so/implib_so.BUILD b/third_party/implib_so/implib_so.BUILD
new file mode 100644
index 00000000000000..bbfb2898eb12dd
--- /dev/null
+++ b/third_party/implib_so/implib_so.BUILD
@@ -0,0 +1,20 @@
+# Description:
+#   Implib.so is a simple equivalent of Windows DLL import libraries for POSIX
+#   shared libraries.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # MIT
+
+exports_files([
+    "LICENSE.txt",
+])
+
+py_library(
+    name = "implib_gen_lib",
+    srcs = ["implib-gen.py"],
+    data = glob([
+        "arch/**/*.S.tpl",
+        "arch/**/*.ini",
+    ]),
+)
diff --git a/third_party/implib_so/make_stub.py b/third_party/implib_so/make_stub.py
new file mode 100644
index 00000000000000..f0e1fe564c0c17
--- /dev/null
+++ b/third_party/implib_so/make_stub.py
@@ -0,0 +1,68 @@
+"""Given a list of symbols, generates a stub."""
+
+import argparse
+import configparser
+import os
+import string
+
+from bazel_tools.tools.python.runfiles import runfiles
+
+r = runfiles.Create()
+
+
+def main():
+  parser = argparse.ArgumentParser(
+      description='Generates stubs for CUDA libraries.'
+  )
+  parser.add_argument('symbols', help='File containing a list of symbols.')
+  parser.add_argument(
+      '--outdir', '-o', help='Path to create wrapper at', default='.'
+  )
+  parser.add_argument(
+      '--target',
+      help='Target platform name, e.g. x86_64, aarch64.',
+      required=True,
+  )
+  args = parser.parse_args()
+
+  config_path = r.Rlocation(f'implib_so/arch/{args.target}/config.ini')
+  table_path = r.Rlocation(f'implib_so/arch/{args.target}/table.S.tpl')
+  trampoline_path = r.Rlocation(
+      f'implib_so/arch/{args.target}/trampoline.S.tpl'
+  )
+
+  cfg = configparser.ConfigParser(inline_comment_prefixes=';')
+  cfg.read(config_path)
+  ptr_size = int(cfg['Arch']['PointerSize'])
+
+  with open(args.symbols, 'r') as f:
+    funs = [s.strip() for s in f.readlines()]
+
+  # Generate assembly code, containing a table for the resolved symbols and the
+  # trampolines.
+  lib_name, _ = os.path.splitext(os.path.basename(args.symbols))
+
+  with open(os.path.join(args.outdir, f'{lib_name}.tramp.S'), 'w') as f:
+    with open(table_path, 'r') as t:
+      table_text = string.Template(t.read()).substitute(
+          lib_suffix=lib_name, table_size=ptr_size * (len(funs) + 1)
+      )
+    f.write(table_text)
+
+    with open(trampoline_path, 'r') as t:
+      tramp_tpl = string.Template(t.read())
+
+    for i, name in enumerate(funs):
+      tramp_text = tramp_tpl.substitute(
+          lib_suffix=lib_name, sym=name, offset=i * ptr_size, number=i
+      )
+      f.write(tramp_text)
+
+  # Generates a list of symbols, formatted as a list of C++ strings.
+  with open(os.path.join(args.outdir, f'{lib_name}.inc'), 'w') as f:
+    sym_names = ''.join(f'  "{name}",\n' for name in funs)
+    f.write(sym_names)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/third_party/implib_so/workspace.bzl b/third_party/implib_so/workspace.bzl
new file mode 100644
index 00000000000000..01dad3b169f402
--- /dev/null
+++ b/third_party/implib_so/workspace.bzl
@@ -0,0 +1,13 @@
+"""Implib.so is a simple equivalent of Windows DLL import libraries for POSIX
+shared libraries."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "implib_so",
+        strip_prefix = "Implib.so-5fb84c2a750434b9df1da67d67b749eb929598f1",
+        sha256 = "10de0a616df24849f2a883747784c115f209708960e44556f5ce384de6f103e8",
+        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/5fb84c2a750434b9df1da67d67b749eb929598f1.tar.gz"),
+        build_file = "//third_party/implib_so:implib_so.BUILD",
+    )
diff --git a/third_party/xla/opensource_only.files b/third_party/xla/opensource_only.files
index 690bac46ceb990..0425cd3bc0d34a 100644
--- a/third_party/xla/opensource_only.files
+++ b/third_party/xla/opensource_only.files
@@ -56,6 +56,9 @@ third_party/gpus/rocm/build_defs.bzl.tpl:
 third_party/gpus/rocm/rocm_config.h.tpl:
 third_party/gpus/rocm_configure.bzl:
 third_party/grpc/BUILD:
+third_party/implib_so/BUILD:
+third_party/implib_so/get_symbols.py:
+third_party/implib_so/make_stub.py:
 third_party/llvm_openmp/BUILD:
 third_party/llvm_openmp/cmake_vars.bzl:
 third_party/llvm_openmp/expand_cmake_vars:.py
diff --git a/third_party/xla/third_party/implib_so/BUILD b/third_party/xla/third_party/implib_so/BUILD
new file mode 100644
index 00000000000000..8401d6152b88ab
--- /dev/null
+++ b/third_party/xla/third_party/implib_so/BUILD
@@ -0,0 +1,23 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # MIT
+
+py_binary(
+    name = "get_symbols",
+    srcs = ["get_symbols.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@bazel_tools//tools/python/runfiles",
+        "@implib_so//:implib_gen_lib",
+    ],
+)
+
+py_binary(
+    name = "make_stub",
+    srcs = ["make_stub.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@bazel_tools//tools/python/runfiles",
+        "@implib_so//:implib_gen_lib",
+    ],
+)
diff --git a/third_party/xla/third_party/implib_so/get_symbols.py b/third_party/xla/third_party/implib_so/get_symbols.py
new file mode 100644
index 00000000000000..9625052f7b69b6
--- /dev/null
+++ b/third_party/xla/third_party/implib_so/get_symbols.py
@@ -0,0 +1,38 @@
+"""Given a .so file, lists symbols that should be included in a stub.
+
+Example usage:
+$ bazel run -c opt @local_tsl//third_party/implib_so:get_symbols
+/usr/local/cuda/lib64/libcudart.so > third_party/tsl/tsl/cuda/cudart.symbols
+"""
+
+import argparse
+import importlib
+
+# We can't import implib-gen directly because it has a dash in its name.
+implib = importlib.import_module('implib-gen')
+
+
+def _is_exported_function(s):
+  return (
+      s['Bind'] != 'LOCAL'
+      and s['Type'] == 'FUNC'
+      and s['Ndx'] != 'UND'
+      and s['Name'] not in ['', '_init', '_fini']
+      and s['Default']
+  )
+
+
+def main():
+  parser = argparse.ArgumentParser(
+      description='Extracts a list of symbols from a shared library'
+  )
+  parser.add_argument('library', help='Path to the .so file.')
+  args = parser.parse_args()
+  syms = implib.collect_syms(args.library)
+  funs = [s['Name'] for s in syms if _is_exported_function(s)]
+  for f in sorted(funs):
+    print(f)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/third_party/xla/third_party/implib_so/implib_so.BUILD b/third_party/xla/third_party/implib_so/implib_so.BUILD
new file mode 100644
index 00000000000000..bbfb2898eb12dd
--- /dev/null
+++ b/third_party/xla/third_party/implib_so/implib_so.BUILD
@@ -0,0 +1,20 @@
+# Description:
+#   Implib.so is a simple equivalent of Windows DLL import libraries for POSIX
+#   shared libraries.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # MIT
+
+exports_files([
+    "LICENSE.txt",
+])
+
+py_library(
+    name = "implib_gen_lib",
+    srcs = ["implib-gen.py"],
+    data = glob([
+        "arch/**/*.S.tpl",
+        "arch/**/*.ini",
+    ]),
+)
diff --git a/third_party/xla/third_party/implib_so/make_stub.py b/third_party/xla/third_party/implib_so/make_stub.py
new file mode 100644
index 00000000000000..f0e1fe564c0c17
--- /dev/null
+++ b/third_party/xla/third_party/implib_so/make_stub.py
@@ -0,0 +1,68 @@
+"""Given a list of symbols, generates a stub."""
+
+import argparse
+import configparser
+import os
+import string
+
+from bazel_tools.tools.python.runfiles import runfiles
+
+r = runfiles.Create()
+
+
+def main():
+  parser = argparse.ArgumentParser(
+      description='Generates stubs for CUDA libraries.'
+  )
+  parser.add_argument('symbols', help='File containing a list of symbols.')
+  parser.add_argument(
+      '--outdir', '-o', help='Path to create wrapper at', default='.'
+  )
+  parser.add_argument(
+      '--target',
+      help='Target platform name, e.g. x86_64, aarch64.',
+      required=True,
+  )
+  args = parser.parse_args()
+
+  config_path = r.Rlocation(f'implib_so/arch/{args.target}/config.ini')
+  table_path = r.Rlocation(f'implib_so/arch/{args.target}/table.S.tpl')
+  trampoline_path = r.Rlocation(
+      f'implib_so/arch/{args.target}/trampoline.S.tpl'
+  )
+
+  cfg = configparser.ConfigParser(inline_comment_prefixes=';')
+  cfg.read(config_path)
+  ptr_size = int(cfg['Arch']['PointerSize'])
+
+  with open(args.symbols, 'r') as f:
+    funs = [s.strip() for s in f.readlines()]
+
+  # Generate assembly code, containing a table for the resolved symbols and the
+  # trampolines.
+  lib_name, _ = os.path.splitext(os.path.basename(args.symbols))
+
+  with open(os.path.join(args.outdir, f'{lib_name}.tramp.S'), 'w') as f:
+    with open(table_path, 'r') as t:
+      table_text = string.Template(t.read()).substitute(
+          lib_suffix=lib_name, table_size=ptr_size * (len(funs) + 1)
+      )
+    f.write(table_text)
+
+    with open(trampoline_path, 'r') as t:
+      tramp_tpl = string.Template(t.read())
+
+    for i, name in enumerate(funs):
+      tramp_text = tramp_tpl.substitute(
+          lib_suffix=lib_name, sym=name, offset=i * ptr_size, number=i
+      )
+      f.write(tramp_text)
+
+  # Generates a list of symbols, formatted as a list of C++ strings.
+  with open(os.path.join(args.outdir, f'{lib_name}.inc'), 'w') as f:
+    sym_names = ''.join(f'  "{name}",\n' for name in funs)
+    f.write(sym_names)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/third_party/xla/third_party/implib_so/workspace.bzl b/third_party/xla/third_party/implib_so/workspace.bzl
new file mode 100644
index 00000000000000..01dad3b169f402
--- /dev/null
+++ b/third_party/xla/third_party/implib_so/workspace.bzl
@@ -0,0 +1,13 @@
+"""Implib.so is a simple equivalent of Windows DLL import libraries for POSIX
+shared libraries."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "implib_so",
+        strip_prefix = "Implib.so-5fb84c2a750434b9df1da67d67b749eb929598f1",
+        sha256 = "10de0a616df24849f2a883747784c115f209708960e44556f5ce384de6f103e8",
+        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/5fb84c2a750434b9df1da67d67b749eb929598f1.tar.gz"),
+        build_file = "//third_party/implib_so:implib_so.BUILD",
+    )
diff --git a/third_party/xla/third_party/tsl/opensource_only.files b/third_party/xla/third_party/tsl/opensource_only.files
index 4be161df5e2059..4a5a1e2a7c9f79 100644
--- a/third_party/xla/third_party/tsl/opensource_only.files
+++ b/third_party/xla/third_party/tsl/opensource_only.files
@@ -53,6 +53,9 @@ third_party/gpus/rocm/build_defs.bzl.tpl:
 third_party/gpus/rocm/rocm_config.h.tpl:
 third_party/gpus/rocm_configure.bzl:
 third_party/grpc/BUILD:
+third_party/implib_so/BUILD:
+third_party/implib_so/get_symbols.py:
+third_party/implib_so/make_stub.py:
 third_party/llvm_openmp/BUILD:
 third_party/llvm_openmp/cmake_vars.bzl:
 third_party/llvm_openmp/expand_cmake_vars:.py
@@ -145,6 +148,7 @@ tools/toolchains/win/bazel_211/BUILD:
 tools/toolchains/win/tf_win_05022023/BUILD:
 tools/toolchains/win_1803/py38/BUILD:
 tools/toolchains/win_1803/py39/BUILD:
+tsl/cuda/stub.bzl:
 tsl/mkl/BUILD:
 tsl/mkl/LICENSE:
 tsl/mkl/MKL_LICENSE:
diff --git a/third_party/xla/third_party/tsl/third_party/implib_so/BUILD b/third_party/xla/third_party/tsl/third_party/implib_so/BUILD
new file mode 100644
index 00000000000000..8401d6152b88ab
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/implib_so/BUILD
@@ -0,0 +1,23 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # MIT
+
+py_binary(
+    name = "get_symbols",
+    srcs = ["get_symbols.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@bazel_tools//tools/python/runfiles",
+        "@implib_so//:implib_gen_lib",
+    ],
+)
+
+py_binary(
+    name = "make_stub",
+    srcs = ["make_stub.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@bazel_tools//tools/python/runfiles",
+        "@implib_so//:implib_gen_lib",
+    ],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/implib_so/get_symbols.py b/third_party/xla/third_party/tsl/third_party/implib_so/get_symbols.py
new file mode 100644
index 00000000000000..9625052f7b69b6
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/implib_so/get_symbols.py
@@ -0,0 +1,38 @@
+"""Given a .so file, lists symbols that should be included in a stub.
+
+Example usage:
+$ bazel run -c opt @local_tsl//third_party/implib_so:get_symbols
+/usr/local/cuda/lib64/libcudart.so > third_party/tsl/tsl/cuda/cudart.symbols
+"""
+
+import argparse
+import importlib
+
+# We can't import implib-gen directly because it has a dash in its name.
+implib = importlib.import_module('implib-gen')
+
+
+def _is_exported_function(s):
+  return (
+      s['Bind'] != 'LOCAL'
+      and s['Type'] == 'FUNC'
+      and s['Ndx'] != 'UND'
+      and s['Name'] not in ['', '_init', '_fini']
+      and s['Default']
+  )
+
+
+def main():
+  parser = argparse.ArgumentParser(
+      description='Extracts a list of symbols from a shared library'
+  )
+  parser.add_argument('library', help='Path to the .so file.')
+  args = parser.parse_args()
+  syms = implib.collect_syms(args.library)
+  funs = [s['Name'] for s in syms if _is_exported_function(s)]
+  for f in sorted(funs):
+    print(f)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/third_party/xla/third_party/tsl/third_party/implib_so/implib_so.BUILD b/third_party/xla/third_party/tsl/third_party/implib_so/implib_so.BUILD
new file mode 100644
index 00000000000000..bbfb2898eb12dd
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/implib_so/implib_so.BUILD
@@ -0,0 +1,20 @@
+# Description:
+#   Implib.so is a simple equivalent of Windows DLL import libraries for POSIX
+#   shared libraries.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # MIT
+
+exports_files([
+    "LICENSE.txt",
+])
+
+py_library(
+    name = "implib_gen_lib",
+    srcs = ["implib-gen.py"],
+    data = glob([
+        "arch/**/*.S.tpl",
+        "arch/**/*.ini",
+    ]),
+)
diff --git a/third_party/xla/third_party/tsl/third_party/implib_so/make_stub.py b/third_party/xla/third_party/tsl/third_party/implib_so/make_stub.py
new file mode 100644
index 00000000000000..f0e1fe564c0c17
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/implib_so/make_stub.py
@@ -0,0 +1,68 @@
+"""Given a list of symbols, generates a stub."""
+
+import argparse
+import configparser
+import os
+import string
+
+from bazel_tools.tools.python.runfiles import runfiles
+
+r = runfiles.Create()
+
+
+def main():
+  parser = argparse.ArgumentParser(
+      description='Generates stubs for CUDA libraries.'
+  )
+  parser.add_argument('symbols', help='File containing a list of symbols.')
+  parser.add_argument(
+      '--outdir', '-o', help='Path to create wrapper at', default='.'
+  )
+  parser.add_argument(
+      '--target',
+      help='Target platform name, e.g. x86_64, aarch64.',
+      required=True,
+  )
+  args = parser.parse_args()
+
+  config_path = r.Rlocation(f'implib_so/arch/{args.target}/config.ini')
+  table_path = r.Rlocation(f'implib_so/arch/{args.target}/table.S.tpl')
+  trampoline_path = r.Rlocation(
+      f'implib_so/arch/{args.target}/trampoline.S.tpl'
+  )
+
+  cfg = configparser.ConfigParser(inline_comment_prefixes=';')
+  cfg.read(config_path)
+  ptr_size = int(cfg['Arch']['PointerSize'])
+
+  with open(args.symbols, 'r') as f:
+    funs = [s.strip() for s in f.readlines()]
+
+  # Generate assembly code, containing a table for the resolved symbols and the
+  # trampolines.
+  lib_name, _ = os.path.splitext(os.path.basename(args.symbols))
+
+  with open(os.path.join(args.outdir, f'{lib_name}.tramp.S'), 'w') as f:
+    with open(table_path, 'r') as t:
+      table_text = string.Template(t.read()).substitute(
+          lib_suffix=lib_name, table_size=ptr_size * (len(funs) + 1)
+      )
+    f.write(table_text)
+
+    with open(trampoline_path, 'r') as t:
+      tramp_tpl = string.Template(t.read())
+
+    for i, name in enumerate(funs):
+      tramp_text = tramp_tpl.substitute(
+          lib_suffix=lib_name, sym=name, offset=i * ptr_size, number=i
+      )
+      f.write(tramp_text)
+
+  # Generates a list of symbols, formatted as a list of C++ strings.
+  with open(os.path.join(args.outdir, f'{lib_name}.inc'), 'w') as f:
+    sym_names = ''.join(f'  "{name}",\n' for name in funs)
+    f.write(sym_names)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl b/third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl
new file mode 100644
index 00000000000000..01dad3b169f402
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl
@@ -0,0 +1,13 @@
+"""Implib.so is a simple equivalent of Windows DLL import libraries for POSIX
+shared libraries."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "implib_so",
+        strip_prefix = "Implib.so-5fb84c2a750434b9df1da67d67b749eb929598f1",
+        sha256 = "10de0a616df24849f2a883747784c115f209708960e44556f5ce384de6f103e8",
+        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/5fb84c2a750434b9df1da67d67b749eb929598f1.tar.gz"),
+        build_file = "//third_party/implib_so:implib_so.BUILD",
+    )
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
index ecb7bdb2cfffa7..71bed13a8bae96 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
+++ b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
@@ -1,6 +1,7 @@
 # Description:
 #   Stubs for dynamically loading CUDA.
 
+load("//tsl/cuda:stub.bzl", "cuda_stub")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
@@ -20,25 +21,48 @@ package(
     licenses = ["notice"],
 )
 
+cuda_stub(
+    name = "cublas",
+    srcs = ["cublas.symbols"],
+)
+
 cc_library(
     name = "cublas",
-    srcs = if_cuda_is_configured(["cublas_stub.cc"]),
+    srcs = if_cuda_is_configured([
+        "cublas_stub.cc",
+        "cublas.tramp.S",
+    ]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags(
         "nvidia/cublas/lib",
     )),
-    textual_hdrs = glob(["cublas_*.inc"]),
+    local_defines = [
+        "IMPLIB_EXPORT_SHIMS=1",
+    ],
+    textual_hdrs = ["cublas.inc"],
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
+        "@com_google_absl//absl/container:flat_hash_set",
         "@local_config_cuda//cuda:cuda_headers",
         "//tsl/platform:dso_loader",
         "//tsl/platform:env",
     ]),
 )
 
+cuda_stub(
+    name = "cublasLt",
+    srcs = ["cublasLt.symbols"],
+)
+
 cc_library(
     name = "cublas_lt",
-    srcs = if_cuda_is_configured(["cublasLt_stub.cc"]),
-    textual_hdrs = glob(["cublasLt_*.inc"]),
+    srcs = if_cuda_is_configured([
+        "cublasLt_stub.cc",
+        "cublasLt.tramp.S",
+    ]),
+    local_defines = [
+        "IMPLIB_EXPORT_SHIMS=1",
+    ],
+    textual_hdrs = ["cublasLt.inc"],
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
@@ -47,10 +71,21 @@ cc_library(
     ]),
 )
 
+cuda_stub(
+    name = "cuda",
+    srcs = ["cuda.symbols"],
+)
+
 cc_library(
     name = "cuda",
-    srcs = if_cuda_is_configured(["cuda_stub.cc"]),
-    textual_hdrs = glob(["cuda_*.inc"]),
+    srcs = if_cuda_is_configured([
+        "cuda_stub.cc",
+        "cuda.tramp.S",
+    ]),
+    local_defines = [
+        "IMPLIB_EXPORT_SHIMS=1",
+    ],
+    textual_hdrs = ["cuda.inc"],
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
@@ -59,18 +94,29 @@ cc_library(
     ]),
 )
 
+cuda_stub(
+    name = "cudart",
+    srcs = ["cudart.symbols"],
+)
+
 cc_library(
     name = "cudart",
     srcs = select({
         # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
-        "//tsl:is_cuda_enabled_and_oss": ["cudart_stub.cc"],
+        "//tsl:is_cuda_enabled_and_oss": [
+            "cudart.tramp.S",
+            "cudart_stub.cc",
+        ],
         "//conditions:default": [],
     }),
     linkopts = select({
         "//tsl:is_cuda_enabled_and_oss": cuda_rpath_flags("nvidia/cuda_runtime/lib"),
         "//conditions:default": [],
     }),
-    textual_hdrs = glob(["cuda_runtime_*.inc"]),
+    local_defines = [
+        "IMPLIB_EXPORT_SHIMS=1",
+    ],
+    textual_hdrs = ["cudart.inc"],
     visibility = ["//visibility:public"],
     deps = select({
         "//tsl:is_cuda_enabled_and_oss": [
@@ -83,11 +129,22 @@ cc_library(
     }),
 )
 
+cuda_stub(
+    name = "cudnn",
+    srcs = ["cudnn.symbols"],
+)
+
 cc_library(
     name = "cudnn",
-    srcs = if_cuda_is_configured(["cudnn_stub.cc"]),
+    srcs = if_cuda_is_configured([
+        "cudnn_stub.cc",
+        "cudnn.tramp.S",
+    ]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/cudnn/lib")),
-    textual_hdrs = glob(["cudnn_*.inc"]),
+    local_defines = [
+        "IMPLIB_EXPORT_SHIMS=1",
+    ],
+    textual_hdrs = ["cudnn.inc"],
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cudnn_version",
@@ -130,11 +187,22 @@ tsl_cc_test(
     ],
 )
 
+cuda_stub(
+    name = "cufft",
+    srcs = ["cufft.symbols"],
+)
+
 cc_library(
     name = "cufft",
-    srcs = if_cuda_is_configured(["cufft_stub.cc"]),
+    srcs = if_cuda_is_configured([
+        "cufft_stub.cc",
+        "cufft.tramp.S",
+    ]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/cufft/lib")),
-    textual_hdrs = glob(["cufft_*.inc"]),
+    local_defines = [
+        "IMPLIB_EXPORT_SHIMS=1",
+    ],
+    textual_hdrs = ["cufft.inc"],
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
@@ -143,12 +211,23 @@ cc_library(
     ]),
 )
 
+cuda_stub(
+    name = "cupti",
+    srcs = ["cupti.symbols"],
+)
+
 cc_library(
     name = "cupti",
-    srcs = if_cuda_is_configured(["cupti_stub.cc"]),
+    srcs = if_cuda_is_configured([
+        "cupti_stub.cc",
+        "cupti.tramp.S",
+    ]),
     data = if_cuda_is_configured(["@local_config_cuda//cuda:cupti_dsos"]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/cuda_cupti/lib")),
-    textual_hdrs = glob(["cupti_*.inc"]),
+    local_defines = [
+        "IMPLIB_EXPORT_SHIMS=1",
+    ],
+    textual_hdrs = ["cupti.inc"],
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
@@ -158,11 +237,22 @@ cc_library(
     ]),
 )
 
+cuda_stub(
+    name = "cusolver",
+    srcs = ["cusolver.symbols"],
+)
+
 cc_library(
     name = "cusolver",
-    srcs = if_cuda_is_configured(["cusolver_stub.cc"]),
+    srcs = if_cuda_is_configured([
+        "cusolver_stub.cc",
+        "cusolver.tramp.S",
+    ]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/cusolver/lib")),
-    textual_hdrs = glob(["cusolver_dense_*.inc"]),
+    local_defines = [
+        "IMPLIB_EXPORT_SHIMS=1",
+    ],
+    textual_hdrs = ["cusolver.inc"],
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
@@ -171,11 +261,22 @@ cc_library(
     ]),
 )
 
+cuda_stub(
+    name = "cusparse",
+    srcs = ["cusparse.symbols"],
+)
+
 cc_library(
     name = "cusparse",
-    srcs = if_cuda_is_configured(["cusparse_stub.cc"]),
+    srcs = if_cuda_is_configured([
+        "cusparse_stub.cc",
+        "cusparse.tramp.S",
+    ]),
     linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/cusparse/lib")),
-    textual_hdrs = glob(["cusparse_*.inc"]),
+    local_defines = [
+        "IMPLIB_EXPORT_SHIMS=1",
+    ],
+    textual_hdrs = ["cusparse.inc"],
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublas.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cublas.symbols
new file mode 100644
index 00000000000000..bd93675eb5f299
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cublas.symbols
@@ -0,0 +1,736 @@
+cublasAlloc
+cublasAsumEx
+cublasAsumEx_64
+cublasAxpyEx
+cublasAxpyEx_64
+cublasCaxpy
+cublasCaxpy_v2
+cublasCaxpy_v2_64
+cublasCbdmm
+cublasCcopy
+cublasCcopy_v2
+cublasCcopy_v2_64
+cublasCdgmm
+cublasCdgmm_64
+cublasCdotc
+cublasCdotc_v2
+cublasCdotc_v2_64
+cublasCdotu
+cublasCdotu_v2
+cublasCdotu_v2_64
+cublasCgbmv
+cublasCgbmv_v2
+cublasCgbmv_v2_64
+cublasCgeam
+cublasCgeam_64
+cublasCgelsBatched
+cublasCgemm
+cublasCgemm3m
+cublasCgemm3mBatched
+cublasCgemm3mBatched_64
+cublasCgemm3mEx
+cublasCgemm3mEx_64
+cublasCgemm3mStridedBatched
+cublasCgemm3mStridedBatched_64
+cublasCgemm3m_64
+cublasCgemmBatched
+cublasCgemmBatched_64
+cublasCgemmEx
+cublasCgemmEx_64
+cublasCgemmStridedBatched
+cublasCgemmStridedBatched_64
+cublasCgemm_v2
+cublasCgemm_v2_64
+cublasCgemv
+cublasCgemvBatched
+cublasCgemvBatched_64
+cublasCgemvStridedBatched
+cublasCgemvStridedBatched_64
+cublasCgemv_v2
+cublasCgemv_v2_64
+cublasCgeqrfBatched
+cublasCgerc
+cublasCgerc_v2
+cublasCgerc_v2_64
+cublasCgeru
+cublasCgeru_v2
+cublasCgeru_v2_64
+cublasCgetrfBatched
+cublasCgetriBatched
+cublasCgetrsBatched
+cublasChbmv
+cublasChbmv_v2
+cublasChbmv_v2_64
+cublasChemm
+cublasChemm_v2
+cublasChemm_v2_64
+cublasChemv
+cublasChemv_v2
+cublasChemv_v2_64
+cublasCher
+cublasCher2
+cublasCher2_v2
+cublasCher2_v2_64
+cublasCher2k
+cublasCher2k_v2
+cublasCher2k_v2_64
+cublasCher_v2
+cublasCher_v2_64
+cublasCherk
+cublasCherk3mEx
+cublasCherk3mEx_64
+cublasCherkEx
+cublasCherkEx_64
+cublasCherk_v2
+cublasCherk_v2_64
+cublasCherkx
+cublasCherkx_64
+cublasChpmv
+cublasChpmv_v2
+cublasChpmv_v2_64
+cublasChpr
+cublasChpr2
+cublasChpr2_v2
+cublasChpr2_v2_64
+cublasChpr_v2
+cublasChpr_v2_64
+cublasCmatinvBatched
+cublasCopyEx
+cublasCopyEx_64
+cublasCreate_v2
+cublasCrot
+cublasCrot_v2
+cublasCrot_v2_64
+cublasCrotg
+cublasCrotg_v2
+cublasCscal
+cublasCscal_v2
+cublasCscal_v2_64
+cublasCsrot
+cublasCsrot_v2
+cublasCsrot_v2_64
+cublasCsscal
+cublasCsscal_v2
+cublasCsscal_v2_64
+cublasCswap
+cublasCswap_v2
+cublasCswap_v2_64
+cublasCsymm
+cublasCsymm_v2
+cublasCsymm_v2_64
+cublasCsymv_v2
+cublasCsymv_v2_64
+cublasCsyr2_v2
+cublasCsyr2_v2_64
+cublasCsyr2k
+cublasCsyr2k_v2
+cublasCsyr2k_v2_64
+cublasCsyr_v2
+cublasCsyr_v2_64
+cublasCsyrk
+cublasCsyrk3mEx
+cublasCsyrk3mEx_64
+cublasCsyrkEx
+cublasCsyrkEx_64
+cublasCsyrk_v2
+cublasCsyrk_v2_64
+cublasCsyrkx
+cublasCsyrkx_64
+cublasCtbmv
+cublasCtbmv_v2
+cublasCtbmv_v2_64
+cublasCtbsv
+cublasCtbsv_v2
+cublasCtbsv_v2_64
+cublasCtpmv
+cublasCtpmv_v2
+cublasCtpmv_v2_64
+cublasCtpsv
+cublasCtpsv_v2
+cublasCtpsv_v2_64
+cublasCtpttr
+cublasCtrmm
+cublasCtrmm_v2
+cublasCtrmm_v2_64
+cublasCtrmv
+cublasCtrmv_v2
+cublasCtrmv_v2_64
+cublasCtrsm
+cublasCtrsmBatched
+cublasCtrsmBatched_64
+cublasCtrsm_v2
+cublasCtrsm_v2_64
+cublasCtrsv
+cublasCtrsv_v2
+cublasCtrsv_v2_64
+cublasCtrttp
+cublasDasum
+cublasDasum_v2
+cublasDasum_v2_64
+cublasDaxpy
+cublasDaxpy_v2
+cublasDaxpy_v2_64
+cublasDbdmm
+cublasDcopy
+cublasDcopy_v2
+cublasDcopy_v2_64
+cublasDdgmm
+cublasDdgmm_64
+cublasDdot
+cublasDdot_v2
+cublasDdot_v2_64
+cublasDestroy_v2
+cublasDgbmv
+cublasDgbmv_v2
+cublasDgbmv_v2_64
+cublasDgeam
+cublasDgeam_64
+cublasDgelsBatched
+cublasDgemm
+cublasDgemmBatched
+cublasDgemmBatched_64
+cublasDgemmStridedBatched
+cublasDgemmStridedBatched_64
+cublasDgemm_v2
+cublasDgemm_v2_64
+cublasDgemv
+cublasDgemvBatched
+cublasDgemvBatched_64
+cublasDgemvStridedBatched
+cublasDgemvStridedBatched_64
+cublasDgemv_v2
+cublasDgemv_v2_64
+cublasDgeqrfBatched
+cublasDger
+cublasDger_v2
+cublasDger_v2_64
+cublasDgetrfBatched
+cublasDgetriBatched
+cublasDgetrsBatched
+cublasDmatinvBatched
+cublasDnrm2
+cublasDnrm2_v2
+cublasDnrm2_v2_64
+cublasDotEx
+cublasDotEx_64
+cublasDotcEx
+cublasDotcEx_64
+cublasDrot
+cublasDrot_v2
+cublasDrot_v2_64
+cublasDrotg
+cublasDrotg_v2
+cublasDrotm
+cublasDrotm_v2
+cublasDrotm_v2_64
+cublasDrotmg
+cublasDrotmg_v2
+cublasDsbmv
+cublasDsbmv_v2
+cublasDsbmv_v2_64
+cublasDscal
+cublasDscal_v2
+cublasDscal_v2_64
+cublasDspmv
+cublasDspmv_v2
+cublasDspmv_v2_64
+cublasDspr
+cublasDspr2
+cublasDspr2_v2
+cublasDspr2_v2_64
+cublasDspr_v2
+cublasDspr_v2_64
+cublasDswap
+cublasDswap_v2
+cublasDswap_v2_64
+cublasDsymm
+cublasDsymm_v2
+cublasDsymm_v2_64
+cublasDsymv
+cublasDsymv_v2
+cublasDsymv_v2_64
+cublasDsyr
+cublasDsyr2
+cublasDsyr2_v2
+cublasDsyr2_v2_64
+cublasDsyr2k
+cublasDsyr2k_v2
+cublasDsyr2k_v2_64
+cublasDsyr_v2
+cublasDsyr_v2_64
+cublasDsyrk
+cublasDsyrk_v2
+cublasDsyrk_v2_64
+cublasDsyrkx
+cublasDsyrkx_64
+cublasDtbmv
+cublasDtbmv_v2
+cublasDtbmv_v2_64
+cublasDtbsv
+cublasDtbsv_v2
+cublasDtbsv_v2_64
+cublasDtpmv
+cublasDtpmv_v2
+cublasDtpmv_v2_64
+cublasDtpsv
+cublasDtpsv_v2
+cublasDtpsv_v2_64
+cublasDtpttr
+cublasDtrmm
+cublasDtrmm_v2
+cublasDtrmm_v2_64
+cublasDtrmv
+cublasDtrmv_v2
+cublasDtrmv_v2_64
+cublasDtrsm
+cublasDtrsmBatched
+cublasDtrsmBatched_64
+cublasDtrsm_v2
+cublasDtrsm_v2_64
+cublasDtrsv
+cublasDtrsv_v2
+cublasDtrsv_v2_64
+cublasDtrttp
+cublasDzasum
+cublasDzasum_v2
+cublasDzasum_v2_64
+cublasDznrm2
+cublasDznrm2_v2
+cublasDznrm2_v2_64
+cublasFree
+cublasGemmBatchedEx
+cublasGemmBatchedEx_64
+cublasGemmEx
+cublasGemmEx_64
+cublasGemmStridedBatchedEx
+cublasGemmStridedBatchedEx_64
+cublasGetAtomicsMode
+cublasGetBackdoor
+cublasGetCudartVersion
+cublasGetError
+cublasGetLoggerCallback
+cublasGetMathMode
+cublasGetMatrix
+cublasGetMatrixAsync
+cublasGetMatrixAsync_64
+cublasGetMatrix_64
+cublasGetPointerMode_v2
+cublasGetProperty
+cublasGetSmCountTarget
+cublasGetStatusName
+cublasGetStatusString
+cublasGetStream_v2
+cublasGetVector
+cublasGetVectorAsync
+cublasGetVectorAsync_64
+cublasGetVector_64
+cublasGetVersion
+cublasGetVersion_v2
+cublasHSHgemvBatched
+cublasHSHgemvBatched_64
+cublasHSHgemvStridedBatched
+cublasHSHgemvStridedBatched_64
+cublasHSSgemvBatched
+cublasHSSgemvBatched_64
+cublasHSSgemvStridedBatched
+cublasHSSgemvStridedBatched_64
+cublasHgemm
+cublasHgemmBatched
+cublasHgemmBatched_64
+cublasHgemmStridedBatched
+cublasHgemmStridedBatched_64
+cublasHgemm_64
+cublasIamaxEx
+cublasIamaxEx_64
+cublasIaminEx
+cublasIaminEx_64
+cublasIcamax
+cublasIcamax_v2
+cublasIcamax_v2_64
+cublasIcamin
+cublasIcamin_v2
+cublasIcamin_v2_64
+cublasIdamax
+cublasIdamax_v2
+cublasIdamax_v2_64
+cublasIdamin
+cublasIdamin_v2
+cublasIdamin_v2_64
+cublasInit
+cublasIsamax
+cublasIsamax_v2
+cublasIsamax_v2_64
+cublasIsamin
+cublasIsamin_v2
+cublasIsamin_v2_64
+cublasIzamax
+cublasIzamax_v2
+cublasIzamax_v2_64
+cublasIzamin
+cublasIzamin_v2
+cublasIzamin_v2_64
+cublasLoggerConfigure
+cublasNrm2Ex
+cublasNrm2Ex_64
+cublasRotEx
+cublasRotEx_64
+cublasRotgEx
+cublasRotmEx
+cublasRotmEx_64
+cublasRotmgEx
+cublasSasum
+cublasSasum_v2
+cublasSasum_v2_64
+cublasSaxpy
+cublasSaxpy_v2
+cublasSaxpy_v2_64
+cublasSbdmm
+cublasScalEx
+cublasScalEx_64
+cublasScasum
+cublasScasum_v2
+cublasScasum_v2_64
+cublasScnrm2
+cublasScnrm2_v2
+cublasScnrm2_v2_64
+cublasScopy
+cublasScopy_v2
+cublasScopy_v2_64
+cublasSdgmm
+cublasSdgmm_64
+cublasSdot
+cublasSdot_v2
+cublasSdot_v2_64
+cublasSetAtomicsMode
+cublasSetBackdoor
+cublasSetBackdoorEx
+cublasSetKernelStream
+cublasSetLoggerCallback
+cublasSetMathMode
+cublasSetMatrix
+cublasSetMatrixAsync
+cublasSetMatrixAsync_64
+cublasSetMatrix_64
+cublasSetPointerMode_v2
+cublasSetSmCountTarget
+cublasSetStream_v2
+cublasSetVector
+cublasSetVectorAsync
+cublasSetVectorAsync_64
+cublasSetVector_64
+cublasSetWorkspace_v2
+cublasSgbmv
+cublasSgbmv_v2
+cublasSgbmv_v2_64
+cublasSgeam
+cublasSgeam_64
+cublasSgelsBatched
+cublasSgemm
+cublasSgemmBatched
+cublasSgemmBatched_64
+cublasSgemmEx
+cublasSgemmEx_64
+cublasSgemmStridedBatched
+cublasSgemmStridedBatched_64
+cublasSgemm_v2
+cublasSgemm_v2_64
+cublasSgemv
+cublasSgemvBatched
+cublasSgemvBatched_64
+cublasSgemvStridedBatched
+cublasSgemvStridedBatched_64
+cublasSgemv_v2
+cublasSgemv_v2_64
+cublasSgeqrfBatched
+cublasSger
+cublasSger_v2
+cublasSger_v2_64
+cublasSgetrfBatched
+cublasSgetriBatched
+cublasSgetrsBatched
+cublasShutdown
+cublasSmatinvBatched
+cublasSnrm2
+cublasSnrm2_v2
+cublasSnrm2_v2_64
+cublasSrot
+cublasSrot_v2
+cublasSrot_v2_64
+cublasSrotg
+cublasSrotg_v2
+cublasSrotm
+cublasSrotm_v2
+cublasSrotm_v2_64
+cublasSrotmg
+cublasSrotmg_v2
+cublasSsbmv
+cublasSsbmv_v2
+cublasSsbmv_v2_64
+cublasSscal
+cublasSscal_v2
+cublasSscal_v2_64
+cublasSspmv
+cublasSspmv_v2
+cublasSspmv_v2_64
+cublasSspr
+cublasSspr2
+cublasSspr2_v2
+cublasSspr2_v2_64
+cublasSspr_v2
+cublasSspr_v2_64
+cublasSswap
+cublasSswap_v2
+cublasSswap_v2_64
+cublasSsymm
+cublasSsymm_v2
+cublasSsymm_v2_64
+cublasSsymv
+cublasSsymv_v2
+cublasSsymv_v2_64
+cublasSsyr
+cublasSsyr2
+cublasSsyr2_v2
+cublasSsyr2_v2_64
+cublasSsyr2k
+cublasSsyr2k_v2
+cublasSsyr2k_v2_64
+cublasSsyr_v2
+cublasSsyr_v2_64
+cublasSsyrk
+cublasSsyrk_v2
+cublasSsyrk_v2_64
+cublasSsyrkx
+cublasSsyrkx_64
+cublasStbmv
+cublasStbmv_v2
+cublasStbmv_v2_64
+cublasStbsv
+cublasStbsv_v2
+cublasStbsv_v2_64
+cublasStpmv
+cublasStpmv_v2
+cublasStpmv_v2_64
+cublasStpsv
+cublasStpsv_v2
+cublasStpsv_v2_64
+cublasStpttr
+cublasStrmm
+cublasStrmm_v2
+cublasStrmm_v2_64
+cublasStrmv
+cublasStrmv_v2
+cublasStrmv_v2_64
+cublasStrsm
+cublasStrsmBatched
+cublasStrsmBatched_64
+cublasStrsm_v2
+cublasStrsm_v2_64
+cublasStrsv
+cublasStrsv_v2
+cublasStrsv_v2_64
+cublasStrttp
+cublasSwapEx
+cublasSwapEx_64
+cublasTSSgemvBatched
+cublasTSSgemvBatched_64
+cublasTSSgemvStridedBatched
+cublasTSSgemvStridedBatched_64
+cublasTSTgemvBatched
+cublasTSTgemvBatched_64
+cublasTSTgemvStridedBatched
+cublasTSTgemvStridedBatched_64
+cublasUint8gemmBias
+cublasXerbla
+cublasXtCgemm
+cublasXtChemm
+cublasXtCher2k
+cublasXtCherk
+cublasXtCherkx
+cublasXtCreate
+cublasXtCspmm
+cublasXtCsymm
+cublasXtCsyr2k
+cublasXtCsyrk
+cublasXtCsyrkx
+cublasXtCtrmm
+cublasXtCtrsm
+cublasXtDestroy
+cublasXtDeviceSelect
+cublasXtDgemm
+cublasXtDspmm
+cublasXtDsymm
+cublasXtDsyr2k
+cublasXtDsyrk
+cublasXtDsyrkx
+cublasXtDtrmm
+cublasXtDtrsm
+cublasXtGetBlockDim
+cublasXtGetNumBoards
+cublasXtGetPinningMemMode
+cublasXtMaxBoards
+cublasXtSetBlockDim
+cublasXtSetCpuRatio
+cublasXtSetCpuRoutine
+cublasXtSetPinningMemMode
+cublasXtSgemm
+cublasXtSspmm
+cublasXtSsymm
+cublasXtSsyr2k
+cublasXtSsyrk
+cublasXtSsyrkx
+cublasXtStrmm
+cublasXtStrsm
+cublasXtZgemm
+cublasXtZhemm
+cublasXtZher2k
+cublasXtZherk
+cublasXtZherkx
+cublasXtZspmm
+cublasXtZsymm
+cublasXtZsyr2k
+cublasXtZsyrk
+cublasXtZsyrkx
+cublasXtZtrmm
+cublasXtZtrsm
+cublasZaxpy
+cublasZaxpy_v2
+cublasZaxpy_v2_64
+cublasZbdmm
+cublasZcopy
+cublasZcopy_v2
+cublasZcopy_v2_64
+cublasZdgmm
+cublasZdgmm_64
+cublasZdotc
+cublasZdotc_v2
+cublasZdotc_v2_64
+cublasZdotu
+cublasZdotu_v2
+cublasZdotu_v2_64
+cublasZdrot
+cublasZdrot_v2
+cublasZdrot_v2_64
+cublasZdscal
+cublasZdscal_v2
+cublasZdscal_v2_64
+cublasZgbmv
+cublasZgbmv_v2
+cublasZgbmv_v2_64
+cublasZgeam
+cublasZgeam_64
+cublasZgelsBatched
+cublasZgemm
+cublasZgemm3m
+cublasZgemm3m_64
+cublasZgemmBatched
+cublasZgemmBatched_64
+cublasZgemmStridedBatched
+cublasZgemmStridedBatched_64
+cublasZgemm_v2
+cublasZgemm_v2_64
+cublasZgemv
+cublasZgemvBatched
+cublasZgemvBatched_64
+cublasZgemvStridedBatched
+cublasZgemvStridedBatched_64
+cublasZgemv_v2
+cublasZgemv_v2_64
+cublasZgeqrfBatched
+cublasZgerc
+cublasZgerc_v2
+cublasZgerc_v2_64
+cublasZgeru
+cublasZgeru_v2
+cublasZgeru_v2_64
+cublasZgetrfBatched
+cublasZgetriBatched
+cublasZgetrsBatched
+cublasZhbmv
+cublasZhbmv_v2
+cublasZhbmv_v2_64
+cublasZhemm
+cublasZhemm_v2
+cublasZhemm_v2_64
+cublasZhemv
+cublasZhemv_v2
+cublasZhemv_v2_64
+cublasZher
+cublasZher2
+cublasZher2_v2
+cublasZher2_v2_64
+cublasZher2k
+cublasZher2k_v2
+cublasZher2k_v2_64
+cublasZher_v2
+cublasZher_v2_64
+cublasZherk
+cublasZherk_v2
+cublasZherk_v2_64
+cublasZherkx
+cublasZherkx_64
+cublasZhpmv
+cublasZhpmv_v2
+cublasZhpmv_v2_64
+cublasZhpr
+cublasZhpr2
+cublasZhpr2_v2
+cublasZhpr2_v2_64
+cublasZhpr_v2
+cublasZhpr_v2_64
+cublasZmatinvBatched
+cublasZrot
+cublasZrot_v2
+cublasZrot_v2_64
+cublasZrotg
+cublasZrotg_v2
+cublasZscal
+cublasZscal_v2
+cublasZscal_v2_64
+cublasZswap
+cublasZswap_v2
+cublasZswap_v2_64
+cublasZsymm
+cublasZsymm_v2
+cublasZsymm_v2_64
+cublasZsymv_v2
+cublasZsymv_v2_64
+cublasZsyr2_v2
+cublasZsyr2_v2_64
+cublasZsyr2k
+cublasZsyr2k_v2
+cublasZsyr2k_v2_64
+cublasZsyr_v2
+cublasZsyr_v2_64
+cublasZsyrk
+cublasZsyrk_v2
+cublasZsyrk_v2_64
+cublasZsyrkx
+cublasZsyrkx_64
+cublasZtbmv
+cublasZtbmv_v2
+cublasZtbmv_v2_64
+cublasZtbsv
+cublasZtbsv_v2
+cublasZtbsv_v2_64
+cublasZtpmv
+cublasZtpmv_v2
+cublasZtpmv_v2_64
+cublasZtpsv
+cublasZtpsv_v2
+cublasZtpsv_v2_64
+cublasZtpttr
+cublasZtrmm
+cublasZtrmm_v2
+cublasZtrmm_v2_64
+cublasZtrmv
+cublasZtrmv_v2
+cublasZtrmv_v2_64
+cublasZtrsm
+cublasZtrsmBatched
+cublasZtrsmBatched_64
+cublasZtrsm_v2
+cublasZtrsm_v2_64
+cublasZtrsv
+cublasZtrsv_v2
+cublasZtrsv_v2_64
+cublasZtrttp
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublasLt.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cublasLt.symbols
new file mode 100644
index 00000000000000..7f93cfcb3ad49f
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cublasLt.symbols
@@ -0,0 +1,234 @@
+cublasLtACCMatmul
+cublasLtACCMatmulAlgoCapGetAttribute
+cublasLtACCMatmulAlgoCheck
+cublasLtACCMatmulAlgoGetHeuristic
+cublasLtACCMatmulAlgoGetIds
+cublasLtACCMatmulAlgoInit
+cublasLtAlgoCharacteristicGetAttribute
+cublasLtBIIMatmul
+cublasLtBIIMatmulAlgoCapGetAttribute
+cublasLtBIIMatmulAlgoCheck
+cublasLtBIIMatmulAlgoGetHeuristic
+cublasLtBIIMatmulAlgoGetIds
+cublasLtBIIMatmulAlgoInit
+cublasLtBSBMatmul
+cublasLtBSBMatmulAlgoCapGetAttribute
+cublasLtBSBMatmulAlgoCheck
+cublasLtBSBMatmulAlgoGetHeuristic
+cublasLtBSBMatmulAlgoGetIds
+cublasLtBSBMatmulAlgoInit
+cublasLtBSSMatmul
+cublasLtBSSMatmulAlgoCapGetAttribute
+cublasLtBSSMatmulAlgoCheck
+cublasLtBSSMatmulAlgoGetHeuristic
+cublasLtBSSMatmulAlgoGetIds
+cublasLtBSSMatmulAlgoInit
+cublasLtCCCMatmul
+cublasLtCCCMatmulAlgoCapGetAttribute
+cublasLtCCCMatmulAlgoCheck
+cublasLtCCCMatmulAlgoGetHeuristic
+cublasLtCCCMatmulAlgoGetIds
+cublasLtCCCMatmulAlgoInit
+cublasLtCreate
+cublasLtCtxInit
+cublasLtDDDMatmul
+cublasLtDDDMatmulAlgoCapGetAttribute
+cublasLtDDDMatmulAlgoCheck
+cublasLtDDDMatmulAlgoGetHeuristic
+cublasLtDDDMatmulAlgoGetIds
+cublasLtDDDMatmulAlgoInit
+cublasLtDestroy
+cublasLtE4m3E4m3Fp32Bf16Bf16MatmulAlgoCapGetAttribute
+cublasLtE4m3E4m3Fp32Bf16Bf16MatmulAlgoCheck
+cublasLtE4m3E4m3Fp32Bf16Bf16MatmulAlgoInit
+cublasLtE4m3E4m3Fp32Bf16E4m3MatmulAlgoCapGetAttribute
+cublasLtE4m3E4m3Fp32Bf16E4m3MatmulAlgoCheck
+cublasLtE4m3E4m3Fp32Bf16E4m3MatmulAlgoInit
+cublasLtE4m3E4m3Fp32Fp16E4m3MatmulAlgoCapGetAttribute
+cublasLtE4m3E4m3Fp32Fp16E4m3MatmulAlgoCheck
+cublasLtE4m3E4m3Fp32Fp16E4m3MatmulAlgoInit
+cublasLtE4m3E4m3Fp32Fp16Fp16MatmulAlgoCapGetAttribute
+cublasLtE4m3E4m3Fp32Fp16Fp16MatmulAlgoCheck
+cublasLtE4m3E4m3Fp32Fp16Fp16MatmulAlgoInit
+cublasLtE4m3E4m3Fp32Fp32Fp32MatmulAlgoCapGetAttribute
+cublasLtE4m3E4m3Fp32Fp32Fp32MatmulAlgoCheck
+cublasLtE4m3E4m3Fp32Fp32Fp32MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Bf16Bf16MatmulAlgoCapGetAttribute
+cublasLtE4m3E5m2Fp32Bf16Bf16MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Bf16Bf16MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Bf16E4m3MatmulAlgoCapGetAttribute
+cublasLtE4m3E5m2Fp32Bf16E4m3MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Bf16E4m3MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Bf16E5m2MatmulAlgoCapGetAttribute
+cublasLtE4m3E5m2Fp32Bf16E5m2MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Bf16E5m2MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Fp16E4m3MatmulAlgoCapGetAttribute
+cublasLtE4m3E5m2Fp32Fp16E4m3MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Fp16E4m3MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Fp16E5m2MatmulAlgoCapGetAttribute
+cublasLtE4m3E5m2Fp32Fp16E5m2MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Fp16E5m2MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Fp16Fp16MatmulAlgoCapGetAttribute
+cublasLtE4m3E5m2Fp32Fp16Fp16MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Fp16Fp16MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Fp32Fp32MatmulAlgoCapGetAttribute
+cublasLtE4m3E5m2Fp32Fp32Fp32MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Fp32Fp32MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Bf16Bf16MatmulAlgoCapGetAttribute
+cublasLtE5m2E4m3Fp32Bf16Bf16MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Bf16Bf16MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Bf16E4m3MatmulAlgoCapGetAttribute
+cublasLtE5m2E4m3Fp32Bf16E4m3MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Bf16E4m3MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Bf16E5m2MatmulAlgoCapGetAttribute
+cublasLtE5m2E4m3Fp32Bf16E5m2MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Bf16E5m2MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Fp16E4m3MatmulAlgoCapGetAttribute
+cublasLtE5m2E4m3Fp32Fp16E4m3MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Fp16E4m3MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Fp16E5m2MatmulAlgoCapGetAttribute
+cublasLtE5m2E4m3Fp32Fp16E5m2MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Fp16E5m2MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Fp16Fp16MatmulAlgoCapGetAttribute
+cublasLtE5m2E4m3Fp32Fp16Fp16MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Fp16Fp16MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Fp32Fp32MatmulAlgoCapGetAttribute
+cublasLtE5m2E4m3Fp32Fp32Fp32MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Fp32Fp32MatmulAlgoInit
+cublasLtGetCudartVersion
+cublasLtGetProperty
+cublasLtGetStatusName
+cublasLtGetStatusString
+cublasLtGetVersion
+cublasLtHHHMatmul
+cublasLtHHHMatmulAlgoCapGetAttribute
+cublasLtHHHMatmulAlgoCheck
+cublasLtHHHMatmulAlgoGetHeuristic
+cublasLtHHHMatmulAlgoGetIds
+cublasLtHHHMatmulAlgoInit
+cublasLtHSHMatmul
+cublasLtHSHMatmulAlgoCapGetAttribute
+cublasLtHSHMatmulAlgoCheck
+cublasLtHSHMatmulAlgoGetHeuristic
+cublasLtHSHMatmulAlgoGetIds
+cublasLtHSHMatmulAlgoInit
+cublasLtHSSMatmul
+cublasLtHSSMatmulAlgoCapGetAttribute
+cublasLtHSSMatmulAlgoCheck
+cublasLtHSSMatmulAlgoGetHeuristic
+cublasLtHSSMatmulAlgoGetIds
+cublasLtHSSMatmulAlgoInit
+cublasLtHeuristicLutSerializeEntry
+cublasLtHeuristicsCacheGetCapacity
+cublasLtHeuristicsCacheSetCapacity
+cublasLtKCCMatmul
+cublasLtKCCMatmulAlgoCapGetAttribute
+cublasLtKCCMatmulAlgoCheck
+cublasLtKCCMatmulAlgoGetHeuristic
+cublasLtKCCMatmulAlgoGetIds
+cublasLtKCCMatmulAlgoInit
+cublasLtKCKMatmul
+cublasLtKCKMatmulAlgoCapGetAttribute
+cublasLtKCKMatmulAlgoCheck
+cublasLtKCKMatmulAlgoGetHeuristic
+cublasLtKCKMatmulAlgoGetIds
+cublasLtKCKMatmulAlgoInit
+cublasLtLegacyGemmACC
+cublasLtLegacyGemmBII
+cublasLtLegacyGemmBSS
+cublasLtLegacyGemmCCC
+cublasLtLegacyGemmDDD
+cublasLtLegacyGemmHHH
+cublasLtLegacyGemmHSH
+cublasLtLegacyGemmHSS
+cublasLtLegacyGemmSSS
+cublasLtLegacyGemmTSS
+cublasLtLegacyGemmTST
+cublasLtLegacyGemmUtilizationCCC
+cublasLtLegacyGemmUtilizationDDD
+cublasLtLegacyGemmUtilizationZZZ
+cublasLtLegacyGemmZZZ
+cublasLtLoggerForceDisable
+cublasLtLoggerOpenFile
+cublasLtLoggerSetCallback
+cublasLtLoggerSetFile
+cublasLtLoggerSetLevel
+cublasLtLoggerSetMask
+cublasLtMatmul
+cublasLtMatmulAlgoCapGetAttribute
+cublasLtMatmulAlgoCheck
+cublasLtMatmulAlgoConfigGetAttribute
+cublasLtMatmulAlgoConfigGetAttributeRange
+cublasLtMatmulAlgoConfigSetAttribute
+cublasLtMatmulAlgoGetHeuristic
+cublasLtMatmulAlgoGetIds
+cublasLtMatmulAlgoInit
+cublasLtMatmulDescCreate
+cublasLtMatmulDescDestroy
+cublasLtMatmulDescGetAttribute
+cublasLtMatmulDescInit_internal
+cublasLtMatmulDescSetAttribute
+cublasLtMatmulPreferenceCreate
+cublasLtMatmulPreferenceDestroy
+cublasLtMatmulPreferenceGetAttribute
+cublasLtMatmulPreferenceInit_internal
+cublasLtMatmulPreferenceSetAttribute
+cublasLtMatrixLayoutCreate
+cublasLtMatrixLayoutDestroy
+cublasLtMatrixLayoutGetAttribute
+cublasLtMatrixLayoutInit_internal
+cublasLtMatrixLayoutSetAttribute
+cublasLtMatrixTransform
+cublasLtMatrixTransformDescCreate
+cublasLtMatrixTransformDescDestroy
+cublasLtMatrixTransformDescGetAttribute
+cublasLtMatrixTransformDescInit_internal
+cublasLtMatrixTransformDescSetAttribute
+cublasLtSSSMatmul
+cublasLtSSSMatmulAlgoCapGetAttribute
+cublasLtSSSMatmulAlgoCheck
+cublasLtSSSMatmulAlgoGetHeuristic
+cublasLtSSSMatmulAlgoGetIds
+cublasLtSSSMatmulAlgoInit
+cublasLtShutdownCtx
+cublasLtTSSMatmul
+cublasLtTSSMatmulAlgoCapGetAttribute
+cublasLtTSSMatmulAlgoCheck
+cublasLtTSSMatmulAlgoGetHeuristic
+cublasLtTSSMatmulAlgoGetIds
+cublasLtTSSMatmulAlgoInit
+cublasLtTSTMatmul
+cublasLtTSTMatmulAlgoCapGetAttribute
+cublasLtTSTMatmulAlgoCheck
+cublasLtTSTMatmulAlgoGetHeuristic
+cublasLtTSTMatmulAlgoGetIds
+cublasLtTSTMatmulAlgoInit
+cublasLtVCCMatmul
+cublasLtVCCMatmulAlgoCapGetAttribute
+cublasLtVCCMatmulAlgoCheck
+cublasLtVCCMatmulAlgoGetHeuristic
+cublasLtVCCMatmulAlgoGetIds
+cublasLtVCCMatmulAlgoInit
+cublasLtVCVMatmul
+cublasLtVCVMatmulAlgoCapGetAttribute
+cublasLtVCVMatmulAlgoCheck
+cublasLtVCVMatmulAlgoGetHeuristic
+cublasLtVCVMatmulAlgoGetIds
+cublasLtVCVMatmulAlgoInit
+cublasLtZZZMatmul
+cublasLtZZZMatmulAlgoCapGetAttribute
+cublasLtZZZMatmulAlgoCheck
+cublasLtZZZMatmulAlgoGetHeuristic
+cublasLtZZZMatmulAlgoGetIds
+cublasLtZZZMatmulAlgoInit
+cublasLt_for_cublas_BII
+cublasLt_for_cublas_BSS
+cublasLt_for_cublas_CCC
+cublasLt_for_cublas_DDD
+cublasLt_for_cublas_HHH
+cublasLt_for_cublas_HSH
+cublasLt_for_cublas_HSS
+cublasLt_for_cublas_SSS
+cublasLt_for_cublas_TSS
+cublasLt_for_cublas_TST
+cublasLt_for_cublas_ZZZ
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublasLt_11_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cublasLt_11_0.inc
deleted file mode 100644
index 5645753c56bcf4..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cublasLt_11_0.inc
+++ /dev/null
@@ -1,390 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cublasStatus_t CUBLASWINAPI cublasLtCreate(cublasLtHandle_t *lightHandle) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lightHandle);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtDestroy(cublasLtHandle_t lightHandle) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lightHandle);
-}
-
-size_t CUBLASWINAPI cublasLtGetVersion(void) {
-  using FuncPtr = size_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtGetVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-size_t CUBLASWINAPI cublasLtGetCudartVersion(void) {
-  using FuncPtr = size_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtGetCudartVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtGetProperty(libraryPropertyType type,
-                                                int *value) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmul(
-    cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t computeDesc,
-    const void *alpha, /* host or device pointer */
-    const void *A, cublasLtMatrixLayout_t Adesc, const void *B,
-    cublasLtMatrixLayout_t Bdesc, const void *beta, /* host or device pointer */
-    const void *C, cublasLtMatrixLayout_t Cdesc, void *D,
-    cublasLtMatrixLayout_t Ddesc, const cublasLtMatmulAlgo_t *algo,
-    void *workspace, size_t workspaceSizeInBytes, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasLtHandle_t, cublasLtMatmulDesc_t, const void *, const void *,
-      cublasLtMatrixLayout_t, const void *, cublasLtMatrixLayout_t,
-      const void *, const void *, cublasLtMatrixLayout_t, void *,
-      cublasLtMatrixLayout_t, const cublasLtMatmulAlgo_t *, void *, size_t,
-      cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmul");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lightHandle, computeDesc, alpha, A, Adesc, B, Bdesc, beta, C,
-                  Cdesc, D, Ddesc, algo, workspace, workspaceSizeInBytes,
-                  stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatrixTransform(
-    cublasLtHandle_t lightHandle, cublasLtMatrixTransformDesc_t transformDesc,
-    const void *alpha, /* host or device pointer */
-    const void *A, cublasLtMatrixLayout_t Adesc,
-    const void *beta, /* host or device pointer */
-    const void *B, cublasLtMatrixLayout_t Bdesc, void *C,
-    cublasLtMatrixLayout_t Cdesc, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasLtHandle_t, cublasLtMatrixTransformDesc_t, const void *,
-      const void *, cublasLtMatrixLayout_t, const void *, const void *,
-      cublasLtMatrixLayout_t, void *, cublasLtMatrixLayout_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatrixTransform");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lightHandle, transformDesc, alpha, A, Adesc, beta, B, Bdesc,
-                  C, Cdesc, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutInit_internal(  //
-    cublasLtMatrixLayout_t matLayout, size_t size, cudaDataType type,
-    uint64_t rows, uint64_t cols, int64_t ld) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtMatrixLayout_t, size_t, cudaDataType, uint64_t, uint64_t,
-      int64_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatrixLayoutInit_internal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(matLayout, size, type, rows, cols, ld);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutCreate(  //
-    cublasLtMatrixLayout_t *matLayout, cudaDataType type, uint64_t rows,
-    uint64_t cols, int64_t ld) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtMatrixLayout_t *, cudaDataType, uint64_t, uint64_t, int64_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatrixLayoutCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(matLayout, type, rows, cols, ld);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasLtMatrixLayoutDestroy(cublasLtMatrixLayout_t matLayout) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatrixLayout_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatrixLayoutDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(matLayout);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutSetAttribute(  //
-    cublasLtMatrixLayout_t matLayout, cublasLtMatrixLayoutAttribute_t attr,
-    const void *buf, size_t sizeInBytes) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtMatrixLayout_t, cublasLtMatrixLayoutAttribute_t, const void *,
-      size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatrixLayoutSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(matLayout, attr, buf, sizeInBytes);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutGetAttribute(  //
-    cublasLtMatrixLayout_t matLayout, cublasLtMatrixLayoutAttribute_t attr,
-    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtMatrixLayout_t, cublasLtMatrixLayoutAttribute_t, void *, size_t,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatrixLayoutGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(matLayout, attr, buf, sizeInBytes, sizeWritten);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulDescInit_internal(  //
-    cublasLtMatmulDesc_t matmulDesc, size_t size,
-    cublasComputeType_t computeType, cudaDataType_t scaleType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtMatmulDesc_t, size_t, cublasComputeType_t, cudaDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescInit_internal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(matmulDesc, size, computeType, scaleType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulDescCreate(
-    cublasLtMatmulDesc_t *matmulDesc, cublasComputeType_t computeType,
-    cudaDataType_t scaleType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasLtMatmulDesc_t *, cublasComputeType_t, cudaDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(matmulDesc, computeType, scaleType);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasLtMatmulDescDestroy(cublasLtMatmulDesc_t matmulDesc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulDesc_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(matmulDesc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulDescSetAttribute(  //
-    cublasLtMatmulDesc_t matmulDesc, cublasLtMatmulDescAttributes_t attr,
-    const void *buf, size_t sizeInBytes) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtMatmulDesc_t, cublasLtMatmulDescAttributes_t, const void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(matmulDesc, attr, buf, sizeInBytes);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulDescGetAttribute(  //
-    cublasLtMatmulDesc_t matmulDesc, cublasLtMatmulDescAttributes_t attr,
-    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtMatmulDesc_t, cublasLtMatmulDescAttributes_t, void *, size_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(matmulDesc, attr, buf, sizeInBytes, sizeWritten);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescInit_internal(
-    cublasLtMatrixTransformDesc_t transformDesc, size_t size,
-    cudaDataType scaleType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatrixTransformDesc_t,
-                                                 size_t, cudaDataType);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescInit_internal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, size, scaleType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescCreate(
-    cublasLtMatrixTransformDesc_t *transformDesc, cudaDataType scaleType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasLtMatrixTransformDesc_t *, cudaDataType);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, scaleType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescDestroy(
-    cublasLtMatrixTransformDesc_t transformDesc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatrixTransformDesc_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescSetAttribute(  //
-    cublasLtMatrixTransformDesc_t transformDesc,
-    cublasLtMatrixTransformDescAttributes_t attr, const void *buf,
-    size_t sizeInBytes) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtMatrixTransformDesc_t, cublasLtMatrixTransformDescAttributes_t,
-      const void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, attr, buf, sizeInBytes);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescGetAttribute(  //
-    cublasLtMatrixTransformDesc_t transformDesc,
-    cublasLtMatrixTransformDescAttributes_t attr, void *buf, size_t sizeInBytes,
-    size_t *sizeWritten) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtMatrixTransformDesc_t, cublasLtMatrixTransformDescAttributes_t,
-      void *, size_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, attr, buf, sizeInBytes, sizeWritten);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceInit_internal(
-    cublasLtMatmulPreference_t pref, size_t size) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulPreference_t, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceInit_internal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pref, size);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasLtMatmulPreferenceCreate(cublasLtMatmulPreference_t *pref) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulPreference_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pref);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasLtMatmulPreferenceDestroy(cublasLtMatmulPreference_t pref) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulPreference_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pref);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceSetAttribute(  //
-    cublasLtMatmulPreference_t pref, cublasLtMatmulPreferenceAttributes_t attr,
-    const void *buf, size_t sizeInBytes) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtMatmulPreference_t, cublasLtMatmulPreferenceAttributes_t,
-      const void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pref, attr, buf, sizeInBytes);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceGetAttribute(  //
-    cublasLtMatmulPreference_t pref, cublasLtMatmulPreferenceAttributes_t attr,
-    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtMatmulPreference_t, cublasLtMatmulPreferenceAttributes_t, void *,
-      size_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pref, attr, buf, sizeInBytes, sizeWritten);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetHeuristic(
-    cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t operationDesc,
-    cublasLtMatrixLayout_t Adesc, cublasLtMatrixLayout_t Bdesc,
-    cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Ddesc,
-    cublasLtMatmulPreference_t preference, int requestedAlgoCount,
-    cublasLtMatmulHeuristicResult_t heuristicResultsArray[],
-    int *returnAlgoCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasLtHandle_t, cublasLtMatmulDesc_t, cublasLtMatrixLayout_t,
-      cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t,
-      cublasLtMatmulPreference_t, int, cublasLtMatmulHeuristicResult_t[],
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoGetHeuristic");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lightHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc,
-                  preference, requestedAlgoCount, heuristicResultsArray,
-                  returnAlgoCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetIds(
-    cublasLtHandle_t lightHandle, cublasComputeType_t computeType,
-    cudaDataType_t scaleType, cudaDataType_t Atype, cudaDataType_t Btype,
-    cudaDataType_t Ctype, cudaDataType_t Dtype, int requestedAlgoCount,
-    int algoIdsArray[], int *returnAlgoCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasLtHandle_t, cublasComputeType_t, cudaDataType_t, cudaDataType_t,
-      cudaDataType_t, cudaDataType_t, cudaDataType_t, int, int[], int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoGetIds");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lightHandle, computeType, scaleType, Atype, Btype, Ctype,
-                  Dtype, requestedAlgoCount, algoIdsArray, returnAlgoCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoInit(
-    cublasLtHandle_t lightHandle, cublasComputeType_t computeType,
-    cudaDataType_t scaleType, cudaDataType_t Atype, cudaDataType_t Btype,
-    cudaDataType_t Ctype, cudaDataType_t Dtype, int algoId,
-    cublasLtMatmulAlgo_t *algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasLtHandle_t, cublasComputeType_t, cudaDataType_t, cudaDataType_t,
-      cudaDataType_t, cudaDataType_t, cudaDataType_t, int,
-      cublasLtMatmulAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoInit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lightHandle, computeType, scaleType, Atype, Btype, Ctype,
-                  Dtype, algoId, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCheck(  //
-    cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t operationDesc,
-    cublasLtMatrixLayout_t Adesc, cublasLtMatrixLayout_t Bdesc,
-    cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Ddesc,
-    const cublasLtMatmulAlgo_t *algo,  ///< may point to result->algo
-    cublasLtMatmulHeuristicResult_t *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
-      cublasLtHandle_t, cublasLtMatmulDesc_t, cublasLtMatrixLayout_t,
-      cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t,
-      const cublasLtMatmulAlgo_t *,  ///< may point to result->algo
-      cublasLtMatmulHeuristicResult_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoCheck");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lightHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, algo,
-                  result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCapGetAttribute(
-    const cublasLtMatmulAlgo_t *algo, cublasLtMatmulAlgoCapAttributes_t attr,
-    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      const cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoCapAttributes_t, void *,
-      size_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatmulAlgoCapGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algo, attr, buf, sizeInBytes, sizeWritten);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigSetAttribute(
-    cublasLtMatmulAlgo_t *algo, cublasLtMatmulAlgoConfigAttributes_t attr,
-    const void *buf, size_t sizeInBytes) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoConfigAttributes_t,
-      const void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatmulAlgoConfigSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algo, attr, buf, sizeInBytes);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigGetAttribute(
-    const cublasLtMatmulAlgo_t *algo, cublasLtMatmulAlgoConfigAttributes_t attr,
-    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      const cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoConfigAttributes_t,
-      void *, size_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cublasLtMatmulAlgoConfigGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algo, attr, buf, sizeInBytes, sizeWritten);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublasLt_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cublasLt_stub.cc
index f7e62f704d2ded..df4e73bebc126c 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cublasLt_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cublasLt_stub.cc
@@ -22,38 +22,48 @@ limitations under the License.
 namespace {
 // Returns DSO handle or null if loading the DSO fails.
 void* GetDsoHandle() {
-#ifdef PLATFORM_GOOGLE
-  return nullptr;
-#else
   static auto handle = []() -> void* {
-    auto handle_or =
-        tsl::internal::DsoLoader::GetCublasLtDsoHandle();
+    auto handle_or = tsl::internal::DsoLoader::GetCublasLtDsoHandle();
     if (!handle_or.ok()) return nullptr;
     return handle_or.value();
   }();
   return handle;
-#endif
 }
 
-template <typename T>
-T LoadSymbol(const char* symbol_name) {
+void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
     tsl::Env::Default()
         ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
-  return reinterpret_cast<T>(symbol);
+  return symbol;
 }
 
-void LogFatalSymbolNotFound(const char* symbol_name) {
-  LOG(FATAL) << symbol_name << " symbol not found.";
-}
+const char* kSymbols[] = {
+#include "tsl/cuda/cublasLt.inc"
+};
+
+constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
 
-cublasStatus_t GetSymbolNotFoundError() { return CUBLAS_STATUS_INTERNAL_ERROR; }
 }  // namespace
 
-// We only use cublasLt from CUDA 11.0 onward.
-#if CUDA_VERSION >= 11000
-#include "tsl/cuda/cublasLt_11_0.inc"
-#endif
+extern "C" {
+
+static cublasStatus_t GetSymbolNotFoundError() {
+  return CUBLAS_STATUS_INTERNAL_ERROR;
+}
+
+extern void* _cublasLt_tramp_table[];
+
+void _cublasLt_tramp_resolve(int i) {
+  CHECK_LE(0, i);
+  CHECK_LT(i, kNumSymbols);
+  void* p = LoadSymbol(kSymbols[i]);
+  if (!p) {
+    p = reinterpret_cast<void*>(&GetSymbolNotFoundError);
+  }
+  _cublasLt_tramp_table[i] = p;
+}
+
+}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublas_10_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cublas_10_0.inc
deleted file mode 100644
index c24fd44c4f2613..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cublas_10_0.inc
+++ /dev/null
@@ -1,4898 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
-                                                int *version) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, version);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
-                                              int *value) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
-                                               cudaStream_t streamId) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
-                                               cudaStream_t *streamId) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
-                                                    cublasPointerMode_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
-                                                    cublasPointerMode_t mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
-                                                 cublasAtomicsMode_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
-                                                 cublasAtomicsMode_t mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
-                                              cublasMath_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
-                                              cublasMath_t mode) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, int logToStdOut,
-                                                  int logToStdErr,
-                                                  const char *logFileName) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLoggerConfigure");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(logIsOn, logToStdOut, logToStdErr, logFileName);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSetLoggerCallback(cublasLogCallback userCallback) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetLoggerCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(userCallback);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasGetLoggerCallback(cublasLogCallback *userCallback) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetLoggerCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(userCallback);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
-                                            int incx, void *devicePtr,
-                                            int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, x, incx, devicePtr, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
-                                            int incx, void *y, int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
-                                            const void *A, int lda, void *B,
-                                            int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
-                                                 int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
-                                            const void *A, int lda, void *B,
-                                            int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
-                                                 int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
-                                                 const void *hostPtr, int incx,
-                                                 void *devicePtr, int incy,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
-                                                 void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
-                                                 const void *devicePtr,
-                                                 int incx, void *hostPtr,
-                                                 int incy,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
-                                                 void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
-                                                 int elemSize, const void *A,
-                                                 int lda, void *B, int ldb,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      int, int, int, const void *, int, void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
-                                                 int elemSize, const void *A,
-                                                 int lda, void *B, int ldb,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      int, int, int, const void *, int, void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
-}
-
-void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
-  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
-  return func_ptr(srName, info);
-}
-
-cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
-                                         const void *x, cudaDataType xType,
-                                         int incx, void *result,
-                                         cudaDataType resultType,
-                                         cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, void *,
-      cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx,
-                                           float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx,
-                                           double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
-                                        const void *x, cudaDataType xType,
-                                        int incx, const void *y,
-                                        cudaDataType yType, int incy,
-                                        void *result, cudaDataType resultType,
-                                        cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
-                  executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
-                                         const void *x, cudaDataType xType,
-                                         int incx, const void *y,
-                                         cudaDataType yType, int incy,
-                                         void *result, cudaDataType resultType,
-                                         cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
-                  executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
-                                          const float *x, int incx,
-                                          const float *y, int incy,
-                                          float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, int, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
-                                          const double *x, int incx,
-                                          const double *y, int incy,
-                                          double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           const cuComplex *y, int incy,
-                                           cuComplex *result) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           const cuComplex *y, int incy,
-                                           cuComplex *result) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           const cuDoubleComplex *y, int incy,
-                                           cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           const cuDoubleComplex *y, int incy,
-                                           cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasScalEx(cublasHandle_t handle, int n,
-             const void *alpha, /* host or device pointer */
-             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
-             cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
-      int, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSscal_v2(cublasHandle_t handle, int n,
-               const float *alpha, /* host or device pointer */
-               float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDscal_v2(cublasHandle_t handle, int n,
-               const double *alpha, /* host or device pointer */
-               double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCscal_v2(cublasHandle_t handle, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsscal_v2(cublasHandle_t handle, int n,
-                const float *alpha, /* host or device pointer */
-                cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZscal_v2(cublasHandle_t handle, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZdscal_v2(cublasHandle_t handle, int n,
-                const double *alpha, /* host or device pointer */
-                cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasAxpyEx(
-    cublasHandle_t handle, int n,
-    const void *alpha, /* host or device pointer */
-    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
-    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, const void *,
-      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
-                  executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSaxpy_v2(cublasHandle_t handle, int n,
-               const float *alpha, /* host or device pointer */
-               const float *x, int incx, float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDaxpy_v2(cublasHandle_t handle, int n,
-               const double *alpha, /* host or device pointer */
-               const double *x, int incx, double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCaxpy_v2(cublasHandle_t handle, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, cuComplex *y, int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
-    cublasHandle_t handle, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx, float *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx, double *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuDoubleComplex *, int,
-                                                 cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
-                                           float *x, int incx, float *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
-                                                 int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
-                                           double *x, int incx, double *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
-                                                 int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
-                                           cuComplex *x, int incx, cuComplex *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
-                                           cuDoubleComplex *x, int incx,
-                                           cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
-                                            const float *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
-                                            const double *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
-                                            const float *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
-                                            const double *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx,
-                                           float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx,
-                                           double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
-              int incy, const float *c, /* host or device pointer */
-              const float *s) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
-                                     int, const float *, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
-              int incy, const double *c, /* host or device pointer */
-              const double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *, int, double *, int, const double *,
-      const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCrot_v2(
-    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
-    int incy, const float *c, /* host or device pointer */
-    const cuComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
-      const cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
-    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
-    int incy, const float *c, /* host or device pointer */
-    const float *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
-      const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZrot_v2(
-    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
-    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
-    const cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      const double *, const cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
-    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
-    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
-    const double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      const double *, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
-               float *b,                        /* host or device pointer */
-               float *c,                        /* host or device pointer */
-               float *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
-                                                 float *, float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
-               double *b,                        /* host or device pointer */
-               double *c,                        /* host or device pointer */
-               double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
-                                                 double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
-               cuComplex *b,                        /* host or device pointer */
-               float *c,                            /* host or device pointer */
-               cuComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
-    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
-    cuDoubleComplex *b,                        /* host or device pointer */
-    double *c,                                 /* host or device pointer */
-    cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
-                                           float *x, int incx, float *y,
-                                           int incy, const float *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, float *, int, float *, int, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, param);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
-                                           double *x, int incx, double *y,
-                                           int incy, const double *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *, int, double *, int, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
-                float *d2,                        /* host or device pointer */
-                float *x1,                        /* host or device pointer */
-                const float *y1,                  /* host or device pointer */
-                float *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, float *, float *, float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, d1, d2, x1, y1, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
-                double *d2,                        /* host or device pointer */
-                double *x1,                        /* host or device pointer */
-                const double *y1,                  /* host or device pointer */
-                double *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, double *, double *, double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, d1, d2, x1, y1, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
-      int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               int kl, int ku, const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
-      const float *, int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               int kl, int ku, const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
-    int ku, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *x, int incx,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
-    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const float *A, int lda, float *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *AP,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *AP,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const float *A, int lda, float *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
-      int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *AP, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const double *alpha, /* host or device pointer */
-               const double *AP, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *AP, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
-               const cuDoubleComplex *beta, /* host or device pointer */
-               cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSger_v2(
-    cublasHandle_t handle, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *x, int incx, const float *y, int incy, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDger_v2(
-    cublasHandle_t handle, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const double *, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgeru_v2(cublasHandle_t handle, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgerc_v2(cublasHandle_t handle, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgeru_v2(cublasHandle_t handle, int m, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgerc_v2(cublasHandle_t handle, int m, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const float *x, int incx, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const double *x, int incx, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const cuComplex *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const cuDoubleComplex *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
-      int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const float *x, int incx, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const double *x, int incx, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
-      int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const float *alpha, /* host or device pointer */
-    const float *x, int incx, const float *y, int incy, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *x, int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3m(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, const void *A,
-    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
-    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
-              cublasOperation_t transb, int m, int n, int k,
-              const cuDoubleComplex *alpha, /* host or device pointer */
-              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
-              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
-              cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda, const void *B,
-    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const float *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda, const void *B,
-    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc, cudaDataType computeType,
-    cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
-      int, const void *, void *, cudaDataType, int, cudaDataType,
-      cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, const void *A,
-    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
-    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
-    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
-    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
-      int, int, int, const unsigned char *, int, int, const unsigned char *,
-      int, int, unsigned char *, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
-                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha,           /* host or device pointer */
-    const float *A, int lda, const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha,            /* host or device pointer */
-    const double *A, int lda, const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
-      void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
-    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
-      void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha,               /* host or device pointer */
-    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const cuComplex *, int, const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZherk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const cuDoubleComplex *, int, const double *,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherkEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    const float *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const void *, cudaDataType, int, const float *, void *,
-      cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
-    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const void *, cudaDataType, int, const float *, void *,
-      cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZherkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasChemm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *A, int lda, float *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *, int, float *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *A, int lda, double *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *, int, double *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, cuComplex *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
-    int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const float *const Aarray[], int lda, const float *const Barray[], int ldb,
-    const float *beta, /* host or device pointer */
-    float *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *const[], int, const float *const[], int,
-      const float *, float *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *const Aarray[], int lda, const double *const Barray[],
-    int ldb, const double *beta, /* host or device pointer */
-    double *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *const[], int, const double *const[], int,
-      const double *, double *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
-    int ldb, const cuComplex *beta, /* host or device pointer */
-    cuComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *const[], int,
-      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
-    int ldb, const cuComplex *beta, /* host or device pointer */
-    cuComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *const[], int,
-      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
-                   cublasOperation_t transb, int m, int n, int k,
-                   const cuDoubleComplex *alpha, /* host or device pointer */
-                   const cuDoubleComplex *const Aarray[], int lda,
-                   const cuDoubleComplex *const Barray[], int ldb,
-                   const cuDoubleComplex *beta, /* host or device pointer */
-                   cuDoubleComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *const[], int,
-      const cuDoubleComplex *const[], int, const cuDoubleComplex *,
-      cuDoubleComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *const Aarray[], cudaDataType Atype, int lda,
-    const void *const Barray[], cudaDataType Btype, int ldb,
-    const void *beta, /* host or device pointer */
-    void *const Carray[], cudaDataType Ctype, int ldc, int batchCount,
-    cudaDataType computeType, cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *const[], cudaDataType, int, const void *const[],
-      cudaDataType, int, const void *, void *const[], cudaDataType, int, int,
-      cudaDataType, cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmBatchedEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda,
-                  Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount,
-                  computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    long long int strideA, /* purposely signed */
-    const void *B, cudaDataType Btype, int ldb, long long int strideB,
-    const void *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc, long long int strideC, int batchCount,
-    cudaDataType computeType, cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *, cudaDataType, int, long long, const void *,
-      cudaDataType, int, long long, const void *, void *, cudaDataType, int,
-      long long, int, cudaDataType, cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmStridedBatchedEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda,
-                  strideA, B, Btype, ldb, strideB, beta, C, Ctype, ldc, strideC,
-                  batchCount, computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha,        /* host or device pointer */
-    const float *A, int lda, long long int strideA, /* purposely signed */
-    const float *B, int ldb, long long int strideB,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *, int, long long, const float *, int,
-      long long, const float *, float *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, long long int strideA, /* purposely signed */
-    const double *B, int ldb, long long int strideB,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *, int, long long, const double *, int,
-      long long, const double *, double *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
-    const cuComplex *B, int ldb, long long int strideB,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
-      int, long long, const cuComplex *, cuComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
-    const cuComplex *B, int ldb, long long int strideB,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
-      int, long long, const cuComplex *, cuComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    long long int strideA, /* purposely signed */
-    const cuDoubleComplex *B, int ldb, long long int strideB,
-    const cuDoubleComplex *beta, /* host or device poi */
-    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
-      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
-      cuDoubleComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const float *alpha,           /* host or device pointer */
-    const float *A, int lda, const float *beta, /* host or device pointer */
-    const float *B, int ldb, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, const float *, int,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const double *alpha,            /* host or device pointer */
-    const double *A, int lda, const double *beta, /* host or device pointer */
-    const double *B, int ldb, double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, const double *, int,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(
-    cublasHandle_t handle, int n, float *const A[], /*Device pointer*/
-    int lda, int *P,                                /*Device Pointer*/
-    int *info,                                      /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, float *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(
-    cublasHandle_t handle, int n, double *const A[], /*Device pointer*/
-    int lda, int *P,                                 /*Device Pointer*/
-    int *info,                                       /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(
-    cublasHandle_t handle, int n, cuComplex *const A[], /*Device pointer*/
-    int lda, int *P,                                    /*Device Pointer*/
-    int *info,                                          /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(
-    cublasHandle_t handle, int n, cuDoubleComplex *const A[], /*Device pointer*/
-    int lda, int *P,                                          /*Device Pointer*/
-    int *info,                                                /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetriBatched(
-    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
-    int lda, const int *P,                                /*Device pointer*/
-    float *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *const[], int, const int *,
-      float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetriBatched(
-    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
-    int lda, const int *P,                                 /*Device pointer*/
-    double *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *const[], int, const int *,
-      double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetriBatched(
-    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
-    int lda, const int *P,                                    /*Device pointer*/
-    cuComplex *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *const[], int, const int *,
-      cuComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgetriBatched(cublasHandle_t handle, int n,
-                    const cuDoubleComplex *const A[], /*Device pointer*/
-                    int lda, const int *P,            /*Device pointer*/
-                    cuDoubleComplex *const C[],       /*Device pointer*/
-                    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *const[], int, const int *,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const float *const Aarray[], int lda, const int *devIpiv,
-    float *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const float *const[], int,
-      const int *, float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const double *const Aarray[], int lda, const int *devIpiv,
-    double *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const double *const[], int,
-      const int *, double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuComplex *const Aarray[], int lda, const int *devIpiv,
-    cuComplex *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *const[],
-      int, const int *, cuComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuDoubleComplex *const Aarray[], int lda, const int *devIpiv,
-    cuDoubleComplex *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *const[], int, const int *,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /*Host or Device Pointer*/
-    const float *const A[], int lda, float *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *const[], int,
-      float *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /*Host or Device Pointer*/
-    const double *const A[], int lda, double *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *const[], int,
-      double *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /*Host or Device Pointer*/
-    const cuComplex *const A[], int lda, cuComplex *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *const[],
-      int, cuComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /*Host or Device Pointer*/
-    const cuDoubleComplex *const A[], int lda, cuDoubleComplex *const B[],
-    int ldb, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
-    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
-    int lda, float *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                               /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const float *const[],
-                                     int, float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(
-    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
-    int lda, double *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                                /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const double *const[],
-                                     int, double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(
-    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
-    int lda, cuComplex *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                                   /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *const[], int, cuComplex *const[],
-      int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZmatinvBatched(cublasHandle_t handle, int n,
-                     const cuDoubleComplex *const A[],       /*Device pointer*/
-                     int lda, cuDoubleComplex *const Ainv[], /*Device pointer*/
-                     int lda_inv, int *info,                 /*Device Pointer*/
-                     int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *const[], int,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    float *const Aarray[],            /*Device pointer*/
-                    int lda, float *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, float *const[],
-                                     int, float *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    double *const Aarray[],            /*Device pointer*/
-                    int lda, double *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, double *const[],
-                                     int, double *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    cuComplex *const Aarray[],            /*Device pointer*/
-                    int lda, cuComplex *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, cuComplex *const[], int, cuComplex *const[],
-      int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(
-    cublasHandle_t handle, int m, int n,
-    cuDoubleComplex *const Aarray[],            /*Device pointer*/
-    int lda, cuDoubleComplex *const TauArray[], /*Device pointer*/
-    int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, cuDoubleComplex *const[], int,
-      cuDoubleComplex *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, float *const Aarray[],       /*Device pointer*/
-                   int lda, float *const Carray[],        /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
-                   int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, float *const[], int,
-      float *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, double *const Aarray[],      /*Device pointer*/
-                   int lda, double *const Carray[],       /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
-                   int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, double *const[], int,
-      double *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, cuComplex *const Aarray[], /*Device pointer*/
-                   int lda, cuComplex *const Carray[],  /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *const[], int,
-      cuComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, cuDoubleComplex *const Aarray[], /*Device pointer*/
-                   int lda, cuDoubleComplex *const Carray[],  /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int,
-      cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int *,
-      int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const float *A, int lda, const float *x,
-                                        int incx, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const double *A, int lda,
-                                        const double *x, int incx, double *C,
-                                        int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const cuComplex *A, int lda,
-                                        const cuComplex *x, int incx,
-                                        cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const cuDoubleComplex *A, int lda,
-                                        const cuDoubleComplex *x, int incx,
-                                        cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const float *AP, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const double *AP, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuComplex *AP, cuComplex *A,
-                                         int lda) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
-                                     const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuDoubleComplex *AP,
-                                         cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const float *A, int lda, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const double *A, int lda, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuComplex *A, int lda,
-                                         cuComplex *AP) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
-                                     const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuDoubleComplex *A, int lda,
-                                         cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus CUBLASWINAPI cublasInit(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasShutdown(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasGetError(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, devicePtr);
-}
-
-cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devicePtr);
-}
-
-cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
-                              int incy) {
-  using FuncPtr =
-      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
-                               const double *y, int incy) {
-  using FuncPtr =
-      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
-                                   const cuComplex *y, int incy) {
-  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
-                                            const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
-                                   const cuComplex *y, int incy) {
-  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
-                                            const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
-                                         int incx, const cuDoubleComplex *y,
-                                         int incy) {
-  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
-      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
-                                         int incx, const cuDoubleComplex *y,
-                                         int incy) {
-  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
-      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
-                               int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
-                              float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
-                              double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
-                              int incx, cuComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
-                                       cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
-  return func_ptr(n, x, incx);
-}
-
-void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
-                             float sc, float ss) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
-  return func_ptr(n, x, incx, y, incy, sc, ss);
-}
-
-void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
-                             double sc, double ss) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
-  return func_ptr(n, x, incx, y, incy, sc, ss);
-}
-
-void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
-                             int incy, float c, cuComplex s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
-                                       float, cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *y, int incy, double sc,
-                             cuDoubleComplex cs) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-                           double, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
-  return func_ptr(n, x, incx, y, incy, sc, cs);
-}
-
-void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
-                              int incy, float c, float s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
-                                       float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy, double c,
-                              double s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
-                                       cuDoubleComplex *, int, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
-  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
-  return func_ptr(sa, sb, sc, ss);
-}
-
-void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
-  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
-  return func_ptr(sa, sb, sc, ss);
-}
-
-void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
-                              cuComplex *cs) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
-  return func_ptr(ca, cb, sc, cs);
-}
-
-void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
-                              double *sc, cuDoubleComplex *cs) {
-  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
-                                       double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
-  return func_ptr(ca, cb, sc, cs);
-}
-
-void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
-                              const float *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
-  return func_ptr(n, x, incx, y, incy, sparam);
-}
-
-void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
-                              const double *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
-  return func_ptr(n, x, incx, y, incy, sparam);
-}
-
-void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
-                               const float *sy1, float *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
-  return func_ptr(sd1, sd2, sx1, sy1, sparam);
-}
-
-void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
-                               const double *sy1, double *sparam) {
-  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
-                                       const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
-  return func_ptr(sd1, sd2, sx1, sy1, sparam);
-}
-
-void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
-                              const float *A, int lda, const float *x, int incx,
-                              float beta, float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
-                              const double *A, int lda, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
-                              float alpha, const float *A, int lda,
-                              const float *x, int incx, float beta, float *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
-                              double alpha, const double *A, int lda,
-                              const double *x, int incx, double beta, double *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
-                           int, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *x, int incx, cuComplex beta,
-                              cuComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
-                                       int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
-                           int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
-                              const float *AP, float *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
-                              const double *AP, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
-                              const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *AP, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
-                                       int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
-                              const float *AP, float *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
-                              const double *AP, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
-                              const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *AP, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
-                           int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
-                              int lda, const float *x, int incx, float beta,
-                              float *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
-                              int lda, const double *x, int incx, double beta,
-                              double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
-                              const float *A, int lda, const float *x, int incx,
-                              float beta, float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
-                              const double *A, int lda, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
-                              const float *x, int incx, float beta, float *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
-                              const double *x, int incx, double beta, double *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
-                           int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *AP, const cuComplex *x, int incx,
-                              cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *AP,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
-                             int incx, const float *y, int incy, float *A,
-                             int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
-                             int incx, const double *y, int incy, double *A,
-                             int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
-                              int incx, const cuComplex *y, int incy,
-                              cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
-                              int incx, const cuComplex *y, int incy,
-                              cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
-                             int incx, float *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
-                             int incx, double *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
-                             int incx, cuComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
-                             const cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
-                             int incx, float *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
-                             int incx, double *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
-                             int incx, cuComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
-                                       cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
-                             const cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
-                              int incx, const float *y, int incy, float *A,
-                              int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
-                              int incx, const double *y, int incy, double *A,
-                              int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
-                              const cuComplex *x, int incx, const cuComplex *y,
-                              int incy, cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
-                              int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
-                              int incx, const double *y, int incy, double *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
-                              const cuComplex *x, int incx, const cuComplex *y,
-                              int incy, cuComplex *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
-                              float alpha, const float *A, int lda,
-                              const float *B, int ldb, float beta, float *C,
-                              int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
-                              double alpha, const double *A, int lda,
-                              const double *B, int ldb, double beta, double *C,
-                              int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
-                           int, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
-                              const float *A, int lda, float beta, float *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
-                              const double *A, int lda, double beta, double *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, double, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
-                           int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
-                                       const cuDoubleComplex *, int,
-                                       cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
-                              const cuComplex *A, int lda, float beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
-                           float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
-                              const cuDoubleComplex *A, int lda, double beta,
-                              cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
-                                       const cuDoubleComplex *, int, double,
-                                       cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
-                               const float *A, int lda, const float *B, int ldb,
-                               float beta, float *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
-                               double alpha, const double *A, int lda,
-                               const double *B, int ldb, double beta, double *C,
-                               int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C,
-                               int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, float beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *B, int ldb,
-                               double beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
-                              const float *A, int lda, const float *B, int ldb,
-                              float beta, float *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
-                              const double *A, int lda, const double *B,
-                              int ldb, double beta, double *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, float alpha, const float *A,
-                              int lda, float *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, double alpha, const double *A,
-                              int lda, double *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuComplex alpha, const cuComplex *A,
-                              int lda, cuComplex *B, int ldb) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
-                                       cuDoubleComplex, const cuDoubleComplex *,
-                                       int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, float alpha, const float *A,
-                              int lda, float *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, double alpha, const double *A,
-                              int lda, double *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuComplex alpha, const cuComplex *A,
-                              int lda, cuComplex *B, int ldb) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
-                                       cuDoubleComplex, const cuDoubleComplex *,
-                                       int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublas_10_1.inc b/third_party/xla/third_party/tsl/tsl/cuda/cublas_10_1.inc
deleted file mode 100644
index 067ba675288524..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cublas_10_1.inc
+++ /dev/null
@@ -1,5023 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
-                                                int *version) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, version);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
-                                              int *value) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-size_t CUBLASWINAPI cublasGetCudartVersion(void) {
-  using FuncPtr = size_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetCudartVersion");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasGetCudartVersion");
-  return func_ptr();
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
-                                               cudaStream_t streamId) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
-                                               cudaStream_t *streamId) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
-                                                    cublasPointerMode_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
-                                                    cublasPointerMode_t mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
-                                                 cublasAtomicsMode_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
-                                                 cublasAtomicsMode_t mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
-                                              cublasMath_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
-                                              cublasMath_t mode) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, int logToStdOut,
-                                                  int logToStdErr,
-                                                  const char *logFileName) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLoggerConfigure");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(logIsOn, logToStdOut, logToStdErr, logFileName);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSetLoggerCallback(cublasLogCallback userCallback) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetLoggerCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(userCallback);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasGetLoggerCallback(cublasLogCallback *userCallback) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetLoggerCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(userCallback);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
-                                            int incx, void *devicePtr,
-                                            int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, x, incx, devicePtr, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
-                                            int incx, void *y, int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
-                                            const void *A, int lda, void *B,
-                                            int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
-                                                 int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
-                                            const void *A, int lda, void *B,
-                                            int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
-                                                 int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
-                                                 const void *hostPtr, int incx,
-                                                 void *devicePtr, int incy,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
-                                                 void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
-                                                 const void *devicePtr,
-                                                 int incx, void *hostPtr,
-                                                 int incy,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
-                                                 void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
-                                                 int elemSize, const void *A,
-                                                 int lda, void *B, int ldb,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      int, int, int, const void *, int, void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
-                                                 int elemSize, const void *A,
-                                                 int lda, void *B, int ldb,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      int, int, int, const void *, int, void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
-}
-
-void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
-  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
-  return func_ptr(srName, info);
-}
-
-cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
-                                         const void *x, cudaDataType xType,
-                                         int incx, void *result,
-                                         cudaDataType resultType,
-                                         cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, void *,
-      cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx,
-                                           float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx,
-                                           double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
-                                        const void *x, cudaDataType xType,
-                                        int incx, const void *y,
-                                        cudaDataType yType, int incy,
-                                        void *result, cudaDataType resultType,
-                                        cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
-                  executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
-                                         const void *x, cudaDataType xType,
-                                         int incx, const void *y,
-                                         cudaDataType yType, int incy,
-                                         void *result, cudaDataType resultType,
-                                         cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
-                  executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
-                                          const float *x, int incx,
-                                          const float *y, int incy,
-                                          float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, int, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
-                                          const double *x, int incx,
-                                          const double *y, int incy,
-                                          double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           const cuComplex *y, int incy,
-                                           cuComplex *result) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           const cuComplex *y, int incy,
-                                           cuComplex *result) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           const cuDoubleComplex *y, int incy,
-                                           cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           const cuDoubleComplex *y, int incy,
-                                           cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasScalEx(cublasHandle_t handle, int n,
-             const void *alpha, /* host or device pointer */
-             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
-             cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
-      int, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSscal_v2(cublasHandle_t handle, int n,
-               const float *alpha, /* host or device pointer */
-               float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDscal_v2(cublasHandle_t handle, int n,
-               const double *alpha, /* host or device pointer */
-               double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCscal_v2(cublasHandle_t handle, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsscal_v2(cublasHandle_t handle, int n,
-                const float *alpha, /* host or device pointer */
-                cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZscal_v2(cublasHandle_t handle, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZdscal_v2(cublasHandle_t handle, int n,
-                const double *alpha, /* host or device pointer */
-                cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasAxpyEx(
-    cublasHandle_t handle, int n,
-    const void *alpha, /* host or device pointer */
-    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
-    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, const void *,
-      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
-                  executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSaxpy_v2(cublasHandle_t handle, int n,
-               const float *alpha, /* host or device pointer */
-               const float *x, int incx, float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDaxpy_v2(cublasHandle_t handle, int n,
-               const double *alpha, /* host or device pointer */
-               const double *x, int incx, double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCaxpy_v2(cublasHandle_t handle, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, cuComplex *y, int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
-    cublasHandle_t handle, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCopyEx(cublasHandle_t handle, int n,
-                                         const void *x, cudaDataType xType,
-                                         int incx, void *y, cudaDataType yType,
-                                         int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, void *,
-      cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCopyEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx, float *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx, double *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuDoubleComplex *, int,
-                                                 cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
-                                           float *x, int incx, float *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
-                                                 int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
-                                           double *x, int incx, double *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
-                                                 int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
-                                           cuComplex *x, int incx, cuComplex *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
-                                           cuDoubleComplex *x, int incx,
-                                           cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSwapEx(cublasHandle_t handle, int n, void *x,
-                                         cudaDataType xType, int incx, void *y,
-                                         cudaDataType yType, int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, void *, cudaDataType,
-                                     int, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSwapEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
-                                            const float *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
-                                            const double *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIamaxEx(
-    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
-    int *result /* host or device pointer */
-) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIamaxEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
-                                            const float *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
-                                            const double *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIaminEx(
-    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
-    int *result /* host or device pointer */
-) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIaminEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasAsumEx(
-    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
-    void *result, cudaDataType resultType, /* host or device pointer */
-    cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, void *,
-      cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAsumEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result, resultType, executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx,
-                                           float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx,
-                                           double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
-              int incy, const float *c, /* host or device pointer */
-              const float *s) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
-                                     int, const float *, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
-              int incy, const double *c, /* host or device pointer */
-              const double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *, int, double *, int, const double *,
-      const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCrot_v2(
-    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
-    int incy, const float *c, /* host or device pointer */
-    const cuComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
-      const cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
-    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
-    int incy, const float *c, /* host or device pointer */
-    const float *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
-      const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZrot_v2(
-    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
-    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
-    const cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      const double *, const cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
-    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
-    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
-    const double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      const double *, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasRotEx(cublasHandle_t handle, int n, void *x, cudaDataType xType, int incx,
-            void *y, cudaDataType yType, int incy,
-            const void *c, /* host or device pointer */
-            const void *s, cudaDataType csType, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
-      const void *, const void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, c, s, csType,
-                  executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
-               float *b,                        /* host or device pointer */
-               float *c,                        /* host or device pointer */
-               float *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
-                                                 float *, float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
-               double *b,                        /* host or device pointer */
-               double *c,                        /* host or device pointer */
-               double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
-                                                 double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
-               cuComplex *b,                        /* host or device pointer */
-               float *c,                            /* host or device pointer */
-               cuComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
-    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
-    cuDoubleComplex *b,                        /* host or device pointer */
-    double *c,                                 /* host or device pointer */
-    cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
-                                         void *a, /* host or device pointer */
-                                         void *b, /* host or device pointer */
-                                         cudaDataType abType,
-                                         void *c, /* host or device pointer */
-                                         void *s, /* host or device pointer */
-                                         cudaDataType csType,
-                                         cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, void *, void *,
-                                                 cudaDataType, void *, void *,
-                                                 cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotgEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, abType, c, s, csType, executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
-                                           float *x, int incx, float *y,
-                                           int incy, const float *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, float *, int, float *, int, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, param);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
-                                           double *x, int incx, double *y,
-                                           int incy, const double *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *, int, double *, int, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasRotmEx(cublasHandle_t handle, int n, void *x, cudaDataType xType,
-             int incx, void *y, cudaDataType yType, int incy,
-             const void *param, /* host or device pointer */
-             cudaDataType paramType, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
-      const void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, param, paramType,
-                  executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
-                float *d2,                        /* host or device pointer */
-                float *x1,                        /* host or device pointer */
-                const float *y1,                  /* host or device pointer */
-                float *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, float *, float *, float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, d1, d2, x1, y1, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
-                double *d2,                        /* host or device pointer */
-                double *x1,                        /* host or device pointer */
-                const double *y1,                  /* host or device pointer */
-                double *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, double *, double *, double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, d1, d2, x1, y1, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasRotmgEx(cublasHandle_t handle, void *d1,     /* host or device pointer */
-              cudaDataType d1Type, void *d2,       /* host or device pointer */
-              cudaDataType d2Type, void *x1,       /* host or device pointer */
-              cudaDataType x1Type, const void *y1, /* host or device pointer */
-              cudaDataType y1Type, void *param,    /* host or device pointer */
-              cudaDataType paramType, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, void *, cudaDataType, void *, cudaDataType, void *,
-      cudaDataType, const void *, cudaDataType, void *, cudaDataType,
-      cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmgEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, d1, d1Type, d2, d2Type, x1, x1Type, y1, y1Type, param,
-                  paramType, executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
-      int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               int kl, int ku, const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
-      const float *, int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               int kl, int ku, const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
-    int ku, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *x, int incx,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
-    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const float *A, int lda, float *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *AP,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *AP,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const float *A, int lda, float *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
-      int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *AP, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const double *alpha, /* host or device pointer */
-               const double *AP, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *AP, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
-               const cuDoubleComplex *beta, /* host or device pointer */
-               cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSger_v2(
-    cublasHandle_t handle, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *x, int incx, const float *y, int incy, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDger_v2(
-    cublasHandle_t handle, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const double *, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgeru_v2(cublasHandle_t handle, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgerc_v2(cublasHandle_t handle, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgeru_v2(cublasHandle_t handle, int m, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgerc_v2(cublasHandle_t handle, int m, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const float *x, int incx, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const double *x, int incx, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const cuComplex *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const cuDoubleComplex *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
-      int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const float *x, int incx, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const double *x, int incx, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
-      int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const float *alpha, /* host or device pointer */
-    const float *x, int incx, const float *y, int incy, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *x, int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3m(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, const void *A,
-    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
-    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
-              cublasOperation_t transb, int m, int n, int k,
-              const cuDoubleComplex *alpha, /* host or device pointer */
-              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
-              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
-              cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda, const void *B,
-    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const float *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda, const void *B,
-    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc, cudaDataType computeType,
-    cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
-      int, const void *, void *, cudaDataType, int, cudaDataType,
-      cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, const void *A,
-    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
-    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
-    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
-    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
-      int, int, int, const unsigned char *, int, int, const unsigned char *,
-      int, int, unsigned char *, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
-                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha,           /* host or device pointer */
-    const float *A, int lda, const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha,            /* host or device pointer */
-    const double *A, int lda, const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
-      void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
-    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
-      void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha,               /* host or device pointer */
-    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const cuComplex *, int, const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZherk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const cuDoubleComplex *, int, const double *,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherkEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    const float *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const void *, cudaDataType, int, const float *, void *,
-      cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
-    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const void *, cudaDataType, int, const float *, void *,
-      cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZherkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasChemm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *A, int lda, float *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *, int, float *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *A, int lda, double *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *, int, double *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, cuComplex *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
-    int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const float *const Aarray[], int lda, const float *const Barray[], int ldb,
-    const float *beta, /* host or device pointer */
-    float *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *const[], int, const float *const[], int,
-      const float *, float *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *const Aarray[], int lda, const double *const Barray[],
-    int ldb, const double *beta, /* host or device pointer */
-    double *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *const[], int, const double *const[], int,
-      const double *, double *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
-    int ldb, const cuComplex *beta, /* host or device pointer */
-    cuComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *const[], int,
-      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
-    int ldb, const cuComplex *beta, /* host or device pointer */
-    cuComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *const[], int,
-      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
-                   cublasOperation_t transb, int m, int n, int k,
-                   const cuDoubleComplex *alpha, /* host or device pointer */
-                   const cuDoubleComplex *const Aarray[], int lda,
-                   const cuDoubleComplex *const Barray[], int ldb,
-                   const cuDoubleComplex *beta, /* host or device pointer */
-                   cuDoubleComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *const[], int,
-      const cuDoubleComplex *const[], int, const cuDoubleComplex *,
-      cuDoubleComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *const Aarray[], cudaDataType Atype, int lda,
-    const void *const Barray[], cudaDataType Btype, int ldb,
-    const void *beta, /* host or device pointer */
-    void *const Carray[], cudaDataType Ctype, int ldc, int batchCount,
-    cudaDataType computeType, cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *const[], cudaDataType, int, const void *const[],
-      cudaDataType, int, const void *, void *const[], cudaDataType, int, int,
-      cudaDataType, cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmBatchedEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda,
-                  Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount,
-                  computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    long long int strideA, /* purposely signed */
-    const void *B, cudaDataType Btype, int ldb, long long int strideB,
-    const void *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc, long long int strideC, int batchCount,
-    cudaDataType computeType, cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *, cudaDataType, int, long long, const void *,
-      cudaDataType, int, long long, const void *, void *, cudaDataType, int,
-      long long, int, cudaDataType, cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmStridedBatchedEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda,
-                  strideA, B, Btype, ldb, strideB, beta, C, Ctype, ldc, strideC,
-                  batchCount, computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha,        /* host or device pointer */
-    const float *A, int lda, long long int strideA, /* purposely signed */
-    const float *B, int ldb, long long int strideB,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *, int, long long, const float *, int,
-      long long, const float *, float *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, long long int strideA, /* purposely signed */
-    const double *B, int ldb, long long int strideB,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *, int, long long, const double *, int,
-      long long, const double *, double *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
-    const cuComplex *B, int ldb, long long int strideB,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
-      int, long long, const cuComplex *, cuComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
-    const cuComplex *B, int ldb, long long int strideB,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
-      int, long long, const cuComplex *, cuComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    long long int strideA, /* purposely signed */
-    const cuDoubleComplex *B, int ldb, long long int strideB,
-    const cuDoubleComplex *beta, /* host or device poi */
-    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
-      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
-      cuDoubleComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const float *alpha,           /* host or device pointer */
-    const float *A, int lda, const float *beta, /* host or device pointer */
-    const float *B, int ldb, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, const float *, int,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const double *alpha,            /* host or device pointer */
-    const double *A, int lda, const double *beta, /* host or device pointer */
-    const double *B, int ldb, double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, const double *, int,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(
-    cublasHandle_t handle, int n, float *const A[], /*Device pointer*/
-    int lda, int *P,                                /*Device Pointer*/
-    int *info,                                      /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, float *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(
-    cublasHandle_t handle, int n, double *const A[], /*Device pointer*/
-    int lda, int *P,                                 /*Device Pointer*/
-    int *info,                                       /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(
-    cublasHandle_t handle, int n, cuComplex *const A[], /*Device pointer*/
-    int lda, int *P,                                    /*Device Pointer*/
-    int *info,                                          /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(
-    cublasHandle_t handle, int n, cuDoubleComplex *const A[], /*Device pointer*/
-    int lda, int *P,                                          /*Device Pointer*/
-    int *info,                                                /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetriBatched(
-    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
-    int lda, const int *P,                                /*Device pointer*/
-    float *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *const[], int, const int *,
-      float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetriBatched(
-    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
-    int lda, const int *P,                                 /*Device pointer*/
-    double *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *const[], int, const int *,
-      double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetriBatched(
-    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
-    int lda, const int *P,                                    /*Device pointer*/
-    cuComplex *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *const[], int, const int *,
-      cuComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgetriBatched(cublasHandle_t handle, int n,
-                    const cuDoubleComplex *const A[], /*Device pointer*/
-                    int lda, const int *P,            /*Device pointer*/
-                    cuDoubleComplex *const C[],       /*Device pointer*/
-                    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *const[], int, const int *,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const float *const Aarray[], int lda, const int *devIpiv,
-    float *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const float *const[], int,
-      const int *, float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const double *const Aarray[], int lda, const int *devIpiv,
-    double *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const double *const[], int,
-      const int *, double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuComplex *const Aarray[], int lda, const int *devIpiv,
-    cuComplex *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *const[],
-      int, const int *, cuComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuDoubleComplex *const Aarray[], int lda, const int *devIpiv,
-    cuDoubleComplex *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *const[], int, const int *,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /*Host or Device Pointer*/
-    const float *const A[], int lda, float *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *const[], int,
-      float *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /*Host or Device Pointer*/
-    const double *const A[], int lda, double *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *const[], int,
-      double *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /*Host or Device Pointer*/
-    const cuComplex *const A[], int lda, cuComplex *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *const[],
-      int, cuComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /*Host or Device Pointer*/
-    const cuDoubleComplex *const A[], int lda, cuDoubleComplex *const B[],
-    int ldb, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
-    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
-    int lda, float *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                               /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const float *const[],
-                                     int, float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(
-    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
-    int lda, double *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                                /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const double *const[],
-                                     int, double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(
-    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
-    int lda, cuComplex *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                                   /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *const[], int, cuComplex *const[],
-      int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZmatinvBatched(cublasHandle_t handle, int n,
-                     const cuDoubleComplex *const A[],       /*Device pointer*/
-                     int lda, cuDoubleComplex *const Ainv[], /*Device pointer*/
-                     int lda_inv, int *info,                 /*Device Pointer*/
-                     int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *const[], int,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    float *const Aarray[],            /*Device pointer*/
-                    int lda, float *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, float *const[],
-                                     int, float *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    double *const Aarray[],            /*Device pointer*/
-                    int lda, double *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, double *const[],
-                                     int, double *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    cuComplex *const Aarray[],            /*Device pointer*/
-                    int lda, cuComplex *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, cuComplex *const[], int, cuComplex *const[],
-      int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(
-    cublasHandle_t handle, int m, int n,
-    cuDoubleComplex *const Aarray[],            /*Device pointer*/
-    int lda, cuDoubleComplex *const TauArray[], /*Device pointer*/
-    int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, cuDoubleComplex *const[], int,
-      cuDoubleComplex *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, float *const Aarray[],       /*Device pointer*/
-                   int lda, float *const Carray[],        /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
-                   int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, float *const[], int,
-      float *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, double *const Aarray[],      /*Device pointer*/
-                   int lda, double *const Carray[],       /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
-                   int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, double *const[], int,
-      double *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, cuComplex *const Aarray[], /*Device pointer*/
-                   int lda, cuComplex *const Carray[],  /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *const[], int,
-      cuComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, cuDoubleComplex *const Aarray[], /*Device pointer*/
-                   int lda, cuDoubleComplex *const Carray[],  /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int,
-      cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int *,
-      int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const float *A, int lda, const float *x,
-                                        int incx, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const double *A, int lda,
-                                        const double *x, int incx, double *C,
-                                        int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const cuComplex *A, int lda,
-                                        const cuComplex *x, int incx,
-                                        cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const cuDoubleComplex *A, int lda,
-                                        const cuDoubleComplex *x, int incx,
-                                        cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const float *AP, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const double *AP, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuComplex *AP, cuComplex *A,
-                                         int lda) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
-                                     const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuDoubleComplex *AP,
-                                         cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const float *A, int lda, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const double *A, int lda, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuComplex *A, int lda,
-                                         cuComplex *AP) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
-                                     const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuDoubleComplex *A, int lda,
-                                         cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus CUBLASWINAPI cublasInit(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasShutdown(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasGetError(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, devicePtr);
-}
-
-cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devicePtr);
-}
-
-cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
-                              int incy) {
-  using FuncPtr =
-      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
-                               const double *y, int incy) {
-  using FuncPtr =
-      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
-                                   const cuComplex *y, int incy) {
-  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
-                                            const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
-                                   const cuComplex *y, int incy) {
-  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
-                                            const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
-                                         int incx, const cuDoubleComplex *y,
-                                         int incy) {
-  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
-      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
-                                         int incx, const cuDoubleComplex *y,
-                                         int incy) {
-  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
-      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
-                               int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
-                              float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
-                              double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
-                              int incx, cuComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
-                                       cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
-  return func_ptr(n, x, incx);
-}
-
-void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
-                             float sc, float ss) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
-  return func_ptr(n, x, incx, y, incy, sc, ss);
-}
-
-void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
-                             double sc, double ss) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
-  return func_ptr(n, x, incx, y, incy, sc, ss);
-}
-
-void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
-                             int incy, float c, cuComplex s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
-                                       float, cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *y, int incy, double sc,
-                             cuDoubleComplex cs) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-                           double, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
-  return func_ptr(n, x, incx, y, incy, sc, cs);
-}
-
-void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
-                              int incy, float c, float s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
-                                       float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy, double c,
-                              double s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
-                                       cuDoubleComplex *, int, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
-  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
-  return func_ptr(sa, sb, sc, ss);
-}
-
-void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
-  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
-  return func_ptr(sa, sb, sc, ss);
-}
-
-void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
-                              cuComplex *cs) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
-  return func_ptr(ca, cb, sc, cs);
-}
-
-void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
-                              double *sc, cuDoubleComplex *cs) {
-  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
-                                       double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
-  return func_ptr(ca, cb, sc, cs);
-}
-
-void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
-                              const float *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
-  return func_ptr(n, x, incx, y, incy, sparam);
-}
-
-void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
-                              const double *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
-  return func_ptr(n, x, incx, y, incy, sparam);
-}
-
-void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
-                               const float *sy1, float *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
-  return func_ptr(sd1, sd2, sx1, sy1, sparam);
-}
-
-void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
-                               const double *sy1, double *sparam) {
-  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
-                                       const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
-  return func_ptr(sd1, sd2, sx1, sy1, sparam);
-}
-
-void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
-                              const float *A, int lda, const float *x, int incx,
-                              float beta, float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
-                              const double *A, int lda, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
-                              float alpha, const float *A, int lda,
-                              const float *x, int incx, float beta, float *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
-                              double alpha, const double *A, int lda,
-                              const double *x, int incx, double beta, double *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
-                           int, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *x, int incx, cuComplex beta,
-                              cuComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
-                                       int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
-                           int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
-                              const float *AP, float *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
-                              const double *AP, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
-                              const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *AP, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
-                                       int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
-                              const float *AP, float *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
-                              const double *AP, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
-                              const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *AP, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
-                           int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
-                              int lda, const float *x, int incx, float beta,
-                              float *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
-                              int lda, const double *x, int incx, double beta,
-                              double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
-                              const float *A, int lda, const float *x, int incx,
-                              float beta, float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
-                              const double *A, int lda, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
-                              const float *x, int incx, float beta, float *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
-                              const double *x, int incx, double beta, double *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
-                           int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *AP, const cuComplex *x, int incx,
-                              cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *AP,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
-                             int incx, const float *y, int incy, float *A,
-                             int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
-                             int incx, const double *y, int incy, double *A,
-                             int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
-                              int incx, const cuComplex *y, int incy,
-                              cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
-                              int incx, const cuComplex *y, int incy,
-                              cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
-                             int incx, float *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
-                             int incx, double *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
-                             int incx, cuComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
-                             const cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
-                             int incx, float *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
-                             int incx, double *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
-                             int incx, cuComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
-                                       cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
-                             const cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
-                              int incx, const float *y, int incy, float *A,
-                              int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
-                              int incx, const double *y, int incy, double *A,
-                              int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
-                              const cuComplex *x, int incx, const cuComplex *y,
-                              int incy, cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
-                              int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
-                              int incx, const double *y, int incy, double *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
-                              const cuComplex *x, int incx, const cuComplex *y,
-                              int incy, cuComplex *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
-                              float alpha, const float *A, int lda,
-                              const float *B, int ldb, float beta, float *C,
-                              int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
-                              double alpha, const double *A, int lda,
-                              const double *B, int ldb, double beta, double *C,
-                              int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
-                           int, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
-                              const float *A, int lda, float beta, float *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
-                              const double *A, int lda, double beta, double *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, double, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
-                           int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
-                                       const cuDoubleComplex *, int,
-                                       cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
-                              const cuComplex *A, int lda, float beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
-                           float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
-                              const cuDoubleComplex *A, int lda, double beta,
-                              cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
-                                       const cuDoubleComplex *, int, double,
-                                       cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
-                               const float *A, int lda, const float *B, int ldb,
-                               float beta, float *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
-                               double alpha, const double *A, int lda,
-                               const double *B, int ldb, double beta, double *C,
-                               int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C,
-                               int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, float beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *B, int ldb,
-                               double beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
-                              const float *A, int lda, const float *B, int ldb,
-                              float beta, float *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
-                              const double *A, int lda, const double *B,
-                              int ldb, double beta, double *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, float alpha, const float *A,
-                              int lda, float *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, double alpha, const double *A,
-                              int lda, double *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuComplex alpha, const cuComplex *A,
-                              int lda, cuComplex *B, int ldb) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
-                                       cuDoubleComplex, const cuDoubleComplex *,
-                                       int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, float alpha, const float *A,
-                              int lda, float *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, double alpha, const double *A,
-                              int lda, double *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuComplex alpha, const cuComplex *A,
-                              int lda, cuComplex *B, int ldb) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
-                                       cuDoubleComplex, const cuDoubleComplex *,
-                                       int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublas_10_2.inc b/third_party/xla/third_party/tsl/tsl/cuda/cublas_10_2.inc
deleted file mode 100644
index 067ba675288524..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cublas_10_2.inc
+++ /dev/null
@@ -1,5023 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
-                                                int *version) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, version);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
-                                              int *value) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-size_t CUBLASWINAPI cublasGetCudartVersion(void) {
-  using FuncPtr = size_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetCudartVersion");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasGetCudartVersion");
-  return func_ptr();
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
-                                               cudaStream_t streamId) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
-                                               cudaStream_t *streamId) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
-                                                    cublasPointerMode_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
-                                                    cublasPointerMode_t mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
-                                                 cublasAtomicsMode_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
-                                                 cublasAtomicsMode_t mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
-                                              cublasMath_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
-                                              cublasMath_t mode) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, int logToStdOut,
-                                                  int logToStdErr,
-                                                  const char *logFileName) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLoggerConfigure");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(logIsOn, logToStdOut, logToStdErr, logFileName);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSetLoggerCallback(cublasLogCallback userCallback) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetLoggerCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(userCallback);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasGetLoggerCallback(cublasLogCallback *userCallback) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetLoggerCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(userCallback);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
-                                            int incx, void *devicePtr,
-                                            int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, x, incx, devicePtr, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
-                                            int incx, void *y, int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
-                                            const void *A, int lda, void *B,
-                                            int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
-                                                 int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
-                                            const void *A, int lda, void *B,
-                                            int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
-                                                 int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
-                                                 const void *hostPtr, int incx,
-                                                 void *devicePtr, int incy,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
-                                                 void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
-                                                 const void *devicePtr,
-                                                 int incx, void *hostPtr,
-                                                 int incy,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
-                                                 void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
-                                                 int elemSize, const void *A,
-                                                 int lda, void *B, int ldb,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      int, int, int, const void *, int, void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
-                                                 int elemSize, const void *A,
-                                                 int lda, void *B, int ldb,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      int, int, int, const void *, int, void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
-}
-
-void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
-  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
-  return func_ptr(srName, info);
-}
-
-cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
-                                         const void *x, cudaDataType xType,
-                                         int incx, void *result,
-                                         cudaDataType resultType,
-                                         cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, void *,
-      cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx,
-                                           float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx,
-                                           double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
-                                        const void *x, cudaDataType xType,
-                                        int incx, const void *y,
-                                        cudaDataType yType, int incy,
-                                        void *result, cudaDataType resultType,
-                                        cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
-                  executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
-                                         const void *x, cudaDataType xType,
-                                         int incx, const void *y,
-                                         cudaDataType yType, int incy,
-                                         void *result, cudaDataType resultType,
-                                         cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
-                  executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
-                                          const float *x, int incx,
-                                          const float *y, int incy,
-                                          float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, int, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
-                                          const double *x, int incx,
-                                          const double *y, int incy,
-                                          double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           const cuComplex *y, int incy,
-                                           cuComplex *result) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           const cuComplex *y, int incy,
-                                           cuComplex *result) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           const cuDoubleComplex *y, int incy,
-                                           cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           const cuDoubleComplex *y, int incy,
-                                           cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasScalEx(cublasHandle_t handle, int n,
-             const void *alpha, /* host or device pointer */
-             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
-             cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
-      int, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSscal_v2(cublasHandle_t handle, int n,
-               const float *alpha, /* host or device pointer */
-               float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDscal_v2(cublasHandle_t handle, int n,
-               const double *alpha, /* host or device pointer */
-               double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCscal_v2(cublasHandle_t handle, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsscal_v2(cublasHandle_t handle, int n,
-                const float *alpha, /* host or device pointer */
-                cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZscal_v2(cublasHandle_t handle, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZdscal_v2(cublasHandle_t handle, int n,
-                const double *alpha, /* host or device pointer */
-                cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasAxpyEx(
-    cublasHandle_t handle, int n,
-    const void *alpha, /* host or device pointer */
-    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
-    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, const void *,
-      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
-                  executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSaxpy_v2(cublasHandle_t handle, int n,
-               const float *alpha, /* host or device pointer */
-               const float *x, int incx, float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDaxpy_v2(cublasHandle_t handle, int n,
-               const double *alpha, /* host or device pointer */
-               const double *x, int incx, double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCaxpy_v2(cublasHandle_t handle, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, cuComplex *y, int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
-    cublasHandle_t handle, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCopyEx(cublasHandle_t handle, int n,
-                                         const void *x, cudaDataType xType,
-                                         int incx, void *y, cudaDataType yType,
-                                         int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, void *,
-      cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCopyEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx, float *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx, double *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuDoubleComplex *, int,
-                                                 cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
-                                           float *x, int incx, float *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
-                                                 int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
-                                           double *x, int incx, double *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
-                                                 int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
-                                           cuComplex *x, int incx, cuComplex *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
-                                           cuDoubleComplex *x, int incx,
-                                           cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSwapEx(cublasHandle_t handle, int n, void *x,
-                                         cudaDataType xType, int incx, void *y,
-                                         cudaDataType yType, int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, void *, cudaDataType,
-                                     int, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSwapEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
-                                            const float *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
-                                            const double *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIamaxEx(
-    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
-    int *result /* host or device pointer */
-) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIamaxEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
-                                            const float *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
-                                            const double *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIaminEx(
-    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
-    int *result /* host or device pointer */
-) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIaminEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasAsumEx(
-    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
-    void *result, cudaDataType resultType, /* host or device pointer */
-    cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, void *,
-      cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAsumEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result, resultType, executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx,
-                                           float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx,
-                                           double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
-              int incy, const float *c, /* host or device pointer */
-              const float *s) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
-                                     int, const float *, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
-              int incy, const double *c, /* host or device pointer */
-              const double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *, int, double *, int, const double *,
-      const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCrot_v2(
-    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
-    int incy, const float *c, /* host or device pointer */
-    const cuComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
-      const cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
-    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
-    int incy, const float *c, /* host or device pointer */
-    const float *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
-      const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZrot_v2(
-    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
-    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
-    const cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      const double *, const cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
-    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
-    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
-    const double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      const double *, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasRotEx(cublasHandle_t handle, int n, void *x, cudaDataType xType, int incx,
-            void *y, cudaDataType yType, int incy,
-            const void *c, /* host or device pointer */
-            const void *s, cudaDataType csType, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
-      const void *, const void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, c, s, csType,
-                  executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
-               float *b,                        /* host or device pointer */
-               float *c,                        /* host or device pointer */
-               float *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
-                                                 float *, float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
-               double *b,                        /* host or device pointer */
-               double *c,                        /* host or device pointer */
-               double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
-                                                 double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
-               cuComplex *b,                        /* host or device pointer */
-               float *c,                            /* host or device pointer */
-               cuComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
-    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
-    cuDoubleComplex *b,                        /* host or device pointer */
-    double *c,                                 /* host or device pointer */
-    cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
-                                         void *a, /* host or device pointer */
-                                         void *b, /* host or device pointer */
-                                         cudaDataType abType,
-                                         void *c, /* host or device pointer */
-                                         void *s, /* host or device pointer */
-                                         cudaDataType csType,
-                                         cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, void *, void *,
-                                                 cudaDataType, void *, void *,
-                                                 cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotgEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, abType, c, s, csType, executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
-                                           float *x, int incx, float *y,
-                                           int incy, const float *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, float *, int, float *, int, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, param);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
-                                           double *x, int incx, double *y,
-                                           int incy, const double *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *, int, double *, int, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasRotmEx(cublasHandle_t handle, int n, void *x, cudaDataType xType,
-             int incx, void *y, cudaDataType yType, int incy,
-             const void *param, /* host or device pointer */
-             cudaDataType paramType, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
-      const void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, param, paramType,
-                  executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
-                float *d2,                        /* host or device pointer */
-                float *x1,                        /* host or device pointer */
-                const float *y1,                  /* host or device pointer */
-                float *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, float *, float *, float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, d1, d2, x1, y1, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
-                double *d2,                        /* host or device pointer */
-                double *x1,                        /* host or device pointer */
-                const double *y1,                  /* host or device pointer */
-                double *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, double *, double *, double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, d1, d2, x1, y1, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasRotmgEx(cublasHandle_t handle, void *d1,     /* host or device pointer */
-              cudaDataType d1Type, void *d2,       /* host or device pointer */
-              cudaDataType d2Type, void *x1,       /* host or device pointer */
-              cudaDataType x1Type, const void *y1, /* host or device pointer */
-              cudaDataType y1Type, void *param,    /* host or device pointer */
-              cudaDataType paramType, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, void *, cudaDataType, void *, cudaDataType, void *,
-      cudaDataType, const void *, cudaDataType, void *, cudaDataType,
-      cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmgEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, d1, d1Type, d2, d2Type, x1, x1Type, y1, y1Type, param,
-                  paramType, executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
-      int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               int kl, int ku, const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
-      const float *, int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               int kl, int ku, const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
-    int ku, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *x, int incx,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
-    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const float *A, int lda, float *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *AP,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *AP,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const float *A, int lda, float *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
-      int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *AP, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const double *alpha, /* host or device pointer */
-               const double *AP, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *AP, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
-               const cuDoubleComplex *beta, /* host or device pointer */
-               cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSger_v2(
-    cublasHandle_t handle, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *x, int incx, const float *y, int incy, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDger_v2(
-    cublasHandle_t handle, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const double *, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgeru_v2(cublasHandle_t handle, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgerc_v2(cublasHandle_t handle, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgeru_v2(cublasHandle_t handle, int m, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgerc_v2(cublasHandle_t handle, int m, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const float *x, int incx, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const double *x, int incx, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const cuComplex *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const cuDoubleComplex *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
-      int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const float *x, int incx, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const double *x, int incx, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
-      int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const float *alpha, /* host or device pointer */
-    const float *x, int incx, const float *y, int incy, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *x, int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3m(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, const void *A,
-    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
-    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
-              cublasOperation_t transb, int m, int n, int k,
-              const cuDoubleComplex *alpha, /* host or device pointer */
-              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
-              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
-              cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda, const void *B,
-    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const float *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda, const void *B,
-    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc, cudaDataType computeType,
-    cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
-      int, const void *, void *, cudaDataType, int, cudaDataType,
-      cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, const void *A,
-    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
-    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
-    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
-    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
-      int, int, int, const unsigned char *, int, int, const unsigned char *,
-      int, int, unsigned char *, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
-                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha,           /* host or device pointer */
-    const float *A, int lda, const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha,            /* host or device pointer */
-    const double *A, int lda, const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
-      void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
-    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
-      void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha,               /* host or device pointer */
-    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const cuComplex *, int, const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZherk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const cuDoubleComplex *, int, const double *,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherkEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    const float *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const void *, cudaDataType, int, const float *, void *,
-      cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
-    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const void *, cudaDataType, int, const float *, void *,
-      cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZherkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasChemm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *A, int lda, float *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *, int, float *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *A, int lda, double *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *, int, double *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, cuComplex *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
-    int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const float *const Aarray[], int lda, const float *const Barray[], int ldb,
-    const float *beta, /* host or device pointer */
-    float *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *const[], int, const float *const[], int,
-      const float *, float *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *const Aarray[], int lda, const double *const Barray[],
-    int ldb, const double *beta, /* host or device pointer */
-    double *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *const[], int, const double *const[], int,
-      const double *, double *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
-    int ldb, const cuComplex *beta, /* host or device pointer */
-    cuComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *const[], int,
-      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
-    int ldb, const cuComplex *beta, /* host or device pointer */
-    cuComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *const[], int,
-      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
-                   cublasOperation_t transb, int m, int n, int k,
-                   const cuDoubleComplex *alpha, /* host or device pointer */
-                   const cuDoubleComplex *const Aarray[], int lda,
-                   const cuDoubleComplex *const Barray[], int ldb,
-                   const cuDoubleComplex *beta, /* host or device pointer */
-                   cuDoubleComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *const[], int,
-      const cuDoubleComplex *const[], int, const cuDoubleComplex *,
-      cuDoubleComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *const Aarray[], cudaDataType Atype, int lda,
-    const void *const Barray[], cudaDataType Btype, int ldb,
-    const void *beta, /* host or device pointer */
-    void *const Carray[], cudaDataType Ctype, int ldc, int batchCount,
-    cudaDataType computeType, cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *const[], cudaDataType, int, const void *const[],
-      cudaDataType, int, const void *, void *const[], cudaDataType, int, int,
-      cudaDataType, cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmBatchedEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda,
-                  Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount,
-                  computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    long long int strideA, /* purposely signed */
-    const void *B, cudaDataType Btype, int ldb, long long int strideB,
-    const void *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc, long long int strideC, int batchCount,
-    cudaDataType computeType, cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *, cudaDataType, int, long long, const void *,
-      cudaDataType, int, long long, const void *, void *, cudaDataType, int,
-      long long, int, cudaDataType, cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmStridedBatchedEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda,
-                  strideA, B, Btype, ldb, strideB, beta, C, Ctype, ldc, strideC,
-                  batchCount, computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha,        /* host or device pointer */
-    const float *A, int lda, long long int strideA, /* purposely signed */
-    const float *B, int ldb, long long int strideB,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *, int, long long, const float *, int,
-      long long, const float *, float *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, long long int strideA, /* purposely signed */
-    const double *B, int ldb, long long int strideB,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *, int, long long, const double *, int,
-      long long, const double *, double *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
-    const cuComplex *B, int ldb, long long int strideB,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
-      int, long long, const cuComplex *, cuComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
-    const cuComplex *B, int ldb, long long int strideB,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
-      int, long long, const cuComplex *, cuComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    long long int strideA, /* purposely signed */
-    const cuDoubleComplex *B, int ldb, long long int strideB,
-    const cuDoubleComplex *beta, /* host or device poi */
-    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
-      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
-      cuDoubleComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const float *alpha,           /* host or device pointer */
-    const float *A, int lda, const float *beta, /* host or device pointer */
-    const float *B, int ldb, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, const float *, int,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const double *alpha,            /* host or device pointer */
-    const double *A, int lda, const double *beta, /* host or device pointer */
-    const double *B, int ldb, double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, const double *, int,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(
-    cublasHandle_t handle, int n, float *const A[], /*Device pointer*/
-    int lda, int *P,                                /*Device Pointer*/
-    int *info,                                      /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, float *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(
-    cublasHandle_t handle, int n, double *const A[], /*Device pointer*/
-    int lda, int *P,                                 /*Device Pointer*/
-    int *info,                                       /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(
-    cublasHandle_t handle, int n, cuComplex *const A[], /*Device pointer*/
-    int lda, int *P,                                    /*Device Pointer*/
-    int *info,                                          /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(
-    cublasHandle_t handle, int n, cuDoubleComplex *const A[], /*Device pointer*/
-    int lda, int *P,                                          /*Device Pointer*/
-    int *info,                                                /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetriBatched(
-    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
-    int lda, const int *P,                                /*Device pointer*/
-    float *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *const[], int, const int *,
-      float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetriBatched(
-    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
-    int lda, const int *P,                                 /*Device pointer*/
-    double *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *const[], int, const int *,
-      double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetriBatched(
-    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
-    int lda, const int *P,                                    /*Device pointer*/
-    cuComplex *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *const[], int, const int *,
-      cuComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgetriBatched(cublasHandle_t handle, int n,
-                    const cuDoubleComplex *const A[], /*Device pointer*/
-                    int lda, const int *P,            /*Device pointer*/
-                    cuDoubleComplex *const C[],       /*Device pointer*/
-                    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *const[], int, const int *,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const float *const Aarray[], int lda, const int *devIpiv,
-    float *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const float *const[], int,
-      const int *, float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const double *const Aarray[], int lda, const int *devIpiv,
-    double *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const double *const[], int,
-      const int *, double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuComplex *const Aarray[], int lda, const int *devIpiv,
-    cuComplex *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *const[],
-      int, const int *, cuComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuDoubleComplex *const Aarray[], int lda, const int *devIpiv,
-    cuDoubleComplex *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *const[], int, const int *,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /*Host or Device Pointer*/
-    const float *const A[], int lda, float *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *const[], int,
-      float *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /*Host or Device Pointer*/
-    const double *const A[], int lda, double *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *const[], int,
-      double *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /*Host or Device Pointer*/
-    const cuComplex *const A[], int lda, cuComplex *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *const[],
-      int, cuComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /*Host or Device Pointer*/
-    const cuDoubleComplex *const A[], int lda, cuDoubleComplex *const B[],
-    int ldb, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
-    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
-    int lda, float *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                               /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const float *const[],
-                                     int, float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(
-    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
-    int lda, double *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                                /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const double *const[],
-                                     int, double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(
-    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
-    int lda, cuComplex *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                                   /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *const[], int, cuComplex *const[],
-      int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZmatinvBatched(cublasHandle_t handle, int n,
-                     const cuDoubleComplex *const A[],       /*Device pointer*/
-                     int lda, cuDoubleComplex *const Ainv[], /*Device pointer*/
-                     int lda_inv, int *info,                 /*Device Pointer*/
-                     int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *const[], int,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    float *const Aarray[],            /*Device pointer*/
-                    int lda, float *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, float *const[],
-                                     int, float *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    double *const Aarray[],            /*Device pointer*/
-                    int lda, double *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, double *const[],
-                                     int, double *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    cuComplex *const Aarray[],            /*Device pointer*/
-                    int lda, cuComplex *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, cuComplex *const[], int, cuComplex *const[],
-      int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(
-    cublasHandle_t handle, int m, int n,
-    cuDoubleComplex *const Aarray[],            /*Device pointer*/
-    int lda, cuDoubleComplex *const TauArray[], /*Device pointer*/
-    int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, cuDoubleComplex *const[], int,
-      cuDoubleComplex *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, float *const Aarray[],       /*Device pointer*/
-                   int lda, float *const Carray[],        /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
-                   int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, float *const[], int,
-      float *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, double *const Aarray[],      /*Device pointer*/
-                   int lda, double *const Carray[],       /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
-                   int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, double *const[], int,
-      double *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, cuComplex *const Aarray[], /*Device pointer*/
-                   int lda, cuComplex *const Carray[],  /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *const[], int,
-      cuComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, cuDoubleComplex *const Aarray[], /*Device pointer*/
-                   int lda, cuDoubleComplex *const Carray[],  /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int,
-      cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int *,
-      int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const float *A, int lda, const float *x,
-                                        int incx, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const double *A, int lda,
-                                        const double *x, int incx, double *C,
-                                        int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const cuComplex *A, int lda,
-                                        const cuComplex *x, int incx,
-                                        cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const cuDoubleComplex *A, int lda,
-                                        const cuDoubleComplex *x, int incx,
-                                        cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const float *AP, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const double *AP, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuComplex *AP, cuComplex *A,
-                                         int lda) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
-                                     const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuDoubleComplex *AP,
-                                         cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const float *A, int lda, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const double *A, int lda, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuComplex *A, int lda,
-                                         cuComplex *AP) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
-                                     const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuDoubleComplex *A, int lda,
-                                         cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus CUBLASWINAPI cublasInit(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasShutdown(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasGetError(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, devicePtr);
-}
-
-cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devicePtr);
-}
-
-cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
-                              int incy) {
-  using FuncPtr =
-      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
-                               const double *y, int incy) {
-  using FuncPtr =
-      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
-                                   const cuComplex *y, int incy) {
-  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
-                                            const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
-                                   const cuComplex *y, int incy) {
-  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
-                                            const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
-                                         int incx, const cuDoubleComplex *y,
-                                         int incy) {
-  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
-      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
-                                         int incx, const cuDoubleComplex *y,
-                                         int incy) {
-  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
-      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
-                               int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
-                              float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
-                              double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
-                              int incx, cuComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
-                                       cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
-  return func_ptr(n, x, incx);
-}
-
-void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
-                             float sc, float ss) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
-  return func_ptr(n, x, incx, y, incy, sc, ss);
-}
-
-void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
-                             double sc, double ss) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
-  return func_ptr(n, x, incx, y, incy, sc, ss);
-}
-
-void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
-                             int incy, float c, cuComplex s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
-                                       float, cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *y, int incy, double sc,
-                             cuDoubleComplex cs) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-                           double, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
-  return func_ptr(n, x, incx, y, incy, sc, cs);
-}
-
-void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
-                              int incy, float c, float s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
-                                       float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy, double c,
-                              double s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
-                                       cuDoubleComplex *, int, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
-  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
-  return func_ptr(sa, sb, sc, ss);
-}
-
-void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
-  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
-  return func_ptr(sa, sb, sc, ss);
-}
-
-void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
-                              cuComplex *cs) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
-  return func_ptr(ca, cb, sc, cs);
-}
-
-void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
-                              double *sc, cuDoubleComplex *cs) {
-  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
-                                       double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
-  return func_ptr(ca, cb, sc, cs);
-}
-
-void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
-                              const float *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
-  return func_ptr(n, x, incx, y, incy, sparam);
-}
-
-void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
-                              const double *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
-  return func_ptr(n, x, incx, y, incy, sparam);
-}
-
-void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
-                               const float *sy1, float *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
-  return func_ptr(sd1, sd2, sx1, sy1, sparam);
-}
-
-void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
-                               const double *sy1, double *sparam) {
-  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
-                                       const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
-  return func_ptr(sd1, sd2, sx1, sy1, sparam);
-}
-
-void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
-                              const float *A, int lda, const float *x, int incx,
-                              float beta, float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
-                              const double *A, int lda, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
-                              float alpha, const float *A, int lda,
-                              const float *x, int incx, float beta, float *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
-                              double alpha, const double *A, int lda,
-                              const double *x, int incx, double beta, double *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
-                           int, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *x, int incx, cuComplex beta,
-                              cuComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
-                                       int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
-                           int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
-                              const float *AP, float *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
-                              const double *AP, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
-                              const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *AP, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
-                                       int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
-                              const float *AP, float *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
-                              const double *AP, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
-                              const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *AP, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
-                           int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
-                              int lda, const float *x, int incx, float beta,
-                              float *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
-                              int lda, const double *x, int incx, double beta,
-                              double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
-                              const float *A, int lda, const float *x, int incx,
-                              float beta, float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
-                              const double *A, int lda, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
-                              const float *x, int incx, float beta, float *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
-                              const double *x, int incx, double beta, double *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
-                           int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *AP, const cuComplex *x, int incx,
-                              cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *AP,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
-                             int incx, const float *y, int incy, float *A,
-                             int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
-                             int incx, const double *y, int incy, double *A,
-                             int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
-                              int incx, const cuComplex *y, int incy,
-                              cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
-                              int incx, const cuComplex *y, int incy,
-                              cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
-                             int incx, float *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
-                             int incx, double *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
-                             int incx, cuComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
-                             const cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
-                             int incx, float *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
-                             int incx, double *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
-                             int incx, cuComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
-                                       cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
-                             const cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
-                              int incx, const float *y, int incy, float *A,
-                              int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
-                              int incx, const double *y, int incy, double *A,
-                              int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
-                              const cuComplex *x, int incx, const cuComplex *y,
-                              int incy, cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
-                              int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
-                              int incx, const double *y, int incy, double *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
-                              const cuComplex *x, int incx, const cuComplex *y,
-                              int incy, cuComplex *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
-                              float alpha, const float *A, int lda,
-                              const float *B, int ldb, float beta, float *C,
-                              int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
-                              double alpha, const double *A, int lda,
-                              const double *B, int ldb, double beta, double *C,
-                              int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
-                           int, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
-                              const float *A, int lda, float beta, float *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
-                              const double *A, int lda, double beta, double *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, double, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
-                           int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
-                                       const cuDoubleComplex *, int,
-                                       cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
-                              const cuComplex *A, int lda, float beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
-                           float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
-                              const cuDoubleComplex *A, int lda, double beta,
-                              cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
-                                       const cuDoubleComplex *, int, double,
-                                       cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
-                               const float *A, int lda, const float *B, int ldb,
-                               float beta, float *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
-                               double alpha, const double *A, int lda,
-                               const double *B, int ldb, double beta, double *C,
-                               int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C,
-                               int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, float beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *B, int ldb,
-                               double beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
-                              const float *A, int lda, const float *B, int ldb,
-                              float beta, float *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
-                              const double *A, int lda, const double *B,
-                              int ldb, double beta, double *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, float alpha, const float *A,
-                              int lda, float *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, double alpha, const double *A,
-                              int lda, double *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuComplex alpha, const cuComplex *A,
-                              int lda, cuComplex *B, int ldb) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
-                                       cuDoubleComplex, const cuDoubleComplex *,
-                                       int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, float alpha, const float *A,
-                              int lda, float *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, double alpha, const double *A,
-                              int lda, double *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuComplex alpha, const cuComplex *A,
-                              int lda, cuComplex *B, int ldb) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
-                                       cuDoubleComplex, const cuDoubleComplex *,
-                                       int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublas_11_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cublas_11_0.inc
deleted file mode 100644
index d287a91c562217..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cublas_11_0.inc
+++ /dev/null
@@ -1,5197 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
-                                                int *version) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, version);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
-                                              int *value) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-size_t CUBLASWINAPI cublasGetCudartVersion(void) {
-  using FuncPtr = size_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetCudartVersion");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasGetCudartVersion");
-  return func_ptr();
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetWorkspace_v2(cublasHandle_t handle,
-                                                  void *workspace,
-                                                  size_t workspaceSizeInBytes) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetWorkspace_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, workspace, workspaceSizeInBytes);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
-                                               cudaStream_t streamId) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
-                                               cudaStream_t *streamId) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
-                                                    cublasPointerMode_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
-                                                    cublasPointerMode_t mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
-                                                 cublasAtomicsMode_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
-                                                 cublasAtomicsMode_t mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
-                                              cublasMath_t *mode) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
-                                              cublasMath_t mode) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetSmCountTarget(cublasHandle_t handle,
-                                                   int *smCountTarget) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetSmCountTarget");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, smCountTarget);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetSmCountTarget(cublasHandle_t handle,
-                                                   int smCountTarget) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetSmCountTarget");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, smCountTarget);
-}
-
-const char *CUBLASWINAPI cublasGetStatusName(cublasStatus_t status) {
-  using FuncPtr = const char *(CUBLASWINAPI *)(cublasStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStatusName");
-  if (!func_ptr) return "cublasGetStatusName symbol not found.";
-  return func_ptr(status);
-}
-
-const char *CUBLASWINAPI cublasGetStatusString(cublasStatus_t status) {
-  using FuncPtr = const char *(CUBLASWINAPI *)(cublasStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStatusString");
-  if (!func_ptr) return "cublasGetStatusString symbol not found.";
-  return func_ptr(status);
-}
-
-cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, int logToStdOut,
-                                                  int logToStdErr,
-                                                  const char *logFileName) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLoggerConfigure");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(logIsOn, logToStdOut, logToStdErr, logFileName);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSetLoggerCallback(cublasLogCallback userCallback) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetLoggerCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(userCallback);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasGetLoggerCallback(cublasLogCallback *userCallback) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetLoggerCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(userCallback);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
-                                            int incx, void *devicePtr,
-                                            int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, x, incx, devicePtr, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
-                                            int incx, void *y, int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
-                                            const void *A, int lda, void *B,
-                                            int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
-                                                 int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
-                                            const void *A, int lda, void *B,
-                                            int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
-                                                 int, void *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
-                                                 const void *hostPtr, int incx,
-                                                 void *devicePtr, int incy,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
-                                                 void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
-                                                 const void *devicePtr,
-                                                 int incx, void *hostPtr,
-                                                 int incy,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
-                                                 void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
-                                                 int elemSize, const void *A,
-                                                 int lda, void *B, int ldb,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      int, int, int, const void *, int, void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
-                                                 int elemSize, const void *A,
-                                                 int lda, void *B, int ldb,
-                                                 cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      int, int, int, const void *, int, void *, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
-}
-
-void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
-  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
-  return func_ptr(srName, info);
-}
-
-cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
-                                         const void *x, cudaDataType xType,
-                                         int incx, void *result,
-                                         cudaDataType resultType,
-                                         cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, void *,
-      cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx,
-                                           float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx,
-                                           double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
-                                        const void *x, cudaDataType xType,
-                                        int incx, const void *y,
-                                        cudaDataType yType, int incy,
-                                        void *result, cudaDataType resultType,
-                                        cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
-                  executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
-                                         const void *x, cudaDataType xType,
-                                         int incx, const void *y,
-                                         cudaDataType yType, int incy,
-                                         void *result, cudaDataType resultType,
-                                         cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
-                  executionType);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
-                                          const float *x, int incx,
-                                          const float *y, int incy,
-                                          float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, int, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
-                                          const double *x, int incx,
-                                          const double *y, int incy,
-                                          double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           const cuComplex *y, int incy,
-                                           cuComplex *result) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           const cuComplex *y, int incy,
-                                           cuComplex *result) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           const cuDoubleComplex *y, int incy,
-                                           cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           const cuDoubleComplex *y, int incy,
-                                           cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, result);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasScalEx(cublasHandle_t handle, int n,
-             const void *alpha, /* host or device pointer */
-             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
-             cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
-      int, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSscal_v2(cublasHandle_t handle, int n,
-               const float *alpha, /* host or device pointer */
-               float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDscal_v2(cublasHandle_t handle, int n,
-               const double *alpha, /* host or device pointer */
-               double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCscal_v2(cublasHandle_t handle, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsscal_v2(cublasHandle_t handle, int n,
-                const float *alpha, /* host or device pointer */
-                cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZscal_v2(cublasHandle_t handle, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZdscal_v2(cublasHandle_t handle, int n,
-                const double *alpha, /* host or device pointer */
-                cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasAxpyEx(
-    cublasHandle_t handle, int n,
-    const void *alpha, /* host or device pointer */
-    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
-    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, const void *,
-      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
-                  executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSaxpy_v2(cublasHandle_t handle, int n,
-               const float *alpha, /* host or device pointer */
-               const float *x, int incx, float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDaxpy_v2(cublasHandle_t handle, int n,
-               const double *alpha, /* host or device pointer */
-               const double *x, int incx, double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCaxpy_v2(cublasHandle_t handle, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, cuComplex *y, int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
-                                     const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
-    cublasHandle_t handle, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCopyEx(cublasHandle_t handle, int n,
-                                         const void *x, cudaDataType xType,
-                                         int incx, void *y, cudaDataType yType,
-                                         int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, void *,
-      cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCopyEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx, float *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx, double *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
-                                           const cuComplex *x, int incx,
-                                           cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
-                                           const cuDoubleComplex *x, int incx,
-                                           cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuDoubleComplex *, int,
-                                                 cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
-                                           float *x, int incx, float *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
-                                                 int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
-                                           double *x, int incx, double *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
-                                                 int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
-                                           cuComplex *x, int incx, cuComplex *y,
-                                           int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
-                                           cuDoubleComplex *x, int incx,
-                                           cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSwapEx(cublasHandle_t handle, int n, void *x,
-                                         cudaDataType xType, int incx, void *y,
-                                         cudaDataType yType, int incy) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, void *, cudaDataType,
-                                     int, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSwapEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
-                                            const float *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
-                                            const double *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIamaxEx(
-    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
-    int *result /* host or device pointer */
-) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIamaxEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
-                                            const float *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
-                                            const double *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            int *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasIaminEx(
-    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
-    int *result /* host or device pointer */
-) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIaminEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasAsumEx(
-    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
-    void *result, cudaDataType resultType, /* host or device pointer */
-    cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const void *, cudaDataType, int, void *,
-      cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAsumEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, result, resultType, executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
-                                           const float *x, int incx,
-                                           float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
-                                           const double *x, int incx,
-                                           double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
-                                                 const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
-                                            const cuComplex *x, int incx,
-                                            float *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
-                                            const cuDoubleComplex *x, int incx,
-                                            double *result) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, result);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
-              int incy, const float *c, /* host or device pointer */
-              const float *s) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
-                                     int, const float *, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
-              int incy, const double *c, /* host or device pointer */
-              const double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *, int, double *, int, const double *,
-      const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCrot_v2(
-    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
-    int incy, const float *c, /* host or device pointer */
-    const cuComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
-      const cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
-    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
-    int incy, const float *c, /* host or device pointer */
-    const float *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
-      const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZrot_v2(
-    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
-    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
-    const cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      const double *, const cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
-    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
-    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
-    const double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      const double *, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasRotEx(cublasHandle_t handle, int n, void *x, cudaDataType xType, int incx,
-            void *y, cudaDataType yType, int incy,
-            const void *c, /* host or device pointer */
-            const void *s, cudaDataType csType, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
-      const void *, const void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, c, s, csType,
-                  executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
-               float *b,                        /* host or device pointer */
-               float *c,                        /* host or device pointer */
-               float *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
-                                                 float *, float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
-               double *b,                        /* host or device pointer */
-               double *c,                        /* host or device pointer */
-               double *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
-                                                 double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
-               cuComplex *b,                        /* host or device pointer */
-               float *c,                            /* host or device pointer */
-               cuComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
-    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
-    cuDoubleComplex *b,                        /* host or device pointer */
-    double *c,                                 /* host or device pointer */
-    cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, c, s);
-}
-
-cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
-                                         void *a, /* host or device pointer */
-                                         void *b, /* host or device pointer */
-                                         cudaDataType abType,
-                                         void *c, /* host or device pointer */
-                                         void *s, /* host or device pointer */
-                                         cudaDataType csType,
-                                         cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, void *, void *,
-                                                 cudaDataType, void *, void *,
-                                                 cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotgEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, a, b, abType, c, s, csType, executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
-                                           float *x, int incx, float *y,
-                                           int incy, const float *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, float *, int, float *, int, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, param);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
-                                           double *x, int incx, double *y,
-                                           int incy, const double *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *, int, double *, int, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, incx, y, incy, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasRotmEx(cublasHandle_t handle, int n, void *x, cudaDataType xType,
-             int incx, void *y, cudaDataType yType, int incy,
-             const void *param, /* host or device pointer */
-             cudaDataType paramType, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
-      const void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, param, paramType,
-                  executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
-                float *d2,                        /* host or device pointer */
-                float *x1,                        /* host or device pointer */
-                const float *y1,                  /* host or device pointer */
-                float *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, float *, float *, float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, d1, d2, x1, y1, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
-                double *d2,                        /* host or device pointer */
-                double *x1,                        /* host or device pointer */
-                const double *y1,                  /* host or device pointer */
-                double *param) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, double *, double *, double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, d1, d2, x1, y1, param);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasRotmgEx(cublasHandle_t handle, void *d1,     /* host or device pointer */
-              cudaDataType d1Type, void *d2,       /* host or device pointer */
-              cudaDataType d2Type, void *x1,       /* host or device pointer */
-              cudaDataType x1Type, const void *y1, /* host or device pointer */
-              cudaDataType y1Type, void *param,    /* host or device pointer */
-              cudaDataType paramType, cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, void *, cudaDataType, void *, cudaDataType, void *,
-      cudaDataType, const void *, cudaDataType, void *, cudaDataType,
-      cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmgEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, d1, d1Type, d2, d2Type, x1, x1Type, y1, y1Type, param,
-                  paramType, executiontype);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
-      int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               int kl, int ku, const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
-      const float *, int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-               int kl, int ku, const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
-    int ku, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *x, int incx,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
-    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
-                  incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const float *A, int lda, float *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *AP,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n,
-                                           const cuDoubleComplex *AP,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const float *A, int lda, float *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const double *A, int lda, double *x,
-                                           int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuComplex *A, int lda,
-                                           cuComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
-                                           cublasFillMode_t uplo,
-                                           cublasOperation_t trans,
-                                           cublasDiagType_t diag, int n, int k,
-                                           const cuDoubleComplex *A, int lda,
-                                           cuDoubleComplex *x, int incx) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
-      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const float *alpha, /* host or device pointer */
-               const float *A, int lda, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
-      int, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const double *alpha, /* host or device pointer */
-               const double *A, int lda, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const double *,
-      const double *, int, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *AP, const float *x, int incx,
-               const float *beta, /* host or device pointer */
-               float *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const double *alpha, /* host or device pointer */
-               const double *AP, const double *x, int incx,
-               const double *beta, /* host or device pointer */
-               double *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *AP, const cuComplex *x, int incx,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
-               const cuDoubleComplex *beta, /* host or device pointer */
-               cuDoubleComplex *y, int incy) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSger_v2(
-    cublasHandle_t handle, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *x, int incx, const float *y, int incy, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDger_v2(
-    cublasHandle_t handle, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const double *, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgeru_v2(cublasHandle_t handle, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgerc_v2(cublasHandle_t handle, int m, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgeru_v2(cublasHandle_t handle, int m, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgerc_v2(cublasHandle_t handle, int m, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const float *x, int incx, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const double *x, int incx, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const cuComplex *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const cuDoubleComplex *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
-      int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const float *x, int incx, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const double *x, int incx, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const float *alpha, /* host or device pointer */
-              const cuComplex *x, int incx, cuComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
-      int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-              const double *alpha, /* host or device pointer */
-              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const float *alpha, /* host or device pointer */
-    const float *x, int incx, const float *y, int incy, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *x, int incx, const cuComplex *y, int incy,
-               cuComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const float *alpha, /* host or device pointer */
-               const float *x, int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
-      const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const double *alpha, /* host or device pointer */
-    const double *x, int incx, const double *y, int incy, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
-      int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-               const cuDoubleComplex *alpha, /* host or device pointer */
-               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-               int incy, cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemvBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *const Aarray[], int lda, const float *const xarray[], int incx,
-    const float *beta, /* host or device pointer */
-    float *const yarray[], int incy, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const float *,
-      const float *const[], int, const float *const[], int, const float *,
-      float *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, Aarray, lda, xarray, incx, beta,
-                  yarray, incy, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemvBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *const Aarray[], int lda, const double *const xarray[],
-    int incx, const double *beta, /* host or device pointer */
-    double *const yarray[], int incy, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const double *,
-      const double *const[], int, const double *const[], int, const double *,
-      double *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, Aarray, lda, xarray, incx, beta,
-                  yarray, incy, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemvBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *const Aarray[], int lda, const cuComplex *const xarray[],
-    int incx, const cuComplex *beta, /* host or device pointer */
-    cuComplex *const yarray[], int incy, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
-      const cuComplex *const[], int, const cuComplex *const[], int,
-      const cuComplex *, cuComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, Aarray, lda, xarray, incx, beta,
-                  yarray, incy, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgemvBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   const cuDoubleComplex *alpha, /* host or device pointer */
-                   const cuDoubleComplex *const Aarray[], int lda,
-                   const cuDoubleComplex *const xarray[], int incx,
-                   const cuDoubleComplex *beta, /* host or device pointer */
-                   cuDoubleComplex *const yarray[], int incy, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *const[], int, const cuDoubleComplex *const[], int,
-      const cuDoubleComplex *, cuDoubleComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, Aarray, lda, xarray, incx, beta,
-                  yarray, incy, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemvStridedBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-    const float *alpha,                             /* host or device pointer */
-    const float *A, int lda, long long int strideA, /* purposely signed */
-    const float *x, int incx, long long int stridex,
-    const float *beta, /* host or device pointer */
-    float *y, int incy, long long int stridey, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
-      int, long long, const float *, int, long long, const float *, float *,
-      int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemvStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, strideA, x, incx, stridex,
-                  beta, y, incy, stridey, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemvStridedBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *A, int lda, long long int strideA, /* purposely signed */
-    const double *x, int incx, long long int stridex,
-    const double *beta, /* host or device pointer */
-    double *y, int incy, long long int stridey, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const double *,
-      const double *, int, long long, const double *, int, long long,
-      const double *, double *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemvStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, strideA, x, incx, stridex,
-                  beta, y, incy, stridey, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemvStridedBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
-    const cuComplex *x, int incx, long long int stridex,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *y, int incy, long long int stridey, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
-      const cuComplex *, int, long long, const cuComplex *, int, long long,
-      const cuComplex *, cuComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemvStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, strideA, x, incx, stridex,
-                  beta, y, incy, stridey, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemvStridedBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    long long int strideA, /* purposely signed */
-    const cuDoubleComplex *x, int incx, long long int stridex,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, int incy, long long int stridey, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int,
-      long long, const cuDoubleComplex *, cuDoubleComplex *, int, long long,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemvStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, alpha, A, lda, strideA, x, incx, stridex,
-                  beta, y, incy, stridey, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3m(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, const void *A,
-    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
-    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
-              cublasOperation_t transb, int m, int n, int k,
-              const cuDoubleComplex *alpha, /* host or device pointer */
-              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
-              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
-              cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda, const void *B,
-    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const float *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda, const void *B,
-    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc, cublasComputeType_t computeType,
-    cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
-      int, const void *, void *, cudaDataType, int, cublasComputeType_t,
-      cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, const void *A,
-    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
-    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const void *,
-      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
-                  Btype, ldb, beta, C, Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
-    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
-    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
-      int, int, int, const unsigned char *, int, int, const unsigned char *,
-      int, int, unsigned char *, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
-                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha,           /* host or device pointer */
-    const float *A, int lda, const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha,            /* host or device pointer */
-    const double *A, int lda, const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
-      void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
-    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
-      void *, cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha,               /* host or device pointer */
-    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const cuComplex *, int, const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZherk_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const cuDoubleComplex *, int, const double *,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherkEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    const float *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const void *, cudaDataType, int, const float *, void *,
-      cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
-    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const void *, cudaDataType, int, const float *, void *,
-      cudaDataType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
-                  Ctype, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsyrkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCherkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const float *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const float *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZherkx(
-    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
-    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const double *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const float *, const float *, int, const float *, int, const float *,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const double *, const double *, int, const double *, int, const double *,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasChemm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
-    int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *A, int lda, float *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *, int, float *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *A, int lda, double *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *, int, double *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, cuComplex *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
-    int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, /* host or device pointer */
-    const float *const Aarray[], int lda, const float *const Barray[], int ldb,
-    const float *beta, /* host or device pointer */
-    float *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *const[], int, const float *const[], int,
-      const float *, float *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *const Aarray[], int lda, const double *const Barray[],
-    int ldb, const double *beta, /* host or device pointer */
-    double *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *const[], int, const double *const[], int,
-      const double *, double *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
-    int ldb, const cuComplex *beta, /* host or device pointer */
-    cuComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *const[], int,
-      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
-    int ldb, const cuComplex *beta, /* host or device pointer */
-    cuComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *const[], int,
-      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
-                   cublasOperation_t transb, int m, int n, int k,
-                   const cuDoubleComplex *alpha, /* host or device pointer */
-                   const cuDoubleComplex *const Aarray[], int lda,
-                   const cuDoubleComplex *const Barray[], int ldb,
-                   const cuDoubleComplex *beta, /* host or device pointer */
-                   cuDoubleComplex *const Carray[], int ldc, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *const[], int,
-      const cuDoubleComplex *const[], int, const cuDoubleComplex *,
-      cuDoubleComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
-                  ldb, beta, Carray, ldc, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *const Aarray[], cudaDataType Atype, int lda,
-    const void *const Barray[], cudaDataType Btype, int ldb,
-    const void *beta, /* host or device pointer */
-    void *const Carray[], cudaDataType Ctype, int ldc, int batchCount,
-    cublasComputeType_t computeType, cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *const[], cudaDataType, int, const void *const[],
-      cudaDataType, int, const void *, void *const[], cudaDataType, int, int,
-      cublasComputeType_t, cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmBatchedEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda,
-                  Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount,
-                  computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const void *alpha, /* host or device pointer */
-    const void *A, cudaDataType Atype, int lda,
-    long long int strideA, /* purposely signed */
-    const void *B, cudaDataType Btype, int ldb, long long int strideB,
-    const void *beta, /* host or device pointer */
-    void *C, cudaDataType Ctype, int ldc, long long int strideC, int batchCount,
-    cublasComputeType_t computeType, cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const void *, const void *, cudaDataType, int, long long, const void *,
-      cudaDataType, int, long long, const void *, void *, cudaDataType, int,
-      long long, int, cublasComputeType_t, cublasGemmAlgo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmStridedBatchedEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda,
-                  strideA, B, Btype, ldb, strideB, beta, C, Ctype, ldc, strideC,
-                  batchCount, computeType, algo);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha,        /* host or device pointer */
-    const float *A, int lda, long long int strideA, /* purposely signed */
-    const float *B, int ldb, long long int strideB,
-    const float *beta, /* host or device pointer */
-    float *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const float *, const float *, int, long long, const float *, int,
-      long long, const float *, float *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, /* host or device pointer */
-    const double *A, int lda, long long int strideA, /* purposely signed */
-    const double *B, int ldb, long long int strideB,
-    const double *beta, /* host or device pointer */
-    double *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const double *, const double *, int, long long, const double *, int,
-      long long, const double *, double *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
-    const cuComplex *B, int ldb, long long int strideB,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
-      int, long long, const cuComplex *, cuComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
-    const cuComplex *B, int ldb, long long int strideB,
-    const cuComplex *beta, /* host or device pointer */
-    cuComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
-      int, long long, const cuComplex *, cuComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    long long int strideA, /* purposely signed */
-    const cuDoubleComplex *B, int ldb, long long int strideB,
-    const cuDoubleComplex *beta, /* host or device poi */
-    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
-      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
-      cuDoubleComplex *, int, long long, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
-                  ldb, strideB, beta, C, ldc, strideC, batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const float *alpha,           /* host or device pointer */
-    const float *A, int lda, const float *beta, /* host or device pointer */
-    const float *B, int ldb, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const float *, const float *, int, const float *, const float *, int,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const double *alpha,            /* host or device pointer */
-    const double *A, int lda, const double *beta, /* host or device pointer */
-    const double *B, int ldb, double *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const double *, const double *, int, const double *, const double *, int,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda,
-    const cuComplex *beta, /* host or device pointer */
-    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const cuComplex *, const cuComplex *, int, const cuComplex *,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
-                  ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(
-    cublasHandle_t handle, int n, float *const A[], /*Device pointer*/
-    int lda, int *P,                                /*Device Pointer*/
-    int *info,                                      /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, float *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(
-    cublasHandle_t handle, int n, double *const A[], /*Device pointer*/
-    int lda, int *P,                                 /*Device Pointer*/
-    int *info,                                       /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, double *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(
-    cublasHandle_t handle, int n, cuComplex *const A[], /*Device pointer*/
-    int lda, int *P,                                    /*Device Pointer*/
-    int *info,                                          /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(
-    cublasHandle_t handle, int n, cuDoubleComplex *const A[], /*Device pointer*/
-    int lda, int *P,                                          /*Device Pointer*/
-    int *info,                                                /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, cuDoubleComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetriBatched(
-    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
-    int lda, const int *P,                                /*Device pointer*/
-    float *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const float *const[], int, const int *,
-      float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetriBatched(
-    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
-    int lda, const int *P,                                 /*Device pointer*/
-    double *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const double *const[], int, const int *,
-      double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetriBatched(
-    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
-    int lda, const int *P,                                    /*Device pointer*/
-    cuComplex *const C[],                                     /*Device pointer*/
-    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *const[], int, const int *,
-      cuComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgetriBatched(cublasHandle_t handle, int n,
-                    const cuDoubleComplex *const A[], /*Device pointer*/
-                    int lda, const int *P,            /*Device pointer*/
-                    cuDoubleComplex *const C[],       /*Device pointer*/
-                    int ldc, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *const[], int, const int *,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const float *const Aarray[], int lda, const int *devIpiv,
-    float *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const float *const[], int,
-      const int *, float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const double *const Aarray[], int lda, const int *devIpiv,
-    double *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const double *const[], int,
-      const int *, double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuComplex *const Aarray[], int lda, const int *devIpiv,
-    cuComplex *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *const[],
-      int, const int *, cuComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
-    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuDoubleComplex *const Aarray[], int lda, const int *devIpiv,
-    cuDoubleComplex *const Barray[], int ldb, int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int,
-      const cuDoubleComplex *const[], int, const int *,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
-                  info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const float *alpha, /*Host or Device Pointer*/
-    const float *const A[], int lda, float *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const float *, const float *const[], int,
-      float *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const double *alpha, /*Host or Device Pointer*/
-    const double *const A[], int lda, double *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const double *, const double *const[], int,
-      double *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuComplex *alpha, /*Host or Device Pointer*/
-    const cuComplex *const A[], int lda, cuComplex *const B[], int ldb,
-    int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *const[],
-      int, cuComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
-    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-    const cuDoubleComplex *alpha, /*Host or Device Pointer*/
-    const cuDoubleComplex *const A[], int lda, cuDoubleComplex *const B[],
-    int ldb, int batchCount) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      cublasDiagType_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
-                  batchCount);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
-    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
-    int lda, float *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                               /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const float *const[],
-                                     int, float *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(
-    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
-    int lda, double *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                                /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const double *const[],
-                                     int, double *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(
-    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
-    int lda, cuComplex *const Ainv[],                         /*Device pointer*/
-    int lda_inv, int *info,                                   /*Device Pointer*/
-    int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuComplex *const[], int, cuComplex *const[],
-      int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZmatinvBatched(cublasHandle_t handle, int n,
-                     const cuDoubleComplex *const A[],       /*Device pointer*/
-                     int lda, cuDoubleComplex *const Ainv[], /*Device pointer*/
-                     int lda_inv, int *info,                 /*Device Pointer*/
-                     int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, const cuDoubleComplex *const[], int,
-      cuDoubleComplex *const[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    float *const Aarray[],            /*Device pointer*/
-                    int lda, float *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, float *const[],
-                                     int, float *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    double *const Aarray[],            /*Device pointer*/
-                    int lda, double *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, double *const[],
-                                     int, double *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgeqrfBatched(cublasHandle_t handle, int m, int n,
-                    cuComplex *const Aarray[],            /*Device pointer*/
-                    int lda, cuComplex *const TauArray[], /*Device pointer*/
-                    int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, cuComplex *const[], int, cuComplex *const[],
-      int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(
-    cublasHandle_t handle, int m, int n,
-    cuDoubleComplex *const Aarray[],            /*Device pointer*/
-    int lda, cuDoubleComplex *const TauArray[], /*Device pointer*/
-    int *info, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, int, int, cuDoubleComplex *const[], int,
-      cuDoubleComplex *const[], int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, float *const Aarray[],       /*Device pointer*/
-                   int lda, float *const Carray[],        /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
-                   int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, float *const[], int,
-      float *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, double *const Aarray[],      /*Device pointer*/
-                   int lda, double *const Carray[],       /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
-                   int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, double *const[], int,
-      double *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, cuComplex *const Aarray[], /*Device pointer*/
-                   int lda, cuComplex *const Carray[],  /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *const[], int,
-      cuComplex *const[], int, int *, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI
-cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
-                   int nrhs, cuDoubleComplex *const Aarray[], /*Device pointer*/
-                   int lda, cuDoubleComplex *const Carray[],  /*Device pointer*/
-                   int ldc, int *info, int *devInfoArray, int batchSize) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasOperation_t, int, int, int,
-      cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int *,
-      int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
-                  devInfoArray, batchSize);
-}
-
-cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const float *A, int lda, const float *x,
-                                        int incx, float *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
-      const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const double *A, int lda,
-                                        const double *x, int incx, double *C,
-                                        int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
-      const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const cuComplex *A, int lda,
-                                        const cuComplex *x, int incx,
-                                        cuComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
-      const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
-                                        cublasSideMode_t mode, int m, int n,
-                                        const cuDoubleComplex *A, int lda,
-                                        const cuDoubleComplex *x, int incx,
-                                        cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const float *AP, float *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const double *AP, double *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuComplex *AP, cuComplex *A,
-                                         int lda) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
-                                     const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuDoubleComplex *AP,
-                                         cuDoubleComplex *A, int lda) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, AP, A, lda);
-}
-
-cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const float *A, int lda, float *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const double *A, int lda, double *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuComplex *A, int lda,
-                                         cuComplex *AP) {
-  using FuncPtr =
-      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
-                                     const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
-                                         cublasFillMode_t uplo, int n,
-                                         const cuDoubleComplex *A, int lda,
-                                         cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
-      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, AP);
-}
-
-cublasStatus CUBLASWINAPI cublasInit(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasShutdown(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasGetError(void) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(n, elemSize, devicePtr);
-}
-
-cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devicePtr);
-}
-
-cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
-                              int incy) {
-  using FuncPtr =
-      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
-                               const double *y, int incy) {
-  using FuncPtr =
-      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
-                                   const cuComplex *y, int incy) {
-  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
-                                            const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
-                                   const cuComplex *y, int incy) {
-  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
-                                            const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
-                                         int incx, const cuDoubleComplex *y,
-                                         int incy) {
-  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
-      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
-                                         int incx, const cuDoubleComplex *y,
-                                         int incy) {
-  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
-      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
-                               int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
-  return func_ptr(n, alpha, x, incx);
-}
-
-void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
-                              float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
-                              double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
-                              int incx, cuComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
-  return func_ptr(n, alpha, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
-                                       cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
-  return func_ptr(n, x, incx, y, incy);
-}
-
-int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
-  return func_ptr(n, x, incx);
-}
-
-int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
-  return func_ptr(n, x, incx);
-}
-
-float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
-  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
-  return func_ptr(n, x, incx);
-}
-
-double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
-  return func_ptr(n, x, incx);
-}
-
-void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
-                             float sc, float ss) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
-  return func_ptr(n, x, incx, y, incy, sc, ss);
-}
-
-void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
-                             double sc, double ss) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
-  return func_ptr(n, x, incx, y, incy, sc, ss);
-}
-
-void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
-                             int incy, float c, cuComplex s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
-                                       float, cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *y, int incy, double sc,
-                             cuDoubleComplex cs) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-                           double, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
-  return func_ptr(n, x, incx, y, incy, sc, cs);
-}
-
-void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
-                              int incy, float c, float s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
-                                       float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
-                              cuDoubleComplex *y, int incy, double c,
-                              double s) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
-                                       cuDoubleComplex *, int, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
-  return func_ptr(n, x, incx, y, incy, c, s);
-}
-
-void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
-  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
-  return func_ptr(sa, sb, sc, ss);
-}
-
-void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
-  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
-  return func_ptr(sa, sb, sc, ss);
-}
-
-void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
-                              cuComplex *cs) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
-  return func_ptr(ca, cb, sc, cs);
-}
-
-void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
-                              double *sc, cuDoubleComplex *cs) {
-  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
-                                       double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
-  return func_ptr(ca, cb, sc, cs);
-}
-
-void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
-                              const float *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
-  return func_ptr(n, x, incx, y, incy, sparam);
-}
-
-void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
-                              const double *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
-  return func_ptr(n, x, incx, y, incy, sparam);
-}
-
-void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
-                               const float *sy1, float *sparam) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
-  return func_ptr(sd1, sd2, sx1, sy1, sparam);
-}
-
-void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
-                               const double *sy1, double *sparam) {
-  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
-                                       const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
-  return func_ptr(sd1, sd2, sx1, sy1, sparam);
-}
-
-void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
-                              const float *A, int lda, const float *x, int incx,
-                              float beta, float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
-                              const double *A, int lda, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
-  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
-                              float alpha, const float *A, int lda,
-                              const float *x, int incx, float beta, float *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
-                              double alpha, const double *A, int lda,
-                              const double *x, int incx, double beta, double *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
-                           int, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *x, int incx, cuComplex beta,
-                              cuComplex *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
-  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
-                                       int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
-                           int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
-                              const float *AP, float *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
-                              const double *AP, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
-                              const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *AP, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
-                                       int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
-                           cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
-  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
-                              const float *AP, float *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
-                              const double *AP, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
-                              const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
-                              const cuDoubleComplex *AP, cuDoubleComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
-  return func_ptr(uplo, trans, diag, n, AP, x, incx);
-}
-
-void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
-                              const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
-                              const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
-                              const cuComplex *A, int lda, cuComplex *x,
-                              int incx) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
-                           int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
-  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
-}
-
-void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
-                              int lda, const float *x, int incx, float beta,
-                              float *y, int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
-                              int lda, const double *x, int incx, double beta,
-                              double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
-  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
-                              const float *A, int lda, const float *x, int incx,
-                              float beta, float *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
-                              const double *A, int lda, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
-                              const cuComplex *A, int lda, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
-  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
-                              const float *x, int incx, float beta, float *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
-                              const double *x, int incx, double beta, double *y,
-                              int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
-                           int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *AP, const cuComplex *x, int incx,
-                              cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
-                           const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *AP,
-                              const cuDoubleComplex *x, int incx,
-                              cuDoubleComplex beta, cuDoubleComplex *y,
-                              int incy) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
-  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
-}
-
-void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
-                             int incx, const float *y, int incy, float *A,
-                             int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
-                             int incx, const double *y, int incy, double *A,
-                             int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
-                              int incx, const cuComplex *y, int incy,
-                              cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
-                              int incx, const cuComplex *y, int incy,
-                              cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
-  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
-                             int incx, float *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
-                             int incx, double *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
-                             int incx, cuComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
-                                       cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
-                             const cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
-  return func_ptr(uplo, n, alpha, x, incx, A, lda);
-}
-
-void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
-                             int incx, float *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
-                             int incx, double *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
-                             int incx, cuComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
-                                       cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
-                             const cuDoubleComplex *x, int incx,
-                             cuDoubleComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
-  return func_ptr(uplo, n, alpha, x, incx, AP);
-}
-
-void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
-                              int incx, const float *y, int incy, float *A,
-                              int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
-                              int incx, const double *y, int incy, double *A,
-                              int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
-                              const cuComplex *x, int incx, const cuComplex *y,
-                              int incy, cuComplex *A, int lda) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *A, int lda) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
-}
-
-void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
-                              int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
-                                       const float *, int, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
-                              int incx, const double *y, int incy, double *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
-                                       const double *, int, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
-                              const cuComplex *x, int incx, const cuComplex *y,
-                              int incy, cuComplex *AP) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
-                           const cuComplex *, int, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *x, int incx,
-                              const cuDoubleComplex *y, int incy,
-                              cuDoubleComplex *AP) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
-  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
-}
-
-void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
-                              float alpha, const float *A, int lda,
-                              const float *B, int ldb, float beta, float *C,
-                              int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
-                              double alpha, const double *A, int lda,
-                              const double *B, int ldb, double beta, double *C,
-                              int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
-                           int, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
-  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
-                              const float *A, int lda, float beta, float *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
-                                       const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
-                              const double *A, int lda, double beta, double *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, double, const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
-                           int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
-                                       const cuDoubleComplex *, int,
-                                       cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
-                              const cuComplex *A, int lda, float beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
-                           float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
-                              const cuDoubleComplex *A, int lda, double beta,
-                              cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
-                                       const cuDoubleComplex *, int, double,
-                                       cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
-                               const float *A, int lda, const float *B, int ldb,
-                               float beta, float *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
-                               double alpha, const double *A, int lda,
-                               const double *B, int ldb, double beta, double *C,
-                               int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C,
-                               int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, float beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, float, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *B, int ldb,
-                               double beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
-  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
-                              const float *A, int lda, const float *B, int ldb,
-                              float beta, float *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
-                           const float *, int, float, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
-                              const double *A, int lda, const double *B,
-                              int ldb, double beta, double *C, int ldc) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
-                           const double *, int, double, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
-                              cuComplex alpha, const cuComplex *A, int lda,
-                              const cuComplex *B, int ldb, cuComplex beta,
-                              cuComplex *C, int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuComplex, const cuComplex *, int,
-      const cuComplex *, int, cuComplex, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
-                              cuDoubleComplex alpha, const cuDoubleComplex *A,
-                              int lda, const cuDoubleComplex *B, int ldb,
-                              cuDoubleComplex beta, cuDoubleComplex *C,
-                              int ldc) {
-  using FuncPtr = void(CUBLASWINAPI *)(
-      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
-  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, float alpha, const float *A,
-                              int lda, float *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, double alpha, const double *A,
-                              int lda, double *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuComplex alpha, const cuComplex *A,
-                              int lda, cuComplex *B, int ldb) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
-                                       cuDoubleComplex, const cuDoubleComplex *,
-                                       int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, float alpha, const float *A,
-                              int lda, float *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
-                                       const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, double alpha, const double *A,
-                              int lda, double *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
-                                       const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuComplex alpha, const cuComplex *A,
-                              int lda, cuComplex *B, int ldb) {
-  using FuncPtr =
-      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
-                           const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
-                              int m, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *A, int lda,
-                              cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
-                                       cuDoubleComplex, const cuDoubleComplex *,
-                                       int, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
-  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
-  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublas_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cublas_stub.cc
index 2392e6cdb9ae1f..814d64d75d8d61 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cublas_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cublas_stub.cc
@@ -12,11 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
+#include <string_view>
+
 #if CUBLAS_VER_MAJOR >= 11
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #else
 #include "third_party/gpus/cuda/include/cublas.h"
 #endif
+
+#include "absl/container/flat_hash_set.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "tsl/platform/dso_loader.h"
 #include "tsl/platform/env.h"
@@ -26,43 +31,216 @@ limitations under the License.
 
 namespace {
 // Returns DSO handle or null if loading the DSO fails.
-void* GetDsoHandle() {
-#ifdef PLATFORM_GOOGLE
-  return nullptr;
-#else
-  static auto handle = []() -> void* {
+void *GetDsoHandle() {
+  static auto handle = []() -> void * {
     auto handle_or = tsl::internal::DsoLoader::GetCublasDsoHandle();
     if (!handle_or.ok()) return nullptr;
     return handle_or.value();
   }();
   return handle;
-#endif
 }
 
-template <typename T>
-T LoadSymbol(const char* symbol_name) {
-  void* symbol = nullptr;
+void *LoadSymbol(const char *symbol_name) {
+  void *symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
     tsl::Env::Default()
         ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
-  return reinterpret_cast<T>(symbol);
+  return symbol;
 }
 
-void LogFatalSymbolNotFound(const char* symbol_name) {
-  LOG(FATAL) << symbol_name << " symbol not found.";
+const char *kSymbols[] = {
+#include "tsl/cuda/cublas.inc"
+};
+
+constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char *);
+
+absl::flat_hash_set<std::string_view> const &FatalErrorSymbols() {
+  static auto *syms = new absl::flat_hash_set<std::string_view>{
+      "cublasGetCudartVersion",
+      "cublasXerbla",
+      "cublasSnrm2",
+      "cublasDnrm2",
+      "cublasScnrm2",
+      "cublasDznrm2",
+      "cublasSdot",
+      "cublasDdot",
+      "cublasCdotu",
+      "cublasCdotc",
+      "cublasZdotu",
+      "cublasZdotc",
+      "cublasSscal",
+      "cublasDscal",
+      "cublasCscal",
+      "cublasZscal",
+      "cublasCsscal",
+      "cublasZdscal",
+      "cublasSaxpy",
+      "cublasDaxpy",
+      "cublasCaxpy",
+      "cublasZaxpy",
+      "cublasScopy",
+      "cublasDcopy",
+      "cublasCcopy",
+      "cublasZcopy",
+      "cublasSswap",
+      "cublasDswap",
+      "cublasCswap",
+      "cublasZswap",
+      "cublasIsamax",
+      "cublasIdamax",
+      "cublasIcamax",
+      "cublasIzamax",
+      "cublasIsamin",
+      "cublasIdamin",
+      "cublasIcamin",
+      "cublasIzamin",
+      "cublasSasum",
+      "cublasDasum",
+      "cublasScasum",
+      "cublasDzasum",
+      "cublasSrot",
+      "cublasDrot",
+      "cublasCrot",
+      "cublasZrot",
+      "cublasCsrot",
+      "cublasZdrot",
+      "cublasSrotg",
+      "cublasDrotg",
+      "cublasCrotg",
+      "cublasZrotg",
+      "cublasSrotm",
+      "cublasDrotm",
+      "cublasSrotmg",
+      "cublasDrotmg",
+      "cublasSgemv",
+      "cublasDgemv",
+      "cublasCgemv",
+      "cublasZgemv",
+      "cublasSgbmv",
+      "cublasDgbmv",
+      "cublasCgbmv",
+      "cublasZgbmv",
+      "cublasStrmv",
+      "cublasDtrmv",
+      "cublasCtrmv",
+      "cublasZtrmv",
+      "cublasStbmv",
+      "cublasDtbmv",
+      "cublasCtbmv",
+      "cublasZtbmv",
+      "cublasStpmv",
+      "cublasDtpmv",
+      "cublasCtpmv",
+      "cublasZtpmv",
+      "cublasStrsv",
+      "cublasDtrsv",
+      "cublasCtrsv",
+      "cublasZtrsv",
+      "cublasStpsv",
+      "cublasDtpsv",
+      "cublasCtpsv",
+      "cublasZtpsv",
+      "cublasStbsv",
+      "cublasDtbsv",
+      "cublasCtbsv",
+      "cublasZtbsv",
+      "cublasSsymv",
+      "cublasDsymv",
+      "cublasChemv",
+      "cublasZhemv",
+      "cublasSsbmv",
+      "cublasDsbmv",
+      "cublasChbmv",
+      "cublasZhbmv",
+      "cublasSspmv",
+      "cublasDspmv",
+      "cublasChpmv",
+      "cublasZhpmv",
+      "cublasSger",
+      "cublasDger",
+      "cublasCgeru",
+      "cublasCgerc",
+      "cublasZgeru",
+      "cublasZgerc",
+      "cublasSsyr",
+      "cublasDsyr",
+      "cublasCher",
+      "cublasZher",
+      "cublasSspr",
+      "cublasDspr",
+      "cublasChpr",
+      "cublasZhpr",
+      "cublasSsyr2",
+      "cublasDsyr2",
+      "cublasCher2",
+      "cublasZher2",
+      "cublasSspr2",
+      "cublasDspr2",
+      "cublasChpr2",
+      "cublasZhpr2",
+      "cublasSgemm",
+      "cublasDgemm",
+      "cublasCgemm",
+      "cublasZgemm",
+      "cublasSsyrk",
+      "cublasDsyrk",
+      "cublasCsyrk",
+      "cublasZsyrk",
+      "cublasCherk",
+      "cublasZherk",
+      "cublasSsyr2k",
+      "cublasDsyr2k",
+      "cublasCsyr2k",
+      "cublasZsyr2k",
+      "cublasCher2k",
+      "cublasZher2k",
+      "cublasSsymm",
+      "cublasDsymm",
+      "cublasCsymm",
+      "cublasZsymm",
+      "cublasChemm",
+      "cublasZhemm",
+      "cublasStrsm",
+      "cublasDtrsm",
+      "cublasCtrsm",
+      "cublasZtrsm",
+      "cublasStrmm",
+      "cublasDtrmm",
+      "cublasCtrmm",
+      "cublasZtrmm",
+  };
+  return *syms;
 }
 
-cublasStatus_t GetSymbolNotFoundError() { return CUBLAS_STATUS_INTERNAL_ERROR; }
 }  // namespace
 
-#if CUDA_VERSION < 10010
-#include "tsl/cuda/cublas_10_0.inc"
-#elif CUDA_VERSION < 10020
-#include "tsl/cuda/cublas_10_1.inc"
-#elif CUDA_VERSION < 11000
-#include "tsl/cuda/cublas_10_2.inc"
-#else
-#include "tsl/cuda/cublas_11_0.inc"
-#endif
+extern "C" {
+
+static void CublasLogFatalSymbolNotFound(const char *symbol_name) {
+  LOG(FATAL) << symbol_name << " symbol not found.";
+}
+
+static cublasStatus_t CublasGetSymbolNotFoundError() {
+  return CUBLAS_STATUS_INTERNAL_ERROR;
+}
+
+extern void *_cublas_tramp_table[];
+
+void _cublas_tramp_resolve(int i) {
+  CHECK_LE(0, i);
+  CHECK_LT(i, kNumSymbols);
+  void *p = LoadSymbol(kSymbols[i]);
+  if (!p) {
+    const auto &fatal_error_symbols = FatalErrorSymbols();
+    if (fatal_error_symbols.find(kSymbols[i]) != fatal_error_symbols.end()) {
+      p = reinterpret_cast<void *>(&CublasLogFatalSymbolNotFound);
+    } else {
+      p = reinterpret_cast<void *>(&CublasGetSymbolNotFoundError);
+    }
+  }
+  _cublas_tramp_table[i] = p;
+}
+
+}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cuda.symbols
new file mode 100644
index 00000000000000..558d11cafdbc99
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cuda.symbols
@@ -0,0 +1,583 @@
+cuArray3DCreate
+cuArray3DCreate_v2
+cuArray3DGetDescriptor
+cuArray3DGetDescriptor_v2
+cuArrayCreate
+cuArrayCreate_v2
+cuArrayDestroy
+cuArrayGetDescriptor
+cuArrayGetDescriptor_v2
+cuArrayGetMemoryRequirements
+cuArrayGetPlane
+cuArrayGetSparseProperties
+cuCtxAttach
+cuCtxCreate
+cuCtxCreate_v2
+cuCtxCreate_v3
+cuCtxDestroy
+cuCtxDestroy_v2
+cuCtxDetach
+cuCtxDisablePeerAccess
+cuCtxEnablePeerAccess
+cuCtxGetApiVersion
+cuCtxGetCacheConfig
+cuCtxGetCurrent
+cuCtxGetDevice
+cuCtxGetExecAffinity
+cuCtxGetFlags
+cuCtxGetId
+cuCtxGetLimit
+cuCtxGetSharedMemConfig
+cuCtxGetStreamPriorityRange
+cuCtxPopCurrent
+cuCtxPopCurrent_v2
+cuCtxPushCurrent
+cuCtxPushCurrent_v2
+cuCtxResetPersistingL2Cache
+cuCtxSetCacheConfig
+cuCtxSetCurrent
+cuCtxSetLimit
+cuCtxSetSharedMemConfig
+cuCtxSynchronize
+cuDestroyExternalMemory
+cuDestroyExternalSemaphore
+cuDeviceCanAccessPeer
+cuDeviceComputeCapability
+cuDeviceGet
+cuDeviceGetAttribute
+cuDeviceGetByPCIBusId
+cuDeviceGetCount
+cuDeviceGetDefaultMemPool
+cuDeviceGetExecAffinitySupport
+cuDeviceGetGraphMemAttribute
+cuDeviceGetLuid
+cuDeviceGetMemPool
+cuDeviceGetName
+cuDeviceGetNvSciSyncAttributes
+cuDeviceGetP2PAttribute
+cuDeviceGetPCIBusId
+cuDeviceGetProperties
+cuDeviceGetTexture1DLinearMaxWidth
+cuDeviceGetUuid
+cuDeviceGetUuid_v2
+cuDeviceGraphMemTrim
+cuDevicePrimaryCtxGetState
+cuDevicePrimaryCtxRelease
+cuDevicePrimaryCtxRelease_v2
+cuDevicePrimaryCtxReset
+cuDevicePrimaryCtxReset_v2
+cuDevicePrimaryCtxRetain
+cuDevicePrimaryCtxSetFlags
+cuDevicePrimaryCtxSetFlags_v2
+cuDeviceSetGraphMemAttribute
+cuDeviceSetMemPool
+cuDeviceTotalMem
+cuDeviceTotalMem_v2
+cuDriverGetVersion
+cuEGLApiInit
+cuEGLStreamConsumerAcquireFrame
+cuEGLStreamConsumerConnect
+cuEGLStreamConsumerConnectWithFlags
+cuEGLStreamConsumerDisconnect
+cuEGLStreamConsumerReleaseFrame
+cuEGLStreamProducerConnect
+cuEGLStreamProducerDisconnect
+cuEGLStreamProducerPresentFrame
+cuEGLStreamProducerReturnFrame
+cuEventCreate
+cuEventDestroy
+cuEventDestroy_v2
+cuEventElapsedTime
+cuEventQuery
+cuEventRecord
+cuEventRecordWithFlags
+cuEventRecordWithFlags_ptsz
+cuEventRecord_ptsz
+cuEventSynchronize
+cuExternalMemoryGetMappedBuffer
+cuExternalMemoryGetMappedMipmappedArray
+cuFlushGPUDirectRDMAWrites
+cuFuncGetAttribute
+cuFuncGetModule
+cuFuncSetAttribute
+cuFuncSetBlockShape
+cuFuncSetCacheConfig
+cuFuncSetSharedMemConfig
+cuFuncSetSharedSize
+cuGLCtxCreate
+cuGLCtxCreate_v2
+cuGLGetDevices
+cuGLGetDevices_v2
+cuGLInit
+cuGLMapBufferObject
+cuGLMapBufferObjectAsync
+cuGLMapBufferObjectAsync_v2
+cuGLMapBufferObjectAsync_v2_ptsz
+cuGLMapBufferObject_v2
+cuGLMapBufferObject_v2_ptds
+cuGLRegisterBufferObject
+cuGLSetBufferObjectMapFlags
+cuGLUnmapBufferObject
+cuGLUnmapBufferObjectAsync
+cuGLUnregisterBufferObject
+cuGetErrorName
+cuGetErrorString
+cuGetExportTable
+cuGetProcAddress
+cuGetProcAddress_v2
+cuGraphAddBatchMemOpNode
+cuGraphAddChildGraphNode
+cuGraphAddDependencies
+cuGraphAddEmptyNode
+cuGraphAddEventRecordNode
+cuGraphAddEventWaitNode
+cuGraphAddExternalSemaphoresSignalNode
+cuGraphAddExternalSemaphoresWaitNode
+cuGraphAddHostNode
+cuGraphAddKernelNode
+cuGraphAddKernelNode_v2
+cuGraphAddMemAllocNode
+cuGraphAddMemFreeNode
+cuGraphAddMemcpyNode
+cuGraphAddMemsetNode
+cuGraphBatchMemOpNodeGetParams
+cuGraphBatchMemOpNodeSetParams
+cuGraphChildGraphNodeGetGraph
+cuGraphClone
+cuGraphCreate
+cuGraphDebugDotPrint
+cuGraphDestroy
+cuGraphDestroyNode
+cuGraphEventRecordNodeGetEvent
+cuGraphEventRecordNodeSetEvent
+cuGraphEventWaitNodeGetEvent
+cuGraphEventWaitNodeSetEvent
+cuGraphExecBatchMemOpNodeSetParams
+cuGraphExecChildGraphNodeSetParams
+cuGraphExecDestroy
+cuGraphExecEventRecordNodeSetEvent
+cuGraphExecEventWaitNodeSetEvent
+cuGraphExecExternalSemaphoresSignalNodeSetParams
+cuGraphExecExternalSemaphoresWaitNodeSetParams
+cuGraphExecGetFlags
+cuGraphExecHostNodeSetParams
+cuGraphExecKernelNodeSetParams
+cuGraphExecKernelNodeSetParams_v2
+cuGraphExecMemcpyNodeSetParams
+cuGraphExecMemsetNodeSetParams
+cuGraphExecUpdate
+cuGraphExecUpdate_v2
+cuGraphExternalSemaphoresSignalNodeGetParams
+cuGraphExternalSemaphoresSignalNodeSetParams
+cuGraphExternalSemaphoresWaitNodeGetParams
+cuGraphExternalSemaphoresWaitNodeSetParams
+cuGraphGetEdges
+cuGraphGetNodes
+cuGraphGetRootNodes
+cuGraphHostNodeGetParams
+cuGraphHostNodeSetParams
+cuGraphInstantiate
+cuGraphInstantiateWithFlags
+cuGraphInstantiateWithParams
+cuGraphInstantiateWithParams_ptsz
+cuGraphInstantiate_v2
+cuGraphKernelNodeCopyAttributes
+cuGraphKernelNodeGetAttribute
+cuGraphKernelNodeGetParams
+cuGraphKernelNodeGetParams_v2
+cuGraphKernelNodeSetAttribute
+cuGraphKernelNodeSetParams
+cuGraphKernelNodeSetParams_v2
+cuGraphLaunch
+cuGraphLaunch_ptsz
+cuGraphMemAllocNodeGetParams
+cuGraphMemFreeNodeGetParams
+cuGraphMemcpyNodeGetParams
+cuGraphMemcpyNodeSetParams
+cuGraphMemsetNodeGetParams
+cuGraphMemsetNodeSetParams
+cuGraphNodeFindInClone
+cuGraphNodeGetDependencies
+cuGraphNodeGetDependentNodes
+cuGraphNodeGetEnabled
+cuGraphNodeGetType
+cuGraphNodeSetEnabled
+cuGraphReleaseUserObject
+cuGraphRemoveDependencies
+cuGraphRetainUserObject
+cuGraphUpload
+cuGraphUpload_ptsz
+cuGraphicsEGLRegisterImage
+cuGraphicsGLRegisterBuffer
+cuGraphicsGLRegisterImage
+cuGraphicsMapResources
+cuGraphicsMapResources_ptsz
+cuGraphicsResourceGetMappedEglFrame
+cuGraphicsResourceGetMappedMipmappedArray
+cuGraphicsResourceGetMappedPointer
+cuGraphicsResourceGetMappedPointer_v2
+cuGraphicsResourceSetMapFlags
+cuGraphicsResourceSetMapFlags_v2
+cuGraphicsSubResourceGetMappedArray
+cuGraphicsUnmapResources
+cuGraphicsUnmapResources_ptsz
+cuGraphicsUnregisterResource
+cuGraphicsVDPAURegisterOutputSurface
+cuGraphicsVDPAURegisterVideoSurface
+cuImportExternalMemory
+cuImportExternalSemaphore
+cuInit
+cuIpcCloseMemHandle
+cuIpcGetEventHandle
+cuIpcGetMemHandle
+cuIpcOpenEventHandle
+cuIpcOpenMemHandle
+cuIpcOpenMemHandle_v2
+cuKernelGetAttribute
+cuKernelGetFunction
+cuKernelSetAttribute
+cuKernelSetCacheConfig
+cuLaunch
+cuLaunchCooperativeKernel
+cuLaunchCooperativeKernelMultiDevice
+cuLaunchCooperativeKernel_ptsz
+cuLaunchGrid
+cuLaunchGridAsync
+cuLaunchHostFunc
+cuLaunchHostFunc_ptsz
+cuLaunchKernel
+cuLaunchKernelEx
+cuLaunchKernelEx_ptsz
+cuLaunchKernel_ptsz
+cuLibraryGetGlobal
+cuLibraryGetKernel
+cuLibraryGetManaged
+cuLibraryGetModule
+cuLibraryGetUnifiedFunction
+cuLibraryLoadData
+cuLibraryLoadFromFile
+cuLibraryUnload
+cuLinkAddData
+cuLinkAddData_v2
+cuLinkAddFile
+cuLinkAddFile_v2
+cuLinkComplete
+cuLinkCreate
+cuLinkCreate_v2
+cuLinkDestroy
+cuMemAddressFree
+cuMemAddressReserve
+cuMemAdvise
+cuMemAlloc
+cuMemAllocAsync
+cuMemAllocAsync_ptsz
+cuMemAllocFromPoolAsync
+cuMemAllocFromPoolAsync_ptsz
+cuMemAllocHost
+cuMemAllocHost_v2
+cuMemAllocManaged
+cuMemAllocPitch
+cuMemAllocPitch_v2
+cuMemAlloc_v2
+cuMemCreate
+cuMemExportToShareableHandle
+cuMemFree
+cuMemFreeAsync
+cuMemFreeAsync_ptsz
+cuMemFreeHost
+cuMemFree_v2
+cuMemGetAccess
+cuMemGetAddressRange
+cuMemGetAddressRange_v2
+cuMemGetAllocationGranularity
+cuMemGetAllocationPropertiesFromHandle
+cuMemGetAttribute
+cuMemGetAttribute_v2
+cuMemGetHandleForAddressRange
+cuMemGetInfo
+cuMemGetInfo_v2
+cuMemHostAlloc
+cuMemHostGetDevicePointer
+cuMemHostGetDevicePointer_v2
+cuMemHostGetFlags
+cuMemHostRegister
+cuMemHostRegister_v2
+cuMemHostUnregister
+cuMemImportFromShareableHandle
+cuMemMap
+cuMemMapArrayAsync
+cuMemMapArrayAsync_ptsz
+cuMemPoolCreate
+cuMemPoolDestroy
+cuMemPoolExportPointer
+cuMemPoolExportToShareableHandle
+cuMemPoolGetAccess
+cuMemPoolGetAttribute
+cuMemPoolImportFromShareableHandle
+cuMemPoolImportPointer
+cuMemPoolSetAccess
+cuMemPoolSetAttribute
+cuMemPoolTrimTo
+cuMemPrefetchAsync
+cuMemPrefetchAsync_ptsz
+cuMemRangeGetAttribute
+cuMemRangeGetAttributes
+cuMemRelease
+cuMemRetainAllocationHandle
+cuMemSetAccess
+cuMemUnmap
+cuMemcpy
+cuMemcpy2D
+cuMemcpy2DAsync
+cuMemcpy2DAsync_v2
+cuMemcpy2DAsync_v2_ptsz
+cuMemcpy2DUnaligned
+cuMemcpy2DUnaligned_v2
+cuMemcpy2DUnaligned_v2_ptds
+cuMemcpy2D_v2
+cuMemcpy2D_v2_ptds
+cuMemcpy3D
+cuMemcpy3DAsync
+cuMemcpy3DAsync_v2
+cuMemcpy3DAsync_v2_ptsz
+cuMemcpy3DPeer
+cuMemcpy3DPeerAsync
+cuMemcpy3DPeerAsync_ptsz
+cuMemcpy3DPeer_ptds
+cuMemcpy3D_v2
+cuMemcpy3D_v2_ptds
+cuMemcpyAsync
+cuMemcpyAsync_ptsz
+cuMemcpyAtoA
+cuMemcpyAtoA_v2
+cuMemcpyAtoA_v2_ptds
+cuMemcpyAtoD
+cuMemcpyAtoD_v2
+cuMemcpyAtoD_v2_ptds
+cuMemcpyAtoH
+cuMemcpyAtoHAsync
+cuMemcpyAtoHAsync_v2
+cuMemcpyAtoHAsync_v2_ptsz
+cuMemcpyAtoH_v2
+cuMemcpyAtoH_v2_ptds
+cuMemcpyDtoA
+cuMemcpyDtoA_v2
+cuMemcpyDtoA_v2_ptds
+cuMemcpyDtoD
+cuMemcpyDtoDAsync
+cuMemcpyDtoDAsync_v2
+cuMemcpyDtoDAsync_v2_ptsz
+cuMemcpyDtoD_v2
+cuMemcpyDtoD_v2_ptds
+cuMemcpyDtoH
+cuMemcpyDtoHAsync
+cuMemcpyDtoHAsync_v2
+cuMemcpyDtoHAsync_v2_ptsz
+cuMemcpyDtoH_v2
+cuMemcpyDtoH_v2_ptds
+cuMemcpyHtoA
+cuMemcpyHtoAAsync
+cuMemcpyHtoAAsync_v2
+cuMemcpyHtoAAsync_v2_ptsz
+cuMemcpyHtoA_v2
+cuMemcpyHtoA_v2_ptds
+cuMemcpyHtoD
+cuMemcpyHtoDAsync
+cuMemcpyHtoDAsync_v2
+cuMemcpyHtoDAsync_v2_ptsz
+cuMemcpyHtoD_v2
+cuMemcpyHtoD_v2_ptds
+cuMemcpyPeer
+cuMemcpyPeerAsync
+cuMemcpyPeerAsync_ptsz
+cuMemcpyPeer_ptds
+cuMemcpy_ptds
+cuMemsetD16
+cuMemsetD16Async
+cuMemsetD16Async_ptsz
+cuMemsetD16_v2
+cuMemsetD16_v2_ptds
+cuMemsetD2D16
+cuMemsetD2D16Async
+cuMemsetD2D16Async_ptsz
+cuMemsetD2D16_v2
+cuMemsetD2D16_v2_ptds
+cuMemsetD2D32
+cuMemsetD2D32Async
+cuMemsetD2D32Async_ptsz
+cuMemsetD2D32_v2
+cuMemsetD2D32_v2_ptds
+cuMemsetD2D8
+cuMemsetD2D8Async
+cuMemsetD2D8Async_ptsz
+cuMemsetD2D8_v2
+cuMemsetD2D8_v2_ptds
+cuMemsetD32
+cuMemsetD32Async
+cuMemsetD32Async_ptsz
+cuMemsetD32_v2
+cuMemsetD32_v2_ptds
+cuMemsetD8
+cuMemsetD8Async
+cuMemsetD8Async_ptsz
+cuMemsetD8_v2
+cuMemsetD8_v2_ptds
+cuMipmappedArrayCreate
+cuMipmappedArrayDestroy
+cuMipmappedArrayGetLevel
+cuMipmappedArrayGetMemoryRequirements
+cuMipmappedArrayGetSparseProperties
+cuModuleGetFunction
+cuModuleGetGlobal
+cuModuleGetGlobal_v2
+cuModuleGetLoadingMode
+cuModuleGetSurfRef
+cuModuleGetTexRef
+cuModuleLoad
+cuModuleLoadData
+cuModuleLoadDataEx
+cuModuleLoadFatBinary
+cuModuleUnload
+cuOccupancyAvailableDynamicSMemPerBlock
+cuOccupancyMaxActiveBlocksPerMultiprocessor
+cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+cuOccupancyMaxActiveClusters
+cuOccupancyMaxPotentialBlockSize
+cuOccupancyMaxPotentialBlockSizeWithFlags
+cuOccupancyMaxPotentialClusterSize
+cuParamSetSize
+cuParamSetTexRef
+cuParamSetf
+cuParamSeti
+cuParamSetv
+cuPointerGetAttribute
+cuPointerGetAttributes
+cuPointerSetAttribute
+cuProfilerInitialize
+cuProfilerStart
+cuProfilerStop
+cuSignalExternalSemaphoresAsync
+cuSignalExternalSemaphoresAsync_ptsz
+cuStreamAddCallback
+cuStreamAddCallback_ptsz
+cuStreamAttachMemAsync
+cuStreamAttachMemAsync_ptsz
+cuStreamBatchMemOp
+cuStreamBatchMemOp_ptsz
+cuStreamBatchMemOp_v2
+cuStreamBatchMemOp_v2_ptsz
+cuStreamBeginCapture
+cuStreamBeginCapture_ptsz
+cuStreamBeginCapture_v2
+cuStreamBeginCapture_v2_ptsz
+cuStreamCopyAttributes
+cuStreamCopyAttributes_ptsz
+cuStreamCreate
+cuStreamCreateWithPriority
+cuStreamDestroy
+cuStreamDestroy_v2
+cuStreamEndCapture
+cuStreamEndCapture_ptsz
+cuStreamGetAttribute
+cuStreamGetAttribute_ptsz
+cuStreamGetCaptureInfo
+cuStreamGetCaptureInfo_ptsz
+cuStreamGetCaptureInfo_v2
+cuStreamGetCaptureInfo_v2_ptsz
+cuStreamGetCtx
+cuStreamGetCtx_ptsz
+cuStreamGetFlags
+cuStreamGetFlags_ptsz
+cuStreamGetId
+cuStreamGetId_ptsz
+cuStreamGetPriority
+cuStreamGetPriority_ptsz
+cuStreamIsCapturing
+cuStreamIsCapturing_ptsz
+cuStreamQuery
+cuStreamQuery_ptsz
+cuStreamSetAttribute
+cuStreamSetAttribute_ptsz
+cuStreamSynchronize
+cuStreamSynchronize_ptsz
+cuStreamUpdateCaptureDependencies
+cuStreamUpdateCaptureDependencies_ptsz
+cuStreamWaitEvent
+cuStreamWaitEvent_ptsz
+cuStreamWaitValue32
+cuStreamWaitValue32_ptsz
+cuStreamWaitValue32_v2
+cuStreamWaitValue32_v2_ptsz
+cuStreamWaitValue64
+cuStreamWaitValue64_ptsz
+cuStreamWaitValue64_v2
+cuStreamWaitValue64_v2_ptsz
+cuStreamWriteValue32
+cuStreamWriteValue32_ptsz
+cuStreamWriteValue32_v2
+cuStreamWriteValue32_v2_ptsz
+cuStreamWriteValue64
+cuStreamWriteValue64_ptsz
+cuStreamWriteValue64_v2
+cuStreamWriteValue64_v2_ptsz
+cuSurfObjectCreate
+cuSurfObjectDestroy
+cuSurfObjectGetResourceDesc
+cuSurfRefGetArray
+cuSurfRefSetArray
+cuTensorMapEncodeIm2col
+cuTensorMapEncodeTiled
+cuTensorMapReplaceAddress
+cuTexObjectCreate
+cuTexObjectDestroy
+cuTexObjectGetResourceDesc
+cuTexObjectGetResourceViewDesc
+cuTexObjectGetTextureDesc
+cuTexRefCreate
+cuTexRefDestroy
+cuTexRefGetAddress
+cuTexRefGetAddressMode
+cuTexRefGetAddress_v2
+cuTexRefGetArray
+cuTexRefGetBorderColor
+cuTexRefGetFilterMode
+cuTexRefGetFlags
+cuTexRefGetFormat
+cuTexRefGetMaxAnisotropy
+cuTexRefGetMipmapFilterMode
+cuTexRefGetMipmapLevelBias
+cuTexRefGetMipmapLevelClamp
+cuTexRefGetMipmappedArray
+cuTexRefSetAddress
+cuTexRefSetAddress2D
+cuTexRefSetAddress2D_v2
+cuTexRefSetAddress2D_v3
+cuTexRefSetAddressMode
+cuTexRefSetAddress_v2
+cuTexRefSetArray
+cuTexRefSetBorderColor
+cuTexRefSetFilterMode
+cuTexRefSetFlags
+cuTexRefSetFormat
+cuTexRefSetMaxAnisotropy
+cuTexRefSetMipmapFilterMode
+cuTexRefSetMipmapLevelBias
+cuTexRefSetMipmapLevelClamp
+cuTexRefSetMipmappedArray
+cuThreadExchangeStreamCaptureMode
+cuUserObjectCreate
+cuUserObjectRelease
+cuUserObjectRetain
+cuVDPAUCtxCreate
+cuVDPAUCtxCreate_v2
+cuVDPAUGetDevice
+cuWaitExternalSemaphoresAsync
+cuWaitExternalSemaphoresAsync_ptsz
+cudbgApiAttach
+cudbgApiDetach
+cudbgApiInit
+cudbgGetAPI
+cudbgGetAPIVersion
+cudbgMain
+cudbgReportDriverApiError
+cudbgReportDriverInternalError
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_10_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_10_0.inc
deleted file mode 100644
index 6f26cfb92d1d85..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_10_0.inc
+++ /dev/null
@@ -1,2133 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuInit(unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(Flags);
-}
-
-CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, ordinal);
-}
-
-CUresult CUDAAPI cuDeviceGetCount(int *count) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(name, len, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(uuid, dev);
-}
-
-CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(bytes, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
-                                      CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, dev);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop,
-                                                         CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevprop *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, dev);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major,
-                                                             int *minor,
-                                                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags,
-                                            int *active) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags, active);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags,
-                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags, dev);
-}
-
-CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-CUresult CUDAAPI cuCtxSynchronize(void) {
-  using FuncPtr = CUresult(CUDAAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUlimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pvalue, limit);
-}
-
-CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pconfig);
-}
-
-CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, version);
-}
-
-CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority,
-                                             int *greatestPriority) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fname);
-}
-
-CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image);
-}
-
-CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image,
-                                    unsigned int numOptions,
-                                    CUjit_option *options,
-                                    void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *, unsigned int,
-                                      CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fatCubin);
-}
-
-CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hmod);
-}
-
-CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
-                                     const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes,
-                                   CUmodule hmod, const char *name) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytes, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod,
-                                   const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod,
-                                    const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfRef, hmod, name);
-}
-
-CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options,
-                              void **optionValues, CUlinkState *stateOut) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numOptions, options, optionValues, stateOut);
-}
-
-CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type,
-                               void *data, size_t size, const char *name,
-                               unsigned int numOptions, CUjit_option *options,
-                               void **optionValues) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t,
-                          const char *, unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, data, size, name, numOptions, options,
-                  optionValues);
-}
-
-CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type,
-                               const char *path, unsigned int numOptions,
-                               CUjit_option *options, void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, const char *,
-                                      unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, path, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuLinkComplete(CUlinkState state, void **cubinOut,
-                                size_t *sizeOut) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, void **, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, cubinOut, sizeOut);
-}
-
-CUresult CUDAAPI cuLinkDestroy(CUlinkState state) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state);
-}
-
-CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize);
-}
-
-CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
-                                 size_t WidthInBytes, size_t Height,
-                                 unsigned int ElementSizeBytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t,
-                                      unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
-}
-
-CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
-                                      CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbase, psize, dptr);
-}
-
-CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize);
-}
-
-CUresult CUDAAPI cuMemFreeHost(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize,
-                                unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
-                                           unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, p, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, p);
-}
-
-CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
-                                   unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, flags);
-}
-
-CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, pciBusId);
-}
-
-CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, dev);
-}
-
-CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcEventHandle *, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, event);
-}
-
-CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent,
-                                      CUipcEventHandle handle) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, CUipcEventHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, handle);
-}
-
-CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, dptr);
-}
-
-CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
-                                    unsigned int Flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, handle, Flags);
-}
-
-CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostUnregister(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext,
-                              CUdeviceptr srcDevice, CUcontext srcContext,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset,
-                              CUdeviceptr srcDevice, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray,
-                              size_t srcOffset, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset,
-                              const void *srcHost, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset,
-                              CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src,
-                               size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext,
-                                   CUdeviceptr srcDevice, CUcontext srcContext,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount,
-                  hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
-                                   const void *srcHost, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray,
-                                   size_t srcOffset, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy,
-                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N);
-}
-
-CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
-                             size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N);
-}
-
-CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N);
-}
-
-CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch,
-                              unsigned char uc, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned short us, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned int ui, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
-                                 size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                   unsigned char uc, size_t Width,
-                                   size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned short us, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned int ui, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t,
-                                      size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuArrayCreate(CUarray *pHandle,
-                               const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor,
-                                      CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hArray);
-}
-
-CUresult CUDAAPI cuArray3DCreate(
-    CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArray3DGetDescriptor(
-    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-CUresult CUDAAPI
-cuMipmappedArrayCreate(CUmipmappedArray *pHandle,
-                       const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
-                       unsigned int numMipmapLevels) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
-}
-
-CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray,
-                                          CUmipmappedArray hMipmappedArray,
-                                          unsigned int level) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pLevelArray, hMipmappedArray, level);
-}
-
-CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hMipmappedArray);
-}
-
-CUresult CUDAAPI cuPointerGetAttribute(void *data,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, attribute, ptr);
-}
-
-CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count,
-                                    CUdevice dstDevice, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, hStream);
-}
-
-CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count,
-                             CUmem_advise advice, CUdevice device) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize,
-                                        CUmem_range_attribute attribute,
-                                        CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, CUmem_range_attribute,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes,
-                                         CUmem_range_attribute *attributes,
-                                         size_t numAttributes,
-                                         CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-CUresult CUDAAPI cuPointerSetAttribute(const void *value,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attribute, ptr);
-}
-
-CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes,
-                                        CUpointer_attribute *attributes,
-                                        void **data, CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int, CUpointer_attribute *,
-                                      void **, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numAttributes, attributes, data, ptr);
-}
-
-CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, Flags);
-}
-
-CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream,
-                                            unsigned int flags, int priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, flags, priority);
-}
-
-CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, pctx);
-}
-
-CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUevent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, hEvent, Flags);
-}
-
-CUresult CUDAAPI cuStreamAddCallback(CUstream hStream,
-                                     CUstreamCallback callback, void *userData,
-                                     unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, callback, userData, flags);
-}
-
-CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBeginCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, phGraph);
-}
-
-CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream,
-                                     CUstreamCaptureStatus *captureStatus) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus);
-}
-
-CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr,
-                                        size_t length, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, dptr, length, flags);
-}
-
-CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, Flags);
-}
-
-CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent, hStream);
-}
-
-CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
-                                    CUevent hEnd) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUevent, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMilliseconds, hStart, hEnd);
-}
-
-CUresult CUDAAPI
-cuImportExternalMemory(CUexternalMemory *extMem_out,
-                       const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory *,
-                                      const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(
-    CUdeviceptr *devPtr, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUexternalMemory,
-                                      const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(
-    CUmipmappedArray *mipmap, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmipmappedArray *, CUexternalMemory,
-                          const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-CUresult CUDAAPI cuImportExternalSemaphore(
-    CUexternalSemaphore *extSem_out,
-    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-CUresult CUDAAPI cuSignalExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *,
-      const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuWaitExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *,
-      unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalSemaphore);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr,
-                                     cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr,
-                                     cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr,
-                                      cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr,
-                                      cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
-                                    CUstreamBatchMemOpParams *paramArray,
-                                    unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int,
-                                      CUstreamBatchMemOpParams *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, count, paramArray, flags);
-}
-
-CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
-                                    CUfunction hfunc) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, hfunc);
-}
-
-CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc,
-                                    CUfunction_attribute attrib, int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunction_attribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, attrib, value);
-}
-
-CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc,
-                                          CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX,
-                                unsigned int gridDimY, unsigned int gridDimZ,
-                                unsigned int blockDimX, unsigned int blockDimY,
-                                unsigned int blockDimZ,
-                                unsigned int sharedMemBytes, CUstream hStream,
-                                void **kernelParams, void **extra) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
-}
-
-CUresult CUDAAPI cuLaunchCooperativeKernel(
-    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
-    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
-    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
-    void **kernelParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams);
-}
-
-CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(
-    CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn,
-                                  void *userData) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUhostFn, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, fn, userData);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x,
-                                                       int y, int z) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, x, y, z);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc,
-                                                       unsigned int bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, bytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc,
-                                                  unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset,
-                                               unsigned int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset,
-                                               float value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset,
-                                               void *ptr,
-                                               unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, ptr, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width,
-                                                int grid_height) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f,
-                                                     int grid_width,
-                                                     int grid_height,
-                                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height, hStream);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc,
-                                                    int texunit,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, texunit, hTexRef);
-}
-
-CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraph, flags);
-}
-
-CUresult CUDAAPI cuGraphAddKernelNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *,
-                                      size_t, const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeGetParams(
-    CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeSetParams(
-    CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                      CUgraphNode *dependencies,
-                                      size_t numDependencies,
-                                      const CUDA_MEMCPY3D *copyParams,
-                                      CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *,
-                                      size_t, const CUDA_MEMCPY3D *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  copyParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode,
-                                            CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode,
-                                            const CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemsetNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams,
-    CUcontext ctx) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *, size_t,
-                          const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  memsetParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeGetParams(
-    CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeSetParams(
-    CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                    CUgraphNode *dependencies,
-                                    size_t numDependencies,
-                                    const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *,
-                                      size_t, const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode,
-                                          CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeSetParams(
-    CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode,
-                                          CUgraph hGraph,
-                                          CUgraphNode *dependencies,
-                                          size_t numDependencies,
-                                          CUgraph childGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *,
-                                      size_t, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  childGraph);
-}
-
-CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode,
-                                               CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, phGraph);
-}
-
-CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                     CUgraphNode *dependencies,
-                                     size_t numDependencies) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphClone, originalGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode,
-                                        CUgraphNode hOriginalNode,
-                                        CUgraph hClonedGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraphNode, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phNode, hOriginalNode, hClonedGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, type);
-}
-
-CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes,
-                                 size_t *numNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, nodes, numNodes);
-}
-
-CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes,
-                                     size_t *numRootNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, rootNodes, numRootNodes);
-}
-
-CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from,
-                                 CUgraphNode *to, size_t *numEdges) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numEdges);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode,
-                                            CUgraphNode *dependencies,
-                                            size_t *numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode,
-                                              CUgraphNode *dependentNodes,
-                                              size_t *numDependentNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependentNodes, numDependentNodes);
-}
-
-CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, CUgraphNode *from,
-                                        CUgraphNode *to,
-                                        size_t numDependencies) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, CUgraphNode *from,
-                                           CUgraphNode *to,
-                                           size_t numDependencies) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode);
-}
-
-CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph,
-                                    CUgraphNode *phErrorNode, char *logBuffer,
-                                    size_t bufferSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec *, CUgraph, CUgraphNode *,
-                                      char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize);
-}
-
-CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hStream);
-}
-
-CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec);
-}
-
-CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUfunction,
-                                      CUoccupancyB2DSize, size_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit, flags);
-}
-
-CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray,
-                                  unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hArray, Flags);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef,
-                                           CUmipmappedArray hMipmappedArray,
-                                           unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hMipmappedArray, Flags);
-}
-
-CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef,
-                                    CUdeviceptr dptr, size_t bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ByteOffset, hTexRef, dptr, bytes);
-}
-
-CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef,
-                                      const CUDA_ARRAY_DESCRIPTOR *desc,
-                                      CUdeviceptr dptr, size_t Pitch) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, desc, dptr, Pitch);
-}
-
-CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt,
-                                   int NumPackedComponents) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray_format, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fmt, NumPackedComponents);
-}
-
-CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim,
-                                        CUaddress_mode am) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, int, CUaddress_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, dim, am);
-}
-
-CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef,
-                                             CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, bias);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef,
-                                             float minMipmapLevelClamp,
-                                             float maxMipmapLevelClamp) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
-}
-
-CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef,
-                                          unsigned int maxAniso) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, maxAniso);
-}
-
-CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, pBorderColor);
-}
-
-CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, Flags);
-}
-
-CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray,
-                                           CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phMipmappedArray, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef,
-                                        int dim) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUaddress_mode *, CUtexref, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pam, hTexRef, dim);
-}
-
-CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels,
-                                   CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray_format *, int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFormat, pNumChannels, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm,
-                                             CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbias, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp,
-                                             float *pmaxMipmapLevelClamp,
-                                             CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pmaxAniso, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pBorderColor, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef);
-}
-
-CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hSurfRef, hArray, Flags);
-}
-
-CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUsurfref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hSurfRef);
-}
-
-CUresult CUDAAPI
-cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc,
-                  const CUDA_TEXTURE_DESC *pTexDesc,
-                  const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *,
-                                      const CUDA_TEXTURE_DESC *,
-                                      const CUDA_RESOURCE_VIEW_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                            CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc,
-                                           CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceViewDesc(
-    CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject,
-                                    const CUDA_RESOURCE_DESC *pResDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                             CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev,
-                                       CUdevice peerDev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, dev, peerDev);
-}
-
-CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext,
-                                       unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext, Flags);
-}
-
-CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext);
-}
-
-CUresult CUDAAPI cuDeviceGetP2PAttribute(int *value,
-                                         CUdevice_P2PAttribute attrib,
-                                         CUdevice srcDevice,
-                                         CUdevice dstDevice) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attrib, srcDevice, dstDevice);
-}
-
-CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(
-    CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex,
-    unsigned int mipLevel) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUgraphicsResource,
-                                      unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArray, resource, arrayIndex, mipLevel);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(
-    CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMipmappedArray, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(
-    CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevPtr, pSize, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-CUresult CUDAAPI cuGraphicsMapResources(unsigned int count,
-                                        CUgraphicsResource *resources,
-                                        CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count,
-                                          CUgraphicsResource *resources,
-                                          CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
-                                  const CUuuid *pExportTableId) {
-  using FuncPtr = CUresult(CUDAAPI *)(const void **, const CUuuid *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_10_1.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_10_1.inc
deleted file mode 100644
index d35035799a77da..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_10_1.inc
+++ /dev/null
@@ -1,2166 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuInit(unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(Flags);
-}
-
-CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, ordinal);
-}
-
-CUresult CUDAAPI cuDeviceGetCount(int *count) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(name, len, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(uuid, dev);
-}
-
-CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(bytes, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
-                                      CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, dev);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop,
-                                                         CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevprop *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, dev);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major,
-                                                             int *minor,
-                                                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags,
-                                            int *active) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags, active);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags,
-                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags, dev);
-}
-
-CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-CUresult CUDAAPI cuCtxSynchronize(void) {
-  using FuncPtr = CUresult(CUDAAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUlimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pvalue, limit);
-}
-
-CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pconfig);
-}
-
-CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, version);
-}
-
-CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority,
-                                             int *greatestPriority) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fname);
-}
-
-CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image);
-}
-
-CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image,
-                                    unsigned int numOptions,
-                                    CUjit_option *options,
-                                    void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *, unsigned int,
-                                      CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fatCubin);
-}
-
-CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hmod);
-}
-
-CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
-                                     const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes,
-                                   CUmodule hmod, const char *name) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytes, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod,
-                                   const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod,
-                                    const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfRef, hmod, name);
-}
-
-CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options,
-                              void **optionValues, CUlinkState *stateOut) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numOptions, options, optionValues, stateOut);
-}
-
-CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type,
-                               void *data, size_t size, const char *name,
-                               unsigned int numOptions, CUjit_option *options,
-                               void **optionValues) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t,
-                          const char *, unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, data, size, name, numOptions, options,
-                  optionValues);
-}
-
-CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type,
-                               const char *path, unsigned int numOptions,
-                               CUjit_option *options, void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, const char *,
-                                      unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, path, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuLinkComplete(CUlinkState state, void **cubinOut,
-                                size_t *sizeOut) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, void **, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, cubinOut, sizeOut);
-}
-
-CUresult CUDAAPI cuLinkDestroy(CUlinkState state) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state);
-}
-
-CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize);
-}
-
-CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
-                                 size_t WidthInBytes, size_t Height,
-                                 unsigned int ElementSizeBytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t,
-                                      unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
-}
-
-CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
-                                      CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbase, psize, dptr);
-}
-
-CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize);
-}
-
-CUresult CUDAAPI cuMemFreeHost(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize,
-                                unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
-                                           unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, p, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, p);
-}
-
-CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
-                                   unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, flags);
-}
-
-CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, pciBusId);
-}
-
-CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, dev);
-}
-
-CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcEventHandle *, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, event);
-}
-
-CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent,
-                                      CUipcEventHandle handle) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, CUipcEventHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, handle);
-}
-
-CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, dptr);
-}
-
-CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
-                                    unsigned int Flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, handle, Flags);
-}
-
-CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostUnregister(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext,
-                              CUdeviceptr srcDevice, CUcontext srcContext,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset,
-                              CUdeviceptr srcDevice, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray,
-                              size_t srcOffset, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset,
-                              const void *srcHost, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset,
-                              CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src,
-                               size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext,
-                                   CUdeviceptr srcDevice, CUcontext srcContext,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount,
-                  hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
-                                   const void *srcHost, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray,
-                                   size_t srcOffset, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy,
-                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N);
-}
-
-CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
-                             size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N);
-}
-
-CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N);
-}
-
-CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch,
-                              unsigned char uc, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned short us, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned int ui, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
-                                 size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                   unsigned char uc, size_t Width,
-                                   size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned short us, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned int ui, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t,
-                                      size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuArrayCreate(CUarray *pHandle,
-                               const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor,
-                                      CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hArray);
-}
-
-CUresult CUDAAPI cuArray3DCreate(
-    CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArray3DGetDescriptor(
-    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-CUresult CUDAAPI
-cuMipmappedArrayCreate(CUmipmappedArray *pHandle,
-                       const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
-                       unsigned int numMipmapLevels) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
-}
-
-CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray,
-                                          CUmipmappedArray hMipmappedArray,
-                                          unsigned int level) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pLevelArray, hMipmappedArray, level);
-}
-
-CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hMipmappedArray);
-}
-
-CUresult CUDAAPI cuPointerGetAttribute(void *data,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, attribute, ptr);
-}
-
-CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count,
-                                    CUdevice dstDevice, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, hStream);
-}
-
-CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count,
-                             CUmem_advise advice, CUdevice device) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize,
-                                        CUmem_range_attribute attribute,
-                                        CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, CUmem_range_attribute,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes,
-                                         CUmem_range_attribute *attributes,
-                                         size_t numAttributes,
-                                         CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-CUresult CUDAAPI cuPointerSetAttribute(const void *value,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attribute, ptr);
-}
-
-CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes,
-                                        CUpointer_attribute *attributes,
-                                        void **data, CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int, CUpointer_attribute *,
-                                      void **, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numAttributes, attributes, data, ptr);
-}
-
-CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, Flags);
-}
-
-CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream,
-                                            unsigned int flags, int priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, flags, priority);
-}
-
-CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, pctx);
-}
-
-CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUevent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, hEvent, Flags);
-}
-
-CUresult CUDAAPI cuStreamAddCallback(CUstream hStream,
-                                     CUstreamCallback callback, void *userData,
-                                     unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, callback, userData, flags);
-}
-
-CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream,
-                                      CUstreamCaptureMode mode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBeginCapture_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, mode);
-}
-
-CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstreamCaptureMode *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuThreadExchangeStreamCaptureMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, phGraph);
-}
-
-CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream,
-                                     CUstreamCaptureStatus *captureStatus) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus);
-}
-
-CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream,
-                                        CUstreamCaptureStatus *captureStatus,
-                                        cuuint64_t *id) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *, cuuint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCaptureInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus, id);
-}
-
-CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr,
-                                        size_t length, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, dptr, length, flags);
-}
-
-CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, Flags);
-}
-
-CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent, hStream);
-}
-
-CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
-                                    CUevent hEnd) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUevent, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMilliseconds, hStart, hEnd);
-}
-
-CUresult CUDAAPI
-cuImportExternalMemory(CUexternalMemory *extMem_out,
-                       const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory *,
-                                      const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(
-    CUdeviceptr *devPtr, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUexternalMemory,
-                                      const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(
-    CUmipmappedArray *mipmap, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmipmappedArray *, CUexternalMemory,
-                          const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-CUresult CUDAAPI cuImportExternalSemaphore(
-    CUexternalSemaphore *extSem_out,
-    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-CUresult CUDAAPI cuSignalExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *,
-      const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuWaitExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *,
-      unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalSemaphore);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr,
-                                     cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr,
-                                     cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr,
-                                      cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr,
-                                      cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
-                                    CUstreamBatchMemOpParams *paramArray,
-                                    unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int,
-                                      CUstreamBatchMemOpParams *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, count, paramArray, flags);
-}
-
-CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
-                                    CUfunction hfunc) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, hfunc);
-}
-
-CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc,
-                                    CUfunction_attribute attrib, int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunction_attribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, attrib, value);
-}
-
-CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc,
-                                          CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX,
-                                unsigned int gridDimY, unsigned int gridDimZ,
-                                unsigned int blockDimX, unsigned int blockDimY,
-                                unsigned int blockDimZ,
-                                unsigned int sharedMemBytes, CUstream hStream,
-                                void **kernelParams, void **extra) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
-}
-
-CUresult CUDAAPI cuLaunchCooperativeKernel(
-    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
-    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
-    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
-    void **kernelParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams);
-}
-
-CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(
-    CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn,
-                                  void *userData) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUhostFn, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, fn, userData);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x,
-                                                       int y, int z) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, x, y, z);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc,
-                                                       unsigned int bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, bytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc,
-                                                  unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset,
-                                               unsigned int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset,
-                                               float value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset,
-                                               void *ptr,
-                                               unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, ptr, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width,
-                                                int grid_height) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f,
-                                                     int grid_width,
-                                                     int grid_height,
-                                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height, hStream);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc,
-                                                    int texunit,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, texunit, hTexRef);
-}
-
-CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraph, flags);
-}
-
-CUresult CUDAAPI cuGraphAddKernelNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeGetParams(
-    CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeSetParams(
-    CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                      const CUgraphNode *dependencies,
-                                      size_t numDependencies,
-                                      const CUDA_MEMCPY3D *copyParams,
-                                      CUcontext ctx) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_MEMCPY3D *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  copyParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode,
-                                            CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode,
-                                            const CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemsetNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams,
-    CUcontext ctx) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  memsetParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeGetParams(
-    CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeSetParams(
-    CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                    const CUgraphNode *dependencies,
-                                    size_t numDependencies,
-                                    const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode,
-                                          CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeSetParams(
-    CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode,
-                                          CUgraph hGraph,
-                                          const CUgraphNode *dependencies,
-                                          size_t numDependencies,
-                                          CUgraph childGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  childGraph);
-}
-
-CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode,
-                                               CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, phGraph);
-}
-
-CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                     const CUgraphNode *dependencies,
-                                     size_t numDependencies) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphClone, originalGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode,
-                                        CUgraphNode hOriginalNode,
-                                        CUgraph hClonedGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraphNode, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phNode, hOriginalNode, hClonedGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, type);
-}
-
-CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes,
-                                 size_t *numNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, nodes, numNodes);
-}
-
-CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes,
-                                     size_t *numRootNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, rootNodes, numRootNodes);
-}
-
-CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from,
-                                 CUgraphNode *to, size_t *numEdges) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numEdges);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode,
-                                            CUgraphNode *dependencies,
-                                            size_t *numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode,
-                                              CUgraphNode *dependentNodes,
-                                              size_t *numDependentNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependentNodes, numDependentNodes);
-}
-
-CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from,
-                                        const CUgraphNode *to,
-                                        size_t numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
-                                      const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph,
-                                           const CUgraphNode *from,
-                                           const CUgraphNode *to,
-                                           size_t numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
-                                      const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode);
-}
-
-CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph,
-                                    CUgraphNode *phErrorNode, char *logBuffer,
-                                    size_t bufferSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec *, CUgraph, CUgraphNode *,
-                                      char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize);
-}
-
-CUresult CUDAAPI
-cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
-                               const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hStream);
-}
-
-CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec);
-}
-
-CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUfunction,
-                                      CUoccupancyB2DSize, size_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit, flags);
-}
-
-CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray,
-                                  unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hArray, Flags);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef,
-                                           CUmipmappedArray hMipmappedArray,
-                                           unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hMipmappedArray, Flags);
-}
-
-CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef,
-                                    CUdeviceptr dptr, size_t bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ByteOffset, hTexRef, dptr, bytes);
-}
-
-CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef,
-                                      const CUDA_ARRAY_DESCRIPTOR *desc,
-                                      CUdeviceptr dptr, size_t Pitch) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, desc, dptr, Pitch);
-}
-
-CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt,
-                                   int NumPackedComponents) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray_format, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fmt, NumPackedComponents);
-}
-
-CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim,
-                                        CUaddress_mode am) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, int, CUaddress_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, dim, am);
-}
-
-CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef,
-                                             CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, bias);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef,
-                                             float minMipmapLevelClamp,
-                                             float maxMipmapLevelClamp) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
-}
-
-CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef,
-                                          unsigned int maxAniso) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, maxAniso);
-}
-
-CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, pBorderColor);
-}
-
-CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, Flags);
-}
-
-CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray,
-                                           CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phMipmappedArray, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef,
-                                        int dim) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUaddress_mode *, CUtexref, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pam, hTexRef, dim);
-}
-
-CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels,
-                                   CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray_format *, int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFormat, pNumChannels, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm,
-                                             CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbias, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp,
-                                             float *pmaxMipmapLevelClamp,
-                                             CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pmaxAniso, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pBorderColor, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef);
-}
-
-CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef);
-}
-
-CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hSurfRef, hArray, Flags);
-}
-
-CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUsurfref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hSurfRef);
-}
-
-CUresult CUDAAPI
-cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc,
-                  const CUDA_TEXTURE_DESC *pTexDesc,
-                  const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *,
-                                      const CUDA_TEXTURE_DESC *,
-                                      const CUDA_RESOURCE_VIEW_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                            CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc,
-                                           CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceViewDesc(
-    CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject,
-                                    const CUDA_RESOURCE_DESC *pResDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                             CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev,
-                                       CUdevice peerDev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, dev, peerDev);
-}
-
-CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext,
-                                       unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext, Flags);
-}
-
-CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext);
-}
-
-CUresult CUDAAPI cuDeviceGetP2PAttribute(int *value,
-                                         CUdevice_P2PAttribute attrib,
-                                         CUdevice srcDevice,
-                                         CUdevice dstDevice) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attrib, srcDevice, dstDevice);
-}
-
-CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(
-    CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex,
-    unsigned int mipLevel) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUgraphicsResource,
-                                      unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArray, resource, arrayIndex, mipLevel);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(
-    CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMipmappedArray, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(
-    CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevPtr, pSize, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-CUresult CUDAAPI cuGraphicsMapResources(unsigned int count,
-                                        CUgraphicsResource *resources,
-                                        CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count,
-                                          CUgraphicsResource *resources,
-                                          CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
-                                  const CUuuid *pExportTableId) {
-  using FuncPtr = CUresult(CUDAAPI *)(const void **, const CUuuid *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_10_2.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_10_2.inc
deleted file mode 100644
index f37fc9d888de44..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_10_2.inc
+++ /dev/null
@@ -1,2328 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuInit(unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(Flags);
-}
-
-CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, ordinal);
-}
-
-CUresult CUDAAPI cuDeviceGetCount(int *count) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(name, len, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(uuid, dev);
-}
-
-CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(bytes, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
-                                      CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList,
-                                                CUdevice dev, int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdevice, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetNvSciSyncAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nvSciSyncAttrList, dev, flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop,
-                                                         CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevprop *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, dev);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major,
-                                                             int *minor,
-                                                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags,
-                                            int *active) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags, active);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags,
-                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags, dev);
-}
-
-CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-CUresult CUDAAPI cuCtxSynchronize(void) {
-  using FuncPtr = CUresult(CUDAAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUlimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pvalue, limit);
-}
-
-CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pconfig);
-}
-
-CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, version);
-}
-
-CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority,
-                                             int *greatestPriority) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fname);
-}
-
-CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image);
-}
-
-CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image,
-                                    unsigned int numOptions,
-                                    CUjit_option *options,
-                                    void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *, unsigned int,
-                                      CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fatCubin);
-}
-
-CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hmod);
-}
-
-CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
-                                     const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes,
-                                   CUmodule hmod, const char *name) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytes, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod,
-                                   const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod,
-                                    const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfRef, hmod, name);
-}
-
-CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options,
-                              void **optionValues, CUlinkState *stateOut) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numOptions, options, optionValues, stateOut);
-}
-
-CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type,
-                               void *data, size_t size, const char *name,
-                               unsigned int numOptions, CUjit_option *options,
-                               void **optionValues) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t,
-                          const char *, unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, data, size, name, numOptions, options,
-                  optionValues);
-}
-
-CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type,
-                               const char *path, unsigned int numOptions,
-                               CUjit_option *options, void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, const char *,
-                                      unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, path, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuLinkComplete(CUlinkState state, void **cubinOut,
-                                size_t *sizeOut) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, void **, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, cubinOut, sizeOut);
-}
-
-CUresult CUDAAPI cuLinkDestroy(CUlinkState state) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state);
-}
-
-CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize);
-}
-
-CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
-                                 size_t WidthInBytes, size_t Height,
-                                 unsigned int ElementSizeBytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t,
-                                      unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
-}
-
-CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
-                                      CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbase, psize, dptr);
-}
-
-CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize);
-}
-
-CUresult CUDAAPI cuMemFreeHost(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize,
-                                unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
-                                           unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, p, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, p);
-}
-
-CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
-                                   unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, flags);
-}
-
-CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, pciBusId);
-}
-
-CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, dev);
-}
-
-CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcEventHandle *, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, event);
-}
-
-CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent,
-                                      CUipcEventHandle handle) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, CUipcEventHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, handle);
-}
-
-CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, dptr);
-}
-
-CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
-                                    unsigned int Flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, handle, Flags);
-}
-
-CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostUnregister(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext,
-                              CUdeviceptr srcDevice, CUcontext srcContext,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset,
-                              CUdeviceptr srcDevice, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray,
-                              size_t srcOffset, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset,
-                              const void *srcHost, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset,
-                              CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src,
-                               size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext,
-                                   CUdeviceptr srcDevice, CUcontext srcContext,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount,
-                  hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
-                                   const void *srcHost, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray,
-                                   size_t srcOffset, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy,
-                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N);
-}
-
-CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
-                             size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N);
-}
-
-CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N);
-}
-
-CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch,
-                              unsigned char uc, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned short us, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned int ui, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
-                                 size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                   unsigned char uc, size_t Width,
-                                   size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned short us, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned int ui, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t,
-                                      size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuArrayCreate(CUarray *pHandle,
-                               const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor,
-                                      CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hArray);
-}
-
-CUresult CUDAAPI cuArray3DCreate(
-    CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArray3DGetDescriptor(
-    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-CUresult CUDAAPI
-cuMipmappedArrayCreate(CUmipmappedArray *pHandle,
-                       const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
-                       unsigned int numMipmapLevels) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
-}
-
-CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray,
-                                          CUmipmappedArray hMipmappedArray,
-                                          unsigned int level) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pLevelArray, hMipmappedArray, level);
-}
-
-CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hMipmappedArray);
-}
-
-CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size,
-                                     size_t alignment, CUdeviceptr addr,
-                                     unsigned long long flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, size_t,
-                                      CUdeviceptr, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressReserve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, alignment, addr, flags);
-}
-
-CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                             const CUmemAllocationProp *prop,
-                             unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, size_t,
-                          const CUmemAllocationProp *, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, size, prop, flags);
-}
-
-CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
-                          CUmemGenericAllocationHandle handle,
-                          unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, size_t,
-                          CUmemGenericAllocationHandle, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMap");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, offset, handle, flags);
-}
-
-CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemUnmap");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size,
-                                const CUmemAccessDesc *desc, size_t count) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, const CUmemAccessDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemSetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, desc, count);
-}
-
-CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags,
-                                const CUmemLocation *location,
-                                CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned long long *,
-                                      const CUmemLocation *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags, location, ptr);
-}
-
-CUresult CUDAAPI cuMemExportToShareableHandle(
-    void *shareableHandle, CUmemGenericAllocationHandle handle,
-    CUmemAllocationHandleType handleType, unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(void *, CUmemGenericAllocationHandle,
-                          CUmemAllocationHandleType, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemExportToShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(shareableHandle, handle, handleType, flags);
-}
-
-CUresult CUDAAPI cuMemImportFromShareableHandle(
-    CUmemGenericAllocationHandle *handle, void *osHandle,
-    CUmemAllocationHandleType shHandleType) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *,
-                                      CUmemAllocationHandleType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemImportFromShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, osHandle, shHandleType);
-}
-
-CUresult CUDAAPI cuMemGetAllocationGranularity(
-    size_t *granularity, const CUmemAllocationProp *prop,
-    CUmemAllocationGranularity_flags option) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, const CUmemAllocationProp *,
-                                      CUmemAllocationGranularity_flags);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAllocationGranularity");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(granularity, prop, option);
-}
-
-CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(
-    CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemAllocationProp *, CUmemGenericAllocationHandle);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMemGetAllocationPropertiesFromHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, handle);
-}
-
-CUresult CUDAAPI cuPointerGetAttribute(void *data,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, attribute, ptr);
-}
-
-CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count,
-                                    CUdevice dstDevice, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, hStream);
-}
-
-CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count,
-                             CUmem_advise advice, CUdevice device) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize,
-                                        CUmem_range_attribute attribute,
-                                        CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, CUmem_range_attribute,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes,
-                                         CUmem_range_attribute *attributes,
-                                         size_t numAttributes,
-                                         CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-CUresult CUDAAPI cuPointerSetAttribute(const void *value,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attribute, ptr);
-}
-
-CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes,
-                                        CUpointer_attribute *attributes,
-                                        void **data, CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int, CUpointer_attribute *,
-                                      void **, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numAttributes, attributes, data, ptr);
-}
-
-CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, Flags);
-}
-
-CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream,
-                                            unsigned int flags, int priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, flags, priority);
-}
-
-CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, pctx);
-}
-
-CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUevent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, hEvent, Flags);
-}
-
-CUresult CUDAAPI cuStreamAddCallback(CUstream hStream,
-                                     CUstreamCallback callback, void *userData,
-                                     unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, callback, userData, flags);
-}
-
-CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream,
-                                      CUstreamCaptureMode mode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBeginCapture_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, mode);
-}
-
-CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstreamCaptureMode *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuThreadExchangeStreamCaptureMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, phGraph);
-}
-
-CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream,
-                                     CUstreamCaptureStatus *captureStatus) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus);
-}
-
-CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream,
-                                        CUstreamCaptureStatus *captureStatus,
-                                        cuuint64_t *id) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *, cuuint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCaptureInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus, id);
-}
-
-CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr,
-                                        size_t length, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, dptr, length, flags);
-}
-
-CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, Flags);
-}
-
-CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent, hStream);
-}
-
-CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
-                                    CUevent hEnd) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUevent, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMilliseconds, hStart, hEnd);
-}
-
-CUresult CUDAAPI
-cuImportExternalMemory(CUexternalMemory *extMem_out,
-                       const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory *,
-                                      const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(
-    CUdeviceptr *devPtr, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUexternalMemory,
-                                      const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(
-    CUmipmappedArray *mipmap, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmipmappedArray *, CUexternalMemory,
-                          const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-CUresult CUDAAPI cuImportExternalSemaphore(
-    CUexternalSemaphore *extSem_out,
-    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-CUresult CUDAAPI cuSignalExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *,
-      const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuWaitExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *,
-      unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalSemaphore);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr,
-                                     cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr,
-                                     cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr,
-                                      cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr,
-                                      cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
-                                    CUstreamBatchMemOpParams *paramArray,
-                                    unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int,
-                                      CUstreamBatchMemOpParams *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, count, paramArray, flags);
-}
-
-CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
-                                    CUfunction hfunc) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, hfunc);
-}
-
-CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc,
-                                    CUfunction_attribute attrib, int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunction_attribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, attrib, value);
-}
-
-CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc,
-                                          CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX,
-                                unsigned int gridDimY, unsigned int gridDimZ,
-                                unsigned int blockDimX, unsigned int blockDimY,
-                                unsigned int blockDimZ,
-                                unsigned int sharedMemBytes, CUstream hStream,
-                                void **kernelParams, void **extra) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
-}
-
-CUresult CUDAAPI cuLaunchCooperativeKernel(
-    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
-    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
-    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
-    void **kernelParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams);
-}
-
-CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(
-    CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn,
-                                  void *userData) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUhostFn, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, fn, userData);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x,
-                                                       int y, int z) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, x, y, z);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc,
-                                                       unsigned int bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, bytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc,
-                                                  unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset,
-                                               unsigned int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset,
-                                               float value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset,
-                                               void *ptr,
-                                               unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, ptr, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width,
-                                                int grid_height) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f,
-                                                     int grid_width,
-                                                     int grid_height,
-                                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height, hStream);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc,
-                                                    int texunit,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, texunit, hTexRef);
-}
-
-CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraph, flags);
-}
-
-CUresult CUDAAPI cuGraphAddKernelNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeGetParams(
-    CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeSetParams(
-    CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                      const CUgraphNode *dependencies,
-                                      size_t numDependencies,
-                                      const CUDA_MEMCPY3D *copyParams,
-                                      CUcontext ctx) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_MEMCPY3D *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  copyParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode,
-                                            CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode,
-                                            const CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemsetNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams,
-    CUcontext ctx) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  memsetParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeGetParams(
-    CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeSetParams(
-    CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                    const CUgraphNode *dependencies,
-                                    size_t numDependencies,
-                                    const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode,
-                                          CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeSetParams(
-    CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode,
-                                          CUgraph hGraph,
-                                          const CUgraphNode *dependencies,
-                                          size_t numDependencies,
-                                          CUgraph childGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  childGraph);
-}
-
-CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode,
-                                               CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, phGraph);
-}
-
-CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                     const CUgraphNode *dependencies,
-                                     size_t numDependencies) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphClone, originalGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode,
-                                        CUgraphNode hOriginalNode,
-                                        CUgraph hClonedGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraphNode, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phNode, hOriginalNode, hClonedGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, type);
-}
-
-CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes,
-                                 size_t *numNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, nodes, numNodes);
-}
-
-CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes,
-                                     size_t *numRootNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, rootNodes, numRootNodes);
-}
-
-CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from,
-                                 CUgraphNode *to, size_t *numEdges) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numEdges);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode,
-                                            CUgraphNode *dependencies,
-                                            size_t *numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode,
-                                              CUgraphNode *dependentNodes,
-                                              size_t *numDependentNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependentNodes, numDependentNodes);
-}
-
-CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from,
-                                        const CUgraphNode *to,
-                                        size_t numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
-                                      const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph,
-                                           const CUgraphNode *from,
-                                           const CUgraphNode *to,
-                                           size_t numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
-                                      const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode);
-}
-
-CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph,
-                                    CUgraphNode *phErrorNode, char *logBuffer,
-                                    size_t bufferSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec *, CUgraph, CUgraphNode *,
-                                      char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize);
-}
-
-CUresult CUDAAPI
-cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
-                               const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec,
-                                                CUgraphNode hNode,
-                                                const CUDA_MEMCPY3D *copyParams,
-                                                CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_MEMCPY3D *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, copyParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUgraphExec, CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, memsetParams, ctx);
-}
-
-CUresult CUDAAPI
-cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
-                             const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hStream);
-}
-
-CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec);
-}
-
-CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph);
-}
-
-CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph,
-                                   CUgraphNode *hErrorNode_out,
-                                   CUgraphExecUpdateResult *updateResult_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraph, CUgraphNode *,
-                                      CUgraphExecUpdateResult *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUfunction,
-                                      CUoccupancyB2DSize, size_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit, flags);
-}
-
-CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray,
-                                  unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hArray, Flags);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef,
-                                           CUmipmappedArray hMipmappedArray,
-                                           unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hMipmappedArray, Flags);
-}
-
-CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef,
-                                    CUdeviceptr dptr, size_t bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ByteOffset, hTexRef, dptr, bytes);
-}
-
-CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef,
-                                      const CUDA_ARRAY_DESCRIPTOR *desc,
-                                      CUdeviceptr dptr, size_t Pitch) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, desc, dptr, Pitch);
-}
-
-CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt,
-                                   int NumPackedComponents) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray_format, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fmt, NumPackedComponents);
-}
-
-CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim,
-                                        CUaddress_mode am) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, int, CUaddress_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, dim, am);
-}
-
-CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef,
-                                             CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, bias);
-}
-
-CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef,
-                                             float minMipmapLevelClamp,
-                                             float maxMipmapLevelClamp) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
-}
-
-CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef,
-                                          unsigned int maxAniso) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, maxAniso);
-}
-
-CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, pBorderColor);
-}
-
-CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, Flags);
-}
-
-CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray,
-                                           CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phMipmappedArray, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef,
-                                        int dim) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUaddress_mode *, CUtexref, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pam, hTexRef, dim);
-}
-
-CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels,
-                                   CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray_format *, int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFormat, pNumChannels, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm,
-                                             CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbias, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp,
-                                             float *pmaxMipmapLevelClamp,
-                                             CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pmaxAniso, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pBorderColor, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, hTexRef);
-}
-
-CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef);
-}
-
-CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef);
-}
-
-CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hSurfRef, hArray, Flags);
-}
-
-CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUsurfref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hSurfRef);
-}
-
-CUresult CUDAAPI
-cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc,
-                  const CUDA_TEXTURE_DESC *pTexDesc,
-                  const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *,
-                                      const CUDA_TEXTURE_DESC *,
-                                      const CUDA_RESOURCE_VIEW_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                            CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc,
-                                           CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceViewDesc(
-    CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject,
-                                    const CUDA_RESOURCE_DESC *pResDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                             CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev,
-                                       CUdevice peerDev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, dev, peerDev);
-}
-
-CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext,
-                                       unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext, Flags);
-}
-
-CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext);
-}
-
-CUresult CUDAAPI cuDeviceGetP2PAttribute(int *value,
-                                         CUdevice_P2PAttribute attrib,
-                                         CUdevice srcDevice,
-                                         CUdevice dstDevice) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attrib, srcDevice, dstDevice);
-}
-
-CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(
-    CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex,
-    unsigned int mipLevel) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUgraphicsResource,
-                                      unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArray, resource, arrayIndex, mipLevel);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(
-    CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMipmappedArray, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(
-    CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevPtr, pSize, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-CUresult CUDAAPI cuGraphicsMapResources(unsigned int count,
-                                        CUgraphicsResource *resources,
-                                        CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count,
-                                          CUgraphicsResource *resources,
-                                          CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
-                                  const CUuuid *pExportTableId) {
-  using FuncPtr = CUresult(CUDAAPI *)(const void **, const CUuuid *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_11_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_11_0.inc
deleted file mode 100644
index 8fb62f8bf5d8b6..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_11_0.inc
+++ /dev/null
@@ -1,2943 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuInit(unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(Flags);
-}
-
-CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, ordinal);
-}
-
-CUresult CUDAAPI cuDeviceGetCount(int *count) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(name, len, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(uuid, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask,
-                                 CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, unsigned int *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetLuid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(luid, deviceNodeMask, dev);
-}
-
-CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(bytes, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
-                                                    CUarray_format format,
-                                                    unsigned numChannels,
-                                                    CUdevice dev) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(size_t *, CUarray_format, unsigned int, CUdevice);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuDeviceGetTexture1DLinearMaxWidth");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(maxWidthInElements, format, numChannels, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
-                                      CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList,
-                                                CUdevice dev, int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdevice, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetNvSciSyncAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nvSciSyncAttrList, dev, flags);
-}
-
-#if CUDA_VERSION >= 11020
-
-CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, CUmemoryPool);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceSetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, pool);
-}
-
-CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out,
-                                           CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetDefaultMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool_out, dev);
-}
-
-#endif  // CUDA_VERSION >= 11020
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop,
-                                                         CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevprop *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, dev);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major,
-                                                             int *minor,
-                                                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags,
-                                            int *active) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags, active);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags,
-                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags, dev);
-}
-
-CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-CUresult CUDAAPI cuCtxSynchronize(void) {
-  using FuncPtr = CUresult(CUDAAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUlimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pvalue, limit);
-}
-
-CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pconfig);
-}
-
-CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, version);
-}
-
-CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority,
-                                             int *greatestPriority) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-CUresult CUDAAPI cuCtxResetPersistingL2Cache(void) {
-  using FuncPtr = CUresult(CUDAAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxResetPersistingL2Cache");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fname);
-}
-
-CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image);
-}
-
-CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image,
-                                    unsigned int numOptions,
-                                    CUjit_option *options,
-                                    void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *, unsigned int,
-                                      CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fatCubin);
-}
-
-CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hmod);
-}
-
-CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
-                                     const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes,
-                                   CUmodule hmod, const char *name) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytes, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod,
-                                   const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod,
-                                    const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfRef, hmod, name);
-}
-
-CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options,
-                              void **optionValues, CUlinkState *stateOut) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numOptions, options, optionValues, stateOut);
-}
-
-CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type,
-                               void *data, size_t size, const char *name,
-                               unsigned int numOptions, CUjit_option *options,
-                               void **optionValues) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t,
-                          const char *, unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, data, size, name, numOptions, options,
-                  optionValues);
-}
-
-CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type,
-                               const char *path, unsigned int numOptions,
-                               CUjit_option *options, void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, const char *,
-                                      unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, path, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuLinkComplete(CUlinkState state, void **cubinOut,
-                                size_t *sizeOut) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, void **, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, cubinOut, sizeOut);
-}
-
-CUresult CUDAAPI cuLinkDestroy(CUlinkState state) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state);
-}
-
-CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize);
-}
-
-CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
-                                 size_t WidthInBytes, size_t Height,
-                                 unsigned int ElementSizeBytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t,
-                                      unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
-}
-
-CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
-                                      CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbase, psize, dptr);
-}
-
-CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize);
-}
-
-CUresult CUDAAPI cuMemFreeHost(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize,
-                                unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
-                                           unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, p, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, p);
-}
-
-CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
-                                   unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, flags);
-}
-
-CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, pciBusId);
-}
-
-CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, dev);
-}
-
-CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcEventHandle *, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, event);
-}
-
-CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent,
-                                      CUipcEventHandle handle) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, CUipcEventHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, handle);
-}
-
-CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, dptr);
-}
-
-CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
-                                    unsigned int Flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, handle, Flags);
-}
-
-CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostUnregister(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext,
-                              CUdeviceptr srcDevice, CUcontext srcContext,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset,
-                              CUdeviceptr srcDevice, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray,
-                              size_t srcOffset, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset,
-                              const void *srcHost, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset,
-                              CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src,
-                               size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext,
-                                   CUdeviceptr srcDevice, CUcontext srcContext,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount,
-                  hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
-                                   const void *srcHost, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray,
-                                   size_t srcOffset, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy,
-                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N);
-}
-
-CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
-                             size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N);
-}
-
-CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N);
-}
-
-CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch,
-                              unsigned char uc, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned short us, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned int ui, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
-                                 size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                   unsigned char uc, size_t Width,
-                                   size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned short us, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned int ui, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t,
-                                      size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuArrayCreate(CUarray *pHandle,
-                               const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor,
-                                      CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-#if CUDA_VERSION >= 11100
-
-CUresult CUDAAPI cuArrayGetSparseProperties(
-    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_SPARSE_PROPERTIES *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, array);
-}
-
-CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(
-    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUDA_ARRAY_SPARSE_PROPERTIES *, CUmipmappedArray);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMipmappedArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, mipmap);
-}
-
-#endif  // CUDA_VERSION >= 11100
-
-CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray,
-                                 unsigned int planeIdx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetPlane");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pPlaneArray, hArray, planeIdx);
-}
-
-CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hArray);
-}
-
-CUresult CUDAAPI cuArray3DCreate(
-    CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArray3DGetDescriptor(
-    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-CUresult CUDAAPI
-cuMipmappedArrayCreate(CUmipmappedArray *pHandle,
-                       const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
-                       unsigned int numMipmapLevels) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
-}
-
-CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray,
-                                          CUmipmappedArray hMipmappedArray,
-                                          unsigned int level) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pLevelArray, hMipmappedArray, level);
-}
-
-CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hMipmappedArray);
-}
-
-CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size,
-                                     size_t alignment, CUdeviceptr addr,
-                                     unsigned long long flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, size_t,
-                                      CUdeviceptr, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressReserve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, alignment, addr, flags);
-}
-
-CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                             const CUmemAllocationProp *prop,
-                             unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, size_t,
-                          const CUmemAllocationProp *, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, size, prop, flags);
-}
-
-CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
-                          CUmemGenericAllocationHandle handle,
-                          unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, size_t,
-                          CUmemGenericAllocationHandle, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMap");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, offset, handle, flags);
-}
-
-#if CUDA_VERSION >= 11100
-
-CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList,
-                                    unsigned int count, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarrayMapInfo *, unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMapArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mapInfoList, count, hStream);
-}
-
-#endif  // CUDA_VERSION >= 11100
-
-CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemUnmap");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size,
-                                const CUmemAccessDesc *desc, size_t count) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, const CUmemAccessDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemSetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, desc, count);
-}
-
-CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags,
-                                const CUmemLocation *location,
-                                CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned long long *,
-                                      const CUmemLocation *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags, location, ptr);
-}
-
-CUresult CUDAAPI cuMemExportToShareableHandle(
-    void *shareableHandle, CUmemGenericAllocationHandle handle,
-    CUmemAllocationHandleType handleType, unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(void *, CUmemGenericAllocationHandle,
-                          CUmemAllocationHandleType, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemExportToShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(shareableHandle, handle, handleType, flags);
-}
-
-CUresult CUDAAPI cuMemImportFromShareableHandle(
-    CUmemGenericAllocationHandle *handle, void *osHandle,
-    CUmemAllocationHandleType shHandleType) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *,
-                                      CUmemAllocationHandleType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemImportFromShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, osHandle, shHandleType);
-}
-
-CUresult CUDAAPI cuMemGetAllocationGranularity(
-    size_t *granularity, const CUmemAllocationProp *prop,
-    CUmemAllocationGranularity_flags option) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, const CUmemAllocationProp *,
-                                      CUmemAllocationGranularity_flags);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAllocationGranularity");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(granularity, prop, option);
-}
-
-CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(
-    CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemAllocationProp *, CUmemGenericAllocationHandle);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMemGetAllocationPropertiesFromHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, handle);
-}
-
-CUresult CUDAAPI
-cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRetainAllocationHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, addr);
-}
-
-CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, hStream);
-}
-
-CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize,
-                                 CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, hStream);
-}
-
-#if CUDA_VERSION >= 11020
-
-CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolTrimTo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, minBytesToKeep);
-}
-
-CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool,
-                                       CUmemPool_attribute attr, void *value) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemoryPool, CUmemPool_attribute, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, attr, value);
-}
-
-CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool,
-                                       CUmemPool_attribute attr, void *value) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemoryPool, CUmemPool_attribute, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, attr, value);
-}
-
-CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool,
-                                    const CUmemAccessDesc *map, size_t count) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemoryPool, const CUmemAccessDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolSetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, map, count);
-}
-
-CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags,
-                                    CUmemoryPool memPool,
-                                    CUmemLocation *location) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemAccess_flags *, CUmemoryPool, CUmemLocation *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolGetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags, memPool, location);
-}
-
-CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool,
-                                 const CUmemPoolProps *poolProps) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, const CUmemPoolProps *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, poolProps);
-}
-
-CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool);
-}
-
-CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize,
-                                         CUmemoryPool pool, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t, CUmemoryPool, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocFromPoolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, pool, hStream);
-}
-
-CUresult CUDAAPI cuMemPoolExportToShareableHandle(
-    void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType,
-    unsigned long long flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      void *, CUmemoryPool, CUmemAllocationHandleType, unsigned long long);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMemPoolExportToShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle_out, pool, handleType, flags);
-}
-
-CUresult CUDAAPI cuMemPoolImportFromShareableHandle(
-    CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType,
-    unsigned long long flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUmemoryPool *, void *, CUmemAllocationHandleType, unsigned long long);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMemPoolImportFromShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool_out, handle, handleType, flags);
-}
-
-CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out,
-                                        CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemPoolPtrExportData *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolExportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(shareData_out, ptr);
-}
-
-CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool,
-                                        CUmemPoolPtrExportData *shareData) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUmemoryPool,
-                                      CUmemPoolPtrExportData *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolImportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr_out, pool, shareData);
-}
-
-#endif  // CUDA_VERSION >= 11020
-
-CUresult CUDAAPI cuPointerGetAttribute(void *data,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, attribute, ptr);
-}
-
-CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count,
-                                    CUdevice dstDevice, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, hStream);
-}
-
-CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count,
-                             CUmem_advise advice, CUdevice device) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize,
-                                        CUmem_range_attribute attribute,
-                                        CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, CUmem_range_attribute,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes,
-                                         CUmem_range_attribute *attributes,
-                                         size_t numAttributes,
-                                         CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-CUresult CUDAAPI cuPointerSetAttribute(const void *value,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attribute, ptr);
-}
-
-CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes,
-                                        CUpointer_attribute *attributes,
-                                        void **data, CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int, CUpointer_attribute *,
-                                      void **, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numAttributes, attributes, data, ptr);
-}
-
-CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, Flags);
-}
-
-CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream,
-                                            unsigned int flags, int priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, flags, priority);
-}
-
-CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, pctx);
-}
-
-CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUevent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, hEvent, Flags);
-}
-
-CUresult CUDAAPI cuStreamAddCallback(CUstream hStream,
-                                     CUstreamCallback callback, void *userData,
-                                     unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, callback, userData, flags);
-}
-
-CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream,
-                                      CUstreamCaptureMode mode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBeginCapture_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, mode);
-}
-
-CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstreamCaptureMode *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuThreadExchangeStreamCaptureMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, phGraph);
-}
-
-CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream,
-                                     CUstreamCaptureStatus *captureStatus) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus);
-}
-
-CUresult CUDAAPI cuStreamGetCaptureInfo(
-    CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
-    cuuint64_t *id_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *, cuuint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCaptureInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus_out, id_out);
-}
-
-CUresult CUDAAPI cuStreamGetCaptureInfo_v2(
-    CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
-    cuuint64_t *id_out, CUgraph *graph_out,
-    const CUgraphNode **dependencies_out, size_t *numDependencies_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *, cuuint64_t *,
-                          CUgraph *, const CUgraphNode **, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCaptureInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus_out, id_out, graph_out,
-                  dependencies_out, numDependencies_out);
-}
-
-CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream,
-                                                   CUgraphNode *dependencies,
-                                                   size_t numDependencies,
-                                                   unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUgraphNode *, size_t, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuStreamUpdateCaptureDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, dependencies, numDependencies, flags);
-}
-
-CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr,
-                                        size_t length, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, dptr, length, flags);
-}
-
-CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src);
-}
-
-CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
-                                      CUstreamAttrValue *value_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, CUstreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value_out);
-}
-
-CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
-                                      const CUstreamAttrValue *value) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, const CUstreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value);
-}
-
-CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, Flags);
-}
-
-CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent, hStream);
-}
-
-CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream,
-                                        unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecordWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent, hStream, flags);
-}
-
-CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
-                                    CUevent hEnd) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUevent, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMilliseconds, hStart, hEnd);
-}
-
-CUresult CUDAAPI
-cuImportExternalMemory(CUexternalMemory *extMem_out,
-                       const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory *,
-                                      const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(
-    CUdeviceptr *devPtr, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUexternalMemory,
-                                      const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(
-    CUmipmappedArray *mipmap, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmipmappedArray *, CUexternalMemory,
-                          const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-CUresult CUDAAPI cuImportExternalSemaphore(
-    CUexternalSemaphore *extSem_out,
-    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-CUresult CUDAAPI cuSignalExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *,
-      const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuWaitExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *,
-      unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalSemaphore);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr,
-                                     cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr,
-                                     cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr,
-                                      cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr,
-                                      cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
-                                    CUstreamBatchMemOpParams *paramArray,
-                                    unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int,
-                                      CUstreamBatchMemOpParams *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, count, paramArray, flags);
-}
-
-CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
-                                    CUfunction hfunc) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, hfunc);
-}
-
-CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc,
-                                    CUfunction_attribute attrib, int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunction_attribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, attrib, value);
-}
-
-CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc,
-                                          CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetModule");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hmod, hfunc);
-}
-
-CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX,
-                                unsigned int gridDimY, unsigned int gridDimZ,
-                                unsigned int blockDimX, unsigned int blockDimY,
-                                unsigned int blockDimZ,
-                                unsigned int sharedMemBytes, CUstream hStream,
-                                void **kernelParams, void **extra) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
-}
-
-CUresult CUDAAPI cuLaunchCooperativeKernel(
-    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
-    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
-    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
-    void **kernelParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(
-    CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn,
-                                  void *userData) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUhostFn, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, fn, userData);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x,
-                                                       int y, int z) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, x, y, z);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc,
-                                                       unsigned int bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, bytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc,
-                                                  unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset,
-                                               unsigned int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset,
-                                               float value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset,
-                                               void *ptr,
-                                               unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, ptr, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width,
-                                                int grid_height) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f,
-                                                     int grid_width,
-                                                     int grid_height,
-                                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height, hStream);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc,
-                                                    int texunit,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, texunit, hTexRef);
-}
-
-CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraph, flags);
-}
-
-CUresult CUDAAPI cuGraphAddKernelNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeGetParams(
-    CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeSetParams(
-    CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                      const CUgraphNode *dependencies,
-                                      size_t numDependencies,
-                                      const CUDA_MEMCPY3D *copyParams,
-                                      CUcontext ctx) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_MEMCPY3D *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  copyParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode,
-                                            CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode,
-                                            const CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemsetNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams,
-    CUcontext ctx) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  memsetParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeGetParams(
-    CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeSetParams(
-    CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                    const CUgraphNode *dependencies,
-                                    size_t numDependencies,
-                                    const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode,
-                                          CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeSetParams(
-    CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode,
-                                          CUgraph hGraph,
-                                          const CUgraphNode *dependencies,
-                                          size_t numDependencies,
-                                          CUgraph childGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  childGraph);
-}
-
-CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode,
-                                               CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, phGraph);
-}
-
-CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                     const CUgraphNode *dependencies,
-                                     size_t numDependencies) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode,
-                                           CUgraph hGraph,
-                                           const CUgraphNode *dependencies,
-                                           size_t numDependencies,
-                                           CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEventRecordNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, event);
-}
-
-CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode,
-                                                CUevent *event_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventRecordNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event_out);
-}
-
-CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode,
-                                                CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event);
-}
-
-CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode,
-                                         CUgraph hGraph,
-                                         const CUgraphNode *dependencies,
-                                         size_t numDependencies,
-                                         CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEventWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, event);
-}
-
-CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode,
-                                              CUevent *event_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventWaitNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event_out);
-}
-
-CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode,
-                                              CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event);
-}
-
-#if CUDA_VERSION >= 11020
-
-CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphAddExternalSemaphoresSignalNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(
-    CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresSignalNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(
-    CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphAddExternalSemaphoresWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(
-    CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresWaitNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(
-    CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-#endif  // CUDA_VERSION >= 11020
-
-CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphClone, originalGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode,
-                                        CUgraphNode hOriginalNode,
-                                        CUgraph hClonedGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraphNode, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phNode, hOriginalNode, hClonedGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, type);
-}
-
-CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes,
-                                 size_t *numNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, nodes, numNodes);
-}
-
-CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes,
-                                     size_t *numRootNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, rootNodes, numRootNodes);
-}
-
-CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from,
-                                 CUgraphNode *to, size_t *numEdges) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numEdges);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode,
-                                            CUgraphNode *dependencies,
-                                            size_t *numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode,
-                                              CUgraphNode *dependentNodes,
-                                              size_t *numDependentNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependentNodes, numDependentNodes);
-}
-
-CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from,
-                                        const CUgraphNode *to,
-                                        size_t numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
-                                      const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph,
-                                           const CUgraphNode *from,
-                                           const CUgraphNode *to,
-                                           size_t numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
-                                      const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode);
-}
-
-CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph,
-                                    CUgraphNode *phErrorNode, char *logBuffer,
-                                    size_t bufferSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec *, CUgraph, CUgraphNode *,
-                                      char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize);
-}
-
-CUresult CUDAAPI
-cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
-                               const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec,
-                                                CUgraphNode hNode,
-                                                const CUDA_MEMCPY3D *copyParams,
-                                                CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_MEMCPY3D *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, copyParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUgraphExec, CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, memsetParams, ctx);
-}
-
-CUresult CUDAAPI
-cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
-                             const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec,
-                                                    CUgraphNode hNode,
-                                                    CUgraph childGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUgraph);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecChildGraphNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, childGraph);
-}
-
-CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec,
-                                                    CUgraphNode hNode,
-                                                    CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUevent);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec,
-                                                  CUgraphNode hNode,
-                                                  CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUevent);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-#if CUDA_VERSION >= 11020
-
-CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-#endif  // CUDA_VERSION >= 11020
-
-CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphUpload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hStream);
-}
-
-CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hStream);
-}
-
-CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec);
-}
-
-CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph);
-}
-
-CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph,
-                                   CUgraphNode *hErrorNode_out,
-                                   CUgraphExecUpdateResult *updateResult_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraph, CUgraphNode *,
-                                      CUgraphExecUpdateResult *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst,
-                                                 CUgraphNode src) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src);
-}
-
-CUresult CUDAAPI
-cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
-                              CUkernelNodeAttrValue *value_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
-                                      CUkernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value_out);
-}
-
-CUresult CUDAAPI
-cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
-                              const CUkernelNodeAttrValue *value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
-                                      const CUkernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value);
-}
-
-CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path,
-                                      unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const char *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDebugDotPrint");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, path, flags);
-}
-
-#if CUDA_VERSION >= 11030
-
-CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr,
-                                    CUhostFn destroy,
-                                    unsigned int initialRefcount,
-                                    unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuserObject *, void *, CUhostFn,
-                                      unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuUserObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object_out, ptr, destroy, initialRefcount, flags);
-}
-
-CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuserObject, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuUserObjectRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuserObject, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuUserObjectRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object,
-                                         unsigned int count,
-                                         unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraph, CUuserObject, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRetainUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count, flags);
-}
-
-CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object,
-                                          unsigned int count) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUuserObject, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphReleaseUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count);
-}
-
-#endif  // CUDA_VERSION >= 11030
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUfunction,
-                                      CUoccupancyB2DSize, size_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit, flags);
-}
-
-CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(
-    size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUfunction, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyAvailableDynamicSMemPerBlock");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef,
-                                                    CUarray hArray,
-                                                    unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hArray, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(
-    CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hMipmappedArray, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset,
-                                                      CUtexref hTexRef,
-                                                      CUdeviceptr dptr,
-                                                      size_t bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ByteOffset, hTexRef, dptr, bytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc,
-                     CUdeviceptr dptr, size_t Pitch) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, desc, dptr, Pitch);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef,
-                                                     CUarray_format fmt,
-                                                     int NumPackedComponents) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray_format, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fmt, NumPackedComponents);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef,
-                                                          int dim,
-                                                          CUaddress_mode am) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, int, CUaddress_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, dim, am);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef,
-                                                         CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef,
-                                                              float bias) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, bias);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(
-    CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, maxAniso);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef,
-                                                          float *pBorderColor) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, pBorderColor);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef,
-                                                    unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr,
-                                                      CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(
-    CUmipmappedArray *phMipmappedArray, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phMipmappedArray, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam,
-                                                          CUtexref hTexRef,
-                                                          int dim) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUaddress_mode *, CUtexref, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pam, hTexRef, dim);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm,
-                                                         CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat,
-                                                     int *pNumChannels,
-                                                     CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray_format *, int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFormat, pNumChannels, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbias, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp,
-                            float *pmaxMipmapLevelClamp, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso,
-                                                            CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pmaxAniso, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor,
-                                                          CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pBorderColor, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef,
-                                                     CUarray hArray,
-                                                     unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hSurfRef, hArray, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray,
-                                                     CUsurfref hSurfRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUsurfref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hSurfRef);
-}
-
-CUresult CUDAAPI
-cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc,
-                  const CUDA_TEXTURE_DESC *pTexDesc,
-                  const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *,
-                                      const CUDA_TEXTURE_DESC *,
-                                      const CUDA_RESOURCE_VIEW_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                            CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc,
-                                           CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceViewDesc(
-    CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject,
-                                    const CUDA_RESOURCE_DESC *pResDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                             CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev,
-                                       CUdevice peerDev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, dev, peerDev);
-}
-
-CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext,
-                                       unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext, Flags);
-}
-
-CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext);
-}
-
-CUresult CUDAAPI cuDeviceGetP2PAttribute(int *value,
-                                         CUdevice_P2PAttribute attrib,
-                                         CUdevice srcDevice,
-                                         CUdevice dstDevice) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attrib, srcDevice, dstDevice);
-}
-
-CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(
-    CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex,
-    unsigned int mipLevel) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUgraphicsResource,
-                                      unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArray, resource, arrayIndex, mipLevel);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(
-    CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMipmappedArray, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(
-    CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevPtr, pSize, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-CUresult CUDAAPI cuGraphicsMapResources(unsigned int count,
-                                        CUgraphicsResource *resources,
-                                        CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count,
-                                          CUgraphicsResource *resources,
-                                          CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn,
-                                  int cudaVersion, cuuint64_t flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(const char *, void **, int, cuuint64_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetProcAddress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, pfn, cudaVersion, flags);
-}
-
-CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
-                                  const CUuuid *pExportTableId) {
-  using FuncPtr = CUresult(CUDAAPI *)(const void **, const CUuuid *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-#if CUDA_VERSION >= 11030
-
-CUresult CUDAAPI
-cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target,
-                           CUflushGPUDirectRDMAWritesScope scope) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUflushGPUDirectRDMAWritesTarget,
-                                      CUflushGPUDirectRDMAWritesScope);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFlushGPUDirectRDMAWrites");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(target, scope);
-}
-
-#endif  // CUDA_VERSION >= 11030
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_11_2.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_11_2.inc
deleted file mode 100644
index 7153901f8e065b..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_11_2.inc
+++ /dev/null
@@ -1,2816 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuInit(unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(Flags);
-}
-
-CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, ordinal);
-}
-
-CUresult CUDAAPI cuDeviceGetCount(int *count) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(name, len, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(uuid, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask,
-                                 CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, unsigned int *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetLuid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(luid, deviceNodeMask, dev);
-}
-
-CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(bytes, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
-                                                    CUarray_format format,
-                                                    unsigned numChannels,
-                                                    CUdevice dev) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(size_t *, CUarray_format, unsigned int, CUdevice);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuDeviceGetTexture1DLinearMaxWidth");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(maxWidthInElements, format, numChannels, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
-                                      CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList,
-                                                CUdevice dev, int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdevice, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetNvSciSyncAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nvSciSyncAttrList, dev, flags);
-}
-
-CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, CUmemoryPool);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceSetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, pool);
-}
-
-CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out,
-                                           CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetDefaultMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool_out, dev);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop,
-                                                         CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevprop *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, dev);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major,
-                                                             int *minor,
-                                                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags,
-                                            int *active) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags, active);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags,
-                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags, dev);
-}
-
-CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-CUresult CUDAAPI cuCtxSynchronize(void) {
-  using FuncPtr = CUresult(CUDAAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUlimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pvalue, limit);
-}
-
-CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pconfig);
-}
-
-CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, version);
-}
-
-CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority,
-                                             int *greatestPriority) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-CUresult CUDAAPI cuCtxResetPersistingL2Cache(void) {
-  using FuncPtr = CUresult(CUDAAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxResetPersistingL2Cache");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fname);
-}
-
-CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image);
-}
-
-CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image,
-                                    unsigned int numOptions,
-                                    CUjit_option *options,
-                                    void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *, unsigned int,
-                                      CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fatCubin);
-}
-
-CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hmod);
-}
-
-CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
-                                     const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes,
-                                   CUmodule hmod, const char *name) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytes, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod,
-                                   const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod,
-                                    const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfRef, hmod, name);
-}
-
-CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options,
-                              void **optionValues, CUlinkState *stateOut) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numOptions, options, optionValues, stateOut);
-}
-
-CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type,
-                               void *data, size_t size, const char *name,
-                               unsigned int numOptions, CUjit_option *options,
-                               void **optionValues) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t,
-                          const char *, unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, data, size, name, numOptions, options,
-                  optionValues);
-}
-
-CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type,
-                               const char *path, unsigned int numOptions,
-                               CUjit_option *options, void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, const char *,
-                                      unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, path, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuLinkComplete(CUlinkState state, void **cubinOut,
-                                size_t *sizeOut) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, void **, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, cubinOut, sizeOut);
-}
-
-CUresult CUDAAPI cuLinkDestroy(CUlinkState state) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state);
-}
-
-CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize);
-}
-
-CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
-                                 size_t WidthInBytes, size_t Height,
-                                 unsigned int ElementSizeBytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t,
-                                      unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
-}
-
-CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
-                                      CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbase, psize, dptr);
-}
-
-CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize);
-}
-
-CUresult CUDAAPI cuMemFreeHost(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize,
-                                unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
-                                           unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, p, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, p);
-}
-
-CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
-                                   unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, flags);
-}
-
-CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, pciBusId);
-}
-
-CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, dev);
-}
-
-CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcEventHandle *, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, event);
-}
-
-CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent,
-                                      CUipcEventHandle handle) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, CUipcEventHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, handle);
-}
-
-CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, dptr);
-}
-
-CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
-                                    unsigned int Flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, handle, Flags);
-}
-
-CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostUnregister(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext,
-                              CUdeviceptr srcDevice, CUcontext srcContext,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset,
-                              CUdeviceptr srcDevice, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray,
-                              size_t srcOffset, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset,
-                              const void *srcHost, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset,
-                              CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src,
-                               size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext,
-                                   CUdeviceptr srcDevice, CUcontext srcContext,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount,
-                  hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
-                                   const void *srcHost, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray,
-                                   size_t srcOffset, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy,
-                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N);
-}
-
-CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
-                             size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N);
-}
-
-CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N);
-}
-
-CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch,
-                              unsigned char uc, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned short us, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned int ui, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
-                                 size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                   unsigned char uc, size_t Width,
-                                   size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned short us, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned int ui, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t,
-                                      size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuArrayCreate(CUarray *pHandle,
-                               const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor,
-                                      CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-CUresult CUDAAPI cuArrayGetSparseProperties(
-    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_SPARSE_PROPERTIES *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, array);
-}
-
-CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(
-    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUDA_ARRAY_SPARSE_PROPERTIES *, CUmipmappedArray);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMipmappedArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, mipmap);
-}
-
-CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray,
-                                 unsigned int planeIdx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetPlane");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pPlaneArray, hArray, planeIdx);
-}
-
-CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hArray);
-}
-
-CUresult CUDAAPI cuArray3DCreate(
-    CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArray3DGetDescriptor(
-    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-CUresult CUDAAPI
-cuMipmappedArrayCreate(CUmipmappedArray *pHandle,
-                       const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
-                       unsigned int numMipmapLevels) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
-}
-
-CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray,
-                                          CUmipmappedArray hMipmappedArray,
-                                          unsigned int level) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pLevelArray, hMipmappedArray, level);
-}
-
-CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hMipmappedArray);
-}
-
-CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size,
-                                     size_t alignment, CUdeviceptr addr,
-                                     unsigned long long flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, size_t,
-                                      CUdeviceptr, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressReserve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, alignment, addr, flags);
-}
-
-CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                             const CUmemAllocationProp *prop,
-                             unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, size_t,
-                          const CUmemAllocationProp *, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, size, prop, flags);
-}
-
-CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
-                          CUmemGenericAllocationHandle handle,
-                          unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, size_t,
-                          CUmemGenericAllocationHandle, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMap");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, offset, handle, flags);
-}
-
-CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList,
-                                    unsigned int count, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarrayMapInfo *, unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMapArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mapInfoList, count, hStream);
-}
-
-CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemUnmap");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size,
-                                const CUmemAccessDesc *desc, size_t count) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, const CUmemAccessDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemSetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, desc, count);
-}
-
-CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags,
-                                const CUmemLocation *location,
-                                CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned long long *,
-                                      const CUmemLocation *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags, location, ptr);
-}
-
-CUresult CUDAAPI cuMemExportToShareableHandle(
-    void *shareableHandle, CUmemGenericAllocationHandle handle,
-    CUmemAllocationHandleType handleType, unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(void *, CUmemGenericAllocationHandle,
-                          CUmemAllocationHandleType, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemExportToShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(shareableHandle, handle, handleType, flags);
-}
-
-CUresult CUDAAPI cuMemImportFromShareableHandle(
-    CUmemGenericAllocationHandle *handle, void *osHandle,
-    CUmemAllocationHandleType shHandleType) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *,
-                                      CUmemAllocationHandleType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemImportFromShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, osHandle, shHandleType);
-}
-
-CUresult CUDAAPI cuMemGetAllocationGranularity(
-    size_t *granularity, const CUmemAllocationProp *prop,
-    CUmemAllocationGranularity_flags option) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, const CUmemAllocationProp *,
-                                      CUmemAllocationGranularity_flags);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAllocationGranularity");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(granularity, prop, option);
-}
-
-CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(
-    CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemAllocationProp *, CUmemGenericAllocationHandle);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMemGetAllocationPropertiesFromHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, handle);
-}
-
-CUresult CUDAAPI
-cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRetainAllocationHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, addr);
-}
-
-CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, hStream);
-}
-
-CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize,
-                                 CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, hStream);
-}
-
-CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolTrimTo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, minBytesToKeep);
-}
-
-CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool,
-                                       CUmemPool_attribute attr, void *value) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemoryPool, CUmemPool_attribute, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, attr, value);
-}
-
-CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool,
-                                       CUmemPool_attribute attr, void *value) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemoryPool, CUmemPool_attribute, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, attr, value);
-}
-
-CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool,
-                                    const CUmemAccessDesc *map, size_t count) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemoryPool, const CUmemAccessDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolSetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, map, count);
-}
-
-CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags,
-                                    CUmemoryPool memPool,
-                                    CUmemLocation *location) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemAccess_flags *, CUmemoryPool, CUmemLocation *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolGetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags, memPool, location);
-}
-
-CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool,
-                                 const CUmemPoolProps *poolProps) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, const CUmemPoolProps *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, poolProps);
-}
-
-CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool);
-}
-
-CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize,
-                                         CUmemoryPool pool, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t, CUmemoryPool, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocFromPoolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, pool, hStream);
-}
-
-CUresult CUDAAPI cuMemPoolExportToShareableHandle(
-    void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType,
-    unsigned long long flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      void *, CUmemoryPool, CUmemAllocationHandleType, unsigned long long);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMemPoolExportToShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle_out, pool, handleType, flags);
-}
-
-CUresult CUDAAPI cuMemPoolImportFromShareableHandle(
-    CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType,
-    unsigned long long flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUmemoryPool *, void *, CUmemAllocationHandleType, unsigned long long);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMemPoolImportFromShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool_out, handle, handleType, flags);
-}
-
-CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out,
-                                        CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemPoolPtrExportData *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolExportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(shareData_out, ptr);
-}
-
-CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool,
-                                        CUmemPoolPtrExportData *shareData) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUmemoryPool,
-                                      CUmemPoolPtrExportData *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolImportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr_out, pool, shareData);
-}
-
-CUresult CUDAAPI cuPointerGetAttribute(void *data,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, attribute, ptr);
-}
-
-CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count,
-                                    CUdevice dstDevice, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, hStream);
-}
-
-CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count,
-                             CUmem_advise advice, CUdevice device) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize,
-                                        CUmem_range_attribute attribute,
-                                        CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, CUmem_range_attribute,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes,
-                                         CUmem_range_attribute *attributes,
-                                         size_t numAttributes,
-                                         CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-CUresult CUDAAPI cuPointerSetAttribute(const void *value,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attribute, ptr);
-}
-
-CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes,
-                                        CUpointer_attribute *attributes,
-                                        void **data, CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int, CUpointer_attribute *,
-                                      void **, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numAttributes, attributes, data, ptr);
-}
-
-CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, Flags);
-}
-
-CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream,
-                                            unsigned int flags, int priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, flags, priority);
-}
-
-CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, pctx);
-}
-
-CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUevent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, hEvent, Flags);
-}
-
-CUresult CUDAAPI cuStreamAddCallback(CUstream hStream,
-                                     CUstreamCallback callback, void *userData,
-                                     unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, callback, userData, flags);
-}
-
-CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream,
-                                      CUstreamCaptureMode mode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBeginCapture_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, mode);
-}
-
-CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstreamCaptureMode *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuThreadExchangeStreamCaptureMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, phGraph);
-}
-
-CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream,
-                                     CUstreamCaptureStatus *captureStatus) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus);
-}
-
-CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream,
-                                        CUstreamCaptureStatus *captureStatus,
-                                        cuuint64_t *id) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *, cuuint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCaptureInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus, id);
-}
-
-CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr,
-                                        size_t length, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, dptr, length, flags);
-}
-
-CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src);
-}
-
-CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
-                                      CUstreamAttrValue *value_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, CUstreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value_out);
-}
-
-CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
-                                      const CUstreamAttrValue *value) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, const CUstreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value);
-}
-
-CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, Flags);
-}
-
-CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent, hStream);
-}
-
-CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream,
-                                        unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecordWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent, hStream, flags);
-}
-
-CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
-                                    CUevent hEnd) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUevent, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMilliseconds, hStart, hEnd);
-}
-
-CUresult CUDAAPI
-cuImportExternalMemory(CUexternalMemory *extMem_out,
-                       const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory *,
-                                      const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(
-    CUdeviceptr *devPtr, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUexternalMemory,
-                                      const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(
-    CUmipmappedArray *mipmap, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmipmappedArray *, CUexternalMemory,
-                          const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-CUresult CUDAAPI cuImportExternalSemaphore(
-    CUexternalSemaphore *extSem_out,
-    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-CUresult CUDAAPI cuSignalExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *,
-      const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuWaitExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *,
-      unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalSemaphore);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr,
-                                     cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr,
-                                     cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr,
-                                      cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr,
-                                      cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
-                                    CUstreamBatchMemOpParams *paramArray,
-                                    unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int,
-                                      CUstreamBatchMemOpParams *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, count, paramArray, flags);
-}
-
-CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
-                                    CUfunction hfunc) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, hfunc);
-}
-
-CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc,
-                                    CUfunction_attribute attrib, int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunction_attribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, attrib, value);
-}
-
-CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc,
-                                          CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX,
-                                unsigned int gridDimY, unsigned int gridDimZ,
-                                unsigned int blockDimX, unsigned int blockDimY,
-                                unsigned int blockDimZ,
-                                unsigned int sharedMemBytes, CUstream hStream,
-                                void **kernelParams, void **extra) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
-}
-
-CUresult CUDAAPI cuLaunchCooperativeKernel(
-    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
-    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
-    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
-    void **kernelParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams);
-}
-
-CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(
-    CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn,
-                                  void *userData) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUhostFn, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, fn, userData);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x,
-                                                       int y, int z) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, x, y, z);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc,
-                                                       unsigned int bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, bytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc,
-                                                  unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset,
-                                               unsigned int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset,
-                                               float value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset,
-                                               void *ptr,
-                                               unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, ptr, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width,
-                                                int grid_height) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f,
-                                                     int grid_width,
-                                                     int grid_height,
-                                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height, hStream);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc,
-                                                    int texunit,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, texunit, hTexRef);
-}
-
-CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraph, flags);
-}
-
-CUresult CUDAAPI cuGraphAddKernelNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeGetParams(
-    CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeSetParams(
-    CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                      const CUgraphNode *dependencies,
-                                      size_t numDependencies,
-                                      const CUDA_MEMCPY3D *copyParams,
-                                      CUcontext ctx) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_MEMCPY3D *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  copyParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode,
-                                            CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode,
-                                            const CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemsetNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams,
-    CUcontext ctx) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  memsetParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeGetParams(
-    CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeSetParams(
-    CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                    const CUgraphNode *dependencies,
-                                    size_t numDependencies,
-                                    const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode,
-                                          CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeSetParams(
-    CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode,
-                                          CUgraph hGraph,
-                                          const CUgraphNode *dependencies,
-                                          size_t numDependencies,
-                                          CUgraph childGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  childGraph);
-}
-
-CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode,
-                                               CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, phGraph);
-}
-
-CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                     const CUgraphNode *dependencies,
-                                     size_t numDependencies) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode,
-                                           CUgraph hGraph,
-                                           const CUgraphNode *dependencies,
-                                           size_t numDependencies,
-                                           CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEventRecordNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, event);
-}
-
-CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode,
-                                                CUevent *event_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventRecordNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event_out);
-}
-
-CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode,
-                                                CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event);
-}
-
-CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode,
-                                         CUgraph hGraph,
-                                         const CUgraphNode *dependencies,
-                                         size_t numDependencies,
-                                         CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEventWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, event);
-}
-
-CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode,
-                                              CUevent *event_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventWaitNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event_out);
-}
-
-CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode,
-                                              CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event);
-}
-
-CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphAddExternalSemaphoresSignalNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(
-    CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresSignalNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(
-    CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphAddExternalSemaphoresWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(
-    CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresWaitNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(
-    CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphClone, originalGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode,
-                                        CUgraphNode hOriginalNode,
-                                        CUgraph hClonedGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraphNode, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phNode, hOriginalNode, hClonedGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, type);
-}
-
-CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes,
-                                 size_t *numNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, nodes, numNodes);
-}
-
-CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes,
-                                     size_t *numRootNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, rootNodes, numRootNodes);
-}
-
-CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from,
-                                 CUgraphNode *to, size_t *numEdges) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numEdges);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode,
-                                            CUgraphNode *dependencies,
-                                            size_t *numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode,
-                                              CUgraphNode *dependentNodes,
-                                              size_t *numDependentNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependentNodes, numDependentNodes);
-}
-
-CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from,
-                                        const CUgraphNode *to,
-                                        size_t numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
-                                      const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph,
-                                           const CUgraphNode *from,
-                                           const CUgraphNode *to,
-                                           size_t numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
-                                      const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode);
-}
-
-CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph,
-                                    CUgraphNode *phErrorNode, char *logBuffer,
-                                    size_t bufferSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec *, CUgraph, CUgraphNode *,
-                                      char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize);
-}
-
-CUresult CUDAAPI
-cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
-                               const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec,
-                                                CUgraphNode hNode,
-                                                const CUDA_MEMCPY3D *copyParams,
-                                                CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_MEMCPY3D *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, copyParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUgraphExec, CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, memsetParams, ctx);
-}
-
-CUresult CUDAAPI
-cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
-                             const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec,
-                                                    CUgraphNode hNode,
-                                                    CUgraph childGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUgraph);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecChildGraphNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, childGraph);
-}
-
-CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec,
-                                                    CUgraphNode hNode,
-                                                    CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUevent);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec,
-                                                  CUgraphNode hNode,
-                                                  CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUevent);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphUpload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hStream);
-}
-
-CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hStream);
-}
-
-CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec);
-}
-
-CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph);
-}
-
-CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph,
-                                   CUgraphNode *hErrorNode_out,
-                                   CUgraphExecUpdateResult *updateResult_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraph, CUgraphNode *,
-                                      CUgraphExecUpdateResult *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst,
-                                                 CUgraphNode src) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src);
-}
-
-CUresult CUDAAPI
-cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
-                              CUkernelNodeAttrValue *value_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
-                                      CUkernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value_out);
-}
-
-CUresult CUDAAPI
-cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
-                              const CUkernelNodeAttrValue *value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
-                                      const CUkernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUfunction,
-                                      CUoccupancyB2DSize, size_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit, flags);
-}
-
-CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(
-    size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUfunction, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyAvailableDynamicSMemPerBlock");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef,
-                                                    CUarray hArray,
-                                                    unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hArray, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(
-    CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hMipmappedArray, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset,
-                                                      CUtexref hTexRef,
-                                                      CUdeviceptr dptr,
-                                                      size_t bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ByteOffset, hTexRef, dptr, bytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc,
-                     CUdeviceptr dptr, size_t Pitch) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, desc, dptr, Pitch);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef,
-                                                     CUarray_format fmt,
-                                                     int NumPackedComponents) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray_format, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fmt, NumPackedComponents);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef,
-                                                          int dim,
-                                                          CUaddress_mode am) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, int, CUaddress_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, dim, am);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef,
-                                                         CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef,
-                                                              float bias) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, bias);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(
-    CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, maxAniso);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef,
-                                                          float *pBorderColor) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, pBorderColor);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef,
-                                                    unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr,
-                                                      CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(
-    CUmipmappedArray *phMipmappedArray, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phMipmappedArray, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam,
-                                                          CUtexref hTexRef,
-                                                          int dim) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUaddress_mode *, CUtexref, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pam, hTexRef, dim);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm,
-                                                         CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat,
-                                                     int *pNumChannels,
-                                                     CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray_format *, int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFormat, pNumChannels, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbias, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp,
-                            float *pmaxMipmapLevelClamp, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso,
-                                                            CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pmaxAniso, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor,
-                                                          CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pBorderColor, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef,
-                                                     CUarray hArray,
-                                                     unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hSurfRef, hArray, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray,
-                                                     CUsurfref hSurfRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUsurfref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hSurfRef);
-}
-
-CUresult CUDAAPI
-cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc,
-                  const CUDA_TEXTURE_DESC *pTexDesc,
-                  const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *,
-                                      const CUDA_TEXTURE_DESC *,
-                                      const CUDA_RESOURCE_VIEW_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                            CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc,
-                                           CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceViewDesc(
-    CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject,
-                                    const CUDA_RESOURCE_DESC *pResDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                             CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev,
-                                       CUdevice peerDev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, dev, peerDev);
-}
-
-CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext,
-                                       unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext, Flags);
-}
-
-CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext);
-}
-
-CUresult CUDAAPI cuDeviceGetP2PAttribute(int *value,
-                                         CUdevice_P2PAttribute attrib,
-                                         CUdevice srcDevice,
-                                         CUdevice dstDevice) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attrib, srcDevice, dstDevice);
-}
-
-CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(
-    CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex,
-    unsigned int mipLevel) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUgraphicsResource,
-                                      unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArray, resource, arrayIndex, mipLevel);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(
-    CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMipmappedArray, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(
-    CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevPtr, pSize, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-CUresult CUDAAPI cuGraphicsMapResources(unsigned int count,
-                                        CUgraphicsResource *resources,
-                                        CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count,
-                                          CUgraphicsResource *resources,
-                                          CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
-                                  const CUuuid *pExportTableId) {
-  using FuncPtr = CUresult(CUDAAPI *)(const void **, const CUuuid *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetModule");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hmod, hfunc);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_12_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_12_0.inc
deleted file mode 100644
index 7ec62ecb7c19b0..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_12_0.inc
+++ /dev/null
@@ -1,3323 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(error, pStr);
-}
-
-CUresult CUDAAPI cuInit(unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(Flags);
-}
-
-CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, ordinal);
-}
-
-CUresult CUDAAPI cuDeviceGetCount(int *count) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(name, len, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(uuid, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(uuid, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask,
-                                 CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, unsigned int *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetLuid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(luid, deviceNodeMask, dev);
-}
-
-CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(bytes, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
-                                                    CUarray_format format,
-                                                    unsigned numChannels,
-                                                    CUdevice dev) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(size_t *, CUarray_format, unsigned int, CUdevice);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuDeviceGetTexture1DLinearMaxWidth");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(maxWidthInElements, format, numChannels, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
-                                      CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList,
-                                                CUdevice dev, int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdevice, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetNvSciSyncAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nvSciSyncAttrList, dev, flags);
-}
-
-CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, CUmemoryPool);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceSetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, pool);
-}
-
-CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out,
-                                           CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetDefaultMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool_out, dev);
-}
-
-CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi,
-                                                CUexecAffinityType type,
-                                                CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUexecAffinityType, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetExecAffinitySupport");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, type, dev);
-}
-
-CUresult CUDAAPI
-cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target,
-                           CUflushGPUDirectRDMAWritesScope scope) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUflushGPUDirectRDMAWritesTarget,
-                                      CUflushGPUDirectRDMAWritesScope);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFlushGPUDirectRDMAWrites");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(target, scope);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop,
-                                                         CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevprop *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, dev);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major,
-                                                             int *minor,
-                                                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags,
-                                            int *active) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, flags, active);
-}
-
-CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev);
-}
-
-CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags,
-                             CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags, dev);
-}
-
-CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx,
-                                CUexecAffinityParam *paramsArray, int numParams,
-                                unsigned int flags, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUexecAffinityParam *, int,
-                                      unsigned int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v3");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, paramsArray, numParams, flags, dev);
-}
-
-CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx);
-}
-
-CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-CUresult CUDAAPI cuCtxGetId(CUcontext ctx, unsigned long long *ctxId) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, ctxId);
-}
-
-CUresult CUDAAPI cuCtxSynchronize(void) {
-  using FuncPtr = CUresult(CUDAAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUlimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pvalue, limit);
-}
-
-CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pconfig);
-}
-
-CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, version);
-}
-
-CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority,
-                                             int *greatestPriority) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-CUresult CUDAAPI cuCtxResetPersistingL2Cache(void) {
-  using FuncPtr = CUresult(CUDAAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxResetPersistingL2Cache");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity,
-                                      CUexecAffinityType type) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUexecAffinityParam *, CUexecAffinityType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetExecAffinity");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pExecAffinity, type);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pctx, flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx);
-}
-
-CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fname);
-}
-
-CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image);
-}
-
-CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image,
-                                    unsigned int numOptions,
-                                    CUjit_option *options,
-                                    void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *, unsigned int,
-                                      CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, image, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(module, fatCubin);
-}
-
-CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hmod);
-}
-
-CUresult CUDAAPI cuModuleGetLoadingMode(CUmoduleLoadingMode *mode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmoduleLoadingMode *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetLoadingMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
-                                     const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, hmod, name);
-}
-
-CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes,
-                                   CUmodule hmod, const char *name) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytes, hmod, name);
-}
-
-CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options,
-                              void **optionValues, CUlinkState *stateOut) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numOptions, options, optionValues, stateOut);
-}
-
-CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type,
-                               void *data, size_t size, const char *name,
-                               unsigned int numOptions, CUjit_option *options,
-                               void **optionValues) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t,
-                          const char *, unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, data, size, name, numOptions, options,
-                  optionValues);
-}
-
-CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type,
-                               const char *path, unsigned int numOptions,
-                               CUjit_option *options, void **optionValues) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, const char *,
-                                      unsigned int, CUjit_option *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, type, path, numOptions, options, optionValues);
-}
-
-CUresult CUDAAPI cuLinkComplete(CUlinkState state, void **cubinOut,
-                                size_t *sizeOut) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, void **, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state, cubinOut, sizeOut);
-}
-
-CUresult CUDAAPI cuLinkDestroy(CUlinkState state) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(state);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef,
-                                                     CUmodule hmod,
-                                                     const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef, hmod, name);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef,
-                                                      CUmodule hmod,
-                                                      const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref *, CUmodule, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfRef, hmod, name);
-}
-
-CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
-                                   CUjit_option *jitOptions,
-                                   void **jitOptionsValues,
-                                   unsigned int numJitOptions,
-                                   CUlibraryOption *libraryOptions,
-                                   void **libraryOptionValues,
-                                   unsigned int numLibraryOptions) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlibrary *, const void *, CUjit_option *,
-                                      void **, unsigned int, CUlibraryOption *,
-                                      void **, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryLoadData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(library, code, jitOptions, jitOptionsValues, numJitOptions,
-                  libraryOptions, libraryOptionValues, numLibraryOptions);
-}
-
-CUresult CUDAAPI cuLibraryLoadFromFile(CUlibrary *library, const char *fileName,
-                                       CUjit_option *jitOptions,
-                                       void **jitOptionsValues,
-                                       unsigned int numJitOptions,
-                                       CUlibraryOption *libraryOptions,
-                                       void **libraryOptionValues,
-                                       unsigned int numLibraryOptions) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlibrary *, const char *, CUjit_option *,
-                                      void **, unsigned int, CUlibraryOption *,
-                                      void **, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryLoadFromFile");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(library, fileName, jitOptions, jitOptionsValues,
-                  numJitOptions, libraryOptions, libraryOptionValues,
-                  numLibraryOptions);
-}
-
-CUresult CUDAAPI cuLibraryUnload(CUlibrary library) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUlibrary);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryUnload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(library);
-}
-
-CUresult CUDAAPI cuLibraryGetKernel(CUkernel *pKernel, CUlibrary library,
-                                    const char *name) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUkernel *, CUlibrary, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryGetKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pKernel, library, name);
-}
-
-CUresult CUDAAPI cuLibraryGetModule(CUmodule *pMod, CUlibrary library) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, CUlibrary);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryGetModule");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMod, library);
-}
-
-CUresult CUDAAPI cuKernelGetFunction(CUfunction *pFunc, CUkernel kernel) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUkernel);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuKernelGetFunction");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFunc, kernel);
-}
-
-CUresult CUDAAPI cuLibraryGetGlobal(CUdeviceptr *dptr, size_t *bytes,
-                                    CUlibrary library, const char *name) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUlibrary, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryGetGlobal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytes, library, name);
-}
-
-CUresult CUDAAPI cuLibraryGetManaged(CUdeviceptr *dptr, size_t *bytes,
-                                     CUlibrary library, const char *name) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUlibrary, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryGetManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytes, library, name);
-}
-
-CUresult CUDAAPI cuLibraryGetUnifiedFunction(void **fptr, CUlibrary library,
-                                             const char *symbol) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, CUlibrary, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryGetUnifiedFunction");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(fptr, library, symbol);
-}
-
-CUresult CUDAAPI cuKernelGetAttribute(int *pi, CUfunction_attribute attrib,
-                                      CUkernel kernel, CUdevice dev) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUkernel, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuKernelGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, kernel, dev);
-}
-
-CUresult CUDAAPI cuKernelSetAttribute(CUfunction_attribute attrib, int val,
-                                      CUkernel kernel, CUdevice dev) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUfunction_attribute, int, CUkernel, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuKernelSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attrib, val, kernel, dev);
-}
-
-CUresult CUDAAPI cuKernelSetCacheConfig(CUkernel kernel, CUfunc_cache config,
-                                        CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUkernel, CUfunc_cache, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuKernelSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kernel, config, dev);
-}
-
-CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize);
-}
-
-CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
-                                 size_t WidthInBytes, size_t Height,
-                                 unsigned int ElementSizeBytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t,
-                                      unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
-}
-
-CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
-                                      CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbase, psize, dptr);
-}
-
-CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize);
-}
-
-CUresult CUDAAPI cuMemFreeHost(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize,
-                                unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pp, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
-                                           unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, p, Flags);
-}
-
-CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, p);
-}
-
-CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
-                                   unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, flags);
-}
-
-CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, pciBusId);
-}
-
-CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
-  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, dev);
-}
-
-CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcEventHandle *, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, event);
-}
-
-CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent,
-                                      CUipcEventHandle handle) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, CUipcEventHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, handle);
-}
-
-CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, dptr);
-}
-
-CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
-                                    unsigned int Flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, handle, Flags);
-}
-
-CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr);
-}
-
-CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, bytesize, Flags);
-}
-
-CUresult CUDAAPI cuMemHostUnregister(void *p) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext,
-                              CUdeviceptr srcDevice, CUcontext srcContext,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset,
-                              CUdeviceptr srcDevice, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray,
-                              size_t srcOffset, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset,
-                              const void *srcHost, size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset,
-                              CUarray srcArray, size_t srcOffset,
-                              size_t ByteCount) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
-}
-
-CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy);
-}
-
-CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src,
-                               size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext,
-                                   CUdeviceptr srcDevice, CUcontext srcContext,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
-                                      CUcontext, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount,
-                  hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
-                                   size_t ByteCount, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
-                                   const void *srcHost, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray,
-                                   size_t srcOffset, size_t ByteCount,
-                                   CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy,
-                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCopy, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N);
-}
-
-CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
-                             size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N);
-}
-
-CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N);
-}
-
-CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch,
-                              unsigned char uc, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned short us, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch,
-                               unsigned int ui, size_t Width, size_t Height) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height);
-}
-
-CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
-                                 size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, uc, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, us, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui,
-                                  size_t N, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, ui, N, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                   unsigned char uc, size_t Width,
-                                   size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned short us, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short,
-                                      size_t, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
-                                    unsigned int ui, size_t Width,
-                                    size_t Height, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t,
-                                      size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
-}
-
-CUresult CUDAAPI cuArrayCreate(CUarray *pHandle,
-                               const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor,
-                                      CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-CUresult CUDAAPI cuArrayGetSparseProperties(
-    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_SPARSE_PROPERTIES *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, array);
-}
-
-CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(
-    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUDA_ARRAY_SPARSE_PROPERTIES *, CUmipmappedArray);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMipmappedArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, mipmap);
-}
-
-CUresult CUDAAPI
-cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements,
-                             CUarray array, CUdevice device) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUDA_ARRAY_MEMORY_REQUIREMENTS *, CUarray, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetMemoryRequirements");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memoryRequirements, array, device);
-}
-
-CUresult CUDAAPI cuMipmappedArrayGetMemoryRequirements(
-    CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap,
-    CUdevice device) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_MEMORY_REQUIREMENTS *,
-                                      CUmipmappedArray, CUdevice);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMipmappedArrayGetMemoryRequirements");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memoryRequirements, mipmap, device);
-}
-
-CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray,
-                                 unsigned int planeIdx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetPlane");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pPlaneArray, hArray, planeIdx);
-}
-
-CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hArray);
-}
-
-CUresult CUDAAPI cuArray3DCreate(
-    CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pAllocateArray);
-}
-
-CUresult CUDAAPI cuArray3DGetDescriptor(
-    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArrayDescriptor, hArray);
-}
-
-CUresult CUDAAPI
-cuMipmappedArrayCreate(CUmipmappedArray *pHandle,
-                       const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
-                       unsigned int numMipmapLevels) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
-}
-
-CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray,
-                                          CUmipmappedArray hMipmappedArray,
-                                          unsigned int level) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pLevelArray, hMipmappedArray, level);
-}
-
-CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hMipmappedArray);
-}
-
-CUresult CUDAAPI cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr,
-                                               size_t size,
-                                               CUmemRangeHandleType handleType,
-                                               unsigned long long flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t,
-                                      CUmemRangeHandleType, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetHandleForAddressRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dptr, size, handleType, flags);
-}
-
-CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size,
-                                     size_t alignment, CUdeviceptr addr,
-                                     unsigned long long flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, size_t,
-                                      CUdeviceptr, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressReserve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, alignment, addr, flags);
-}
-
-CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                             const CUmemAllocationProp *prop,
-                             unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, size_t,
-                          const CUmemAllocationProp *, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, size, prop, flags);
-}
-
-CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
-                          CUmemGenericAllocationHandle handle,
-                          unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, size_t,
-                          CUmemGenericAllocationHandle, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMap");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, offset, handle, flags);
-}
-
-CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList,
-                                    unsigned int count, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarrayMapInfo *, unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMapArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mapInfoList, count, hStream);
-}
-
-CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemUnmap");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size,
-                                const CUmemAccessDesc *desc, size_t count) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, const CUmemAccessDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemSetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, desc, count);
-}
-
-CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags,
-                                const CUmemLocation *location,
-                                CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned long long *,
-                                      const CUmemLocation *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags, location, ptr);
-}
-
-CUresult CUDAAPI cuMemExportToShareableHandle(
-    void *shareableHandle, CUmemGenericAllocationHandle handle,
-    CUmemAllocationHandleType handleType, unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(void *, CUmemGenericAllocationHandle,
-                          CUmemAllocationHandleType, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemExportToShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(shareableHandle, handle, handleType, flags);
-}
-
-CUresult CUDAAPI cuMemImportFromShareableHandle(
-    CUmemGenericAllocationHandle *handle, void *osHandle,
-    CUmemAllocationHandleType shHandleType) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *,
-                                      CUmemAllocationHandleType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemImportFromShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, osHandle, shHandleType);
-}
-
-CUresult CUDAAPI cuMemGetAllocationGranularity(
-    size_t *granularity, const CUmemAllocationProp *prop,
-    CUmemAllocationGranularity_flags option) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, const CUmemAllocationProp *,
-                                      CUmemAllocationGranularity_flags);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAllocationGranularity");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(granularity, prop, option);
-}
-
-CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(
-    CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemAllocationProp *, CUmemGenericAllocationHandle);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMemGetAllocationPropertiesFromHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, handle);
-}
-
-CUresult CUDAAPI
-cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRetainAllocationHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, addr);
-}
-
-CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, hStream);
-}
-
-CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize,
-                                 CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, hStream);
-}
-
-CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolTrimTo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, minBytesToKeep);
-}
-
-CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool,
-                                       CUmemPool_attribute attr, void *value) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemoryPool, CUmemPool_attribute, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, attr, value);
-}
-
-CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool,
-                                       CUmemPool_attribute attr, void *value) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemoryPool, CUmemPool_attribute, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, attr, value);
-}
-
-CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool,
-                                    const CUmemAccessDesc *map, size_t count) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemoryPool, const CUmemAccessDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolSetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, map, count);
-}
-
-CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags,
-                                    CUmemoryPool memPool,
-                                    CUmemLocation *location) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmemAccess_flags *, CUmemoryPool, CUmemLocation *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolGetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags, memPool, location);
-}
-
-CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool,
-                                 const CUmemPoolProps *poolProps) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, const CUmemPoolProps *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool, poolProps);
-}
-
-CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool);
-}
-
-CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize,
-                                         CUmemoryPool pool, CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t, CUmemoryPool, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocFromPoolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dptr, bytesize, pool, hStream);
-}
-
-CUresult CUDAAPI cuMemPoolExportToShareableHandle(
-    void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType,
-    unsigned long long flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      void *, CUmemoryPool, CUmemAllocationHandleType, unsigned long long);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMemPoolExportToShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle_out, pool, handleType, flags);
-}
-
-CUresult CUDAAPI cuMemPoolImportFromShareableHandle(
-    CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType,
-    unsigned long long flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUmemoryPool *, void *, CUmemAllocationHandleType, unsigned long long);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuMemPoolImportFromShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pool_out, handle, handleType, flags);
-}
-
-CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out,
-                                        CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmemPoolPtrExportData *, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolExportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(shareData_out, ptr);
-}
-
-CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool,
-                                        CUmemPoolPtrExportData *shareData) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUmemoryPool,
-                                      CUmemPoolPtrExportData *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolImportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr_out, pool, shareData);
-}
-
-CUresult CUDAAPI cuPointerGetAttribute(void *data,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, attribute, ptr);
-}
-
-CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count,
-                                    CUdevice dstDevice, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, hStream);
-}
-
-CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count,
-                             CUmem_advise advice, CUdevice device) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize,
-                                        CUmem_range_attribute attribute,
-                                        CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, CUmem_range_attribute,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes,
-                                         CUmem_range_attribute *attributes,
-                                         size_t numAttributes,
-                                         CUdeviceptr devPtr, size_t count) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-CUresult CUDAAPI cuPointerSetAttribute(const void *value,
-                                       CUpointer_attribute attribute,
-                                       CUdeviceptr ptr) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attribute, ptr);
-}
-
-CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes,
-                                        CUpointer_attribute *attributes,
-                                        void **data, CUdeviceptr ptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int, CUpointer_attribute *,
-                                      void **, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numAttributes, attributes, data, ptr);
-}
-
-CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, Flags);
-}
-
-CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream,
-                                            unsigned int flags, int priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phStream, flags, priority);
-}
-
-CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, streamId);
-}
-
-CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUcontext *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, pctx);
-}
-
-CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent,
-                                   unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUevent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, hEvent, Flags);
-}
-
-CUresult CUDAAPI cuStreamAddCallback(CUstream hStream,
-                                     CUstreamCallback callback, void *userData,
-                                     unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, callback, userData, flags);
-}
-
-CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream,
-                                      CUstreamCaptureMode mode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBeginCapture_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, mode);
-}
-
-CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstreamCaptureMode *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuThreadExchangeStreamCaptureMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, phGraph);
-}
-
-CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream,
-                                     CUstreamCaptureStatus *captureStatus) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus);
-}
-
-CUresult CUDAAPI cuStreamGetCaptureInfo(
-    CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
-    cuuint64_t *id_out, CUgraph *graph_out,
-    const CUgraphNode **dependencies_out, size_t *numDependencies_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *, cuuint64_t *,
-                          CUgraph *, const CUgraphNode **, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCaptureInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, captureStatus_out, id_out, graph_out,
-                  dependencies_out, numDependencies_out);
-}
-
-CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream,
-                                                   CUgraphNode *dependencies,
-                                                   size_t numDependencies,
-                                                   unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUgraphNode *, size_t, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuStreamUpdateCaptureDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, dependencies, numDependencies, flags);
-}
-
-CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr,
-                                        size_t length, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, dptr, length, flags);
-}
-
-CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream);
-}
-
-CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src);
-}
-
-CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
-                                      CUstreamAttrValue *value_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, CUstreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value_out);
-}
-
-CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
-                                      const CUstreamAttrValue *value) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, const CUstreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value);
-}
-
-CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phEvent, Flags);
-}
-
-CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent, hStream);
-}
-
-CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream,
-                                        unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecordWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent, hStream, flags);
-}
-
-CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hEvent);
-}
-
-CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
-                                    CUevent hEnd) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUevent, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMilliseconds, hStart, hEnd);
-}
-
-CUresult CUDAAPI
-cuImportExternalMemory(CUexternalMemory *extMem_out,
-                       const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory *,
-                                      const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(
-    CUdeviceptr *devPtr, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUexternalMemory,
-                                      const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(
-    CUmipmappedArray *mipmap, CUexternalMemory extMem,
-    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUmipmappedArray *, CUexternalMemory,
-                          const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-CUresult CUDAAPI cuImportExternalSemaphore(
-    CUexternalSemaphore *extSem_out,
-    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-CUresult CUDAAPI cuSignalExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *,
-      const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuWaitExternalSemaphoresAsync(
-    const CUexternalSemaphore *extSemArray,
-    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray,
-    unsigned int numExtSems, CUstream stream) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *,
-      unsigned int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUexternalSemaphore);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr,
-                                     cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr,
-                                     cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr,
-                                      cuuint32_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr,
-                                      cuuint64_t value, unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, addr, value, flags);
-}
-
-CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
-                                    CUstreamBatchMemOpParams *paramArray,
-                                    unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int,
-                                      CUstreamBatchMemOpParams *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, count, paramArray, flags);
-}
-
-CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
-                                    CUfunction hfunc) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pi, attrib, hfunc);
-}
-
-CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc,
-                                    CUfunction_attribute attrib, int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunction_attribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, attrib, value);
-}
-
-CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunc_cache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc,
-                                          CUsharedconfig config) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUsharedconfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, config);
-}
-
-CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetModule");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hmod, hfunc);
-}
-
-CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX,
-                                unsigned int gridDimY, unsigned int gridDimZ,
-                                unsigned int blockDimX, unsigned int blockDimY,
-                                unsigned int blockDimZ,
-                                unsigned int sharedMemBytes, CUstream hStream,
-                                void **kernelParams, void **extra) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
-}
-
-CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config, CUfunction f,
-                                  void **kernelParams, void **extra) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(const CUlaunchConfig *, CUfunction, void **, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernelEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config, f, kernelParams, extra);
-}
-
-CUresult CUDAAPI cuLaunchCooperativeKernel(
-    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
-    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
-    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
-    void **kernelParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
-      unsigned int, unsigned int, unsigned int, CUstream, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                  blockDimZ, sharedMemBytes, hStream, kernelParams);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(
-    CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn,
-                                  void *userData) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUhostFn, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, fn, userData);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x,
-                                                       int y, int z) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, x, y, z);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc,
-                                                       unsigned int bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, bytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc,
-                                                  unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset,
-                                               unsigned int value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset,
-                                               float value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, value);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset,
-                                               void *ptr,
-                                               unsigned int numbytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, offset, ptr, numbytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width,
-                                                int grid_height) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f,
-                                                     int grid_width,
-                                                     int grid_height,
-                                                     CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(f, grid_width, grid_height, hStream);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc,
-                                                    int texunit,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hfunc, texunit, hTexRef);
-}
-
-CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraph, flags);
-}
-
-CUresult CUDAAPI cuGraphAddKernelNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddKernelNode_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeGetParams(
-    CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetParams_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeSetParams(
-    CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetParams_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                      const CUgraphNode *dependencies,
-                                      size_t numDependencies,
-                                      const CUDA_MEMCPY3D *copyParams,
-                                      CUcontext ctx) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_MEMCPY3D *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  copyParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode,
-                                            CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode,
-                                            const CUDA_MEMCPY3D *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMCPY3D *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemsetNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams,
-    CUcontext ctx) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  memsetParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeGetParams(
-    CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemsetNodeSetParams(
-    CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                    const CUgraphNode *dependencies,
-                                    size_t numDependencies,
-                                    const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode,
-                                          CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphHostNodeSetParams(
-    CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode,
-                                          CUgraph hGraph,
-                                          const CUgraphNode *dependencies,
-                                          size_t numDependencies,
-                                          CUgraph childGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  childGraph);
-}
-
-CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode,
-                                               CUgraph *phGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraph *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, phGraph);
-}
-
-CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                     const CUgraphNode *dependencies,
-                                     size_t numDependencies) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode,
-                                           CUgraph hGraph,
-                                           const CUgraphNode *dependencies,
-                                           size_t numDependencies,
-                                           CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEventRecordNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, event);
-}
-
-CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode,
-                                                CUevent *event_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventRecordNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event_out);
-}
-
-CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode,
-                                                CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event);
-}
-
-CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode,
-                                         CUgraph hGraph,
-                                         const CUgraphNode *dependencies,
-                                         size_t numDependencies,
-                                         CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEventWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, event);
-}
-
-CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode,
-                                              CUevent *event_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventWaitNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event_out);
-}
-
-CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode,
-                                              CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, event);
-}
-
-CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphAddExternalSemaphoresSignalNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(
-    CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresSignalNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(
-    CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphAddExternalSemaphoresWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(
-    CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresWaitNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(
-    CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddBatchMemOpNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          const CUDA_BATCH_MEM_OP_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddBatchMemOpNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphBatchMemOpNodeGetParams(
-    CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphBatchMemOpNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams_out);
-}
-
-CUresult CUDAAPI cuGraphBatchMemOpNodeSetParams(
-    CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphBatchMemOpNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExecBatchMemOpNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_BATCH_MEM_OP_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecBatchMemOpNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphAddMemAllocNode(
-    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
-    size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
-                          CUDA_MEM_ALLOC_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemAllocNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
-                  nodeParams);
-}
-
-CUresult CUDAAPI cuGraphMemAllocNodeGetParams(
-    CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEM_ALLOC_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemAllocNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph,
-                                       const CUgraphNode *dependencies,
-                                       size_t numDependencies,
-                                       CUdeviceptr dptr) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
-                                      const CUgraphNode *, size_t, CUdeviceptr);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemFreeNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, dptr);
-}
-
-CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode,
-                                             CUdeviceptr *dptr_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUdeviceptr *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemFreeNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dptr_out);
-}
-
-CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGraphMemTrim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device,
-                                              CUgraphMem_attribute attr,
-                                              void *value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, CUgraphMem_attribute, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetGraphMemAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attr, value);
-}
-
-CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device,
-                                              CUgraphMem_attribute attr,
-                                              void *value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, CUgraphMem_attribute, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceSetGraphMemAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attr, value);
-}
-
-CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphClone, originalGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode,
-                                        CUgraphNode hOriginalNode,
-                                        CUgraph hClonedGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraphNode, CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phNode, hOriginalNode, hClonedGraph);
-}
-
-CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, type);
-}
-
-CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes,
-                                 size_t *numNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, nodes, numNodes);
-}
-
-CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes,
-                                     size_t *numRootNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, rootNodes, numRootNodes);
-}
-
-CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from,
-                                 CUgraphNode *to, size_t *numEdges) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numEdges);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode,
-                                            CUgraphNode *dependencies,
-                                            size_t *numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependencies, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode,
-                                              CUgraphNode *dependentNodes,
-                                              size_t *numDependentNodes) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, dependentNodes, numDependentNodes);
-}
-
-CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from,
-                                        const CUgraphNode *to,
-                                        size_t numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
-                                      const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph,
-                                           const CUgraphNode *from,
-                                           const CUgraphNode *to,
-                                           size_t numDependencies) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
-                                      const CUgraphNode *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, from, to, numDependencies);
-}
-
-CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode);
-}
-
-CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph,
-                                    unsigned long long flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphExec *, CUgraph, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphExec, hGraph, flags);
-}
-
-CUresult CUDAAPI
-cuGraphInstantiateWithParams(CUgraphExec *phGraphExec, CUgraph hGraph,
-                             CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec *, CUgraph,
-                                      CUDA_GRAPH_INSTANTIATE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiateWithParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phGraphExec, hGraph, instantiateParams);
-}
-
-CUresult CUDAAPI cuGraphExecGetFlags(CUgraphExec hGraphExec,
-                                     cuuint64_t *flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, cuuint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, flags);
-}
-
-CUresult CUDAAPI
-cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
-                               const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_KERNEL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecKernelNodeSetParams_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec,
-                                                CUgraphNode hNode,
-                                                const CUDA_MEMCPY3D *copyParams,
-                                                CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_MEMCPY3D *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, copyParams, ctx);
-}
-
-CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUgraphExec, CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, memsetParams, ctx);
-}
-
-CUresult CUDAAPI
-cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
-                             const CUDA_HOST_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_HOST_NODE_PARAMS *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec,
-                                                    CUgraphNode hNode,
-                                                    CUgraph childGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUgraph);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecChildGraphNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, childGraph);
-}
-
-CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec,
-                                                    CUgraphNode hNode,
-                                                    CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUevent);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec,
-                                                  CUgraphNode hNode,
-                                                  CUevent event) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUevent);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(
-    CUgraphExec hGraphExec, CUgraphNode hNode,
-    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
-                                      const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphExecExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-CUresult CUDAAPI cuGraphNodeSetEnabled(CUgraphExec hGraphExec,
-                                       CUgraphNode hNode,
-                                       unsigned int isEnabled) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeSetEnabled");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, isEnabled);
-}
-
-CUresult CUDAAPI cuGraphNodeGetEnabled(CUgraphExec hGraphExec,
-                                       CUgraphNode hNode,
-                                       unsigned int *isEnabled) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetEnabled");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, isEnabled);
-}
-
-CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphUpload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hStream);
-}
-
-CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hStream);
-}
-
-CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec);
-}
-
-CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph);
-}
-
-CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph,
-                                   CUgraphExecUpdateResultInfo *resultInfo) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraphExec, CUgraph, CUgraphExecUpdateResultInfo *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecUpdate_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hGraph, resultInfo);
-}
-
-CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst,
-                                                 CUgraphNode src) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src);
-}
-
-CUresult CUDAAPI
-cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
-                              CUkernelNodeAttrValue *value_out) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
-                                      CUkernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value_out);
-}
-
-CUresult CUDAAPI
-cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
-                              const CUkernelNodeAttrValue *value) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
-                                      const CUkernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value);
-}
-
-CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path,
-                                      unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const char *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDebugDotPrint");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraph, path, flags);
-}
-
-CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr,
-                                    CUhostFn destroy,
-                                    unsigned int initialRefcount,
-                                    unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuserObject *, void *, CUhostFn,
-                                      unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuUserObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object_out, ptr, destroy, initialRefcount, flags);
-}
-
-CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuserObject, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuUserObjectRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUuserObject, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuUserObjectRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object,
-                                         unsigned int count,
-                                         unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUgraph, CUuserObject, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRetainUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count, flags);
-}
-
-CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object,
-                                          unsigned int count) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUuserObject, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphReleaseUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize,
-    unsigned int flags) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUfunction,
-                                      CUoccupancyB2DSize, size_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
-    int *minGridSize, int *blockSize, CUfunction func,
-    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
-    int blockSizeLimit, unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
-                  dynamicSMemSize, blockSizeLimit, flags);
-}
-
-CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(
-    size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUfunction, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyAvailableDynamicSMemPerBlock");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
-}
-
-CUresult CUDAAPI cuOccupancyMaxPotentialClusterSize(
-    int *clusterSize, CUfunction func, const CUlaunchConfig *config) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUfunction, const CUlaunchConfig *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialClusterSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(clusterSize, func, config);
-}
-
-CUresult CUDAAPI cuOccupancyMaxActiveClusters(int *numClusters, CUfunction func,
-                                              const CUlaunchConfig *config) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUfunction, const CUlaunchConfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxActiveClusters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numClusters, func, config);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef,
-                                                    CUarray hArray,
-                                                    unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hArray, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(
-    CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, hMipmappedArray, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset,
-                                                      CUtexref hTexRef,
-                                                      CUdeviceptr dptr,
-                                                      size_t bytes) {
-  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ByteOffset, hTexRef, dptr, bytes);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc,
-                     CUdeviceptr dptr, size_t Pitch) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *,
-                                      CUdeviceptr, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, desc, dptr, Pitch);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef,
-                                                     CUarray_format fmt,
-                                                     int NumPackedComponents) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray_format, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fmt, NumPackedComponents);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef,
-                                                          int dim,
-                                                          CUaddress_mode am) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, int, CUaddress_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, dim, am);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef,
-                                                         CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, fm);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef,
-                                                              float bias) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, bias);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(
-    CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, maxAniso);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef,
-                                                          float *pBorderColor) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, pBorderColor);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef,
-                                                    unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr,
-                                                      CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pdptr, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(
-    CUmipmappedArray *phMipmappedArray, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phMipmappedArray, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam,
-                                                          CUtexref hTexRef,
-                                                          int dim) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUaddress_mode *, CUtexref, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pam, hTexRef, dim);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm,
-                                                         CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat,
-                                                     int *pNumChannels,
-                                                     CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray_format *, int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFormat, pNumChannels, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pfm, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pbias, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI
-cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp,
-                            float *pmaxMipmapLevelClamp, CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso,
-                                                            CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pmaxAniso, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor,
-                                                          CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pBorderColor, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags,
-                                                    CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hTexRef);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef,
-                                                     CUarray hArray,
-                                                     unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref, CUarray, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hSurfRef, hArray, Flags);
-}
-
-__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray,
-                                                     CUsurfref hSurfRef) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUsurfref);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(phArray, hSurfRef);
-}
-
-CUresult CUDAAPI
-cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc,
-                  const CUDA_TEXTURE_DESC *pTexDesc,
-                  const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *,
-                                      const CUDA_TEXTURE_DESC *,
-                                      const CUDA_RESOURCE_VIEW_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                            CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc,
-                                           CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-CUresult CUDAAPI cuTexObjectGetResourceViewDesc(
-    CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject,
-                                    const CUDA_RESOURCE_DESC *pResDesc) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
-                                             CUsurfObject surfObject) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-CUresult CUDAAPI cuTensorMapEncodeTiled(
-    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
-    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
-    const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
-    const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
-    CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
-    CUtensorMapFloatOOBfill oobFill) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUtensorMap *, CUtensorMapDataType, cuuint32_t, void *,
-      const cuuint64_t *, const cuuint64_t *, const cuuint32_t *,
-      const cuuint32_t *, CUtensorMapInterleave, CUtensorMapSwizzle,
-      CUtensorMapL2promotion, CUtensorMapFloatOOBfill);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTensorMapEncodeTiled");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorMap, tensorDataType, tensorRank, globalAddress,
-                  globalDim, globalStrides, boxDim, elementStrides, interleave,
-                  swizzle, l2Promotion, oobFill);
-}
-
-CUresult CUDAAPI cuTensorMapEncodeIm2col(
-    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
-    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
-    const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner,
-    const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel,
-    cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides,
-    CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle,
-    CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) {
-  using FuncPtr = CUresult(CUDAAPI *)(
-      CUtensorMap *, CUtensorMapDataType, cuuint32_t, void *,
-      const cuuint64_t *, const cuuint64_t *, const int *, const int *,
-      cuuint32_t, cuuint32_t, const cuuint32_t *, CUtensorMapInterleave,
-      CUtensorMapSwizzle, CUtensorMapL2promotion, CUtensorMapFloatOOBfill);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTensorMapEncodeIm2col");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorMap, tensorDataType, tensorRank, globalAddress,
-                  globalDim, globalStrides, pixelBoxLowerCorner,
-                  pixelBoxUpperCorner, channelsPerPixel, pixelsPerColumn,
-                  elementStrides, interleave, swizzle, l2Promotion, oobFill);
-}
-
-CUresult CUDAAPI cuTensorMapReplaceAddress(CUtensorMap *tensorMap,
-                                           void *globalAddress) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUtensorMap *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuTensorMapReplaceAddress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorMap, globalAddress);
-}
-
-CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev,
-                                       CUdevice peerDev) {
-  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, dev, peerDev);
-}
-
-CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext,
-                                       unsigned int Flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext, Flags);
-}
-
-CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerContext);
-}
-
-CUresult CUDAAPI cuDeviceGetP2PAttribute(int *value,
-                                         CUdevice_P2PAttribute attrib,
-                                         CUdevice srcDevice,
-                                         CUdevice dstDevice) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attrib, srcDevice, dstDevice);
-}
-
-CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(
-    CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex,
-    unsigned int mipLevel) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUgraphicsResource,
-                                      unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pArray, resource, arrayIndex, mipLevel);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(
-    CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pMipmappedArray, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(
-    CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevPtr, pSize, resource);
-}
-
-CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource,
-                                               unsigned int flags) {
-  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-CUresult CUDAAPI cuGraphicsMapResources(unsigned int count,
-                                        CUgraphicsResource *resources,
-                                        CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count,
-                                          CUgraphicsResource *resources,
-                                          CUstream hStream) {
-  using FuncPtr =
-      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, hStream);
-}
-
-CUresult CUDAAPI cuGetProcAddress(
-    const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags,
-    CUdriverProcAddressQueryResult *symbolStatus) {
-  using FuncPtr = CUresult(CUDAAPI *)(const char *, void **, int, cuuint64_t,
-                                      CUdriverProcAddressQueryResult *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetProcAddress_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, pfn, cudaVersion, flags, symbolStatus);
-}
-
-CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
-                                  const CUuuid *pExportTableId) {
-  using FuncPtr = CUresult(CUDAAPI *)(const void **, const CUuuid *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_0.inc
deleted file mode 100644
index 6810c05d679a2f..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_0.inc
+++ /dev/null
@@ -1,1846 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
-                                                         size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, pciBusId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
-                                                            int len,
-                                                            int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, handle);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
-    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, handle, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaPeekAtLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorName(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
-  if (!func_ptr) return "cudaGetErrorName symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorString(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
-  if (!func_ptr) return "cudaGetErrorString symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceCount(int *count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
-                          int srcDevice, int dstDevice) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, srcDevice, dstDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, prop);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDevice(int *device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
-                                                          int len) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device_arr, len);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
-                             int priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamDestroy(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
-    cudaStream_t stream, cudaEvent_t event, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, event, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
-                      void *userData, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
-                                           void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, callback, userData, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamSynchronize(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
-                         size_t length __dv(0),
-                         unsigned int flags __dv(cudaMemAttachSingle)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, devPtr, length, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamBeginCapture(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventDestroy(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
-                                                           cudaEvent_t start,
-                                                           cudaEvent_t end) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ms, start, end);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
-    cudaExternalMemory_t *extMem_out,
-    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
-    void **devPtr, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
-                               const struct cudaExternalMemoryBufferDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, cudaExternalMemory_t,
-      const struct cudaExternalMemoryMipmappedArrayDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
-    cudaExternalSemaphore_t *extSem_out,
-    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreSignalParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreSignalParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaSignalExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreWaitParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreWaitParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaWaitExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
-                 size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
-    const void *func, dim3 gridDim, dim3 blockDim, void **args,
-    size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(
-    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
-                                           unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, cacheConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, config);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, func);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, attr, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForDevice(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForHost(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
-                                                         cudaHostFn_t fn,
-                                                         void *userData) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, fn, userData);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
-                                              int blockSize,
-                                              size_t dynamicSMemSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
-                                                       const void *func,
-                                                       int blockSize,
-                                                       size_t dynamicSMemSize,
-                                                       unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0),
-                  cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(dim3, dim3, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaConfigureCall");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(gridDim, blockDim, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
-                                                        size_t size,
-                                                        size_t offset) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetupArgument");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arg, size, offset);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(
-    void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMalloc(void **devPtr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
-                                                      size_t *pitch,
-                                                      size_t width,
-                                                      size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
-    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
-    size_t height __dv(0), unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           size_t, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, width, height, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFree(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
-                                                    unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHost, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
-                                                       unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevice, pHost, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
-                                                       void *pHost) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, pHost);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, extent);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
-                  struct cudaExtent extent, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           struct cudaExtent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, extent, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray,
-    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
-    unsigned int numLevels, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
-      struct cudaExtent, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
-    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
-    unsigned int level) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(levelArray, mipmappedArray, level);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
-    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
-    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
-                                                     size_t *total) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
-                 unsigned int *flags, cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           struct cudaExtent *, unsigned int *,
-                                           cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, extent, flags, array);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
-                                                 size_t count,
-                                                 enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
-                                                     const void *src,
-                                                     int srcDevice,
-                                                     size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
-                  const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
-                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
-                                                   const void *src,
-                                                   size_t spitch, size_t width,
-                                                   size_t height,
-                                                   enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
-    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           cudaArray_const_t, size_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
-    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
-                                           size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
-    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemcpyAsync(void *dst, const void *src, size_t count,
-                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
-                    size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(
-    void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset,
-    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
-    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
-    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           const void *, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
-                                           size_t, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
-    const void *symbol, const void *src, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
-                               enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
-    void *dst, const void *symbol, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
-                                                 size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
-                                                   int value, size_t width,
-                                                   size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
-    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
-    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
-                  size_t height, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
-                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
-                                           struct cudaExtent, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
-                                                           const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
-                                                        const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(size, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
-                     cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
-              int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
-                                           enum cudaMemoryAdvise, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
-    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
-    const void *devPtr, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
-    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
-    size_t numAttributes, const void *devPtr, size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
-                               size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
-    struct cudaPointerAttributes *attributes, const void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attributes, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, device, peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceDisablePeerAccess(int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
-    cudaGraphicsResource_t resource, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
-    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
-    cudaArray_t *array, cudaGraphicsResource_t resource,
-    unsigned int arrayIndex, unsigned int mipLevel) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, resource, arrayIndex, mipLevel);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsResourceGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
-    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           cudaArray_const_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, array);
-}
-
-extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(
-    int x, int y, int z, int w, enum cudaChannelFormatKind f) {
-  using FuncPtr = struct cudaChannelFormatDesc(CUDARTAPI *)(
-      int, int, int, int, enum cudaChannelFormatKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateChannelDesc");
-  if (!func_ptr) {
-    return cudaChannelFormatDesc{cudaChannelFormatKind(-1), 0, 0, 0};
-  }
-  return func_ptr(x, y, z, w, f);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindTexture(
-    size_t *offset, const struct textureReference *texref, const void *devPtr,
-    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
-                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
-                  size_t width, size_t height, size_t pitch) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
-    const struct textureReference *texref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, array, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaBindTextureToMipmappedArray(const struct textureReference *texref,
-                                cudaMipmappedArray_const_t mipmappedArray,
-                                const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaMipmappedArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, mipmappedArray, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUnbindTexture(const struct textureReference *texref) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(
-    size_t *offset, const struct textureReference *texref) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
-    const struct textureReference **texref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
-    const struct surfaceReference *surfref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct surfaceReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, array, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
-    const struct surfaceReference **surfref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
-    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
-    const struct cudaTextureDesc *pTexDesc,
-    const struct cudaResourceViewDesc *pResViewDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaTextureObject_t *, const struct cudaResourceDesc *,
-      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyTextureObject(cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
-    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
-    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
-                                           cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
-    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
-                                           const struct cudaResourceDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaRuntimeGetVersion(int *runtimeVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(runtimeVersion);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
-                                                      unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraph, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddKernelNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                       cudaGraphNode_t *pDependencies, size_t numDependencies,
-                       const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, size_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
-    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
-    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddMemcpyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                       cudaGraphNode_t *pDependencies, size_t numDependencies,
-                       const struct cudaMemcpy3DParms *pCopyParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, size_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pCopyParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddMemsetNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                       cudaGraphNode_t *pDependencies, size_t numDependencies,
-                       const struct cudaMemsetParams *pMemsetParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, size_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pMemsetParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddHostNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                     cudaGraphNode_t *pDependencies, size_t numDependencies,
-                     const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, size_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
-    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
-    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                           cudaGraphNode_t *pDependencies,
-                           size_t numDependencies, cudaGraph_t childGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, cudaGraphNode_t *, size_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  childGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddEmptyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                      cudaGraphNode_t *pDependencies, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphClone, originalGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
-                         cudaGraph_t clonedGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pNode, originalNode, clonedGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pType);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
-                                                        cudaGraphNode_t *nodes,
-                                                        size_t *numNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, nodes, numNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
-    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, pRootNodes, pNumRootNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
-                                                        cudaGraphNode_t *from,
-                                                        cudaGraphNode_t *to,
-                                                        size_t *numEdges) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numEdges);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
-    size_t *pNumDependencies) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependencies, pNumDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
-    size_t *pNumDependentNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependentNodes, pNumDependentNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddDependencies(cudaGraph_t graph, cudaGraphNode_t *from,
-                         cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphRemoveDependencies(cudaGraph_t graph, cudaGraphNode_t *from,
-                            cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphDestroyNode(cudaGraphNode_t node) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(
-    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
-    char *pLogBuffer, size_t bufferSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
-                                                      cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
-    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_1.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_1.inc
deleted file mode 100644
index d076cc4ac3a506..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_1.inc
+++ /dev/null
@@ -1,1854 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
-                                                         size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, pciBusId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
-                                                            int len,
-                                                            int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, handle);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
-    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, handle, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaPeekAtLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorName(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
-  if (!func_ptr) return "cudaGetErrorName symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorString(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
-  if (!func_ptr) return "cudaGetErrorString symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceCount(int *count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
-                          int srcDevice, int dstDevice) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, srcDevice, dstDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, prop);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDevice(int *device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
-                                                          int len) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device_arr, len);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
-                             int priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamDestroy(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
-    cudaStream_t stream, cudaEvent_t event, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, event, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
-                      void *userData, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
-                                           void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, callback, userData, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamSynchronize(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
-                         size_t length __dv(0),
-                         unsigned int flags __dv(cudaMemAttachSingle)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, devPtr, length, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaStreamCaptureMode *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus,
-    unsigned long long *pId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus, pId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventDestroy(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
-                                                           cudaEvent_t start,
-                                                           cudaEvent_t end) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ms, start, end);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
-    cudaExternalMemory_t *extMem_out,
-    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
-    void **devPtr, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
-                               const struct cudaExternalMemoryBufferDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, cudaExternalMemory_t,
-      const struct cudaExternalMemoryMipmappedArrayDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
-    cudaExternalSemaphore_t *extSem_out,
-    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreSignalParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreSignalParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaSignalExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreWaitParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreWaitParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaWaitExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
-                 size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
-    const void *func, dim3 gridDim, dim3 blockDim, void **args,
-    size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(
-    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
-                                           unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, cacheConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, config);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, func);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, attr, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForDevice(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForHost(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
-                                                         cudaHostFn_t fn,
-                                                         void *userData) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, fn, userData);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
-                                              int blockSize,
-                                              size_t dynamicSMemSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
-                                                       const void *func,
-                                                       int blockSize,
-                                                       size_t dynamicSMemSize,
-                                                       unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(
-    void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMalloc(void **devPtr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
-                                                      size_t *pitch,
-                                                      size_t width,
-                                                      size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
-    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
-    size_t height __dv(0), unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           size_t, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, width, height, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFree(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
-                                                    unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHost, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
-                                                       unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevice, pHost, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
-                                                       void *pHost) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, pHost);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, extent);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
-                  struct cudaExtent extent, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           struct cudaExtent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, extent, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray,
-    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
-    unsigned int numLevels, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
-      struct cudaExtent, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
-    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
-    unsigned int level) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(levelArray, mipmappedArray, level);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
-    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
-    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
-                                                     size_t *total) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
-                 unsigned int *flags, cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           struct cudaExtent *, unsigned int *,
-                                           cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, extent, flags, array);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
-                                                 size_t count,
-                                                 enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
-                                                     const void *src,
-                                                     int srcDevice,
-                                                     size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
-                                                   const void *src,
-                                                   size_t spitch, size_t width,
-                                                   size_t height,
-                                                   enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
-    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           cudaArray_const_t, size_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
-    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
-                                           size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
-    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemcpyAsync(void *dst, const void *src, size_t count,
-                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
-                    size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
-    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
-    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           const void *, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
-                                           size_t, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
-    const void *symbol, const void *src, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
-                               enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
-    void *dst, const void *symbol, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
-                                                 size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
-                                                   int value, size_t width,
-                                                   size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
-    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
-    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
-                  size_t height, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
-                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
-                                           struct cudaExtent, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
-                                                           const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
-                                                        const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(size, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
-                     cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
-              int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
-                                           enum cudaMemoryAdvise, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
-    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
-    const void *devPtr, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
-    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
-    size_t numAttributes, const void *devPtr, size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
-                               size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
-                  const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
-                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset,
-                         size_t hOffset, size_t count, enum cudaMemcpyKind kind,
-                         cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
-    struct cudaPointerAttributes *attributes, const void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attributes, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, device, peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceDisablePeerAccess(int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
-    cudaGraphicsResource_t resource, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
-    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
-    cudaArray_t *array, cudaGraphicsResource_t resource,
-    unsigned int arrayIndex, unsigned int mipLevel) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, resource, arrayIndex, mipLevel);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsResourceGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindTexture(
-    size_t *offset, const struct textureReference *texref, const void *devPtr,
-    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
-                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
-                  size_t width, size_t height, size_t pitch) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
-    const struct textureReference *texref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, array, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaBindTextureToMipmappedArray(const struct textureReference *texref,
-                                cudaMipmappedArray_const_t mipmappedArray,
-                                const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaMipmappedArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, mipmappedArray, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUnbindTexture(const struct textureReference *texref) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(
-    size_t *offset, const struct textureReference *texref) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
-    const struct textureReference **texref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
-    const struct surfaceReference *surfref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct surfaceReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, array, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
-    const struct surfaceReference **surfref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
-    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           cudaArray_const_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, array);
-}
-
-extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(
-    int x, int y, int z, int w, enum cudaChannelFormatKind f) {
-  using FuncPtr = struct cudaChannelFormatDesc(CUDARTAPI *)(
-      int, int, int, int, enum cudaChannelFormatKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateChannelDesc");
-  if (!func_ptr) {
-    return cudaChannelFormatDesc{cudaChannelFormatKind(-1), 0, 0, 0};
-  }
-  return func_ptr(x, y, z, w, f);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
-    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
-    const struct cudaTextureDesc *pTexDesc,
-    const struct cudaResourceViewDesc *pResViewDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaTextureObject_t *, const struct cudaResourceDesc *,
-      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyTextureObject(cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
-    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
-    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
-                                           cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
-    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
-                                           const struct cudaResourceDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaRuntimeGetVersion(int *runtimeVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(runtimeVersion);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
-                                                      unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraph, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
-    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
-    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemcpy3DParms *pCopyParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pCopyParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemsetParams *pMemsetParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pMemsetParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
-    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
-    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                           const cudaGraphNode_t *pDependencies,
-                           size_t numDependencies, cudaGraph_t childGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  childGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphClone, originalGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
-                         cudaGraph_t clonedGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pNode, originalNode, clonedGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pType);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
-                                                        cudaGraphNode_t *nodes,
-                                                        size_t *numNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, nodes, numNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
-    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, pRootNodes, pNumRootNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
-                                                        cudaGraphNode_t *from,
-                                                        cudaGraphNode_t *to,
-                                                        size_t *numEdges) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numEdges);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
-    size_t *pNumDependencies) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependencies, pNumDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
-    size_t *pNumDependentNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependentNodes, pNumDependentNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                         const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                            const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphDestroyNode(cudaGraphNode_t node) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(
-    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
-    char *pLogBuffer, size_t bufferSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
-                                                      cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
-    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_2.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_2.inc
deleted file mode 100644
index a5a5438b0e5472..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_10_2.inc
+++ /dev/null
@@ -1,1907 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
-                                                         size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, pciBusId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
-                                                            int len,
-                                                            int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, handle);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
-    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, handle, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaPeekAtLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorName(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
-  if (!func_ptr) return "cudaGetErrorName symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorString(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
-  if (!func_ptr) return "cudaGetErrorString symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceCount(int *count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(
-    void *nvSciSyncAttrList, int device, int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetNvSciSyncAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nvSciSyncAttrList, device, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
-                          int srcDevice, int dstDevice) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, srcDevice, dstDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, prop);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDevice(int *device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
-                                                          int len) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device_arr, len);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
-                             int priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamDestroy(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
-    cudaStream_t stream, cudaEvent_t event, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, event, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
-                      void *userData, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
-                                           void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, callback, userData, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamSynchronize(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
-                         size_t length __dv(0),
-                         unsigned int flags __dv(cudaMemAttachSingle)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, devPtr, length, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaStreamCaptureMode *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus,
-    unsigned long long *pId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus, pId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventDestroy(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
-                                                           cudaEvent_t start,
-                                                           cudaEvent_t end) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ms, start, end);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
-    cudaExternalMemory_t *extMem_out,
-    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
-    void **devPtr, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
-                               const struct cudaExternalMemoryBufferDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, cudaExternalMemory_t,
-      const struct cudaExternalMemoryMipmappedArrayDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
-    cudaExternalSemaphore_t *extSem_out,
-    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreSignalParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreSignalParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaSignalExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreWaitParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreWaitParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaWaitExternalSemaphoresAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
-                 size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
-    const void *func, dim3 gridDim, dim3 blockDim, void **args,
-    size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(
-    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
-                                           unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, cacheConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, config);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, func);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, attr, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForDevice(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForHost(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
-                                                         cudaHostFn_t fn,
-                                                         void *userData) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, fn, userData);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
-                                              int blockSize,
-                                              size_t dynamicSMemSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
-                                                       const void *func,
-                                                       int blockSize,
-                                                       size_t dynamicSMemSize,
-                                                       unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(
-    void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMalloc(void **devPtr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
-                                                      size_t *pitch,
-                                                      size_t width,
-                                                      size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
-    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
-    size_t height __dv(0), unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           size_t, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, width, height, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFree(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
-                                                    unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHost, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
-                                                       unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevice, pHost, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
-                                                       void *pHost) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, pHost);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, extent);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
-                  struct cudaExtent extent, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           struct cudaExtent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, extent, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray,
-    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
-    unsigned int numLevels, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
-      struct cudaExtent, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
-    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
-    unsigned int level) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(levelArray, mipmappedArray, level);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
-    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
-    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
-                                                     size_t *total) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
-                 unsigned int *flags, cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           struct cudaExtent *, unsigned int *,
-                                           cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, extent, flags, array);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
-                                                 size_t count,
-                                                 enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
-                                                     const void *src,
-                                                     int srcDevice,
-                                                     size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
-                                                   const void *src,
-                                                   size_t spitch, size_t width,
-                                                   size_t height,
-                                                   enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
-    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           cudaArray_const_t, size_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
-    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
-                                           size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
-    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemcpyAsync(void *dst, const void *src, size_t count,
-                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
-                    size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
-    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
-    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           const void *, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
-                                           size_t, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
-    const void *symbol, const void *src, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
-                               enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
-    void *dst, const void *symbol, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
-                                                 size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
-                                                   int value, size_t width,
-                                                   size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
-    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
-    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
-                  size_t height, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
-                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
-                                           struct cudaExtent, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
-                                                           const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
-                                                        const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(size, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
-                     cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
-              int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
-                                           enum cudaMemoryAdvise, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
-    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
-    const void *devPtr, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
-    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
-    size_t numAttributes, const void *devPtr, size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
-                               size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
-                  const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
-                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset,
-                         size_t hOffset, size_t count, enum cudaMemcpyKind kind,
-                         cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
-    struct cudaPointerAttributes *attributes, const void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attributes, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, device, peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceDisablePeerAccess(int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
-    cudaGraphicsResource_t resource, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
-    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
-    cudaArray_t *array, cudaGraphicsResource_t resource,
-    unsigned int arrayIndex, unsigned int mipLevel) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, resource, arrayIndex, mipLevel);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsResourceGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindTexture(
-    size_t *offset, const struct textureReference *texref, const void *devPtr,
-    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
-                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
-                  size_t width, size_t height, size_t pitch) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
-    const struct textureReference *texref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, array, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaBindTextureToMipmappedArray(const struct textureReference *texref,
-                                cudaMipmappedArray_const_t mipmappedArray,
-                                const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaMipmappedArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, mipmappedArray, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUnbindTexture(const struct textureReference *texref) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(
-    size_t *offset, const struct textureReference *texref) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
-    const struct textureReference **texref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
-    const struct surfaceReference *surfref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct surfaceReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, array, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
-    const struct surfaceReference **surfref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
-    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           cudaArray_const_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, array);
-}
-
-extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(
-    int x, int y, int z, int w, enum cudaChannelFormatKind f) {
-  using FuncPtr = struct cudaChannelFormatDesc(CUDARTAPI *)(
-      int, int, int, int, enum cudaChannelFormatKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateChannelDesc");
-  if (!func_ptr) {
-    return cudaChannelFormatDesc{cudaChannelFormatKind(-1), 0, 0, 0};
-  }
-  return func_ptr(x, y, z, w, f);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
-    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
-    const struct cudaTextureDesc *pTexDesc,
-    const struct cudaResourceViewDesc *pResViewDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaTextureObject_t *, const struct cudaResourceDesc *,
-      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyTextureObject(cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
-    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
-    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
-                                           cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
-    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
-                                           const struct cudaResourceDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaRuntimeGetVersion(int *runtimeVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(runtimeVersion);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
-                                                      unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraph, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
-    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
-    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemcpy3DParms *pCopyParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pCopyParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemsetParams *pMemsetParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pMemsetParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
-    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
-    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                           const cudaGraphNode_t *pDependencies,
-                           size_t numDependencies, cudaGraph_t childGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  childGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphClone, originalGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
-                         cudaGraph_t clonedGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pNode, originalNode, clonedGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pType);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
-                                                        cudaGraphNode_t *nodes,
-                                                        size_t *numNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, nodes, numNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
-    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, pRootNodes, pNumRootNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
-                                                        cudaGraphNode_t *from,
-                                                        cudaGraphNode_t *to,
-                                                        size_t *numEdges) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numEdges);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
-    size_t *pNumDependencies) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependencies, pNumDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
-    size_t *pNumDependentNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependentNodes, pNumDependentNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                         const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                            const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphDestroyNode(cudaGraphNode_t node) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(
-    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
-    char *pLogBuffer, size_t bufferSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-                               const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph,
-                    cudaGraphNode_t *hErrorNode_out,
-                    enum cudaGraphExecUpdateResult *updateResult_out) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraph_t, cudaGraphNode_t *,
-                               enum cudaGraphExecUpdateResult *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
-                                                      cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
-    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_0.inc
deleted file mode 100644
index c9cd0a37697a4e..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_0.inc
+++ /dev/null
@@ -1,2639 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
-                                                         size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetTexture1DLinearMaxWidth(
-    size_t *maxWidthInElements, const struct cudaChannelFormatDesc *fmtDesc,
-    int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct cudaChannelFormatDesc *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetTexture1DLinearMaxWidth");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(maxWidthInElements, fmtDesc, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, pciBusId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
-                                                            int len,
-                                                            int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, handle);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
-    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, handle, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-#if CUDA_VERSION >= 11030
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceFlushGPUDirectRDMAWrites(
-    enum cudaFlushGPUDirectRDMAWritesTarget target,
-    enum cudaFlushGPUDirectRDMAWritesScope scope) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(enum cudaFlushGPUDirectRDMAWritesTarget,
-                               enum cudaFlushGPUDirectRDMAWritesScope);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceFlushGPUDirectRDMAWrites");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(target, scope);
-}
-
-#endif  // CUDA_VERSION >= 11030
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaPeekAtLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorName(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
-  if (!func_ptr) return "cudaGetErrorName symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorString(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
-  if (!func_ptr) return "cudaGetErrorString symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceCount(int *count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, device);
-}
-
-#if CUDA_VERSION >= 11020
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetDefaultMemPool(cudaMemPool_t *memPool, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetDefaultMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetMemPool(int device, cudaMemPool_t memPool) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, cudaMemPool_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, memPool);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetMemPool(cudaMemPool_t *memPool, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, device);
-}
-
-#endif  // CUDA_VERSION  >= 11020
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(
-    void *nvSciSyncAttrList, int device, int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetNvSciSyncAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nvSciSyncAttrList, device, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
-                          int srcDevice, int dstDevice) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, srcDevice, dstDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, prop);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDevice(int *device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
-                                                          int len) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device_arr, len);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
-                             int priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCtxResetPersistingL2Cache");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetAttribute(cudaStream_t hStream, enum cudaStreamAttrID attr,
-                       union cudaStreamAttrValue *value_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamAttrID,
-                                           union cudaStreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value_out);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamSetAttribute(cudaStream_t hStream, enum cudaStreamAttrID attr,
-                       const union cudaStreamAttrValue *value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamAttrID,
-                                           const union cudaStreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamDestroy(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
-    cudaStream_t stream, cudaEvent_t event, unsigned int flags __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, event, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
-                      void *userData, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
-                                           void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, callback, userData, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamSynchronize(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
-                         size_t length __dv(0), unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, devPtr, length, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaStreamCaptureMode *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus,
-    unsigned long long *pId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus, pId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v2(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out,
-    unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0),
-    const cudaGraphNode_t **dependencies_out __dv(0),
-    size_t *numDependencies_out __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *,
-      cudaGraph_t *, const cudaGraphNode_t **, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, captureStatus_out, id_out, graph_out,
-                  dependencies_out, numDependencies_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(
-    cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraphNode_t *,
-                                           size_t, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaStreamUpdateCaptureDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, dependencies, numDependencies, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0),
-                         unsigned int flags __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecordWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, stream, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventDestroy(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
-                                                           cudaEvent_t start,
-                                                           cudaEvent_t end) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ms, start, end);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
-    cudaExternalMemory_t *extMem_out,
-    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
-    void **devPtr, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
-                               const struct cudaExternalMemoryBufferDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, cudaExternalMemory_t,
-      const struct cudaExternalMemoryMipmappedArrayDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
-    cudaExternalSemaphore_t *extSem_out,
-    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreSignalParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreSignalParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaSignalExternalSemaphoresAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreWaitParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreWaitParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaWaitExternalSemaphoresAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
-                 size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
-    const void *func, dim3 gridDim, dim3 blockDim, void **args,
-    size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaLaunchCooperativeKernelMultiDevice(
-    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
-                                           unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, cacheConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, config);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, func);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, attr, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForDevice(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForHost(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
-                                                         cudaHostFn_t fn,
-                                                         void *userData) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, fn, userData);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
-                                              int blockSize,
-                                              size_t dynamicSMemSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize,
-                                          const void *func, int numBlocks,
-                                          int blockSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyAvailableDynamicSMemPerBlock");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
-                                                       const void *func,
-                                                       int blockSize,
-                                                       size_t dynamicSMemSize,
-                                                       unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMallocManaged(void **devPtr, size_t size, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMalloc(void **devPtr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
-                                                      size_t *pitch,
-                                                      size_t width,
-                                                      size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
-    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
-    size_t height __dv(0), unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           size_t, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, width, height, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFree(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
-                                                    unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHost, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
-                                                       unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevice, pHost, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
-                                                       void *pHost) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, pHost);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, extent);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
-                  struct cudaExtent extent, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           struct cudaExtent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, extent, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray,
-    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
-    unsigned int numLevels, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
-      struct cudaExtent, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
-    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
-    unsigned int level) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(levelArray, mipmappedArray, level);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
-    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
-    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
-                                                     size_t *total) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
-                 unsigned int *flags, cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           struct cudaExtent *, unsigned int *,
-                                           cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, extent, flags, array);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaArrayGetPlane(
-    cudaArray_t *pPlaneArray, cudaArray_t hArray, unsigned int planeIdx) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t *, cudaArray_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetPlane");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pPlaneArray, hArray, planeIdx);
-}
-
-#if CUDA_VERSION >= 11010
-
-extern __host__ cudaError_t CUDARTAPI cudaArrayGetSparseProperties(
-    struct cudaArraySparseProperties *sparseProperties, cudaArray_t array) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaArraySparseProperties *, cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, array);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetSparseProperties(
-    struct cudaArraySparseProperties *sparseProperties,
-    cudaMipmappedArray_t mipmap) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArraySparseProperties *,
-                                           cudaMipmappedArray_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMipmappedArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, mipmap);
-}
-
-#endif  // CUDA_VERSION >= 11010
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
-                                                 size_t count,
-                                                 enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
-                                                     const void *src,
-                                                     int srcDevice,
-                                                     size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
-                                                   const void *src,
-                                                   size_t spitch, size_t width,
-                                                   size_t height,
-                                                   enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
-    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           cudaArray_const_t, size_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
-    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
-                                           size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
-    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemcpyAsync(void *dst, const void *src, size_t count,
-                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
-                    size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
-    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
-    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           const void *, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
-                                           size_t, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
-    const void *symbol, const void *src, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
-                               enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
-    void *dst, const void *symbol, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
-                                                 size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
-                                                   int value, size_t width,
-                                                   size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
-    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
-    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
-                  size_t height, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
-                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
-                                           struct cudaExtent, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
-                                                           const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
-                                                        const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(size, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
-                     cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
-              int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
-                                           enum cudaMemoryAdvise, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
-    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
-    const void *devPtr, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
-    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
-    size_t numAttributes, const void *devPtr, size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
-                               size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
-                  const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
-                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset,
-                         size_t hOffset, size_t count, enum cudaMemcpyKind kind,
-                         cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr,
-                                                      size_t size,
-                                                      cudaStream_t hStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, hStream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr,
-                                                    cudaStream_t hStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, hStream);
-}
-
-#if CUDA_VERSION >= 11020
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolTrimTo(cudaMemPool_t memPool,
-                                                        size_t minBytesToKeep) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolTrimTo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, minBytesToKeep);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAttribute(
-    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAttribute(
-    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolSetAccess(cudaMemPool_t memPool,
-                     const struct cudaMemAccessDesc *descList, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMemPool_t, const struct cudaMemAccessDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, descList, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolGetAccess(enum cudaMemAccessFlags *flags, cudaMemPool_t memPool,
-                     struct cudaMemLocation *location) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      enum cudaMemAccessFlags *, cudaMemPool_t, struct cudaMemLocation *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags, memPool, location);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolCreate(
-    cudaMemPool_t *memPool, const struct cudaMemPoolProps *poolProps) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *,
-                                           const struct cudaMemPoolProps *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, poolProps);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolDestroy(cudaMemPool_t memPool) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(
-    void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t, cudaMemPool_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocFromPoolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, memPool, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportToShareableHandle(
-    void *shareableHandle, cudaMemPool_t memPool,
-    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, cudaMemPool_t, enum cudaMemAllocationHandleType, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMemPoolExportToShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(shareableHandle, memPool, handleType, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportFromShareableHandle(
-    cudaMemPool_t *memPool, void *shareableHandle,
-    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMemPool_t *, void *, enum cudaMemAllocationHandleType, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMemPoolImportFromShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, shareableHandle, handleType, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportPointer(
-    struct cudaMemPoolPtrExportData *exportData, void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaMemPoolPtrExportData *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolExportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(exportData, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolImportPointer(void **ptr, cudaMemPool_t memPool,
-                         struct cudaMemPoolPtrExportData *exportData) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, cudaMemPool_t,
-                                           struct cudaMemPoolPtrExportData *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolImportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, memPool, exportData);
-}
-
-#endif  // CUDA_VERSION >= 11020
-
-extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
-    struct cudaPointerAttributes *attributes, const void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attributes, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, device, peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceDisablePeerAccess(int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
-    cudaGraphicsResource_t resource, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
-    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
-    cudaArray_t *array, cudaGraphicsResource_t resource,
-    unsigned int arrayIndex, unsigned int mipLevel) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, resource, arrayIndex, mipLevel);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsResourceGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, resource);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture(
-    size_t *offset, const struct textureReference *texref, const void *devPtr,
-    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, size);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
-                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
-                  size_t width, size_t height, size_t pitch) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
-    const struct textureReference *texref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, array, desc);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaBindTextureToMipmappedArray(const struct textureReference *texref,
-                                cudaMipmappedArray_const_t mipmappedArray,
-                                const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaMipmappedArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, mipmappedArray, desc);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaUnbindTexture(const struct textureReference *texref) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaGetTextureAlignmentOffset(size_t *offset,
-                              const struct textureReference *texref) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
-    const struct textureReference **texref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, symbol);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
-    const struct surfaceReference *surfref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct surfaceReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, array, desc);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
-    const struct surfaceReference **surfref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
-    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           cudaArray_const_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, array);
-}
-
-extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(
-    int x, int y, int z, int w, enum cudaChannelFormatKind f) {
-  using FuncPtr = struct cudaChannelFormatDesc(CUDARTAPI *)(
-      int, int, int, int, enum cudaChannelFormatKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateChannelDesc");
-  if (!func_ptr) {
-    return cudaChannelFormatDesc{cudaChannelFormatKind(-1), 0, 0, 0};
-  }
-  return func_ptr(x, y, z, w, f);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
-    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
-    const struct cudaTextureDesc *pTexDesc,
-    const struct cudaResourceViewDesc *pResViewDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaTextureObject_t *, const struct cudaResourceDesc *,
-      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyTextureObject(cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
-    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
-    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
-                                           cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
-    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
-                                           const struct cudaResourceDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaRuntimeGetVersion(int *runtimeVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(runtimeVersion);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
-                                                      unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraph, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
-    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
-    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphKernelNodeCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hSrc, hDst);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
-    cudaGraphNode_t hNode, enum cudaKernelNodeAttrID attr,
-    union cudaKernelNodeAttrValue *value_out) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaKernelNodeAttrID,
-                               union cudaKernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
-    cudaGraphNode_t hNode, enum cudaKernelNodeAttrID attr,
-    const union cudaKernelNodeAttrValue *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaKernelNodeAttrID,
-                               const union cudaKernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemcpy3DParms *pCopyParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pCopyParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeToSymbol(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const void *symbol, const void *src, size_t count, size_t offset,
-    enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
-      const void *, const void *, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNodeToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, symbol,
-                  src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeFromSymbol(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dst,
-    const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *,
-      const void *, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNodeFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dst,
-                  symbol, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode1D(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dst,
-    const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *,
-      const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode1D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dst, src,
-                  count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsToSymbol(
-    cudaGraphNode_t node, const void *symbol, const void *src, size_t count,
-    size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, const void *, const void *,
-                               size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParamsToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsFromSymbol(
-    cudaGraphNode_t node, void *dst, const void *symbol, size_t count,
-    size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, void *, const void *, size_t,
-                               size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParamsFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, dst, symbol, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphMemcpyNodeSetParams1D(cudaGraphNode_t node, void *dst, const void *src,
-                               size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, void *, const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams1D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemsetParams *pMemsetParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pMemsetParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
-    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
-    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                           const cudaGraphNode_t *pDependencies,
-                           size_t numDependencies, cudaGraph_t childGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  childGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                            const cudaGraphNode_t *pDependencies,
-                            size_t numDependencies, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventRecordNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphEventRecordNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                          const cudaGraphNode_t *pDependencies,
-                          size_t numDependencies, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphEventWaitNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event);
-}
-
-#if CUDA_VERSION >= 11020
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresSignalNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
-      const struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphAddExternalSemaphoresSignalNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresSignalNodeGetParams(
-    cudaGraphNode_t hNode,
-    struct cudaExternalSemaphoreSignalNodeParams *params_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresSignalNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresSignalNodeSetParams(
-    cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, const struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresWaitNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
-      const struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphAddExternalSemaphoresWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresWaitNodeGetParams(
-    cudaGraphNode_t hNode,
-    struct cudaExternalSemaphoreWaitNodeParams *params_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresWaitNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresWaitNodeSetParams(
-    cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, const struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-#endif  // CUDA_VERSION >= 11020
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphClone, originalGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
-                         cudaGraph_t clonedGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pNode, originalNode, clonedGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pType);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
-                                                        cudaGraphNode_t *nodes,
-                                                        size_t *numNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, nodes, numNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
-    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, pRootNodes, pNumRootNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
-                                                        cudaGraphNode_t *from,
-                                                        cudaGraphNode_t *to,
-                                                        size_t *numEdges) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numEdges);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
-    size_t *pNumDependencies) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependencies, pNumDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
-    size_t *pNumDependentNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependentNodes, pNumDependentNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                         const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                            const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphDestroyNode(cudaGraphNode_t node) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(
-    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
-    char *pLogBuffer, size_t bufferSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsToSymbol(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const void *symbol,
-    const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const void *, const void *, size_t,
-                                           size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParamsToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecMemcpyNodeSetParamsFromSymbol(cudaGraphExec_t hGraphExec,
-                                           cudaGraphNode_t node, void *dst,
-                                           const void *symbol, size_t count,
-                                           size_t offset,
-                                           enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParamsFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, dst, symbol, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams1D(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, void *dst,
-    const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, void *,
-                               const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams1D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-                               const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecChildGraphNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaGraph_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecChildGraphNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, childGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventRecordNodeSetEvent(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventWaitNodeSetEvent(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecExternalSemaphoresSignalNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphExec_t, cudaGraphNode_t,
-      const struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecExternalSemaphoresWaitNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphExec_t, cudaGraphNode_t,
-      const struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph,
-                    cudaGraphNode_t *hErrorNode_out,
-                    enum cudaGraphExecUpdateResult *updateResult_out) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraph_t, cudaGraphNode_t *,
-                               enum cudaGraphExecUpdateResult *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec,
-                                                      cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphUpload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
-                                                      cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphDebugDotPrint(
-    cudaGraph_t graph, const char *path, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, const char *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDebugDotPrint");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, path, flags);
-}
-
-#if CUDA_VERSION >= 11030
-
-extern __host__ cudaError_t CUDARTAPI cudaUserObjectCreate(
-    cudaUserObject_t *object_out, void *ptr, cudaHostFn_t destroy,
-    unsigned int initialRefcount, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaUserObject_t *, void *, cudaHostFn_t, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object_out, ptr, destroy, initialRefcount, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUserObjectRetain(cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUserObjectRelease(cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphRetainUserObject(
-    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1),
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t,
-                                           unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRetainUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphReleaseUserObject(
-    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphReleaseUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count);
-}
-
-#endif  // CUDA_VERSION >= 11030
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(
-    const char *symbol, void **funcPtr, unsigned long long flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const char *, void **, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDriverEntryPoint");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, funcPtr, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
-    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-#if CUDA_VERSION >= 11020
-
-extern __host__ cudaError_t CUDARTAPI_CDECL
-cudaGetFuncBySymbol(cudaFunction_t *functionPtr, const void *symbolPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaFunction_t *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetFuncBySymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(functionPtr, symbolPtr);
-}
-
-#endif  // CUDA_VERSION >= 11020
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_2.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_2.inc
deleted file mode 100644
index 5c0ba7fe6a4e39..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_2.inc
+++ /dev/null
@@ -1,2259 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
-                                                         size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, pciBusId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
-                                                            int len,
-                                                            int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, handle);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
-    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, handle, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaPeekAtLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorName(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
-  if (!func_ptr) return "cudaGetErrorName symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorString(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
-  if (!func_ptr) return "cudaGetErrorString symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceCount(int *count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetDefaultMemPool(cudaMemPool_t *memPool, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetDefaultMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetMemPool(int device, cudaMemPool_t memPool) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, cudaMemPool_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, memPool);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetMemPool(cudaMemPool_t *memPool, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(
-    void *nvSciSyncAttrList, int device, int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetNvSciSyncAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nvSciSyncAttrList, device, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
-                          int srcDevice, int dstDevice) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, srcDevice, dstDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, prop);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDevice(int *device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
-                                                          int len) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device_arr, len);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
-                             int priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCtxResetPersistingL2Cache");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetAttribute(cudaStream_t hStream, enum cudaStreamAttrID attr,
-                       union cudaStreamAttrValue *value_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamAttrID,
-                                           union cudaStreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value_out);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamSetAttribute(cudaStream_t hStream, enum cudaStreamAttrID attr,
-                       const union cudaStreamAttrValue *value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamAttrID,
-                                           const union cudaStreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamDestroy(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
-    cudaStream_t stream, cudaEvent_t event, unsigned int flags __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, event, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
-                      void *userData, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
-                                           void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, callback, userData, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamSynchronize(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
-                         size_t length __dv(0), unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, devPtr, length, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaStreamCaptureMode *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus,
-    unsigned long long *pId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus, pId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v2(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out,
-    unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0),
-    const cudaGraphNode_t **dependencies_out __dv(0),
-    size_t *numDependencies_out __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *,
-      cudaGraph_t *, const cudaGraphNode_t **, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, captureStatus_out, id_out, graph_out,
-                  dependencies_out, numDependencies_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(
-    cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraphNode_t *,
-                                           size_t, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaStreamUpdateCaptureDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, dependencies, numDependencies, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventDestroy(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
-                                                           cudaEvent_t start,
-                                                           cudaEvent_t end) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ms, start, end);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
-    cudaExternalMemory_t *extMem_out,
-    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
-    void **devPtr, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
-                               const struct cudaExternalMemoryBufferDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, cudaExternalMemory_t,
-      const struct cudaExternalMemoryMipmappedArrayDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
-    cudaExternalSemaphore_t *extSem_out,
-    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreSignalParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreSignalParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "__CUDART_API_PTSZ(cudaSignalExternalSemaphoresAsync_v2)");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreWaitParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreWaitParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "__CUDART_API_PTSZ(cudaWaitExternalSemaphoresAsync_v2)");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
-                 size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
-    const void *func, dim3 gridDim, dim3 blockDim, void **args,
-    size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(
-    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
-                                           unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, cacheConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, config);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, func);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, attr, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForDevice(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForHost(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
-                                                         cudaHostFn_t fn,
-                                                         void *userData) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, fn, userData);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
-                                              int blockSize,
-                                              size_t dynamicSMemSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize,
-                                          const void *func, int numBlocks,
-                                          int blockSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyAvailableDynamicSMemPerBlock");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
-                                                       const void *func,
-                                                       int blockSize,
-                                                       size_t dynamicSMemSize,
-                                                       unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMallocManaged(void **devPtr, size_t size, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMalloc(void **devPtr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
-                                                      size_t *pitch,
-                                                      size_t width,
-                                                      size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
-    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
-    size_t height __dv(0), unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           size_t, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, width, height, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFree(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
-                                                    unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHost, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
-                                                       unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevice, pHost, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
-                                                       void *pHost) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, pHost);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, extent);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
-                  struct cudaExtent extent, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           struct cudaExtent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, extent, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray,
-    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
-    unsigned int numLevels, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
-      struct cudaExtent, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
-    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
-    unsigned int level) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(levelArray, mipmappedArray, level);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
-    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
-    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
-                                                     size_t *total) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
-                 unsigned int *flags, cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           struct cudaExtent *, unsigned int *,
-                                           cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, extent, flags, array);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaArrayGetPlane(
-    cudaArray_t *pPlaneArray, cudaArray_t hArray, unsigned int planeIdx) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t *, cudaArray_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetPlane");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pPlaneArray, hArray, planeIdx);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
-                                                 size_t count,
-                                                 enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
-                                                     const void *src,
-                                                     int srcDevice,
-                                                     size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
-                                                   const void *src,
-                                                   size_t spitch, size_t width,
-                                                   size_t height,
-                                                   enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
-    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           cudaArray_const_t, size_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
-    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
-                                           size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
-    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemcpyAsync(void *dst, const void *src, size_t count,
-                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
-                    size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
-    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
-    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           const void *, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
-                                           size_t, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
-    const void *symbol, const void *src, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
-                               enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
-    void *dst, const void *symbol, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
-                                                 size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
-                                                   int value, size_t width,
-                                                   size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
-    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
-    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
-                  size_t height, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
-                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
-                                           struct cudaExtent, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
-                                                           const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
-                                                        const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(size, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
-                     cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
-              int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
-                                           enum cudaMemoryAdvise, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
-    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
-    const void *devPtr, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
-    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
-    size_t numAttributes, const void *devPtr, size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
-                               size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
-                  const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
-                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset,
-                         size_t hOffset, size_t count, enum cudaMemcpyKind kind,
-                         cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr,
-                                                      size_t size,
-                                                      cudaStream_t hStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, hStream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr,
-                                                    cudaStream_t hStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, hStream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolTrimTo(cudaMemPool_t memPool,
-                                                        size_t minBytesToKeep) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolTrimTo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, minBytesToKeep);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAttribute(
-    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAttribute(
-    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolSetAccess(cudaMemPool_t memPool,
-                     const struct cudaMemAccessDesc *descList, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMemPool_t, const struct cudaMemAccessDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, descList, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolGetAccess(enum cudaMemAccessFlags *flags, cudaMemPool_t memPool,
-                     struct cudaMemLocation *location) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      enum cudaMemAccessFlags *, cudaMemPool_t, struct cudaMemLocation *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags, memPool, location);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolCreate(
-    cudaMemPool_t *memPool, const struct cudaMemPoolProps *poolProps) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *,
-                                           const struct cudaMemPoolProps *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, poolProps);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolDestroy(cudaMemPool_t memPool) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(
-    void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t, cudaMemPool_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocFromPoolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, memPool, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportToShareableHandle(
-    void *shareableHandle, cudaMemPool_t memPool,
-    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, cudaMemPool_t, enum cudaMemAllocationHandleType, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMemPoolExportToShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(shareableHandle, memPool, handleType, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportFromShareableHandle(
-    cudaMemPool_t *memPool, void *shareableHandle,
-    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMemPool_t *, void *, enum cudaMemAllocationHandleType, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMemPoolImportFromShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, shareableHandle, handleType, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportPointer(
-    struct cudaMemPoolPtrExportData *exportData, void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaMemPoolPtrExportData *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolExportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(exportData, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolImportPointer(void **ptr, cudaMemPool_t memPool,
-                         struct cudaMemPoolPtrExportData *exportData) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, cudaMemPool_t,
-                                           struct cudaMemPoolPtrExportData *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolImportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, memPool, exportData);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
-    struct cudaPointerAttributes *attributes, const void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attributes, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, device, peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceDisablePeerAccess(int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
-    cudaGraphicsResource_t resource, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
-    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
-    cudaArray_t *array, cudaGraphicsResource_t resource,
-    unsigned int arrayIndex, unsigned int mipLevel) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, resource, arrayIndex, mipLevel);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsResourceGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, resource);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture(
-    size_t *offset, const struct textureReference *texref, const void *devPtr,
-    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, size);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
-                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
-                  size_t width, size_t height, size_t pitch) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
-    const struct textureReference *texref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, array, desc);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaBindTextureToMipmappedArray(const struct textureReference *texref,
-                                cudaMipmappedArray_const_t mipmappedArray,
-                                const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaMipmappedArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, mipmappedArray, desc);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaUnbindTexture(const struct textureReference *texref) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaGetTextureAlignmentOffset(size_t *offset,
-                              const struct textureReference *texref) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
-    const struct textureReference **texref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, symbol);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
-    const struct surfaceReference *surfref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct surfaceReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, array, desc);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
-    const struct surfaceReference **surfref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
-    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           cudaArray_const_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, array);
-}
-
-extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(
-    int x, int y, int z, int w, enum cudaChannelFormatKind f) {
-  using FuncPtr = struct cudaChannelFormatDesc(CUDARTAPI *)(
-      int, int, int, int, enum cudaChannelFormatKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateChannelDesc");
-  return func_ptr(x, y, z, w, f);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
-    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
-    const struct cudaTextureDesc *pTexDesc,
-    const struct cudaResourceViewDesc *pResViewDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaTextureObject_t *, const struct cudaResourceDesc *,
-      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyTextureObject(cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
-    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
-    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
-                                           cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
-    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
-                                           const struct cudaResourceDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaRuntimeGetVersion(int *runtimeVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(runtimeVersion);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
-                                                      unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraph, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
-    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
-    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphKernelNodeCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hSrc, hDst);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
-    cudaGraphNode_t hNode, enum cudaKernelNodeAttrID attr,
-    union cudaKernelNodeAttrValue *value_out) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaKernelNodeAttrID,
-                               union cudaKernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
-    cudaGraphNode_t hNode, enum cudaKernelNodeAttrID attr,
-    const union cudaKernelNodeAttrValue *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaKernelNodeAttrID,
-                               const union cudaKernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemcpy3DParms *pCopyParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pCopyParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemsetParams *pMemsetParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pMemsetParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
-    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
-    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                           const cudaGraphNode_t *pDependencies,
-                           size_t numDependencies, cudaGraph_t childGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  childGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphClone, originalGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
-                         cudaGraph_t clonedGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pNode, originalNode, clonedGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pType);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
-                                                        cudaGraphNode_t *nodes,
-                                                        size_t *numNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, nodes, numNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
-    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, pRootNodes, pNumRootNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
-                                                        cudaGraphNode_t *from,
-                                                        cudaGraphNode_t *to,
-                                                        size_t *numEdges) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numEdges);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
-    size_t *pNumDependencies) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependencies, pNumDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
-    size_t *pNumDependentNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependentNodes, pNumDependentNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                         const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                            const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphDestroyNode(cudaGraphNode_t node) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(
-    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
-    char *pLogBuffer, size_t bufferSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-                               const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph,
-                    cudaGraphNode_t *hErrorNode_out,
-                    enum cudaGraphExecUpdateResult *updateResult_out) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraph_t, cudaGraphNode_t *,
-                               enum cudaGraphExecUpdateResult *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
-                                                      cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
-    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-extern __host__ cudaError_t CUDARTAPI_CDECL
-cudaGetFuncBySymbol(cudaFunction_t *functionPtr, const void *symbolPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaFunction_t *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("_CDECL cudaGetFuncBySymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(functionPtr, symbolPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                            const cudaGraphNode_t *pDependencies,
-                            size_t numDependencies, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventRecordNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                          const cudaGraphNode_t *pDependencies,
-                          size_t numDependencies, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
-}
-
-#if CUDA_VERSION >= 11030
-
-extern __host__ cudaError_t CUDARTAPI cudaUserObjectCreate(
-    cudaUserObject_t *object_out, void *ptr, cudaHostFn_t destroy,
-    unsigned int initialRefcount, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaUserObject_t *, void *, cudaHostFn_t, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object_out, ptr, destroy, initialRefcount, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUserObjectRetain(cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUserObjectRelease(cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphRetainUserObject(
-    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1),
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t,
-                                           unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRetainUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphReleaseUserObject(
-    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphReleaseUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count);
-}
-
-#endif  // CUDA_VERSION >= 11030
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(
-    const char *symbol, void **funcPtr, unsigned long long flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const char *, void **, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDriverEntryPoint");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, funcPtr, flags);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_8.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_8.inc
deleted file mode 100644
index 8000ce1f926a12..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_11_8.inc
+++ /dev/null
@@ -1,2771 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
-                                                         size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetTexture1DLinearMaxWidth(
-    size_t *maxWidthInElements, const struct cudaChannelFormatDesc *fmtDesc,
-    int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct cudaChannelFormatDesc *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetTexture1DLinearMaxWidth");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(maxWidthInElements, fmtDesc, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, pciBusId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
-                                                            int len,
-                                                            int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, handle);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
-    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, handle, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceFlushGPUDirectRDMAWrites(
-    enum cudaFlushGPUDirectRDMAWritesTarget target,
-    enum cudaFlushGPUDirectRDMAWritesScope scope) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(enum cudaFlushGPUDirectRDMAWritesTarget,
-                               enum cudaFlushGPUDirectRDMAWritesScope);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceFlushGPUDirectRDMAWrites");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(target, scope);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaPeekAtLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorName(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
-  if (!func_ptr) return "cudaGetErrorName symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorString(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
-  if (!func_ptr) return "cudaGetErrorString symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceCount(int *count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetDefaultMemPool(cudaMemPool_t *memPool, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetDefaultMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetMemPool(int device, cudaMemPool_t memPool) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, cudaMemPool_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, memPool);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetMemPool(cudaMemPool_t *memPool, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(
-    void *nvSciSyncAttrList, int device, int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetNvSciSyncAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nvSciSyncAttrList, device, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
-                          int srcDevice, int dstDevice) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, srcDevice, dstDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, prop);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDevice(int *device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
-                                                          int len) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device_arr, len);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
-                             int priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCtxResetPersistingL2Cache");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetAttribute(cudaStream_t hStream, cudaStreamAttrID attr,
-                       cudaStreamAttrValue *value_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamAttrID,
-                                           cudaStreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value_out);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamSetAttribute(cudaStream_t hStream, cudaStreamAttrID attr,
-                       const cudaStreamAttrValue *value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamAttrID,
-                                           const cudaStreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamDestroy(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
-    cudaStream_t stream, cudaEvent_t event, unsigned int flags __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, event, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
-                      void *userData, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
-                                           void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, callback, userData, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamSynchronize(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
-                         size_t length __dv(0), unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, devPtr, length, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaStreamCaptureMode *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus,
-    unsigned long long *pId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus, pId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v2(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out,
-    unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0),
-    const cudaGraphNode_t **dependencies_out __dv(0),
-    size_t *numDependencies_out __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *,
-      cudaGraph_t *, const cudaGraphNode_t **, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, captureStatus_out, id_out, graph_out,
-                  dependencies_out, numDependencies_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(
-    cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraphNode_t *,
-                                           size_t, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaStreamUpdateCaptureDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, dependencies, numDependencies, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0),
-                         unsigned int flags __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecordWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, stream, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventDestroy(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
-                                                           cudaEvent_t start,
-                                                           cudaEvent_t end) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ms, start, end);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
-    cudaExternalMemory_t *extMem_out,
-    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
-    void **devPtr, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
-                               const struct cudaExternalMemoryBufferDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, cudaExternalMemory_t,
-      const struct cudaExternalMemoryMipmappedArrayDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
-    cudaExternalSemaphore_t *extSem_out,
-    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreSignalParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreSignalParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "__CUDART_API_PTSZ(cudaSignalExternalSemaphoresAsync_v2)");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreWaitParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreWaitParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "__CUDART_API_PTSZ(cudaWaitExternalSemaphoresAsync_v2)");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
-                 size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(
-    const cudaLaunchConfig_t *config, const void *func, void **args) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const cudaLaunchConfig_t *,
-                                           const void *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernelExC");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config, func, args);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
-    const void *func, dim3 gridDim, dim3 blockDim, void **args,
-    size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaLaunchCooperativeKernelMultiDevice(
-    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
-                                           unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, cacheConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, config);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, func);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, attr, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForDevice(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForHost(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
-                                                         cudaHostFn_t fn,
-                                                         void *userData) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, fn, userData);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
-                                              int blockSize,
-                                              size_t dynamicSMemSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize,
-                                          const void *func, int numBlocks,
-                                          int blockSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyAvailableDynamicSMemPerBlock");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
-                                                       const void *func,
-                                                       int blockSize,
-                                                       size_t dynamicSMemSize,
-                                                       unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *func,
-                                     const cudaLaunchConfig_t *launchConfig) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, const cudaLaunchConfig_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyMaxPotentialClusterSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(clusterSize, func, launchConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveClusters(int *numClusters, const void *func,
-                               const cudaLaunchConfig_t *launchConfig) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, const cudaLaunchConfig_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveClusters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numClusters, func, launchConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMallocManaged(void **devPtr, size_t size, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMalloc(void **devPtr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
-                                                      size_t *pitch,
-                                                      size_t width,
-                                                      size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
-    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
-    size_t height __dv(0), unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           size_t, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, width, height, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFree(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
-                                                    unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHost, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
-                                                       unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevice, pHost, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
-                                                       void *pHost) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, pHost);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, extent);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
-                  struct cudaExtent extent, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           struct cudaExtent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, extent, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray,
-    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
-    unsigned int numLevels, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
-      struct cudaExtent, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
-    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
-    unsigned int level) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(levelArray, mipmappedArray, level);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
-    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
-    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
-                                                     size_t *total) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
-                 unsigned int *flags, cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           struct cudaExtent *, unsigned int *,
-                                           cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, extent, flags, array);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaArrayGetPlane(
-    cudaArray_t *pPlaneArray, cudaArray_t hArray, unsigned int planeIdx) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t *, cudaArray_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetPlane");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pPlaneArray, hArray, planeIdx);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaArrayGetMemoryRequirements(
-    struct cudaArrayMemoryRequirements *memoryRequirements, cudaArray_t array,
-    int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArrayMemoryRequirements *,
-                                           cudaArray_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetMemoryRequirements");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memoryRequirements, array, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetMemoryRequirements(
-    struct cudaArrayMemoryRequirements *memoryRequirements,
-    cudaMipmappedArray_t mipmap, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArrayMemoryRequirements *,
-                                           cudaMipmappedArray_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMipmappedArrayGetMemoryRequirements");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memoryRequirements, mipmap, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaArrayGetSparseProperties(
-    struct cudaArraySparseProperties *sparseProperties, cudaArray_t array) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaArraySparseProperties *, cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, array);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetSparseProperties(
-    struct cudaArraySparseProperties *sparseProperties,
-    cudaMipmappedArray_t mipmap) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArraySparseProperties *,
-                                           cudaMipmappedArray_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMipmappedArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, mipmap);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
-                                                 size_t count,
-                                                 enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
-                                                     const void *src,
-                                                     int srcDevice,
-                                                     size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
-                                                   const void *src,
-                                                   size_t spitch, size_t width,
-                                                   size_t height,
-                                                   enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
-    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           cudaArray_const_t, size_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
-    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
-                                           size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
-    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemcpyAsync(void *dst, const void *src, size_t count,
-                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
-                    size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
-    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
-    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           const void *, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
-                                           size_t, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
-    const void *symbol, const void *src, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
-                               enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
-    void *dst, const void *symbol, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
-                                                 size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
-                                                   int value, size_t width,
-                                                   size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
-    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
-    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
-                  size_t height, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
-                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
-                                           struct cudaExtent, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
-                                                           const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
-                                                        const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(size, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
-                     cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
-              int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
-                                           enum cudaMemoryAdvise, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
-    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
-    const void *devPtr, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
-    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
-    size_t numAttributes, const void *devPtr, size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
-                               size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
-                  const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
-                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset,
-                         size_t hOffset, size_t count, enum cudaMemcpyKind kind,
-                         cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr,
-                                                      size_t size,
-                                                      cudaStream_t hStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, hStream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr,
-                                                    cudaStream_t hStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, hStream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolTrimTo(cudaMemPool_t memPool,
-                                                        size_t minBytesToKeep) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolTrimTo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, minBytesToKeep);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAttribute(
-    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAttribute(
-    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolSetAccess(cudaMemPool_t memPool,
-                     const struct cudaMemAccessDesc *descList, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMemPool_t, const struct cudaMemAccessDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, descList, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolGetAccess(enum cudaMemAccessFlags *flags, cudaMemPool_t memPool,
-                     struct cudaMemLocation *location) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      enum cudaMemAccessFlags *, cudaMemPool_t, struct cudaMemLocation *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags, memPool, location);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolCreate(
-    cudaMemPool_t *memPool, const struct cudaMemPoolProps *poolProps) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *,
-                                           const struct cudaMemPoolProps *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, poolProps);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolDestroy(cudaMemPool_t memPool) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(
-    void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t, cudaMemPool_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocFromPoolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, memPool, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportToShareableHandle(
-    void *shareableHandle, cudaMemPool_t memPool,
-    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, cudaMemPool_t, enum cudaMemAllocationHandleType, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMemPoolExportToShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(shareableHandle, memPool, handleType, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportFromShareableHandle(
-    cudaMemPool_t *memPool, void *shareableHandle,
-    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMemPool_t *, void *, enum cudaMemAllocationHandleType, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMemPoolImportFromShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, shareableHandle, handleType, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportPointer(
-    struct cudaMemPoolPtrExportData *exportData, void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaMemPoolPtrExportData *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolExportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(exportData, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolImportPointer(void **ptr, cudaMemPool_t memPool,
-                         struct cudaMemPoolPtrExportData *exportData) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, cudaMemPool_t,
-                                           struct cudaMemPoolPtrExportData *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolImportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, memPool, exportData);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
-    struct cudaPointerAttributes *attributes, const void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attributes, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, device, peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceDisablePeerAccess(int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
-    cudaGraphicsResource_t resource, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
-    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
-    cudaArray_t *array, cudaGraphicsResource_t resource,
-    unsigned int arrayIndex, unsigned int mipLevel) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, resource, arrayIndex, mipLevel);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsResourceGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, resource);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture(
-    size_t *offset, const struct textureReference *texref, const void *devPtr,
-    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, size);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
-                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
-                  size_t width, size_t height, size_t pitch) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
-    const struct textureReference *texref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, array, desc);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaBindTextureToMipmappedArray(const struct textureReference *texref,
-                                cudaMipmappedArray_const_t mipmappedArray,
-                                const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaMipmappedArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, mipmappedArray, desc);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaUnbindTexture(const struct textureReference *texref) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaGetTextureAlignmentOffset(size_t *offset,
-                              const struct textureReference *texref) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
-    const struct textureReference **texref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, symbol);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
-    const struct surfaceReference *surfref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct surfaceReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, array, desc);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
-    const struct surfaceReference **surfref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
-    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           cudaArray_const_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, array);
-}
-
-extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(
-    int x, int y, int z, int w, enum cudaChannelFormatKind f) {
-  using FuncPtr = struct cudaChannelFormatDesc(CUDARTAPI *)(
-      int, int, int, int, enum cudaChannelFormatKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateChannelDesc");
-  return func_ptr(x, y, z, w, f);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
-    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
-    const struct cudaTextureDesc *pTexDesc,
-    const struct cudaResourceViewDesc *pResViewDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaTextureObject_t *, const struct cudaResourceDesc *,
-      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject_v2(
-    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
-    const struct cudaTextureDesc_v2 *pTexDesc,
-    const struct cudaResourceViewDesc *pResViewDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaTextureObject_t *, const struct cudaResourceDesc *,
-      const struct cudaTextureDesc_v2 *, const struct cudaResourceViewDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyTextureObject(cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
-    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc_v2(
-    struct cudaTextureDesc_v2 *pTexDesc, cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaTextureDesc_v2 *,
-                                           cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
-    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
-                                           cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
-    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
-                                           const struct cudaResourceDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaRuntimeGetVersion(int *runtimeVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(runtimeVersion);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
-                                                      unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraph, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
-    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
-    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphKernelNodeCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hSrc, hDst);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
-    cudaGraphNode_t hNode, cudaKernelNodeAttrID attr,
-    cudaKernelNodeAttrValue *value_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, cudaKernelNodeAttrID, cudaKernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
-    cudaGraphNode_t hNode, cudaKernelNodeAttrID attr,
-    const cudaKernelNodeAttrValue *value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, cudaKernelNodeAttrID, const cudaKernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemcpy3DParms *pCopyParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pCopyParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeToSymbol(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const void *symbol, const void *src, size_t count, size_t offset,
-    enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
-      const void *, const void *, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNodeToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, symbol,
-                  src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeFromSymbol(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dst,
-    const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *,
-      const void *, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNodeFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dst,
-                  symbol, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode1D(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dst,
-    const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *,
-      const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode1D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dst, src,
-                  count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsToSymbol(
-    cudaGraphNode_t node, const void *symbol, const void *src, size_t count,
-    size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, const void *, const void *,
-                               size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParamsToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsFromSymbol(
-    cudaGraphNode_t node, void *dst, const void *symbol, size_t count,
-    size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, void *, const void *, size_t,
-                               size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParamsFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, dst, symbol, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphMemcpyNodeSetParams1D(cudaGraphNode_t node, void *dst, const void *src,
-                               size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, void *, const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams1D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemsetParams *pMemsetParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pMemsetParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
-    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
-    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                           const cudaGraphNode_t *pDependencies,
-                           size_t numDependencies, cudaGraph_t childGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  childGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                            const cudaGraphNode_t *pDependencies,
-                            size_t numDependencies, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventRecordNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphEventRecordNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                          const cudaGraphNode_t *pDependencies,
-                          size_t numDependencies, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphEventWaitNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresSignalNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
-      const struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphAddExternalSemaphoresSignalNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresSignalNodeGetParams(
-    cudaGraphNode_t hNode,
-    struct cudaExternalSemaphoreSignalNodeParams *params_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresSignalNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresSignalNodeSetParams(
-    cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, const struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresWaitNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
-      const struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphAddExternalSemaphoresWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresWaitNodeGetParams(
-    cudaGraphNode_t hNode,
-    struct cudaExternalSemaphoreWaitNodeParams *params_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresWaitNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresWaitNodeSetParams(
-    cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, const struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemAllocNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    struct cudaMemAllocNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           struct cudaMemAllocNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemAllocNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemAllocNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemAllocNodeParams *params_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           struct cudaMemAllocNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemAllocNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, params_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemFreeNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemFreeNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphMemFreeNodeGetParams(cudaGraphNode_t node, void *dptr_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemFreeNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, dptr_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGraphMemTrim(int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGraphMemTrim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetGraphMemAttribute(
-    int device, enum cudaGraphMemAttributeType attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, enum cudaGraphMemAttributeType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetGraphMemAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceSetGraphMemAttribute(
-    int device, enum cudaGraphMemAttributeType attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, enum cudaGraphMemAttributeType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetGraphMemAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphClone, originalGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
-                         cudaGraph_t clonedGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pNode, originalNode, clonedGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pType);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
-                                                        cudaGraphNode_t *nodes,
-                                                        size_t *numNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, nodes, numNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
-    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, pRootNodes, pNumRootNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
-                                                        cudaGraphNode_t *from,
-                                                        cudaGraphNode_t *to,
-                                                        size_t *numEdges) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numEdges);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
-    size_t *pNumDependencies) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependencies, pNumDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
-    size_t *pNumDependentNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependentNodes, pNumDependentNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                         const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                            const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphDestroyNode(cudaGraphNode_t node) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(
-    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
-    char *pLogBuffer, size_t bufferSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiateWithFlags(
-    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
-                                           unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphExec, graph, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsToSymbol(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const void *symbol,
-    const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const void *, const void *, size_t,
-                                           size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParamsToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecMemcpyNodeSetParamsFromSymbol(cudaGraphExec_t hGraphExec,
-                                           cudaGraphNode_t node, void *dst,
-                                           const void *symbol, size_t count,
-                                           size_t offset,
-                                           enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParamsFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, dst, symbol, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams1D(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, void *dst,
-    const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, void *,
-                               const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams1D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-                               const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecChildGraphNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaGraph_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecChildGraphNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, childGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventRecordNodeSetEvent(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventWaitNodeSetEvent(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecExternalSemaphoresSignalNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphExec_t, cudaGraphNode_t,
-      const struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecExternalSemaphoresWaitNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphExec_t, cudaGraphNode_t,
-      const struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeSetEnabled(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int isEnabled) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeSetEnabled");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, isEnabled);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
-                        unsigned int *isEnabled) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetEnabled");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, isEnabled);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph,
-                    cudaGraphNode_t *hErrorNode_out,
-                    enum cudaGraphExecUpdateResult *updateResult_out) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraph_t, cudaGraphNode_t *,
-                               enum cudaGraphExecUpdateResult *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec,
-                                                      cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphUpload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
-                                                      cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphDebugDotPrint(
-    cudaGraph_t graph, const char *path, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, const char *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDebugDotPrint");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, path, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaUserObjectCreate(
-    cudaUserObject_t *object_out, void *ptr, cudaHostFn_t destroy,
-    unsigned int initialRefcount, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaUserObject_t *, void *, cudaHostFn_t, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object_out, ptr, destroy, initialRefcount, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUserObjectRetain(cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUserObjectRelease(cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphRetainUserObject(
-    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1),
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t,
-                                           unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRetainUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphReleaseUserObject(
-    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphReleaseUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(
-    const char *symbol, void **funcPtr, unsigned long long flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const char *, void **, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDriverEntryPoint");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, funcPtr, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
-    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-extern __host__ cudaError_t CUDARTAPI_CDECL
-cudaGetFuncBySymbol(cudaFunction_t *functionPtr, const void *symbolPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaFunction_t *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("_CDECL cudaGetFuncBySymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(functionPtr, symbolPtr);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_12_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_12_0.inc
deleted file mode 100644
index b488634d7d2c21..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_12_0.inc
+++ /dev/null
@@ -1,2676 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
-                                                         size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetTexture1DLinearMaxWidth(
-    size_t *maxWidthInElements, const struct cudaChannelFormatDesc *fmtDesc,
-    int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct cudaChannelFormatDesc *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetTexture1DLinearMaxWidth");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(maxWidthInElements, fmtDesc, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, pciBusId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
-                                                            int len,
-                                                            int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, handle);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
-    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, handle, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceFlushGPUDirectRDMAWrites(
-    enum cudaFlushGPUDirectRDMAWritesTarget target,
-    enum cudaFlushGPUDirectRDMAWritesScope scope) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(enum cudaFlushGPUDirectRDMAWritesTarget,
-                               enum cudaFlushGPUDirectRDMAWritesScope);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceFlushGPUDirectRDMAWrites");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(target, scope);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaPeekAtLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorName(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
-  if (!func_ptr) return "cudaGetErrorName symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorString(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
-  if (!func_ptr) return "cudaGetErrorString symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceCount(int *count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetDefaultMemPool(cudaMemPool_t *memPool, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetDefaultMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetMemPool(int device, cudaMemPool_t memPool) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, cudaMemPool_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, memPool);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetMemPool(cudaMemPool_t *memPool, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetMemPool");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(
-    void *nvSciSyncAttrList, int device, int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetNvSciSyncAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nvSciSyncAttrList, device, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
-                          int srcDevice, int dstDevice) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, srcDevice, dstDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, prop);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaInitDevice(int device,
-                                                     unsigned int deviceFlags,
-                                                     unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaInitDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, deviceFlags, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDevice(int *device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
-                                                          int len) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device_arr, len);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
-                             int priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetId(cudaStream_t hStream, unsigned long long *streamId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, streamId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCtxResetPersistingL2Cache");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetAttribute(cudaStream_t hStream, cudaStreamAttrID attr,
-                       cudaStreamAttrValue *value_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamAttrID,
-                                           cudaStreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value_out);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamSetAttribute(cudaStream_t hStream, cudaStreamAttrID attr,
-                       const cudaStreamAttrValue *value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamAttrID,
-                                           const cudaStreamAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, attr, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamDestroy(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
-    cudaStream_t stream, cudaEvent_t event, unsigned int flags __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, event, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
-                      void *userData, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
-                                           void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, callback, userData, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamSynchronize(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
-                         size_t length __dv(0), unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, devPtr, length, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaStreamCaptureMode *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mode);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pCaptureStatus);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(
-    cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out,
-    unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0),
-    const cudaGraphNode_t **dependencies_out __dv(0),
-    size_t *numDependencies_out __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *,
-      cudaGraph_t *, const cudaGraphNode_t **, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, captureStatus_out, id_out, graph_out,
-                  dependencies_out, numDependencies_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(
-    cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraphNode_t *,
-                                           size_t, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaStreamUpdateCaptureDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, dependencies, numDependencies, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0),
-                         unsigned int flags __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecordWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, stream, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventDestroy(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
-                                                           cudaEvent_t start,
-                                                           cudaEvent_t end) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ms, start, end);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
-    cudaExternalMemory_t *extMem_out,
-    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem_out, memHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
-    void **devPtr, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
-                               const struct cudaExternalMemoryBufferDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, extMem, bufferDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
-    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, cudaExternalMemory_t,
-      const struct cudaExternalMemoryMipmappedArrayDesc *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmap, extMem, mipmapDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extMem);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
-    cudaExternalSemaphore_t *extSem_out,
-    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreHandleDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem_out, semHandleDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreSignalParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreSignalParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudaSignalExternalSemaphoresAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
-    const cudaExternalSemaphore_t *extSemArray,
-    const struct cudaExternalSemaphoreWaitParams *paramsArray,
-    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
-                               const struct cudaExternalSemaphoreWaitParams *,
-                               unsigned int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudaWaitExternalSemaphoresAsync_v2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(extSem);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
-                 size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(
-    const cudaLaunchConfig_t *config, const void *func, void **args) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const cudaLaunchConfig_t *,
-                                           const void *, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernelExC");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config, func, args);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
-    const void *func, dim3 gridDim, dim3 blockDim, void **args,
-    size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaLaunchCooperativeKernelMultiDevice(
-    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
-                                           unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, cacheConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, config);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, func);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, attr, value);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForDevice(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaSetDoubleForHost(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
-                                                         cudaHostFn_t fn,
-                                                         void *userData) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, fn, userData);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
-                                              int blockSize,
-                                              size_t dynamicSMemSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize,
-                                          const void *func, int numBlocks,
-                                          int blockSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *, int, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyAvailableDynamicSMemPerBlock");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
-                                                       const void *func,
-                                                       int blockSize,
-                                                       size_t dynamicSMemSize,
-                                                       unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *func,
-                                     const cudaLaunchConfig_t *launchConfig) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, const cudaLaunchConfig_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyMaxPotentialClusterSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(clusterSize, func, launchConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveClusters(int *numClusters, const void *func,
-                               const cudaLaunchConfig_t *launchConfig) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, const cudaLaunchConfig_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveClusters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numClusters, func, launchConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMallocManaged(void **devPtr, size_t size, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMalloc(void **devPtr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
-                                                      size_t *pitch,
-                                                      size_t width,
-                                                      size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
-    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
-    size_t height __dv(0), unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           size_t, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, width, height, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFree(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
-                                                    unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHost, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
-                                                       unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevice, pHost, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
-                                                       void *pHost) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, pHost);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, extent);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
-                  struct cudaExtent extent, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           struct cudaExtent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, extent, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray,
-    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
-    unsigned int numLevels, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
-      struct cudaExtent, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
-    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
-    unsigned int level) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(levelArray, mipmappedArray, level);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
-    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
-    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
-                                                     size_t *total) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
-                 unsigned int *flags, cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           struct cudaExtent *, unsigned int *,
-                                           cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, extent, flags, array);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaArrayGetPlane(
-    cudaArray_t *pPlaneArray, cudaArray_t hArray, unsigned int planeIdx) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t *, cudaArray_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetPlane");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pPlaneArray, hArray, planeIdx);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaArrayGetMemoryRequirements(
-    struct cudaArrayMemoryRequirements *memoryRequirements, cudaArray_t array,
-    int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArrayMemoryRequirements *,
-                                           cudaArray_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetMemoryRequirements");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memoryRequirements, array, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetMemoryRequirements(
-    struct cudaArrayMemoryRequirements *memoryRequirements,
-    cudaMipmappedArray_t mipmap, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArrayMemoryRequirements *,
-                                           cudaMipmappedArray_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMipmappedArrayGetMemoryRequirements");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memoryRequirements, mipmap, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaArrayGetSparseProperties(
-    struct cudaArraySparseProperties *sparseProperties, cudaArray_t array) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaArraySparseProperties *, cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, array);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetSparseProperties(
-    struct cudaArraySparseProperties *sparseProperties,
-    cudaMipmappedArray_t mipmap) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArraySparseProperties *,
-                                           cudaMipmappedArray_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMipmappedArrayGetSparseProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(sparseProperties, mipmap);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
-                                                 size_t count,
-                                                 enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
-                                                     const void *src,
-                                                     int srcDevice,
-                                                     size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
-                                                   const void *src,
-                                                   size_t spitch, size_t width,
-                                                   size_t height,
-                                                   enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
-    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           cudaArray_const_t, size_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
-    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
-                                           size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
-    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemcpyAsync(void *dst, const void *src, size_t count,
-                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
-                    size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
-    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
-    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           const void *, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
-                                           size_t, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
-    const void *symbol, const void *src, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
-                               enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
-    void *dst, const void *symbol, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
-                                                 size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
-                                                   int value, size_t width,
-                                                   size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
-    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
-    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
-                  size_t height, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
-                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
-                                           struct cudaExtent, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
-                                                           const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
-                                                        const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(size, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
-                     cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
-              int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
-                                           enum cudaMemoryAdvise, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
-    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
-    const void *devPtr, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
-    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
-    size_t numAttributes, const void *devPtr, size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
-                               size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
-                  const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
-                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  count, kind);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
-}
-
-extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset,
-                         size_t hOffset, size_t count, enum cudaMemcpyKind kind,
-                         cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr,
-                                                      size_t size,
-                                                      cudaStream_t hStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, hStream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr,
-                                                    cudaStream_t hStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, hStream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolTrimTo(cudaMemPool_t memPool,
-                                                        size_t minBytesToKeep) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolTrimTo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, minBytesToKeep);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAttribute(
-    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAttribute(
-    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolSetAccess(cudaMemPool_t memPool,
-                     const struct cudaMemAccessDesc *descList, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMemPool_t, const struct cudaMemAccessDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, descList, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolGetAccess(enum cudaMemAccessFlags *flags, cudaMemPool_t memPool,
-                     struct cudaMemLocation *location) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      enum cudaMemAccessFlags *, cudaMemPool_t, struct cudaMemLocation *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags, memPool, location);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolCreate(
-    cudaMemPool_t *memPool, const struct cudaMemPoolProps *poolProps) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *,
-                                           const struct cudaMemPoolProps *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, poolProps);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolDestroy(cudaMemPool_t memPool) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(
-    void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t, cudaMemPool_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocFromPoolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, memPool, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportToShareableHandle(
-    void *shareableHandle, cudaMemPool_t memPool,
-    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, cudaMemPool_t, enum cudaMemAllocationHandleType, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMemPoolExportToShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(shareableHandle, memPool, handleType, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportFromShareableHandle(
-    cudaMemPool_t *memPool, void *shareableHandle,
-    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMemPool_t *, void *, enum cudaMemAllocationHandleType, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaMemPoolImportFromShareableHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(memPool, shareableHandle, handleType, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportPointer(
-    struct cudaMemPoolPtrExportData *exportData, void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaMemPoolPtrExportData *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolExportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(exportData, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPoolImportPointer(void **ptr, cudaMemPool_t memPool,
-                         struct cudaMemPoolPtrExportData *exportData) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, cudaMemPool_t,
-                                           struct cudaMemPoolPtrExportData *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolImportPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, memPool, exportData);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
-    struct cudaPointerAttributes *attributes, const void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attributes, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, device, peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceDisablePeerAccess(int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
-    cudaGraphicsResource_t resource, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
-    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
-    cudaArray_t *array, cudaGraphicsResource_t resource,
-    unsigned int arrayIndex, unsigned int mipLevel) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, resource, arrayIndex, mipLevel);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsResourceGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
-    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           cudaArray_const_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, array);
-}
-
-extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(
-    int x, int y, int z, int w, enum cudaChannelFormatKind f) {
-  using FuncPtr = struct cudaChannelFormatDesc(CUDARTAPI *)(
-      int, int, int, int, enum cudaChannelFormatKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateChannelDesc");
-  return func_ptr(x, y, z, w, f);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
-    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
-    const struct cudaTextureDesc *pTexDesc,
-    const struct cudaResourceViewDesc *pResViewDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaTextureObject_t *, const struct cudaResourceDesc *,
-      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyTextureObject(cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
-    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
-    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
-                                           cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
-    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
-                                           const struct cudaResourceDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaRuntimeGetVersion(int *runtimeVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(runtimeVersion);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
-                                                      unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraph, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
-    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
-    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphKernelNodeCopyAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hSrc, hDst);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
-    cudaGraphNode_t hNode, cudaKernelNodeAttrID attr,
-    cudaKernelNodeAttrValue *value_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, cudaKernelNodeAttrID, cudaKernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
-    cudaGraphNode_t hNode, cudaKernelNodeAttrID attr,
-    const cudaKernelNodeAttrValue *value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, cudaKernelNodeAttrID, const cudaKernelNodeAttrValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemcpy3DParms *pCopyParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pCopyParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeToSymbol(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const void *symbol, const void *src, size_t count, size_t offset,
-    enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
-      const void *, const void *, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNodeToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, symbol,
-                  src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeFromSymbol(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dst,
-    const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *,
-      const void *, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNodeFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dst,
-                  symbol, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode1D(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dst,
-    const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *,
-      const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode1D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dst, src,
-                  count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsToSymbol(
-    cudaGraphNode_t node, const void *symbol, const void *src, size_t count,
-    size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, const void *, const void *,
-                               size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParamsToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsFromSymbol(
-    cudaGraphNode_t node, void *dst, const void *symbol, size_t count,
-    size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, void *, const void *, size_t,
-                               size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParamsFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, dst, symbol, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphMemcpyNodeSetParams1D(cudaGraphNode_t node, void *dst, const void *src,
-                               size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, void *, const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams1D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaMemsetParams *pMemsetParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pMemsetParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
-    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
-    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                           const cudaGraphNode_t *pDependencies,
-                           size_t numDependencies, cudaGraph_t childGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  childGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                            const cudaGraphNode_t *pDependencies,
-                            size_t numDependencies, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventRecordNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphEventRecordNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                          const cudaGraphNode_t *pDependencies,
-                          size_t numDependencies, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                               const cudaGraphNode_t *, size_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphEventWaitNodeGetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresSignalNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
-      const struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphAddExternalSemaphoresSignalNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresSignalNodeGetParams(
-    cudaGraphNode_t hNode,
-    struct cudaExternalSemaphoreSignalNodeParams *params_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresSignalNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresSignalNodeSetParams(
-    cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, const struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresWaitNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
-      const struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphAddExternalSemaphoresWaitNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresWaitNodeGetParams(
-    cudaGraphNode_t hNode,
-    struct cudaExternalSemaphoreWaitNodeParams *params_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresWaitNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, params_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExternalSemaphoresWaitNodeSetParams(
-    cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t, const struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hNode, nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemAllocNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies,
-    struct cudaMemAllocNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           const cudaGraphNode_t *, size_t,
-                                           struct cudaMemAllocNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemAllocNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemAllocNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemAllocNodeParams *params_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           struct cudaMemAllocNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemAllocNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, params_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemFreeNode(
-    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemFreeNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphMemFreeNodeGetParams(cudaGraphNode_t node, void *dptr_out) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemFreeNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, dptr_out);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGraphMemTrim(int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGraphMemTrim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetGraphMemAttribute(
-    int device, enum cudaGraphMemAttributeType attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, enum cudaGraphMemAttributeType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetGraphMemAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceSetGraphMemAttribute(
-    int device, enum cudaGraphMemAttributeType attr, void *value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, enum cudaGraphMemAttributeType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetGraphMemAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphClone, originalGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
-                         cudaGraph_t clonedGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pNode, originalNode, clonedGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pType);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
-                                                        cudaGraphNode_t *nodes,
-                                                        size_t *numNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, nodes, numNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
-    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, pRootNodes, pNumRootNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
-                                                        cudaGraphNode_t *from,
-                                                        cudaGraphNode_t *to,
-                                                        size_t *numEdges) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numEdges);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
-    size_t *pNumDependencies) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependencies, pNumDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
-    size_t *pNumDependentNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependentNodes, pNumDependentNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                         const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
-                            const cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
-                                           const cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphDestroyNode(cudaGraphNode_t node) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphInstantiate(cudaGraphExec_t *pGraphExec, cudaGraph_t graph,
-                     unsigned long long flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
-                                           unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphExec, graph, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphInstantiateWithFlags(cudaGraphExec_t *pGraphExec, cudaGraph_t graph,
-                              unsigned long long flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
-                                           unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphExec, graph, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphInstantiateWithParams(cudaGraphExec_t *pGraphExec, cudaGraph_t graph,
-                               cudaGraphInstantiateParams *instantiateParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
-                                           cudaGraphInstantiateParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiateWithParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphExec, graph, instantiateParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecGetFlags(cudaGraphExec_t graphExec, unsigned long long *flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsToSymbol(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const void *symbol,
-    const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const void *, const void *, size_t,
-                                           size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParamsToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecMemcpyNodeSetParamsFromSymbol(cudaGraphExec_t hGraphExec,
-                                           cudaGraphNode_t node, void *dst,
-                                           const void *symbol, size_t count,
-                                           size_t offset,
-                                           enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParamsFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, dst, symbol, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams1D(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, void *dst,
-    const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, void *,
-                               const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams1D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-    const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
-                               const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecChildGraphNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaGraph_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecChildGraphNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, node, childGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventRecordNodeSetEvent(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecEventRecordNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventWaitNodeSetEvent(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaEvent_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecEventWaitNodeSetEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecExternalSemaphoresSignalNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphExec_t, cudaGraphNode_t,
-      const struct cudaExternalSemaphoreSignalNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecExternalSemaphoresSignalNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecExternalSemaphoresWaitNodeSetParams(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
-    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphExec_t, cudaGraphNode_t,
-      const struct cudaExternalSemaphoreWaitNodeParams *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphExecExternalSemaphoresWaitNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, nodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeSetEnabled(
-    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int isEnabled) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeSetEnabled");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, isEnabled);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
-                        unsigned int *isEnabled) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
-                                           unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetEnabled");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hNode, isEnabled);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph,
-                    cudaGraphExecUpdateResultInfo *resultInfo) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraph_t,
-                                           cudaGraphExecUpdateResultInfo *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hGraphExec, hGraph, resultInfo);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec,
-                                                      cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphUpload");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
-                                                      cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphDebugDotPrint(
-    cudaGraph_t graph, const char *path, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, const char *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDebugDotPrint");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, path, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaUserObjectCreate(
-    cudaUserObject_t *object_out, void *ptr, cudaHostFn_t destroy,
-    unsigned int initialRefcount, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaUserObject_t *, void *, cudaHostFn_t, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object_out, ptr, destroy, initialRefcount, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUserObjectRetain(cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRetain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUserObjectRelease(cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRelease");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(object, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphRetainUserObject(
-    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1),
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t,
-                                           unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRetainUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphReleaseUserObject(
-    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphReleaseUserObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, object, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(
-    const char *symbol, void **funcPtr, unsigned long long flags,
-    enum cudaDriverEntryPointQueryResult *driverStatus) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const char *, void **, unsigned long long,
-                               enum cudaDriverEntryPointQueryResult *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDriverEntryPoint");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, funcPtr, flags, driverStatus);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
-    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_9_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_9_0.inc
deleted file mode 100644
index 6753ddcf7829db..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_runtime_9_0.inc
+++ /dev/null
@@ -1,1421 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
-                                                         size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(leastPriority, greatestPriority);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, pciBusId);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
-                                                            int len,
-                                                            int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pciBusId, len, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, event);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, handle);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
-    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, handle, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit,
-                                                         size_t value) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(limit, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue,
-                                                         enum cudaLimit limit) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pValue, limit);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pCacheConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cacheConfig);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaPeekAtLastError(void) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorName(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
-  if (!func_ptr) return "cudaGetErrorName symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ const char *CUDARTAPI
-cudaGetErrorString(cudaError_t error) {
-  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
-  if (!func_ptr) return "cudaGetErrorString symbol not found.";
-  return func_ptr(error);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceCount(int *count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(prop, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
-                          int srcDevice, int dstDevice) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(value, attr, srcDevice, dstDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, prop);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaGetDevice(int *device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
-                                                          int len) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device_arr, len);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
-                             int priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pStream, flags, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, priority);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hStream, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamDestroy(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
-    cudaStream_t stream, cudaEvent_t event, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, event, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
-                      void *userData, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
-                                           void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, callback, userData, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamSynchronize(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
-                         size_t length __dv(0),
-                         unsigned int flags __dv(cudaMemAttachSingle)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, devPtr, length, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaEventDestroy(cudaEvent_t event) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
-                                                           cudaEvent_t start,
-                                                           cudaEvent_t end) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ms, start, end);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
-                 size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
-    const void *func, dim3 gridDim, dim3 blockDim, void **args,
-    size_t sharedMem, cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(
-    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
-    unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
-                                           unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(launchParamsList, numDevices, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, cacheConfig);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, config);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, func);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func, attr, value);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(d);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
-                                              int blockSize,
-                                              size_t dynamicSMemSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
-                                                       const void *func,
-                                                       int blockSize,
-                                                       size_t dynamicSMemSize,
-                                                       unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0),
-                  cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(dim3, dim3, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaConfigureCall");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(gridDim, blockDim, sharedMem, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
-                                                        size_t size,
-                                                        size_t offset) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetupArgument");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arg, size, offset);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(func);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(
-    void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMalloc(void **devPtr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
-                                                      size_t *pitch,
-                                                      size_t width,
-                                                      size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
-    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
-    size_t height __dv(0), unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           size_t, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, width, height, flags);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaFree(void *devPtr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
-                                                    unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pHost, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
-                                                       unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr, size, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pDevice, pHost, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
-                                                       void *pHost) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pFlags, pHost);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, extent);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
-                  struct cudaExtent extent, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
-                                           const struct cudaChannelFormatDesc *,
-                                           struct cudaExtent, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, desc, extent, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray,
-    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
-    unsigned int numLevels, unsigned int flags __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
-      struct cudaExtent, unsigned int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
-    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
-    unsigned int level) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(levelArray, mipmappedArray, level);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
-    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
-    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(p, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
-                                                     size_t *total) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(free, total);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
-                 unsigned int *flags, cudaArray_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           struct cudaExtent *, unsigned int *,
-                                           cudaArray_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, extent, flags, array);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
-                                                 size_t count,
-                                                 enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
-                                                     const void *src,
-                                                     int srcDevice,
-                                                     size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
-                  const void *src, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
-                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  count, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
-                                                   const void *src,
-                                                   size_t spitch, size_t width,
-                                                   size_t height,
-                                                   enum cudaMemcpyKind kind) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
-                               size_t, size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
-    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
-    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
-    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           cudaArray_const_t, size_t, size_t,
-                                           size_t, size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
-                  width, height, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
-    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
-                                           size_t, enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
-    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
-    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemcpyAsync(void *dst, const void *src, size_t count,
-                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
-                    size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
-                                           size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(
-    void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset,
-    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
-    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
-    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
-                               size_t, enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
-    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
-    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
-                                           const void *, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
-    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
-    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
-    cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
-                                           size_t, size_t, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
-                  stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
-    const void *symbol, const void *src, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
-                               enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(symbol, src, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
-    void *dst, const void *symbol, size_t count, size_t offset,
-    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
-                                           enum cudaMemcpyKind, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dst, symbol, count, offset, kind, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
-                                                 size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
-                                                   int value, size_t width,
-                                                   size_t height) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
-    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
-    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, value, count, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
-                  size_t height, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
-                                           cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, pitch, value, width, height, stream);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
-                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
-                                           struct cudaExtent, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pitchedDevPtr, value, extent, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
-                                                           const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
-                                                        const void *symbol) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(size, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
-                     cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, dstDevice, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
-              int device) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
-                                           enum cudaMemoryAdvise, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, count, advice, device);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
-    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
-    const void *devPtr, size_t count) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSize, attribute, devPtr, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
-    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
-    size_t numAttributes, const void *devPtr, size_t count) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
-                               size_t, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
-    struct cudaPointerAttributes *attributes, const void *ptr) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attributes, ptr);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(canAccessPeer, device, peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDeviceDisablePeerAccess(int peerDevice) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(peerDevice);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
-    cudaGraphicsResource_t resource, unsigned int flags) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(resource, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
-    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(count, resources, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
-    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(devPtr, size, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
-    cudaArray_t *array, cudaGraphicsResource_t resource,
-    unsigned int arrayIndex, unsigned int mipLevel) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(array, resource, arrayIndex, mipLevel);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphicsResourceGetMappedMipmappedArray(
-    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mipmappedArray, resource);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
-    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
-                                           cudaArray_const_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, array);
-}
-
-extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(
-    int x, int y, int z, int w, enum cudaChannelFormatKind f) {
-  using FuncPtr = struct cudaChannelFormatDesc(CUDARTAPI *)(
-      int, int, int, int, enum cudaChannelFormatKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateChannelDesc");
-  if (!func_ptr) {
-    return cudaChannelFormatDesc{cudaChannelFormatKind(-1), 0, 0, 0};
-  }
-  return func_ptr(x, y, z, w, f);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindTexture(
-    size_t *offset, const struct textureReference *texref, const void *devPtr,
-    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, size);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
-                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
-                  size_t width, size_t height, size_t pitch) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      size_t *, const struct textureReference *, const void *,
-      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
-    const struct textureReference *texref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, array, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaBindTextureToMipmappedArray(const struct textureReference *texref,
-                                cudaMipmappedArray_const_t mipmappedArray,
-                                const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct textureReference *, cudaMipmappedArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, mipmappedArray, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaUnbindTexture(const struct textureReference *texref) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(
-    size_t *offset, const struct textureReference *texref) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(offset, texref);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
-    const struct textureReference **texref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texref, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
-    const struct surfaceReference *surfref, cudaArray_const_t array,
-    const struct cudaChannelFormatDesc *desc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      const struct surfaceReference *, cudaArray_const_t,
-      const struct cudaChannelFormatDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, array, desc);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
-    const struct surfaceReference **surfref, const void *symbol) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfref, symbol);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
-    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
-    const struct cudaTextureDesc *pTexDesc,
-    const struct cudaResourceViewDesc *pResViewDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaTextureObject_t *, const struct cudaResourceDesc *,
-      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroyTextureObject(cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
-    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pTexDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
-    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
-                                           cudaTextureObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResViewDesc, texObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
-    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
-                                           const struct cudaResourceDesc *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pSurfObject, pResDesc);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
-    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pResDesc, surfObject);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(driverVersion);
-}
-
-extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
-cudaRuntimeGetVersion(int *runtimeVersion) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(runtimeVersion);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
-    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ppExportTable, pExportTableId);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_stub.cc
index aa00b3bbeb5875..a199d4cc700442 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cuda_stub.cc
@@ -33,40 +33,40 @@ void* GetDsoHandle() {
 #endif
 }
 
-template <typename T>
-T LoadSymbol(const char* symbol_name) {
+void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
     tsl::Env::Default()
         ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
-  return reinterpret_cast<T>(symbol);
+  return symbol;
 }
 
-CUresult GetSymbolNotFoundError() {
+const char* kSymbols[] = {
+#include "tsl/cuda/cuda.inc"
+};
+
+constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
+
+}  // namespace
+
+extern "C" {
+
+static CUresult GetSymbolNotFoundError() {
   return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
 }
-}  // namespace
 
-#if CUDA_VERSION < 10000
-#error CUDA version earlier than 10 is not supported.
-#endif
+extern void* _cuda_tramp_table[];
 
-#ifndef __CUDA_DEPRECATED
-#define __CUDA_DEPRECATED
-#endif
+void _cuda_tramp_resolve(int i) {
+  CHECK_LE(0, i);
+  CHECK_LT(i, kNumSymbols);
+  void* p = LoadSymbol(kSymbols[i]);
+  if (!p) {
+    p = reinterpret_cast<void*>(&GetSymbolNotFoundError);
+  }
+  _cuda_tramp_table[i] = p;
+}
 
-#if CUDA_VERSION < 10010
-#include "tsl/cuda/cuda_10_0.inc"
-#elif CUDA_VERSION < 10020
-#include "tsl/cuda/cuda_10_1.inc"
-#elif CUDA_VERSION < 11000
-#include "tsl/cuda/cuda_10_2.inc"
-#elif CUDA_VERSION < 11020
-#include "tsl/cuda/cuda_11_0.inc"
-#elif CUDA_VERSION < 12000
-#include "tsl/cuda/cuda_11_2.inc"
-#else
-#include "tsl/cuda/cuda_12_0.inc"
-#endif
+}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudart.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cudart.symbols
new file mode 100644
index 00000000000000..69b990cb3879b5
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cudart.symbols
@@ -0,0 +1,399 @@
+__cudaGetKernel
+__cudaInitModule
+__cudaLaunchKernel
+__cudaLaunchKernel_ptsz
+__cudaPopCallConfiguration
+__cudaPushCallConfiguration
+__cudaRegisterFatBinary
+__cudaRegisterFatBinaryEnd
+__cudaRegisterFunction
+__cudaRegisterHostVar
+__cudaRegisterManagedVar
+__cudaRegisterUnifiedTable
+__cudaRegisterVar
+__cudaUnregisterFatBinary
+cudaArrayGetInfo
+cudaArrayGetMemoryRequirements
+cudaArrayGetPlane
+cudaArrayGetSparseProperties
+cudaChooseDevice
+cudaCreateChannelDesc
+cudaCreateSurfaceObject
+cudaCreateTextureObject
+cudaCtxResetPersistingL2Cache
+cudaDestroyExternalMemory
+cudaDestroyExternalSemaphore
+cudaDestroySurfaceObject
+cudaDestroyTextureObject
+cudaDeviceCanAccessPeer
+cudaDeviceDisablePeerAccess
+cudaDeviceEnablePeerAccess
+cudaDeviceFlushGPUDirectRDMAWrites
+cudaDeviceGetAttribute
+cudaDeviceGetByPCIBusId
+cudaDeviceGetCacheConfig
+cudaDeviceGetDefaultMemPool
+cudaDeviceGetGraphMemAttribute
+cudaDeviceGetLimit
+cudaDeviceGetMemPool
+cudaDeviceGetNvSciSyncAttributes
+cudaDeviceGetP2PAttribute
+cudaDeviceGetPCIBusId
+cudaDeviceGetSharedMemConfig
+cudaDeviceGetStreamPriorityRange
+cudaDeviceGetTexture1DLinearMaxWidth
+cudaDeviceGraphMemTrim
+cudaDeviceReset
+cudaDeviceSetCacheConfig
+cudaDeviceSetGraphMemAttribute
+cudaDeviceSetLimit
+cudaDeviceSetMemPool
+cudaDeviceSetSharedMemConfig
+cudaDeviceSynchronize
+cudaDriverGetVersion
+cudaEGLStreamConsumerAcquireFrame
+cudaEGLStreamConsumerConnect
+cudaEGLStreamConsumerConnectWithFlags
+cudaEGLStreamConsumerDisconnect
+cudaEGLStreamConsumerReleaseFrame
+cudaEGLStreamProducerConnect
+cudaEGLStreamProducerDisconnect
+cudaEGLStreamProducerPresentFrame
+cudaEGLStreamProducerReturnFrame
+cudaEventCreate
+cudaEventCreateFromEGLSync
+cudaEventCreateWithFlags
+cudaEventDestroy
+cudaEventElapsedTime
+cudaEventQuery
+cudaEventRecord
+cudaEventRecordWithFlags
+cudaEventRecordWithFlags_ptsz
+cudaEventRecord_ptsz
+cudaEventSynchronize
+cudaExternalMemoryGetMappedBuffer
+cudaExternalMemoryGetMappedMipmappedArray
+cudaFree
+cudaFreeArray
+cudaFreeAsync
+cudaFreeAsync_ptsz
+cudaFreeHost
+cudaFreeMipmappedArray
+cudaFuncGetAttributes
+cudaFuncSetAttribute
+cudaFuncSetCacheConfig
+cudaFuncSetSharedMemConfig
+cudaGLGetDevices
+cudaGLMapBufferObject
+cudaGLMapBufferObjectAsync
+cudaGLRegisterBufferObject
+cudaGLSetBufferObjectMapFlags
+cudaGLSetGLDevice
+cudaGLUnmapBufferObject
+cudaGLUnmapBufferObjectAsync
+cudaGLUnregisterBufferObject
+cudaGetChannelDesc
+cudaGetDevice
+cudaGetDeviceCount
+cudaGetDeviceFlags
+cudaGetDeviceProperties
+cudaGetDeviceProperties_v2
+cudaGetDriverEntryPoint
+cudaGetDriverEntryPoint_ptsz
+cudaGetErrorName
+cudaGetErrorString
+cudaGetExportTable
+cudaGetFuncBySymbol
+cudaGetKernel
+cudaGetLastError
+cudaGetMipmappedArrayLevel
+cudaGetSurfaceObjectResourceDesc
+cudaGetSymbolAddress
+cudaGetSymbolSize
+cudaGetTextureObjectResourceDesc
+cudaGetTextureObjectResourceViewDesc
+cudaGetTextureObjectTextureDesc
+cudaGraphAddChildGraphNode
+cudaGraphAddDependencies
+cudaGraphAddEmptyNode
+cudaGraphAddEventRecordNode
+cudaGraphAddEventWaitNode
+cudaGraphAddExternalSemaphoresSignalNode
+cudaGraphAddExternalSemaphoresWaitNode
+cudaGraphAddHostNode
+cudaGraphAddKernelNode
+cudaGraphAddMemAllocNode
+cudaGraphAddMemFreeNode
+cudaGraphAddMemcpyNode
+cudaGraphAddMemcpyNode1D
+cudaGraphAddMemcpyNodeFromSymbol
+cudaGraphAddMemcpyNodeToSymbol
+cudaGraphAddMemsetNode
+cudaGraphAddNode
+cudaGraphChildGraphNodeGetGraph
+cudaGraphClone
+cudaGraphCreate
+cudaGraphDebugDotPrint
+cudaGraphDestroy
+cudaGraphDestroyNode
+cudaGraphEventRecordNodeGetEvent
+cudaGraphEventRecordNodeSetEvent
+cudaGraphEventWaitNodeGetEvent
+cudaGraphEventWaitNodeSetEvent
+cudaGraphExecChildGraphNodeSetParams
+cudaGraphExecDestroy
+cudaGraphExecEventRecordNodeSetEvent
+cudaGraphExecEventWaitNodeSetEvent
+cudaGraphExecExternalSemaphoresSignalNodeSetParams
+cudaGraphExecExternalSemaphoresWaitNodeSetParams
+cudaGraphExecGetFlags
+cudaGraphExecHostNodeSetParams
+cudaGraphExecKernelNodeSetParams
+cudaGraphExecMemcpyNodeSetParams
+cudaGraphExecMemcpyNodeSetParams1D
+cudaGraphExecMemcpyNodeSetParamsFromSymbol
+cudaGraphExecMemcpyNodeSetParamsToSymbol
+cudaGraphExecMemsetNodeSetParams
+cudaGraphExecNodeSetParams
+cudaGraphExecUpdate
+cudaGraphExternalSemaphoresSignalNodeGetParams
+cudaGraphExternalSemaphoresSignalNodeSetParams
+cudaGraphExternalSemaphoresWaitNodeGetParams
+cudaGraphExternalSemaphoresWaitNodeSetParams
+cudaGraphGetEdges
+cudaGraphGetNodes
+cudaGraphGetRootNodes
+cudaGraphHostNodeGetParams
+cudaGraphHostNodeSetParams
+cudaGraphInstantiate
+cudaGraphInstantiateWithFlags
+cudaGraphInstantiateWithParams
+cudaGraphInstantiateWithParams_ptsz
+cudaGraphKernelNodeCopyAttributes
+cudaGraphKernelNodeGetAttribute
+cudaGraphKernelNodeGetParams
+cudaGraphKernelNodeSetAttribute
+cudaGraphKernelNodeSetParams
+cudaGraphLaunch
+cudaGraphLaunch_ptsz
+cudaGraphMemAllocNodeGetParams
+cudaGraphMemFreeNodeGetParams
+cudaGraphMemcpyNodeGetParams
+cudaGraphMemcpyNodeSetParams
+cudaGraphMemcpyNodeSetParams1D
+cudaGraphMemcpyNodeSetParamsFromSymbol
+cudaGraphMemcpyNodeSetParamsToSymbol
+cudaGraphMemsetNodeGetParams
+cudaGraphMemsetNodeSetParams
+cudaGraphNodeFindInClone
+cudaGraphNodeGetDependencies
+cudaGraphNodeGetDependentNodes
+cudaGraphNodeGetEnabled
+cudaGraphNodeGetType
+cudaGraphNodeSetEnabled
+cudaGraphNodeSetParams
+cudaGraphReleaseUserObject
+cudaGraphRemoveDependencies
+cudaGraphRetainUserObject
+cudaGraphUpload
+cudaGraphUpload_ptsz
+cudaGraphicsEGLRegisterImage
+cudaGraphicsGLRegisterBuffer
+cudaGraphicsGLRegisterImage
+cudaGraphicsMapResources
+cudaGraphicsResourceGetMappedEglFrame
+cudaGraphicsResourceGetMappedMipmappedArray
+cudaGraphicsResourceGetMappedPointer
+cudaGraphicsResourceSetMapFlags
+cudaGraphicsSubResourceGetMappedArray
+cudaGraphicsUnmapResources
+cudaGraphicsUnregisterResource
+cudaGraphicsVDPAURegisterOutputSurface
+cudaGraphicsVDPAURegisterVideoSurface
+cudaHostAlloc
+cudaHostGetDevicePointer
+cudaHostGetFlags
+cudaHostRegister
+cudaHostUnregister
+cudaImportExternalMemory
+cudaImportExternalSemaphore
+cudaInitDevice
+cudaIpcCloseMemHandle
+cudaIpcGetEventHandle
+cudaIpcGetMemHandle
+cudaIpcOpenEventHandle
+cudaIpcOpenMemHandle
+cudaLaunchCooperativeKernel
+cudaLaunchCooperativeKernelMultiDevice
+cudaLaunchCooperativeKernel_ptsz
+cudaLaunchHostFunc
+cudaLaunchHostFunc_ptsz
+cudaLaunchKernel
+cudaLaunchKernelExC
+cudaLaunchKernelExC_ptsz
+cudaLaunchKernel_ptsz
+cudaMalloc
+cudaMalloc3D
+cudaMalloc3DArray
+cudaMallocArray
+cudaMallocAsync
+cudaMallocAsync_ptsz
+cudaMallocFromPoolAsync
+cudaMallocFromPoolAsync_ptsz
+cudaMallocHost
+cudaMallocManaged
+cudaMallocMipmappedArray
+cudaMallocPitch
+cudaMemAdvise
+cudaMemAdvise_v2
+cudaMemGetInfo
+cudaMemPoolCreate
+cudaMemPoolDestroy
+cudaMemPoolExportPointer
+cudaMemPoolExportToShareableHandle
+cudaMemPoolGetAccess
+cudaMemPoolGetAttribute
+cudaMemPoolImportFromShareableHandle
+cudaMemPoolImportPointer
+cudaMemPoolSetAccess
+cudaMemPoolSetAttribute
+cudaMemPoolTrimTo
+cudaMemPrefetchAsync
+cudaMemPrefetchAsync_ptsz
+cudaMemPrefetchAsync_v2
+cudaMemPrefetchAsync_v2_ptsz
+cudaMemRangeGetAttribute
+cudaMemRangeGetAttributes
+cudaMemcpy
+cudaMemcpy2D
+cudaMemcpy2DArrayToArray
+cudaMemcpy2DArrayToArray_ptds
+cudaMemcpy2DAsync
+cudaMemcpy2DAsync_ptsz
+cudaMemcpy2DFromArray
+cudaMemcpy2DFromArrayAsync
+cudaMemcpy2DFromArrayAsync_ptsz
+cudaMemcpy2DFromArray_ptds
+cudaMemcpy2DToArray
+cudaMemcpy2DToArrayAsync
+cudaMemcpy2DToArrayAsync_ptsz
+cudaMemcpy2DToArray_ptds
+cudaMemcpy2D_ptds
+cudaMemcpy3D
+cudaMemcpy3DAsync
+cudaMemcpy3DAsync_ptsz
+cudaMemcpy3DPeer
+cudaMemcpy3DPeerAsync
+cudaMemcpy3DPeerAsync_ptsz
+cudaMemcpy3DPeer_ptds
+cudaMemcpy3D_ptds
+cudaMemcpyArrayToArray
+cudaMemcpyArrayToArray_ptds
+cudaMemcpyAsync
+cudaMemcpyAsync_ptsz
+cudaMemcpyFromArray
+cudaMemcpyFromArrayAsync
+cudaMemcpyFromArrayAsync_ptsz
+cudaMemcpyFromArray_ptds
+cudaMemcpyFromSymbol
+cudaMemcpyFromSymbolAsync
+cudaMemcpyFromSymbolAsync_ptsz
+cudaMemcpyFromSymbol_ptds
+cudaMemcpyPeer
+cudaMemcpyPeerAsync
+cudaMemcpyToArray
+cudaMemcpyToArrayAsync
+cudaMemcpyToArrayAsync_ptsz
+cudaMemcpyToArray_ptds
+cudaMemcpyToSymbol
+cudaMemcpyToSymbolAsync
+cudaMemcpyToSymbolAsync_ptsz
+cudaMemcpyToSymbol_ptds
+cudaMemcpy_ptds
+cudaMemset
+cudaMemset2D
+cudaMemset2DAsync
+cudaMemset2DAsync_ptsz
+cudaMemset2D_ptds
+cudaMemset3D
+cudaMemset3DAsync
+cudaMemset3DAsync_ptsz
+cudaMemset3D_ptds
+cudaMemsetAsync
+cudaMemsetAsync_ptsz
+cudaMemset_ptds
+cudaMipmappedArrayGetMemoryRequirements
+cudaMipmappedArrayGetSparseProperties
+cudaOccupancyAvailableDynamicSMemPerBlock
+cudaOccupancyMaxActiveBlocksPerMultiprocessor
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+cudaOccupancyMaxActiveClusters
+cudaOccupancyMaxPotentialClusterSize
+cudaPeekAtLastError
+cudaPointerGetAttributes
+cudaProfilerStart
+cudaProfilerStop
+cudaRuntimeGetVersion
+cudaSetDevice
+cudaSetDeviceFlags
+cudaSetDoubleForDevice
+cudaSetDoubleForHost
+cudaSetValidDevices
+cudaSignalExternalSemaphoresAsync
+cudaSignalExternalSemaphoresAsync_ptsz
+cudaSignalExternalSemaphoresAsync_v2
+cudaSignalExternalSemaphoresAsync_v2_ptsz
+cudaStreamAddCallback
+cudaStreamAddCallback_ptsz
+cudaStreamAttachMemAsync
+cudaStreamAttachMemAsync_ptsz
+cudaStreamBeginCapture
+cudaStreamBeginCapture_ptsz
+cudaStreamCopyAttributes
+cudaStreamCopyAttributes_ptsz
+cudaStreamCreate
+cudaStreamCreateWithFlags
+cudaStreamCreateWithPriority
+cudaStreamDestroy
+cudaStreamEndCapture
+cudaStreamEndCapture_ptsz
+cudaStreamGetAttribute
+cudaStreamGetAttribute_ptsz
+cudaStreamGetCaptureInfo
+cudaStreamGetCaptureInfo_ptsz
+cudaStreamGetCaptureInfo_v2
+cudaStreamGetCaptureInfo_v2_ptsz
+cudaStreamGetFlags
+cudaStreamGetFlags_ptsz
+cudaStreamGetId
+cudaStreamGetId_ptsz
+cudaStreamGetPriority
+cudaStreamGetPriority_ptsz
+cudaStreamIsCapturing
+cudaStreamIsCapturing_ptsz
+cudaStreamQuery
+cudaStreamQuery_ptsz
+cudaStreamSetAttribute
+cudaStreamSetAttribute_ptsz
+cudaStreamSynchronize
+cudaStreamSynchronize_ptsz
+cudaStreamUpdateCaptureDependencies
+cudaStreamUpdateCaptureDependencies_ptsz
+cudaStreamWaitEvent
+cudaStreamWaitEvent_ptsz
+cudaThreadExchangeStreamCaptureMode
+cudaThreadExit
+cudaThreadGetCacheConfig
+cudaThreadGetLimit
+cudaThreadSetCacheConfig
+cudaThreadSetLimit
+cudaThreadSynchronize
+cudaUserObjectCreate
+cudaUserObjectRelease
+cudaUserObjectRetain
+cudaVDPAUGetDevice
+cudaVDPAUSetVDPAUDevice
+cudaWaitExternalSemaphoresAsync
+cudaWaitExternalSemaphoresAsync_ptsz
+cudaWaitExternalSemaphoresAsync_v2
+cudaWaitExternalSemaphoresAsync_v2_ptsz
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc
index aa11f03d5a3e12..cfae868dc667fc 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc
@@ -34,119 +34,37 @@ void *GetDsoHandle() {
   return handle;
 }
 
-template <typename T>
-T LoadSymbol(const char *symbol_name) {
+void *LoadSymbol(const char *symbol_name) {
   void *symbol = nullptr;
   auto env = tsl::Env::Default();
   env->GetSymbolFromLibrary(GetDsoHandle(), symbol_name, &symbol).IgnoreError();
-  return reinterpret_cast<T>(symbol);
+  return symbol;
 }
-cudaError_t GetSymbolNotFoundError() {
-  return cudaErrorSharedObjectSymbolNotFound;
-}
-}  // namespace
-
-#define __dv(v)
-#define __CUDA_DEPRECATED
 
-// A bunch of new symbols were introduced in version 10
-#if CUDART_VERSION < 10000
-#include "tsl/cuda/cuda_runtime_9_0.inc"
-#elif CUDART_VERSION < 10010
-#include "tsl/cuda/cuda_runtime_10_0.inc"
-#elif CUDART_VERSION < 10020
-#include "tsl/cuda/cuda_runtime_10_1.inc"
-#elif CUDART_VERSION < 11000
-#include "tsl/cuda/cuda_runtime_10_2.inc"
-#elif CUDART_VERSION < 11020
-#include "tsl/cuda/cuda_runtime_11_0.inc"
-#elif CUDART_VERSION < 11080
-#include "tsl/cuda/cuda_runtime_11_2.inc"
-#elif CUDART_VERSION < 12000
-#include "tsl/cuda/cuda_runtime_11_8.inc"
-#else
-#include "tsl/cuda/cuda_runtime_12_0.inc"
-#endif
-#undef __dv
-#undef __CUDA_DEPRECATED
-
-extern "C" {
-
-// Following are private symbols in libcudart that got inserted by nvcc.
-extern void CUDARTAPI __cudaRegisterFunction(
-    void **fatCubinHandle, const char *hostFun, char *deviceFun,
-    const char *deviceName, int thread_limit, uint3 *tid, uint3 *bid,
-    dim3 *bDim, dim3 *gDim, int *wSize) {
-  using FuncPtr = void(CUDARTAPI *)(void **fatCubinHandle, const char *hostFun,
-                                    char *deviceFun, const char *deviceName,
-                                    int thread_limit, uint3 *tid, uint3 *bid,
-                                    dim3 *bDim, dim3 *gDim, int *wSize);
-  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaRegisterFunction");
-  if (!func_ptr) return;
-  func_ptr(fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid,
-           bid, bDim, gDim, wSize);
-}
+const char *kSymbols[] = {
+#include "tsl/cuda/cudart.inc"
+};
 
-extern void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
-  using FuncPtr = void(CUDARTAPI *)(void **fatCubinHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaUnregisterFatBinary");
-  if (!func_ptr) return;
-  func_ptr(fatCubinHandle);
-}
+constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char *);
 
-extern void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle, char *hostVar,
-                                        char *deviceAddress,
-                                        const char *deviceName, int ext,
-                                        size_t size, int constant, int global) {
-  using FuncPtr = void(CUDARTAPI *)(
-      void **fatCubinHandle, char *hostVar, char *deviceAddress,
-      const char *deviceName, int ext, size_t size, int constant, int global);
-  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaRegisterVar");
-  if (!func_ptr) return;
-  func_ptr(fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size,
-           constant, global);
-}
+}  // namespace
 
-extern void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
-  using FuncPtr = void **(CUDARTAPI *)(void *fatCubin);
-  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaRegisterFatBinary");
-  if (!func_ptr) return nullptr;
-  return (void **)func_ptr(fatCubin);
-}
+extern "C" {
 
-extern cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim,
-                                                        dim3 *blockDim,
-                                                        size_t *sharedMem,
-                                                        void *stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(dim3 * gridDim, dim3 * blockDim,
-                                           size_t * sharedMem, void *stream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaPopCallConfiguration");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(gridDim, blockDim, sharedMem, stream);
+static cudaError_t CudartGetSymbolNotFoundError() {
+  return cudaErrorSharedObjectSymbolNotFound;
 }
 
-extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
-    dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0) {
-  using FuncPtr = unsigned(CUDARTAPI *)(dim3 gridDim, dim3 blockDim,
-                                        size_t sharedMem, void *stream);
-  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaPushCallConfiguration");
-  if (!func_ptr) return 0;
-  return func_ptr(gridDim, blockDim, sharedMem, stream);
-}
+extern void *_cudart_tramp_table[];
 
-extern char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
-  using FuncPtr = char(CUDARTAPI *)(void **fatCubinHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaInitModule");
-  if (!func_ptr) return 0;
-  return func_ptr(fatCubinHandle);
+void _cudart_tramp_resolve(int i) {
+  CHECK_LE(0, i);
+  CHECK_LT(i, kNumSymbols);
+  void *p = LoadSymbol(kSymbols[i]);
+  if (!p) {
+    p = reinterpret_cast<void *>(&CudartGetSymbolNotFoundError);
+  }
+  _cudart_tramp_table[i] = p;
 }
 
-#if CUDART_VERSION >= 10010
-extern void CUDARTAPI __cudaRegisterFatBinaryEnd(void **fatCubinHandle) {
-  using FuncPtr = void(CUDARTAPI *)(void **fatCubinHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaRegisterFatBinaryEnd");
-  if (!func_ptr) return;
-  func_ptr(fatCubinHandle);
-}
-#endif
 }  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cudnn.symbols
new file mode 100644
index 00000000000000..2c4dbd71030b38
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cudnn.symbols
@@ -0,0 +1,268 @@
+cudnnActivationBackward
+cudnnActivationForward
+cudnnAddTensor
+cudnnAdvInferVersionCheck
+cudnnAdvTrainVersionCheck
+cudnnBackendCreateDescriptor
+cudnnBackendDestroyDescriptor
+cudnnBackendExecute
+cudnnBackendFinalize
+cudnnBackendGetAttribute
+cudnnBackendInitialize
+cudnnBackendSetAttribute
+cudnnBatchNormalizationBackward
+cudnnBatchNormalizationBackwardEx
+cudnnBatchNormalizationForwardInference
+cudnnBatchNormalizationForwardTraining
+cudnnBatchNormalizationForwardTrainingEx
+cudnnBuildRNNDynamic
+cudnnCTCLoss
+cudnnCTCLoss_v8
+cudnnCnnInferVersionCheck
+cudnnCnnTrainVersionCheck
+cudnnConvolutionBackwardBias
+cudnnConvolutionBackwardData
+cudnnConvolutionBackwardFilter
+cudnnConvolutionBiasActivationForward
+cudnnConvolutionForward
+cudnnCopyAlgorithmDescriptor
+cudnnCreate
+cudnnCreateActivationDescriptor
+cudnnCreateAlgorithmDescriptor
+cudnnCreateAlgorithmPerformance
+cudnnCreateAttnDescriptor
+cudnnCreateCTCLossDescriptor
+cudnnCreateConvolutionDescriptor
+cudnnCreateDropoutDescriptor
+cudnnCreateFilterDescriptor
+cudnnCreateFusedOpsConstParamPack
+cudnnCreateFusedOpsPlan
+cudnnCreateFusedOpsVariantParamPack
+cudnnCreateLRNDescriptor
+cudnnCreateOpTensorDescriptor
+cudnnCreatePersistentRNNPlan
+cudnnCreatePoolingDescriptor
+cudnnCreateRNNDataDescriptor
+cudnnCreateRNNDescriptor
+cudnnCreateReduceTensorDescriptor
+cudnnCreateSeqDataDescriptor
+cudnnCreateSpatialTransformerDescriptor
+cudnnCreateTensorDescriptor
+cudnnCreateTensorTransformDescriptor
+cudnnDeriveBNTensorDescriptor
+cudnnDeriveNormTensorDescriptor
+cudnnDestroy
+cudnnDestroyActivationDescriptor
+cudnnDestroyAlgorithmDescriptor
+cudnnDestroyAlgorithmPerformance
+cudnnDestroyAttnDescriptor
+cudnnDestroyCTCLossDescriptor
+cudnnDestroyConvolutionDescriptor
+cudnnDestroyDropoutDescriptor
+cudnnDestroyFilterDescriptor
+cudnnDestroyFusedOpsConstParamPack
+cudnnDestroyFusedOpsPlan
+cudnnDestroyFusedOpsVariantParamPack
+cudnnDestroyLRNDescriptor
+cudnnDestroyOpTensorDescriptor
+cudnnDestroyPersistentRNNPlan
+cudnnDestroyPoolingDescriptor
+cudnnDestroyRNNDataDescriptor
+cudnnDestroyRNNDescriptor
+cudnnDestroyReduceTensorDescriptor
+cudnnDestroySeqDataDescriptor
+cudnnDestroySpatialTransformerDescriptor
+cudnnDestroyTensorDescriptor
+cudnnDestroyTensorTransformDescriptor
+cudnnDivisiveNormalizationBackward
+cudnnDivisiveNormalizationForward
+cudnnDropoutBackward
+cudnnDropoutForward
+cudnnDropoutGetReserveSpaceSize
+cudnnDropoutGetStatesSize
+cudnnFindConvolutionBackwardDataAlgorithm
+cudnnFindConvolutionBackwardDataAlgorithmEx
+cudnnFindConvolutionBackwardFilterAlgorithm
+cudnnFindConvolutionBackwardFilterAlgorithmEx
+cudnnFindConvolutionForwardAlgorithm
+cudnnFindConvolutionForwardAlgorithmEx
+cudnnFindRNNBackwardDataAlgorithmEx
+cudnnFindRNNBackwardWeightsAlgorithmEx
+cudnnFindRNNForwardInferenceAlgorithmEx
+cudnnFindRNNForwardTrainingAlgorithmEx
+cudnnFusedOpsExecute
+cudnnGetActivationDescriptor
+cudnnGetActivationDescriptorSwishBeta
+cudnnGetAlgorithmDescriptor
+cudnnGetAlgorithmPerformance
+cudnnGetAlgorithmSpaceSize
+cudnnGetAttnDescriptor
+cudnnGetBackdoor
+cudnnGetBatchNormalizationBackwardExWorkspaceSize
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize
+cudnnGetCTCLossDescriptor
+cudnnGetCTCLossDescriptorEx
+cudnnGetCTCLossDescriptor_v8
+cudnnGetCTCLossWorkspaceSize
+cudnnGetCTCLossWorkspaceSize_v8
+cudnnGetCallback
+cudnnGetConvolution2dDescriptor
+cudnnGetConvolution2dForwardOutputDim
+cudnnGetConvolutionBackwardDataAlgorithmMaxCount
+cudnnGetConvolutionBackwardDataAlgorithm_v7
+cudnnGetConvolutionBackwardDataWorkspaceSize
+cudnnGetConvolutionBackwardFilterAlgorithmMaxCount
+cudnnGetConvolutionBackwardFilterAlgorithm_v7
+cudnnGetConvolutionBackwardFilterWorkspaceSize
+cudnnGetConvolutionForwardAlgorithmMaxCount
+cudnnGetConvolutionForwardAlgorithm_v7
+cudnnGetConvolutionForwardWorkspaceSize
+cudnnGetConvolutionGroupCount
+cudnnGetConvolutionMathType
+cudnnGetConvolutionNdDescriptor
+cudnnGetConvolutionNdForwardOutputDim
+cudnnGetConvolutionReorderType
+cudnnGetCudartVersion
+cudnnGetDropoutDescriptor
+cudnnGetErrorString
+cudnnGetFilter4dDescriptor
+cudnnGetFilterNdDescriptor
+cudnnGetFilterSizeInBytes
+cudnnGetFoldedConvBackwardDataDescriptors
+cudnnGetFusedOpsConstParamPackAttribute
+cudnnGetFusedOpsVariantParamPackAttribute
+cudnnGetLRNDescriptor
+cudnnGetMaxDeviceVersion
+cudnnGetMultiHeadAttnBuffers
+cudnnGetMultiHeadAttnWeights
+cudnnGetNormalizationBackwardWorkspaceSize
+cudnnGetNormalizationForwardTrainingWorkspaceSize
+cudnnGetNormalizationTrainingReserveSpaceSize
+cudnnGetOpTensorDescriptor
+cudnnGetPooling2dDescriptor
+cudnnGetPooling2dForwardOutputDim
+cudnnGetPoolingNdDescriptor
+cudnnGetPoolingNdForwardOutputDim
+cudnnGetProperty
+cudnnGetRNNBackwardDataAlgorithmMaxCount
+cudnnGetRNNBackwardWeightsAlgorithmMaxCount
+cudnnGetRNNBiasMode
+cudnnGetRNNDataDescriptor
+cudnnGetRNNDescriptor_v6
+cudnnGetRNNDescriptor_v8
+cudnnGetRNNDropoutLocationsInternal
+cudnnGetRNNForwardInferenceAlgorithmMaxCount
+cudnnGetRNNForwardTrainingAlgorithmMaxCount
+cudnnGetRNNLinLayerBiasParams
+cudnnGetRNNLinLayerMatrixParams
+cudnnGetRNNMatrixMathType
+cudnnGetRNNPaddingMode
+cudnnGetRNNParamsSize
+cudnnGetRNNProjectionLayers
+cudnnGetRNNTempSpaceSizes
+cudnnGetRNNTrainingReserveSize
+cudnnGetRNNWeightParams
+cudnnGetRNNWeightSpaceSize
+cudnnGetRNNWorkspaceSize
+cudnnGetReduceTensorDescriptor
+cudnnGetReductionIndicesSize
+cudnnGetReductionWorkspaceSize
+cudnnGetSeqDataDescriptor
+cudnnGetStream
+cudnnGetTensor4dDescriptor
+cudnnGetTensorNdDescriptor
+cudnnGetTensorSizeInBytes
+cudnnGetTensorTransformDescriptor
+cudnnGetVersion
+cudnnIm2Col
+cudnnInitTransformDest
+cudnnLRNCrossChannelBackward
+cudnnLRNCrossChannelForward
+cudnnMakeFusedOpsPlan
+cudnnMultiHeadAttnBackwardData
+cudnnMultiHeadAttnBackwardWeights
+cudnnMultiHeadAttnForward
+cudnnNormalizationBackward
+cudnnNormalizationForwardInference
+cudnnNormalizationForwardTraining
+cudnnOpTensor
+cudnnOpsInferVersionCheck
+cudnnOpsTrainVersionCheck
+cudnnPoolingBackward
+cudnnPoolingForward
+cudnnQueryRuntimeError
+cudnnRNNBackwardData
+cudnnRNNBackwardDataEx
+cudnnRNNBackwardData_v8
+cudnnRNNBackwardWeights
+cudnnRNNBackwardWeightsEx
+cudnnRNNBackwardWeights_v8
+cudnnRNNForward
+cudnnRNNForwardInference
+cudnnRNNForwardInferenceEx
+cudnnRNNForwardTraining
+cudnnRNNForwardTrainingEx
+cudnnRNNGetClip
+cudnnRNNGetClip_v8
+cudnnRNNSetClip
+cudnnRNNSetClip_v8
+cudnnReduceTensor
+cudnnReorderFilterAndBias
+cudnnRestoreAlgorithm
+cudnnRestoreDropoutDescriptor
+cudnnSaveAlgorithm
+cudnnScaleTensor
+cudnnSetActivationDescriptor
+cudnnSetActivationDescriptorSwishBeta
+cudnnSetAlgorithmDescriptor
+cudnnSetAlgorithmPerformance
+cudnnSetAttnDescriptor
+cudnnSetBackdoor
+cudnnSetBackdoorEx
+cudnnSetCTCLossDescriptor
+cudnnSetCTCLossDescriptorEx
+cudnnSetCTCLossDescriptor_v8
+cudnnSetCallback
+cudnnSetConvolution2dDescriptor
+cudnnSetConvolutionGroupCount
+cudnnSetConvolutionMathType
+cudnnSetConvolutionNdDescriptor
+cudnnSetConvolutionReorderType
+cudnnSetDropoutDescriptor
+cudnnSetFilter4dDescriptor
+cudnnSetFilterNdDescriptor
+cudnnSetFusedOpsConstParamPackAttribute
+cudnnSetFusedOpsVariantParamPackAttribute
+cudnnSetLRNDescriptor
+cudnnSetOpTensorDescriptor
+cudnnSetPersistentRNNPlan
+cudnnSetPooling2dDescriptor
+cudnnSetPoolingNdDescriptor
+cudnnSetRNNAlgorithmDescriptor
+cudnnSetRNNBiasMode
+cudnnSetRNNDataDescriptor
+cudnnSetRNNDescriptor_v6
+cudnnSetRNNDescriptor_v8
+cudnnSetRNNMatrixMathType
+cudnnSetRNNPaddingMode
+cudnnSetRNNProjectionLayers
+cudnnSetReduceTensorDescriptor
+cudnnSetSeqDataDescriptor
+cudnnSetSpatialTransformerNdDescriptor
+cudnnSetStream
+cudnnSetTensor
+cudnnSetTensor4dDescriptor
+cudnnSetTensor4dDescriptorEx
+cudnnSetTensorNdDescriptor
+cudnnSetTensorNdDescriptorEx
+cudnnSetTensorTransformDescriptor
+cudnnSoftmaxBackward
+cudnnSoftmaxForward
+cudnnSpatialTfGridGeneratorBackward
+cudnnSpatialTfGridGeneratorForward
+cudnnSpatialTfSamplerBackward
+cudnnSpatialTfSamplerForward
+cudnnTransformFilter
+cudnnTransformTensor
+cudnnTransformTensorEx
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_6_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_6_0.inc
deleted file mode 100644
index 11288983a4a1a4..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_6_0.inc
+++ /dev/null
@@ -1,1825 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
-  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
-                                         cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
-                                         cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType,  // image data type
-    int n,                     // number of inputs (batch size)
-    int c,                     // number of input feature maps
-    int h,                     // height of input section
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t dataType,  // image data type
-    int n,                     // number of inputs (batch size)
-    int c,                     // number of input feature maps
-    int h,                     // height of input section
-    int w,                     // width of input section
-    int nStride, int cStride, int hStride, int wStride) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
-                                   int, int, int, int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t *dataType,  // image data type
-    int *n,                     // number of inputs (batch size)
-    int *c,                     // number of input feature maps
-    int *h,                     // height of input section
-    int *w,                     // width of input section
-    int *nStride, int *cStride, int *hStride, int *wStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
-      int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
-    const int dimA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
-                                   cudnnDataType_t *, int *, int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, size);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
-                                         const void *alpha,
-                                         const cudnnTensorDescriptor_t aDesc,
-                                         const void *A, const void *beta,
-                                         const cudnnTensorDescriptor_t cDesc,
-                                         void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
-    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
-                                   cudnnDataType_t, cudnnNanPropagation_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
-    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
-      cudnnNanPropagation_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
-    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
-    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
-    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
-                  beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
-    cudnnNanPropagation_t reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t reduceTensorIndices,
-    cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
-      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t *reduceTensorOp,
-    cudnnDataType_t *reduceTensorCompType,
-    cudnnNanPropagation_t *reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t *reduceTensorIndices,
-    cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
-      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
-      cudnnIndicesType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    void *indices, size_t indicesSizeInBytes, void *workspace,
-    size_t workspaceSizeInBytes, const void *alpha,
-    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
-    const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
-      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
-                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
-                  C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
-                                         const cudnnTensorDescriptor_t yDesc,
-                                         void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, valuePtr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t yDesc,
-                                           void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, alpha);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType,  // image data type
-                           cudnnTensorFormat_t format,
-                           int k,  // number of output feature maps
-                           int c,  // number of input feature maps
-                           int h,  // height of each input filter
-                           int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t *dataType,  // image data type
-                           cudnnTensorFormat_t *format,
-                           int *k,  // number of output feature maps
-                           int *c,  // number of input feature maps
-                           int *h,  // height of each input filter
-                           int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
-      int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType,  // image data type
-    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType,  // image data type
-    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
-      cudnnTensorFormat_t *, int *, int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
-                  filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
-    cudnnConvolutionDescriptor_t convDesc,
-    int pad_h,       // zero-padding height
-    int pad_w,       // zero-padding width
-    int u,           // vertical filter stride
-    int v,           // horizontal filter stride
-    int dilation_h,  // filter dilation in the vertical dimension
-    int dilation_w,  // filter dilation in the horizontal dimension
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc,
-    int *pad_h,       // zero-padding height
-    int *pad_w,       // zero-padding width
-    int *u,           // vertical filter stride
-    int *v,           // horizontal filter stride
-    int *dilation_h,  // filter dilation in the vertical dimension
-    int *dilation_w,  // filter dilation in the horizontal dimension
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
-      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
-    const int padA[], const int filterStrideA[], const int dilationA[],
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
-    int *arrayLength, int padA[], int strideA[], int dilationA[],
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
-      cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
-                  dilationA, mode, computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int nbDims,
-    int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
-                  tensorOutputDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc,
-    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
-      cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-    cudnnHandle_t handle, const void *alpha1,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
-    const cudnnTensorDescriptor_t zDesc, const void *z,
-    const cudnnTensorDescriptor_t biasDesc, const void *bias,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
-                  activationDesc, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dbDesc, void *db) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *y,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc,
-    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
-      size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnFilterDescriptor_t dwDesc, void *dw) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
-      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc,
-    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
-      size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-            const void *x, const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
-                                   const void *, const cudnnFilterDescriptor_t,
-                                   const cudnnConvolutionDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
-                  dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
-    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
-    int verticalPadding, int horizontalPadding, int verticalStride,
-    int horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
-      int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
-    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
-    int *windowWidth, int *verticalPadding, int *horizontalPadding,
-    int *verticalStride, int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
-    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
-    const int windowDimA[], const int paddingA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
-      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
-                  paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
-    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
-    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int[], int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
-                  windowDimA, paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims, int outputTensorDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                   const cudnnTensorDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
-    cudnnNanPropagation_t reluNanOpt, double coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
-                                               cudnnActivationMode_t,
-                                               cudnnNanPropagation_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
-      cudnnNanPropagation_t *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned lrnN, double lrnAlpha,
-                                                double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int, double, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned *lrnN,
-                                                double *lrnAlpha,
-                                                double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lrnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
-                  x, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc,  // same desc for means, temp, temp2
-    const void *x,
-    const void *means,  // if NULL, means are assumed to be zero
-    void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
-                  beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t
-        xDesc,  // same desc for x, means, dy, temp, temp2
-    const void *x,
-    const void *means,  // if NULL, means are assumed to be zero
-    const void *dy, void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t dXdMeansDesc,  // same desc for dx, dMeans
-    void *dx,                                    // output x differential
-    void *dMeans) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
-                  temp2, beta, dXdMeansDesc, dx, dMeans);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
-    cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               cudnnBatchNormMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(derivedBnDesc, xDesc, mode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-
-    const void *alpha,  // alpha[0] = result blend factor
-    const void *beta,   // beta[0] = dest layer blend factor
-
-    const cudnnTensorDescriptor_t xDesc,
-    const void *x,  // NxCxHxW
-    const cudnnTensorDescriptor_t yDesc,
-    void *y,  // NxCxHxW
-
-    /* Shared desc for the next 6 tensors in the argument list.
-       Data type to be set as follows:
-       type = (typeOf(x) == double) ? double : float
-       Dimensions for this descriptor depend on normalization mode
-       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-        (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of
-       1xCxHxW (normalization is performed across N) */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-
-    // 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
-    const void *bnScale, const void *bnBias,
-
-    /* MUST use factor=1 in the very first call of a complete training cycle.
-       Use a factor=1/(1+n) at N-th call to the function to get
-       Cumulative Moving Average (CMA) behavior
-       CMA[n] = (x[1]+...+x[n])/n
-       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-    double exponentialAverageFactor,
-
-    /* Used in Training phase only.
-       runningMean = newMean*factor + runningMean*(1-factor) */
-    void *resultRunningMean,
-    /* Output in training mode, input in inference. Is the moving average
-       of  variance[x] (factor is applied in the same way as for runningMean) */
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
-      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
-      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-    const void *alpha,  // alpha[0] = result blend factor
-    const void *beta,   // beta[0] = dest layer blend factor
-    const cudnnTensorDescriptor_t xDesc,
-    const void *x,  // NxCxHxW
-    const cudnnTensorDescriptor_t yDesc,
-    void *y,  // NxCxHxW
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias, const void *estimatedMean,
-    const void *estimatedVariance, double epsilon) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, const void *, const void *, double);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
-                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
-                  estimatedVariance, epsilon);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
-    const void *betaDataDiff, const void *alphaParamDiff,
-    const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc,  // same desc for x, dx, dy
-    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const void *bnScale,  // bnBias doesn't affect backpropagation
-    /* scale and bias diff are not backpropagated below this layer */
-    void *dBnScaleResult, void *dBnBiasResult,
-    /* Same epsilon as forward pass */
-    double epsilon,
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, void *, void *, double, const void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
-                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
-                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
-                  epsilon, savedMean, savedInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-
-    cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
-    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
-      const int, const int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *theta, void *grid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, theta, grid);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *dgrid, void *dtheta) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, dgrid, dtheta);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
-    void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
-    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
-                  dyDesc, dy, grid, betaDgrid, dgrid);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
-                                                    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
-    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(xdesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t xdesc, const void *x,
-    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t dydesc, const void *dy,
-    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
-    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
-                                               const cudnnDataType_t,
-                                               cudnnPersistentRNNPlan_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, minibatch, dataType, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
-                                               cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers,
-    cudnnDropoutDescriptor_t
-        dropoutDesc,  // Between layers, not between recurrent steps.
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
-    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
-    cudnnDropoutDescriptor_t
-        dropoutDesc,  // Between layers, not between recurrent steps.
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
-      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
-      cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
-                  direction, mode, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
-    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
-    const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
-                  linLayerMatDesc, linLayerMat);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
-    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
-    const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
-                  linLayerBiasDesc, linLayerBias);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
-                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
-                     const void *w, const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-                     void *workspace, size_t workSpaceSizeInBytes,
-                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor_v4(
-    cudnnConvolutionDescriptor_t convDesc,
-    int pad_h,       // zero-padding height
-    int pad_w,       // zero-padding width
-    int u,           // vertical filter stride
-    int v,           // horizontal filter stride
-    int dilation_h,  // filter dilation in the vertical dimension
-    int dilation_w,  // filter dilation in the horizontal dimension
-    cudnnConvolutionMode_t mode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int,
-                                   int, int, int, cudnnConvolutionMode_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v4");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor_v5(
-    cudnnConvolutionDescriptor_t convDesc,
-    int pad_h,       // zero-padding height
-    int pad_w,       // zero-padding width
-    int u,           // vertical filter stride
-    int v,           // horizontal filter stride
-    int dilation_h,  // filter dilation in the vertical dimension
-    int dilation_w,  // filter dilation in the horizontal dimension
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v5");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor_v4(
-    const cudnnConvolutionDescriptor_t convDesc,
-    int *pad_h,       // zero-padding height
-    int *pad_w,       // zero-padding width
-    int *u,           // vertical filter stride
-    int *v,           // horizontal filter stride
-    int *dilation_h,  // filter dilation in the vertical dimension
-    int *dilation_w,  // filter dilation in the horizontal dimension
-    cudnnConvolutionMode_t *mode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
-      int *, cudnnConvolutionMode_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v4");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor_v5(
-    const cudnnConvolutionDescriptor_t convDesc,
-    int *pad_h,       // zero-padding height
-    int *pad_w,       // zero-padding width
-    int *u,           // vertical filter stride
-    int *v,           // horizontal filter stride
-    int *dilation_h,  // filter dilation in the vertical dimension
-    int *dilation_w,  // filter dilation in the horizontal dimension
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
-      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v5");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_0.inc
deleted file mode 100644
index 008ae9099c0b09..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_0.inc
+++ /dev/null
@@ -1,2027 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
-  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
-                                                 cudnnStatus_t *rstatus,
-                                                 cudnnErrQueryMode_t mode,
-                                                 cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rstatus, mode, tag);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
-                                         cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
-                                         cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w,                    /* width of input section */
-    int nStride, int cStride, int hStride, int wStride) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
-                                   int, int, int, int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t *dataType, /* image data type */
-    int *n,                    /* number of inputs (batch size) */
-    int *c,                    /* number of input feature maps  */
-    int *h,                    /* height of input section */
-    int *w,                    /* width of input section */
-    int *nStride, int *cStride, int *hStride, int *wStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
-      int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
-    const int dimA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
-                                   cudnnDataType_t *, int *, int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, size);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
-                                         const void *alpha,
-                                         const cudnnTensorDescriptor_t aDesc,
-                                         const void *A, const void *beta,
-                                         const cudnnTensorDescriptor_t cDesc,
-                                         void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
-    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
-                                   cudnnDataType_t, cudnnNanPropagation_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
-    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
-      cudnnNanPropagation_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
-    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
-    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
-    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
-                  beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
-    cudnnNanPropagation_t reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t reduceTensorIndices,
-    cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
-      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t *reduceTensorOp,
-    cudnnDataType_t *reduceTensorCompType,
-    cudnnNanPropagation_t *reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t *reduceTensorIndices,
-    cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
-      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
-      cudnnIndicesType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    void *indices, size_t indicesSizeInBytes, void *workspace,
-    size_t workspaceSizeInBytes, const void *alpha,
-    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
-    const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
-      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
-                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
-                  C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
-                                         const cudnnTensorDescriptor_t yDesc,
-                                         void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, valuePtr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t yDesc,
-                                           void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, alpha);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType,          /* image data type */
-    cudnnTensorFormat_t format, int k, /* number of output feature maps */
-    int c,                             /* number of input feature maps */
-    int h,                             /* height of each input filter */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-    const cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t *dataType,           /* image data type */
-    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
-    int *c,                              /* number of input feature maps */
-    int *h,                              /* height of each input filter */
-    int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
-      int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType, /* image data type */
-    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, /* image data type */
-    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
-      cudnnTensorFormat_t *, int *, int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
-                  filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
-    int pad_w,                                        /* zero-padding width */
-    int u,          /* vertical filter stride */
-    int v,          /* horizontal filter stride */
-    int dilation_h, /* filter dilation in the vertical dimension */
-    int dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc,
-    int *pad_h,      /* zero-padding height */
-    int *pad_w,      /* zero-padding width */
-    int *u,          /* vertical filter stride */
-    int *v,          /* horizontal filter stride */
-    int *dilation_h, /* filter dilation in the vertical dimension */
-    int *dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
-      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
-    const int padA[], const int filterStrideA[], const int dilationA[],
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
-    int *arrayLength, int padA[], int strideA[], int dilationA[],
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
-      cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
-                  dilationA, mode, computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int nbDims,
-    int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
-                  tensorOutputDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc,
-    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
-      cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnFilterDescriptor_t filterDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-    cudnnHandle_t handle, const void *alpha1,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
-    const cudnnTensorDescriptor_t zDesc, const void *z,
-    const cudnnTensorDescriptor_t biasDesc, const void *bias,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
-                  activationDesc, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dbDesc, void *db) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *y,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc,
-    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
-      size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnFilterDescriptor_t dwDesc, void *dw) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
-      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc,
-    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
-      size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-            const void *x, const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
-                                   const void *, const cudnnFilterDescriptor_t,
-                                   const cudnnConvolutionDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
-                  dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
-    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
-    int verticalPadding, int horizontalPadding, int verticalStride,
-    int horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
-      int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
-    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
-    int *windowWidth, int *verticalPadding, int *horizontalPadding,
-    int *verticalStride, int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
-    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
-    const int windowDimA[], const int paddingA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
-      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
-                  paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
-    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
-    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int[], int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
-                  windowDimA, paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims, int outputTensorDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                   const cudnnTensorDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
-    cudnnNanPropagation_t reluNanOpt, double coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
-                                               cudnnActivationMode_t,
-                                               cudnnNanPropagation_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
-      cudnnNanPropagation_t *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned lrnN, double lrnAlpha,
-                                                double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int, double, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned *lrnN,
-                                                double *lrnAlpha,
-                                                double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lrnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
-                  x, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
-                  beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t
-        xDesc, /* same desc for x, means, dy, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    const void *dy, void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-    void *dx,                                   /* output x differential */
-    void *dMeans) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
-                  temp2, beta, dXdMeansDesc, dx, dMeans);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
-    cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               cudnnBatchNormMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(derivedBnDesc, xDesc, mode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-
-    /* Shared desc for the next 6 tensors in the argument list.
-       Data type to be set as follows:
-       type = (typeOf(x) == double) ? double : float
-       Dimensions for this descriptor depend on normalization mode
-       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-        (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of
-       1xCxHxW (normalization is performed across N) */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
-     */
-    const void *bnScale, const void *bnBias,
-
-    /* MUST use factor=1 in the very first call of a complete training cycle.
-       Use a factor=1/(1+n) at N-th call to the function to get
-       Cumulative Moving Average (CMA) behavior
-       CMA[n] = (x[1]+...+x[n])/n
-       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-    double exponentialAverageFactor,
-
-    /* Used in Training phase only.
-       runningMean = newMean*factor + runningMean*(1-factor) */
-    void *resultRunningMean,
-    /* Output in training mode, input in inference. Is the moving average
-       of  variance[x] (factor is applied in the same way as for runningMean) */
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
-      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
-      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias, const void *estimatedMean,
-    const void *estimatedVariance, double epsilon) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, const void *, const void *, double);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
-                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
-                  estimatedVariance, epsilon);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
-    const void *betaDataDiff, const void *alphaParamDiff,
-    const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const void *bnScale, /* bnBias doesn't affect backpropagation */
-    /* scale and bias diff are not backpropagated below this layer */
-    void *dBnScaleResult, void *dBnBiasResult,
-    /* Same epsilon as forward pass */
-    double epsilon,
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, void *, void *, double, const void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
-                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
-                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
-                  epsilon, savedMean, savedInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
-    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
-      const int, const int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *theta, void *grid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, theta, grid);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *dgrid, void *dtheta) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, dgrid, dtheta);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
-    void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
-    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
-                  dyDesc, dy, grid, betaDgrid, dgrid);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
-                                                    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
-    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(xdesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
-    void **states, unsigned long long *seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float *, void **, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t xdesc, const void *x,
-    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t dydesc, const void *dy,
-    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
-    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
-                                               const cudnnDataType_t,
-                                               cudnnPersistentRNNPlan_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, minibatch, dataType, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
-                                               cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers,
-    cudnnDropoutDescriptor_t
-        dropoutDesc, /* Between layers, not between recurrent steps. */
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
-    cudnnHandle_t cudnnHandle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
-    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
-    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
-    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
-      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
-      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cudnnHandle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t desc,
-                                                    cudnnMathType_t math) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(desc, math);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
-    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
-    const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
-                  linLayerMatDesc, linLayerMat);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
-    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
-    const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
-                  linLayerBiasDesc, linLayerBias);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
-                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
-                     const void *w, const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-                     void *workspace, size_t workSpaceSizeInBytes,
-                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the timing steps, N is the mini batch size, A
-                      is the alphabet size)  */
-    const void *probs,       /* probabilities after softmax, in GPU memory */
-    const int *labels,       /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    void *costs,             /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
-                          T,N,A */
-    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
-                                compute costs only, set it to NULL */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace, /* pointer to the workspace, in GPU memory */
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
-      const int *, const int *, void *, const cudnnTensorDescriptor_t,
-      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
-                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the timing steps, N is the mini batch size, A
-                      is the alphabet size) */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
-                          T,N,A. To compute costs only, set it to NULL */
-    const int *labels, /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
-      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
-                  inputLengths, algo, ctcLossDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers,
-    cudnnDropoutDescriptor_t
-        dropoutDesc, /* Between layers, not between recurrent steps. */
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
-    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
-    cudnnDropoutDescriptor_t
-        dropoutDesc, /* Between layers, not between recurrent steps. */
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
-      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
-      cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
-                  direction, mode, dataType);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_1.inc b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_1.inc
deleted file mode 100644
index 5330e6d0584c2f..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_1.inc
+++ /dev/null
@@ -1,2361 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
-  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
-                                                 cudnnStatus_t *rstatus,
-                                                 cudnnErrQueryMode_t mode,
-                                                 cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rstatus, mode, tag);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
-                                         cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
-                                         cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w,                    /* width of input section */
-    int nStride, int cStride, int hStride, int wStride) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
-                                   int, int, int, int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t *dataType, /* image data type */
-    int *n,                    /* number of inputs (batch size) */
-    int *c,                    /* number of input feature maps  */
-    int *h,                    /* height of input section */
-    int *w,                    /* width of input section */
-    int *nStride, int *cStride, int *hStride, int *wStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
-      int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
-    const int dimA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
-                                   cudnnDataType_t *, int *, int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, size);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
-                                         const void *alpha,
-                                         const cudnnTensorDescriptor_t aDesc,
-                                         const void *A, const void *beta,
-                                         const cudnnTensorDescriptor_t cDesc,
-                                         void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
-    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
-                                   cudnnDataType_t, cudnnNanPropagation_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
-    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
-      cudnnNanPropagation_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
-    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
-    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
-    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
-                  beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
-    cudnnNanPropagation_t reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t reduceTensorIndices,
-    cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
-      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t *reduceTensorOp,
-    cudnnDataType_t *reduceTensorCompType,
-    cudnnNanPropagation_t *reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t *reduceTensorIndices,
-    cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
-      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
-      cudnnIndicesType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    void *indices, size_t indicesSizeInBytes, void *workspace,
-    size_t workspaceSizeInBytes, const void *alpha,
-    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
-    const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
-      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
-                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
-                  C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
-                                         const cudnnTensorDescriptor_t yDesc,
-                                         void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, valuePtr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t yDesc,
-                                           void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, alpha);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType,          /* image data type */
-    cudnnTensorFormat_t format, int k, /* number of output feature maps */
-    int c,                             /* number of input feature maps */
-    int h,                             /* height of each input filter */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-    const cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t *dataType,           /* image data type */
-    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
-    int *c,                              /* number of input feature maps */
-    int *h,                              /* height of each input filter */
-    int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
-      int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType, /* image data type */
-    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, /* image data type */
-    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
-      cudnnTensorFormat_t *, int *, int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
-                  filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
-    int pad_w,                                        /* zero-padding width */
-    int u,          /* vertical filter stride */
-    int v,          /* horizontal filter stride */
-    int dilation_h, /* filter dilation in the vertical dimension */
-    int dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc,
-    int *pad_h,      /* zero-padding height */
-    int *pad_w,      /* zero-padding width */
-    int *u,          /* vertical filter stride */
-    int *v,          /* horizontal filter stride */
-    int *dilation_h, /* filter dilation in the vertical dimension */
-    int *dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
-      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
-    const int padA[], const int filterStrideA[], const int dilationA[],
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
-    int *arrayLength, int padA[], int strideA[], int dilationA[],
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
-      cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
-                  dilationA, mode, computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int nbDims,
-    int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
-                  tensorOutputDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc,
-    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
-      cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnFilterDescriptor_t filterDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-    cudnnHandle_t handle, const void *alpha1,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
-    const cudnnTensorDescriptor_t zDesc, const void *z,
-    const cudnnTensorDescriptor_t biasDesc, const void *bias,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
-                  activationDesc, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dbDesc, void *db) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *y,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc,
-    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
-      size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnFilterDescriptor_t dwDesc, void *dw) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
-      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc,
-    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
-      size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-            const void *x, const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
-                                   const void *, const cudnnFilterDescriptor_t,
-                                   const cudnnConvolutionDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
-                  dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
-    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
-    int verticalPadding, int horizontalPadding, int verticalStride,
-    int horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
-      int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
-    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
-    int *windowWidth, int *verticalPadding, int *horizontalPadding,
-    int *verticalStride, int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
-    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
-    const int windowDimA[], const int paddingA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
-      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
-                  paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
-    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
-    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int[], int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
-                  windowDimA, paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims, int outputTensorDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                   const cudnnTensorDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
-    cudnnNanPropagation_t reluNanOpt, double coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
-                                               cudnnActivationMode_t,
-                                               cudnnNanPropagation_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
-      cudnnNanPropagation_t *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned lrnN, double lrnAlpha,
-                                                double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int, double, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned *lrnN,
-                                                double *lrnAlpha,
-                                                double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lrnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
-                  x, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
-                  beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t
-        xDesc, /* same desc for x, means, dy, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    const void *dy, void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-    void *dx,                                   /* output x differential */
-    void *dMeans) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
-                  temp2, beta, dXdMeansDesc, dx, dMeans);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
-    cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               cudnnBatchNormMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(derivedBnDesc, xDesc, mode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-
-    /* Shared desc for the next 6 tensors in the argument list.
-       Data type to be set as follows:
-       type = (typeOf(x) == double) ? double : float
-       Dimensions for this descriptor depend on normalization mode
-       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-        (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of
-       1xCxHxW (normalization is performed across N) */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
-     */
-    const void *bnScale, const void *bnBias,
-
-    /* MUST use factor=1 in the very first call of a complete training cycle.
-       Use a factor=1/(1+n) at N-th call to the function to get
-       Cumulative Moving Average (CMA) behavior
-       CMA[n] = (x[1]+...+x[n])/n
-       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-    double exponentialAverageFactor,
-
-    /* Used in Training phase only.
-       runningMean = newMean*factor + runningMean*(1-factor) */
-    void *resultRunningMean,
-    /* Output in training mode, input in inference. Is the moving average
-       of  variance[x] (factor is applied in the same way as for runningMean) */
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
-      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
-      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias, const void *estimatedMean,
-    const void *estimatedVariance, double epsilon) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, const void *, const void *, double);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
-                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
-                  estimatedVariance, epsilon);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
-    const void *betaDataDiff, const void *alphaParamDiff,
-    const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const void *bnScale, /* bnBias doesn't affect backpropagation */
-    /* scale and bias diff are not backpropagated below this layer */
-    void *dBnScaleResult, void *dBnBiasResult,
-    /* Same epsilon as forward pass */
-    double epsilon,
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, void *, void *, double, const void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
-                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
-                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
-                  epsilon, savedMean, savedInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
-    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
-      const int, const int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *theta, void *grid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, theta, grid);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *dgrid, void *dtheta) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, dgrid, dtheta);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
-    void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
-    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
-                  dyDesc, dy, grid, betaDgrid, dgrid);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
-                                                    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
-    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(xdesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
-    void **states, unsigned long long *seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float *, void **, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t xdesc, const void *x,
-    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t dydesc, const void *dy,
-    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
-    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
-    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnTensorDescriptor_t *dxDesc, void *dx,
-    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y,
-    const float findIntensity, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
-    const void *workspace, size_t workSpaceSizeInBytes,
-    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  findIntensity, requestedAlgoCount, returnedAlgoCount,
-                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
-                  reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
-    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
-                                               const cudnnDataType_t,
-                                               cudnnPersistentRNNPlan_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, minibatch, dataType, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
-                                               cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers,
-    cudnnDropoutDescriptor_t
-        dropoutDesc, /* Between layers, not between recurrent steps. */
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize, const int outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
-    int *outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-    cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
-    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
-    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
-    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
-      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
-      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
-    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerMatDesc, linLayerMat);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerBiasDesc, linLayerBias);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
-                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
-                     const void *w, const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-                     void *workspace, size_t workSpaceSizeInBytes,
-                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the timing steps, N is the mini batch size, A
-                      is the alphabet size)  */
-    const void *probs,       /* probabilities after softmax, in GPU memory */
-    const int *labels,       /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    void *costs,             /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
-                          T,N,A */
-    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
-                                compute costs only, set it to NULL */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace, /* pointer to the workspace, in GPU memory */
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
-      const int *, const int *, void *, const cudnnTensorDescriptor_t,
-      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
-                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the timing steps, N is the mini batch size, A
-                      is the alphabet size) */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
-                          T,N,A. To compute costs only, set it to NULL */
-    const int *labels, /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
-      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
-                  inputLengths, algo, ctcLossDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
-    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithm_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc, algorithm);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
-    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithm_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc, algorithm);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
-    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(src, dest);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToCreate);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
-    cudnnStatus_t status, float time, size_t memory) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
-                                               cudnnAlgorithmDescriptor_t,
-                                               cudnnStatus_t, float, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, algoDesc, status, time, memory);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
-    const cudnnAlgorithmPerformance_t algoPerf,
-    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
-    size_t *memory) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
-      cudnnStatus_t *, float *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, algoDesc, status, time, memory);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToDestroy);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
-    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
-    size_t *algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
-                   void *algoSpace, size_t algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
-    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
-    cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
-                                               cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
-                                           cudnnCallback_t fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
-                                           cudnnCallback_t *fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
-    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
-    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
-    cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
-      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
-      cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
-                  direction, mode, dataType);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_3.inc b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_3.inc
deleted file mode 100644
index f1c25c74d0c481..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_3.inc
+++ /dev/null
@@ -1,2585 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
-  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
-                                                 cudnnStatus_t *rstatus,
-                                                 cudnnErrQueryMode_t mode,
-                                                 cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rstatus, mode, tag);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
-                                         cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
-                                         cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w,                    /* width of input section */
-    int nStride, int cStride, int hStride, int wStride) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
-                                   int, int, int, int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t *dataType, /* image data type */
-    int *n,                    /* number of inputs (batch size) */
-    int *c,                    /* number of input feature maps  */
-    int *h,                    /* height of input section */
-    int *w,                    /* width of input section */
-    int *nStride, int *cStride, int *hStride, int *wStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
-      int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
-    const int dimA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
-                                   cudnnDataType_t *, int *, int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, size);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
-                                         const void *alpha,
-                                         const cudnnTensorDescriptor_t aDesc,
-                                         const void *A, const void *beta,
-                                         const cudnnTensorDescriptor_t cDesc,
-                                         void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
-    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
-                                   cudnnDataType_t, cudnnNanPropagation_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
-    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
-      cudnnNanPropagation_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
-    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
-    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
-    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
-                  beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
-    cudnnNanPropagation_t reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t reduceTensorIndices,
-    cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
-      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t *reduceTensorOp,
-    cudnnDataType_t *reduceTensorCompType,
-    cudnnNanPropagation_t *reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t *reduceTensorIndices,
-    cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
-      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
-      cudnnIndicesType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    void *indices, size_t indicesSizeInBytes, void *workspace,
-    size_t workspaceSizeInBytes, const void *alpha,
-    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
-    const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
-      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
-                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
-                  C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
-                                         const cudnnTensorDescriptor_t yDesc,
-                                         void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, valuePtr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t yDesc,
-                                           void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, alpha);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType,          /* image data type */
-    cudnnTensorFormat_t format, int k, /* number of output feature maps */
-    int c,                             /* number of input feature maps */
-    int h,                             /* height of each input filter */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-    const cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t *dataType,           /* image data type */
-    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
-    int *c,                              /* number of input feature maps */
-    int *h,                              /* height of each input filter */
-    int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
-      int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType, /* image data type */
-    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, /* image data type */
-    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
-      cudnnTensorFormat_t *, int *, int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
-                  filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
-    int pad_w,                                        /* zero-padding width */
-    int u,          /* vertical filter stride */
-    int v,          /* horizontal filter stride */
-    int dilation_h, /* filter dilation in the vertical dimension */
-    int dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc,
-    int *pad_h,      /* zero-padding height */
-    int *pad_w,      /* zero-padding width */
-    int *u,          /* vertical filter stride */
-    int *v,          /* horizontal filter stride */
-    int *dilation_h, /* filter dilation in the vertical dimension */
-    int *dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
-      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
-    const int padA[], const int filterStrideA[], const int dilationA[],
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
-    int *arrayLength, int padA[], int strideA[], int dilationA[],
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
-      cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
-                  dilationA, mode, computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int nbDims,
-    int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
-                  tensorOutputDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc,
-    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
-      cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnFilterDescriptor_t filterDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-    cudnnHandle_t handle, const void *alpha1,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
-    const cudnnTensorDescriptor_t zDesc, const void *z,
-    const cudnnTensorDescriptor_t biasDesc, const void *bias,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
-                  activationDesc, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dbDesc, void *db) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *y,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc,
-    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
-      size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnFilterDescriptor_t dwDesc, void *dw) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
-      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc,
-    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
-      size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-            const void *x, const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
-                                   const void *, const cudnnFilterDescriptor_t,
-                                   const cudnnConvolutionDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
-                  dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
-    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
-    int verticalPadding, int horizontalPadding, int verticalStride,
-    int horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
-      int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
-    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
-    int *windowWidth, int *verticalPadding, int *horizontalPadding,
-    int *verticalStride, int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
-    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
-    const int windowDimA[], const int paddingA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
-      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
-                  paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
-    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
-    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int[], int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
-                  windowDimA, paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims, int outputTensorDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                   const cudnnTensorDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
-    cudnnNanPropagation_t reluNanOpt, double coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
-                                               cudnnActivationMode_t,
-                                               cudnnNanPropagation_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
-      cudnnNanPropagation_t *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned lrnN, double lrnAlpha,
-                                                double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int, double, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned *lrnN,
-                                                double *lrnAlpha,
-                                                double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lrnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
-                  x, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
-                  beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t
-        xDesc, /* same desc for x, means, dy, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    const void *dy, void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-    void *dx,                                   /* output x differential */
-    void *dMeans) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
-                  temp2, beta, dXdMeansDesc, dx, dMeans);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
-    cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               cudnnBatchNormMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(derivedBnDesc, xDesc, mode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-
-    /* Shared desc for the next 6 tensors in the argument list.
-       Data type to be set as follows:
-       type = (typeOf(x) == double) ? double : float
-       Dimensions for this descriptor depend on normalization mode
-       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-        (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of
-       1xCxHxW (normalization is performed across N) */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
-     */
-    const void *bnScale, const void *bnBias,
-
-    /* MUST use factor=1 in the very first call of a complete training cycle.
-       Use a factor=1/(1+n) at N-th call to the function to get
-       Cumulative Moving Average (CMA) behavior
-       CMA[n] = (x[1]+...+x[n])/n
-       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-    double exponentialAverageFactor,
-
-    /* Used in Training phase only.
-       runningMean = newMean*factor + runningMean*(1-factor) */
-    void *resultRunningMean,
-    /* Output in training mode, input in inference. Is the moving average
-       of  variance[x] (factor is applied in the same way as for runningMean) */
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
-      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
-      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias, const void *estimatedMean,
-    const void *estimatedVariance, double epsilon) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, const void *, const void *, double);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
-                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
-                  estimatedVariance, epsilon);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
-    const void *betaDataDiff, const void *alphaParamDiff,
-    const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const void *bnScale, /* bnBias doesn't affect backpropagation */
-    /* scale and bias diff are not backpropagated below this layer */
-    void *dBnScaleResult, void *dBnBiasResult,
-    /* Same epsilon as forward pass */
-    double epsilon,
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, void *, void *, double, const void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
-                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
-                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
-                  epsilon, savedMean, savedInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
-    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
-      const int, const int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *theta, void *grid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, theta, grid);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *dgrid, void *dtheta) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, dgrid, dtheta);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
-    void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
-    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
-                  dyDesc, dy, grid, betaDgrid, dgrid);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
-                                                    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
-    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(xdesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
-    void **states, unsigned long long *seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float *, void **, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t xdesc, const void *x,
-    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t dydesc, const void *dy,
-    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
-    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
-    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnTensorDescriptor_t *dxDesc, void *dx,
-    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y,
-    const float findIntensity, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
-    const void *workspace, size_t workSpaceSizeInBytes,
-    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  findIntensity, requestedAlgoCount, returnedAlgoCount,
-                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
-                  reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
-    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
-                                               const cudnnDataType_t,
-                                               cudnnPersistentRNNPlan_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, minibatch, dataType, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
-                                               cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers,
-    cudnnDropoutDescriptor_t
-        dropoutDesc, /* Between layers, not between recurrent steps. */
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize, const int outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
-    int *outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-    cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
-    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
-    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
-    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
-      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
-      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
-    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerMatDesc, linLayerMat);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerBiasDesc, linLayerBias);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
-                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
-                     const void *w, const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-                     void *workspace, size_t workSpaceSizeInBytes,
-                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the timing steps, N is the
-                      mini batch size, A is the alphabet size)  */
-    const void *probs,       /* probabilities after softmax, in GPU memory */
-    const int *labels,       /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    void *costs,             /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
-                          T,N,A */
-    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
-                                compute costs only, set it to NULL */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace, /* pointer to the workspace, in GPU memory */
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
-      const int *, const int *, void *, const cudnnTensorDescriptor_t,
-      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
-                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the
-                      timing steps, N is the mini batch size, A is the alphabet
-                      size) */
-    const cudnnTensorDescriptor_t
-        gradientsDesc,       /* Tensor descriptor for gradients, the
-                                dimensions are T,N,A. To compute costs
-                                only, set it to NULL */
-    const int *labels,       /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
-      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
-                  inputLengths, algo, ctcLossDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
-    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithm_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc, algorithm);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
-    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithm_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc, algorithm);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
-    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(src, dest);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToCreate);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
-    cudnnStatus_t status, float time, size_t memory) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
-                                               cudnnAlgorithmDescriptor_t,
-                                               cudnnStatus_t, float, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, algoDesc, status, time, memory);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
-    const cudnnAlgorithmPerformance_t algoPerf,
-    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
-    size_t *memory) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
-      cudnnStatus_t *, float *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, algoDesc, status, time, memory);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToDestroy);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
-    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
-    size_t *algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
-                   void *algoSpace, size_t algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
-    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
-    cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
-                                               cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNSetClip(cudnnHandle_t handle,
-                                          cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnRNNClipMode_t clipMode,
-                                          cudnnNanPropagation_t clipNanOpt,
-                                          double lclip, double rclip) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t,
-      cudnnNanPropagation_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNGetClip(cudnnHandle_t handle,
-                                          cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnRNNClipMode_t *clipMode,
-                                          cudnnNanPropagation_t *clipNanOpt,
-                                          double *lclip, double *rclip) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *,
-      cudnnNanPropagation_t *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
-                                           cudnnCallback_t fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
-                                           cudnnCallback_t *fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNPaddingMode(
-    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, paddingMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNPaddingMode(
-    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
-                                               cudnnRNNPaddingMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, paddingMode);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDataDescriptor(
-    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t dataType,
-    cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize,
-    int vectorSize,
-    const int seqLengthArray[], /* length of each sequence in the batch */
-    void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int,
-      int, const int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
-                  vectorSize, seqLengthArray, paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDataDescriptor(
-    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t *dataType,
-    cudnnRNNDataLayout_t *layout, int *maxSeqLength, int *batchSize,
-    int *vectorSize, int arrayLengthRequested, int seqLengthArray[],
-    void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *,
-      int *, int *, int *, int, int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
-                  vectorSize, arrayLengthRequested, seqLengthArray,
-                  paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTrainingEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnRNNDataDescriptor_t yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy,
-    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-    const void *keys,                     /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-    void *cAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-    void *iAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-    void *queries,                        /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
-      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
-                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
-                  iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes,
-                  reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInferenceEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnRNNDataDescriptor_t yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy,
-    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-    const void *keys,                     /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-    void *cAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-    void *iAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-    void *queries,                        /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
-      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
-                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
-                  iDesc, iAttn, qDesc, queries, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardDataEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t yDesc, const void *y,
-    const cudnnRNNDataDescriptor_t dyDesc, const void *dy,
-    const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
-    const void *dcAttn,                    /* reserved, should pass NULL */
-    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
-    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnRNNDataDescriptor_t dxDesc, void *dx,
-    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-    const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-    const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
-    void *dkeys,                           /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn,
-                  dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx,
-                  dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys,
-                  workSpace, workSpaceSizeInBytes, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeightsEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnRNNDataDescriptor_t yDesc, const void *y, void *workSpace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, const void *, void *, size_t,
-      const cudnnFilterDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace,
-                  workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
-    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
-    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
-    cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
-      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
-      cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
-                  direction, mode, dataType);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_4.inc b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_4.inc
deleted file mode 100644
index 883c8ba8812ceb..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_4.inc
+++ /dev/null
@@ -1,2726 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
-  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
-                                                 cudnnStatus_t *rstatus,
-                                                 cudnnErrQueryMode_t mode,
-                                                 cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rstatus, mode, tag);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
-                                         cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
-                                         cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w,                    /* width of input section */
-    int nStride, int cStride, int hStride, int wStride) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
-                                   int, int, int, int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t *dataType, /* image data type */
-    int *n,                    /* number of inputs (batch size) */
-    int *c,                    /* number of input feature maps  */
-    int *h,                    /* height of input section */
-    int *w,                    /* width of input section */
-    int *nStride, int *cStride, int *hStride, int *wStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
-      int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
-    const int dimA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
-                                   cudnnDataType_t *, int *, int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, size);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
-                                         const void *alpha,
-                                         const cudnnTensorDescriptor_t aDesc,
-                                         const void *A, const void *beta,
-                                         const cudnnTensorDescriptor_t cDesc,
-                                         void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
-    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
-                                   cudnnDataType_t, cudnnNanPropagation_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
-    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
-      cudnnNanPropagation_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
-    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
-    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
-    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
-                  beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
-    cudnnNanPropagation_t reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t reduceTensorIndices,
-    cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
-      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t *reduceTensorOp,
-    cudnnDataType_t *reduceTensorCompType,
-    cudnnNanPropagation_t *reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t *reduceTensorIndices,
-    cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
-      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
-      cudnnIndicesType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    void *indices, size_t indicesSizeInBytes, void *workspace,
-    size_t workspaceSizeInBytes, const void *alpha,
-    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
-    const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
-      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
-                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
-                  C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
-                                         const cudnnTensorDescriptor_t yDesc,
-                                         void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, valuePtr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t yDesc,
-                                           void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, alpha);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType,          /* image data type */
-    cudnnTensorFormat_t format, int k, /* number of output feature maps */
-    int c,                             /* number of input feature maps */
-    int h,                             /* height of each input filter */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-    const cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t *dataType,           /* image data type */
-    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
-    int *c,                              /* number of input feature maps */
-    int *h,                              /* height of each input filter */
-    int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
-      int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType, /* image data type */
-    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, /* image data type */
-    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
-      cudnnTensorFormat_t *, int *, int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
-                  filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
-    int pad_w,                                        /* zero-padding width */
-    int u,          /* vertical filter stride */
-    int v,          /* horizontal filter stride */
-    int dilation_h, /* filter dilation in the vertical dimension */
-    int dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc,
-    int *pad_h,      /* zero-padding height */
-    int *pad_w,      /* zero-padding width */
-    int *u,          /* vertical filter stride */
-    int *v,          /* horizontal filter stride */
-    int *dilation_h, /* filter dilation in the vertical dimension */
-    int *dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
-      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
-    const int padA[], const int filterStrideA[], const int dilationA[],
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
-    int *arrayLength, int padA[], int strideA[], int dilationA[],
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
-      cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
-                  dilationA, mode, computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int nbDims,
-    int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
-                  tensorOutputDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc,
-    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
-      cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnFilterDescriptor_t filterDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-    cudnnHandle_t handle, const void *alpha1,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
-    const cudnnTensorDescriptor_t zDesc, const void *z,
-    const cudnnTensorDescriptor_t biasDesc, const void *bias,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
-                  activationDesc, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dbDesc, void *db) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *y,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc,
-    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
-      size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnFilterDescriptor_t dwDesc, void *dw) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
-      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc,
-    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
-      size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-            const void *x, const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
-                                   const void *, const cudnnFilterDescriptor_t,
-                                   const cudnnConvolutionDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
-                  dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
-    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
-    int verticalPadding, int horizontalPadding, int verticalStride,
-    int horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
-      int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
-    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
-    int *windowWidth, int *verticalPadding, int *horizontalPadding,
-    int *verticalStride, int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
-    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
-    const int windowDimA[], const int paddingA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
-      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
-                  paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
-    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
-    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int[], int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
-                  windowDimA, paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims, int outputTensorDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                   const cudnnTensorDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
-    cudnnNanPropagation_t reluNanOpt, double coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
-                                               cudnnActivationMode_t,
-                                               cudnnNanPropagation_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
-      cudnnNanPropagation_t *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned lrnN, double lrnAlpha,
-                                                double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int, double, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned *lrnN,
-                                                double *lrnAlpha,
-                                                double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lrnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
-                  x, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
-                  beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t
-        xDesc, /* same desc for x, means, dy, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    const void *dy, void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-    void *dx,                                   /* output x differential */
-    void *dMeans) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
-                  temp2, beta, dXdMeansDesc, dx, dMeans);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
-    cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               cudnnBatchNormMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(derivedBnDesc, xDesc, mode);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc,
-    const cudnnTensorDescriptor_t yDesc,
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc,
-                  bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc,
-    const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc,
-    const cudnnTensorDescriptor_t dxDesc,
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc,
-                  dBnScaleBiasDesc, activationDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-
-    /* Shared desc for the next 6 tensors in the argument list.
-       Data type to be set as follows:
-       type = (typeOf(x) == double) ? double : float
-       Dimensions for this descriptor depend on normalization mode
-       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-        (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of
-       1xCxHxW (normalization is performed across N) */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
-     */
-    const void *bnScale, const void *bnBias,
-
-    /* MUST use factor=1 in the very first call of a complete training cycle.
-       Use a factor=1/(1+n) at N-th call to the function to get
-       Cumulative Moving Average (CMA) behavior
-       CMA[n] = (x[1]+...+x[n])/n
-       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-    double exponentialAverageFactor,
-
-    /* Used in Training phase only.
-       runningMean = newMean*factor + runningMean*(1-factor) */
-    void *resultRunningMean,
-    /* Output in training mode, input in inference. Is the moving average
-       of  variance[x] (factor is applied in the same way as for runningMean) */
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
-      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
-      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTrainingEx(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-
-    const cudnnTensorDescriptor_t xDesc, const void *xData,
-    const cudnnTensorDescriptor_t zDesc, const void *zData,
-    const cudnnTensorDescriptor_t yDesc, void *yData,
-
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias,
-
-    double exponentialAverageFactor, void *resultRunningMean,
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance,
-
-    cudnnActivationDescriptor_t activationDesc, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData,
-                  yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias,
-                  exponentialAverageFactor, resultRunningMean,
-                  resultRunningVariance, epsilon, resultSaveMean,
-                  resultSaveInvVariance, activationDesc, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias, const void *estimatedMean,
-    const void *estimatedVariance, double epsilon) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, const void *, const void *, double);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
-                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
-                  estimatedVariance, epsilon);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
-    const void *betaDataDiff, const void *alphaParamDiff,
-    const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const void *bnScale, /* bnBias doesn't affect backpropagation */
-    /* scale and bias diff are not backpropagated below this layer */
-    void *dBnScaleResult, void *dBnBiasResult,
-    /* Same epsilon as forward pass */
-    double epsilon,
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, void *, void *, double, const void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
-                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
-                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
-                  epsilon, savedMean, savedInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackwardEx(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-
-    const void *alphaDataDiff, const void *betaDataDiff,
-    const void *alphaParamDiff, const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc, const void *xData,
-    const cudnnTensorDescriptor_t yDesc, const void *yData,
-    const cudnnTensorDescriptor_t dyDesc, const void *dyData,
-    const cudnnTensorDescriptor_t dzDesc, void *dzData,
-    const cudnnTensorDescriptor_t dxDesc, void *dxData,
-
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc, const void *bnScaleData,
-    const void *bnBiasData, /* needed if there is activation */
-    void *dBnScaleData, void *dBnBiasData,
-    double epsilon, /* Same epsilon as forward pass */
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance,
-    cudnnActivationDescriptor_t activationDesc, void *workSpace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
-      const void *, const void *, const void *, const cudnnTensorDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, const void *, const void *, void *,
-      void *, double, const void *, const void *, cudnnActivationDescriptor_t,
-      void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff,
-      betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData,
-      dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData,
-      dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc,
-      workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
-    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
-      const int, const int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *theta, void *grid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, theta, grid);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *dgrid, void *dtheta) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, dgrid, dtheta);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
-    void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
-    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
-                  dyDesc, dy, grid, betaDgrid, dgrid);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
-                                                    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
-    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(xdesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
-    void **states, unsigned long long *seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float *, void **, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t xdesc, const void *x,
-    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t dydesc, const void *dy,
-    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
-    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
-    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnTensorDescriptor_t *dxDesc, void *dx,
-    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y,
-    const float findIntensity, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
-    const void *workspace, size_t workSpaceSizeInBytes,
-    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  findIntensity, requestedAlgoCount, returnedAlgoCount,
-                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
-                  reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
-    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
-                                               const cudnnDataType_t,
-                                               cudnnPersistentRNNPlan_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, minibatch, dataType, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
-                                               cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers,
-    cudnnDropoutDescriptor_t
-        dropoutDesc, /* Between layers, not between recurrent steps. */
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize, const int outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
-    int *outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-    cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
-    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
-    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
-    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
-      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
-      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
-    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerMatDesc, linLayerMat);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerBiasDesc, linLayerBias);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
-                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
-                     const void *w, const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-                     void *workspace, size_t workSpaceSizeInBytes,
-                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the timing steps, N is the
-                      mini batch size, A is the alphabet size)  */
-    const void *probs,       /* probabilities after softmax, in GPU memory */
-    const int *labels,       /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    void *costs,             /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
-                          T,N,A */
-    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
-                                compute costs only, set it to NULL */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace, /* pointer to the workspace, in GPU memory */
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
-      const int *, const int *, void *, const cudnnTensorDescriptor_t,
-      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
-                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the
-                      timing steps, N is the mini batch size, A is the alphabet
-                      size) */
-    const cudnnTensorDescriptor_t
-        gradientsDesc,       /* Tensor descriptor for gradients, the
-                                dimensions are T,N,A. To compute costs
-                                only, set it to NULL */
-    const int *labels,       /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
-      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
-                  inputLengths, algo, ctcLossDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
-    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithm_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc, algorithm);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
-    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithm_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc, algorithm);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
-    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(src, dest);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToCreate);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
-    cudnnStatus_t status, float time, size_t memory) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
-                                               cudnnAlgorithmDescriptor_t,
-                                               cudnnStatus_t, float, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, algoDesc, status, time, memory);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
-    const cudnnAlgorithmPerformance_t algoPerf,
-    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
-    size_t *memory) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
-      cudnnStatus_t *, float *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, algoDesc, status, time, memory);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToDestroy);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
-    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
-    size_t *algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
-                   void *algoSpace, size_t algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
-    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
-    cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
-                                               cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNSetClip(cudnnHandle_t handle,
-                                          cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnRNNClipMode_t clipMode,
-                                          cudnnNanPropagation_t clipNanOpt,
-                                          double lclip, double rclip) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t,
-      cudnnNanPropagation_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNGetClip(cudnnHandle_t handle,
-                                          cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnRNNClipMode_t *clipMode,
-                                          cudnnNanPropagation_t *clipNanOpt,
-                                          double *lclip, double *rclip) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *,
-      cudnnNanPropagation_t *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
-                                           cudnnCallback_t fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
-                                           cudnnCallback_t *fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNPaddingMode(
-    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, paddingMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNPaddingMode(
-    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
-                                               cudnnRNNPaddingMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, paddingMode);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDataDescriptor(
-    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t dataType,
-    cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize,
-    int vectorSize,
-    const int seqLengthArray[], /* length of each sequence in the batch */
-    void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int,
-      int, const int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
-                  vectorSize, seqLengthArray, paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDataDescriptor(
-    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t *dataType,
-    cudnnRNNDataLayout_t *layout, int *maxSeqLength, int *batchSize,
-    int *vectorSize, int arrayLengthRequested, int seqLengthArray[],
-    void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *,
-      int *, int *, int *, int, int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
-                  vectorSize, arrayLengthRequested, seqLengthArray,
-                  paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTrainingEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnRNNDataDescriptor_t yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy,
-    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-    const void *keys,                     /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-    void *cAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-    void *iAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-    void *queries,                        /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
-      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
-                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
-                  iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes,
-                  reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInferenceEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnRNNDataDescriptor_t yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy,
-    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-    const void *keys,                     /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-    void *cAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-    void *iAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-    void *queries,                        /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
-      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
-                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
-                  iDesc, iAttn, qDesc, queries, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardDataEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t yDesc, const void *y,
-    const cudnnRNNDataDescriptor_t dyDesc, const void *dy,
-    const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
-    const void *dcAttn,                    /* reserved, should pass NULL */
-    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
-    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnRNNDataDescriptor_t dxDesc, void *dx,
-    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-    const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-    const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
-    void *dkeys,                           /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn,
-                  dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx,
-                  dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys,
-                  workSpace, workSpaceSizeInBytes, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeightsEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnRNNDataDescriptor_t yDesc, const void *y, void *workSpace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, const void *, void *, size_t,
-      const cudnnFilterDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace,
-                  workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
-    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
-    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
-    cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
-      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
-      cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
-                  direction, mode, dataType);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_6.inc b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_6.inc
deleted file mode 100644
index 9dd420a9022d57..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_7_6.inc
+++ /dev/null
@@ -1,3257 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
-  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
-                                                 cudnnStatus_t *rstatus,
-                                                 cudnnErrQueryMode_t mode,
-                                                 cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rstatus, mode, tag);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
-                                         cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
-                                         cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w,                    /* width of input section */
-    int nStride, int cStride, int hStride, int wStride) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
-                                   int, int, int, int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t *dataType, /* image data type */
-    int *n,                    /* number of inputs (batch size) */
-    int *c,                    /* number of input feature maps  */
-    int *h,                    /* height of input section */
-    int *w,                    /* width of input section */
-    int *nStride, int *cStride, int *hStride, int *wStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
-      int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
-    const int dimA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
-                                   cudnnDataType_t *, int *, int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, size);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnInitTransformDest(
-    const cudnnTensorTransformDescriptor_t transformDesc,
-    const cudnnTensorDescriptor_t srcDesc, cudnnTensorDescriptor_t destDesc,
-    size_t *destSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnTensorTransformDescriptor_t, const cudnnTensorDescriptor_t,
-      cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnInitTransformDest");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, srcDesc, destDesc, destSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t *transformDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t transformDesc, const uint32_t nbDims,
-    const cudnnTensorFormat_t destFormat, const int32_t padBeforeA[],
-    const int32_t padAfterA[], const uint32_t foldA[],
-    const cudnnFoldingDirection_t direction) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorTransformDescriptor_t, const uint32_t,
-      const cudnnTensorFormat_t, const int32_t[], const int32_t[],
-      const uint32_t[], const cudnnFoldingDirection_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, nbDims, destFormat, padBeforeA, padAfterA,
-                  foldA, direction);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t transformDesc, uint32_t nbDimsRequested,
-    cudnnTensorFormat_t *destFormat, int32_t padBeforeA[], int32_t padAfterA[],
-    uint32_t foldA[], cudnnFoldingDirection_t *direction) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorTransformDescriptor_t, uint32_t, cudnnTensorFormat_t *,
-      int32_t[], int32_t[], uint32_t[], cudnnFoldingDirection_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, nbDimsRequested, destFormat, padBeforeA,
-                  padAfterA, foldA, direction);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t transformDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformTensorEx(
-    cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc,
-    const void *alpha, const cudnnTensorDescriptor_t srcDesc,
-    const void *srcData, const void *beta,
-    const cudnnTensorDescriptor_t destDesc, void *destData) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc,
-                  destData);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFoldedConvBackwardDataDescriptors(
-    const cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t gradDesc,
-    const cudnnTensorFormat_t transformFormat,
-    cudnnFilterDescriptor_t foldedFilterDesc,
-    cudnnTensorDescriptor_t paddedDiffDesc,
-    cudnnConvolutionDescriptor_t foldedConvDesc,
-    cudnnTensorDescriptor_t foldedGradDesc,
-    cudnnTensorTransformDescriptor_t filterFoldTransDesc,
-    cudnnTensorTransformDescriptor_t diffPadTransDesc,
-    cudnnTensorTransformDescriptor_t gradFoldTransDesc,
-    cudnnTensorTransformDescriptor_t gradUnfoldTransDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorFormat_t,
-      cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
-      cudnnConvolutionDescriptor_t, cudnnTensorDescriptor_t,
-      cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t,
-      cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetFoldedConvBackwardDataDescriptors");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
-                  transformFormat, foldedFilterDesc, paddedDiffDesc,
-                  foldedConvDesc, foldedGradDesc, filterFoldTransDesc,
-                  diffPadTransDesc, gradFoldTransDesc, gradUnfoldTransDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
-                                         const void *alpha,
-                                         const cudnnTensorDescriptor_t aDesc,
-                                         const void *A, const void *beta,
-                                         const cudnnTensorDescriptor_t cDesc,
-                                         void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
-    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
-                                   cudnnDataType_t, cudnnNanPropagation_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
-    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
-      cudnnNanPropagation_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
-    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
-    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
-    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
-                  beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
-    cudnnNanPropagation_t reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t reduceTensorIndices,
-    cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
-      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t *reduceTensorOp,
-    cudnnDataType_t *reduceTensorCompType,
-    cudnnNanPropagation_t *reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t *reduceTensorIndices,
-    cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
-      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
-      cudnnIndicesType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    void *indices, size_t indicesSizeInBytes, void *workspace,
-    size_t workspaceSizeInBytes, const void *alpha,
-    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
-    const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
-      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
-                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
-                  C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
-                                         const cudnnTensorDescriptor_t yDesc,
-                                         void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, valuePtr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t yDesc,
-                                           void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, alpha);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType,          /* image data type */
-    cudnnTensorFormat_t format, int k, /* number of output feature maps */
-    int c,                             /* number of input feature maps */
-    int h,                             /* height of each input filter */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-    const cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t *dataType,           /* image data type */
-    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
-    int *c,                              /* number of input feature maps */
-    int *h,                              /* height of each input filter */
-    int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
-      int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType, /* image data type */
-    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, /* image data type */
-    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
-      cudnnTensorFormat_t *, int *, int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
-                  filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilterSizeInBytes(
-    const cudnnFilterDescriptor_t filterDesc, size_t *size) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnFilterDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterSizeInBytes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, size);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformFilter(
-    cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc,
-    const void *alpha, const cudnnFilterDescriptor_t srcDesc,
-    const void *srcData, const void *beta,
-    const cudnnFilterDescriptor_t destDesc, void *destData) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *, const void *,
-      const cudnnFilterDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformFilter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc,
-                  destData);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnReorderFilterAndBias(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    cudnnReorderType_t reorderType, const void *filterData,
-    void *reorderedFilterData, int reorderBias, const void *biasData,
-    void *reorderedBiasData) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t, cudnnReorderType_t,
-      const void *, void *, int, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReorderFilterAndBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, reorderType, filterData,
-                  reorderedFilterData, reorderBias, biasData,
-                  reorderedBiasData);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionReorderType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnReorderType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionReorderType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, reorderType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionReorderType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnReorderType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionReorderType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, reorderType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
-    int pad_w,                                        /* zero-padding width */
-    int u,          /* vertical filter stride */
-    int v,          /* horizontal filter stride */
-    int dilation_h, /* filter dilation in the vertical dimension */
-    int dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc,
-    int *pad_h,      /* zero-padding height */
-    int *pad_w,      /* zero-padding width */
-    int *u,          /* vertical filter stride */
-    int *v,          /* horizontal filter stride */
-    int *dilation_h, /* filter dilation in the vertical dimension */
-    int *dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
-      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
-    const int padA[], const int filterStrideA[], const int dilationA[],
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
-    int *arrayLength, int padA[], int strideA[], int dilationA[],
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
-      cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
-                  dilationA, mode, computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int nbDims,
-    int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
-                  tensorOutputDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc,
-    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
-      cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnFilterDescriptor_t filterDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-    cudnnHandle_t handle, const void *alpha1,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
-    const cudnnTensorDescriptor_t zDesc, const void *z,
-    const cudnnTensorDescriptor_t biasDesc, const void *bias,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
-                  activationDesc, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dbDesc, void *db) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *y,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc,
-    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
-      size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnFilterDescriptor_t dwDesc, void *dw) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
-      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc,
-    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
-    cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
-      size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
-                  memoryLimitInBytes, algo);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-            const void *x, const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
-                                   const void *, const cudnnFilterDescriptor_t,
-                                   const cudnnConvolutionDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
-                  dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
-    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
-    int verticalPadding, int horizontalPadding, int verticalStride,
-    int horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
-      int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
-    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
-    int *windowWidth, int *verticalPadding, int *horizontalPadding,
-    int *verticalStride, int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
-    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
-    const int windowDimA[], const int paddingA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
-      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
-                  paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
-    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
-    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int[], int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
-                  windowDimA, paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims, int outputTensorDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                   const cudnnTensorDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
-    cudnnNanPropagation_t reluNanOpt, double coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
-                                               cudnnActivationMode_t,
-                                               cudnnNanPropagation_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
-      cudnnNanPropagation_t *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned lrnN, double lrnAlpha,
-                                                double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int, double, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned *lrnN,
-                                                double *lrnAlpha,
-                                                double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lrnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
-                  x, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
-                  beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t
-        xDesc, /* same desc for x, means, dy, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    const void *dy, void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-    void *dx,                                   /* output x differential */
-    void *dMeans) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
-                  temp2, beta, dXdMeansDesc, dx, dMeans);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
-    cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               cudnnBatchNormMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(derivedBnDesc, xDesc, mode);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc,
-    const cudnnTensorDescriptor_t yDesc,
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc,
-                  bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc,
-    const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc,
-    const cudnnTensorDescriptor_t dxDesc,
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc,
-                  dBnScaleBiasDesc, activationDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-
-    /* Shared desc for the next 6 tensors in the argument list.
-       Data type to be set as follows:
-       type = (typeOf(x) == double) ? double : float
-       Dimensions for this descriptor depend on normalization mode
-       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-        (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of
-       1xCxHxW (normalization is performed across N) */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
-     */
-    const void *bnScale, const void *bnBias,
-
-    /* MUST use factor=1 in the very first call of a complete training cycle.
-       Use a factor=1/(1+n) at N-th call to the function to get
-       Cumulative Moving Average (CMA) behavior
-       CMA[n] = (x[1]+...+x[n])/n
-       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-    double exponentialAverageFactor,
-
-    /* Used in Training phase only.
-       runningMean = newMean*factor + runningMean*(1-factor) */
-    void *resultRunningMean,
-    /* Output in training mode, input in inference. Is the moving average
-       of  variance[x] (factor is applied in the same way as for runningMean) */
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
-      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
-      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTrainingEx(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-
-    const cudnnTensorDescriptor_t xDesc, const void *xData,
-    const cudnnTensorDescriptor_t zDesc, const void *zData,
-    const cudnnTensorDescriptor_t yDesc, void *yData,
-
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias,
-
-    double exponentialAverageFactor, void *resultRunningMean,
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance,
-
-    cudnnActivationDescriptor_t activationDesc, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData,
-                  yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias,
-                  exponentialAverageFactor, resultRunningMean,
-                  resultRunningVariance, epsilon, resultSaveMean,
-                  resultSaveInvVariance, activationDesc, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias, const void *estimatedMean,
-    const void *estimatedVariance, double epsilon) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, const void *, const void *, double);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
-                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
-                  estimatedVariance, epsilon);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
-    const void *betaDataDiff, const void *alphaParamDiff,
-    const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const void *bnScale, /* bnBias doesn't affect backpropagation */
-    /* scale and bias diff are not backpropagated below this layer */
-    void *dBnScaleResult, void *dBnBiasResult,
-    /* Same epsilon as forward pass */
-    double epsilon,
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, void *, void *, double, const void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
-                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
-                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
-                  epsilon, savedMean, savedInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackwardEx(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-
-    const void *alphaDataDiff, const void *betaDataDiff,
-    const void *alphaParamDiff, const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc, const void *xData,
-    const cudnnTensorDescriptor_t yDesc, const void *yData,
-    const cudnnTensorDescriptor_t dyDesc, const void *dyData,
-    const cudnnTensorDescriptor_t dzDesc, void *dzData,
-    const cudnnTensorDescriptor_t dxDesc, void *dxData,
-
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc, const void *bnScaleData,
-    const void *bnBiasData, /* needed if there is activation */
-    void *dBnScaleData, void *dBnBiasData,
-    double epsilon, /* Same epsilon as forward pass */
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance,
-    cudnnActivationDescriptor_t activationDesc, void *workSpace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
-      const void *, const void *, const void *, const cudnnTensorDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, const void *, const void *, void *,
-      void *, double, const void *, const void *, cudnnActivationDescriptor_t,
-      void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff,
-      betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData,
-      dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData,
-      dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc,
-      workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
-    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
-      const int, const int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *theta, void *grid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, theta, grid);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *dgrid, void *dtheta) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, dgrid, dtheta);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
-    void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
-    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
-                  dyDesc, dy, grid, betaDgrid, dgrid);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
-                                                    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
-    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(xdesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
-    void **states, unsigned long long *seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float *, void **, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t xdesc, const void *x,
-    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t dydesc, const void *dy,
-    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t mathPrec) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, mathPrec);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
-    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
-    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
-    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *mathPrec) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
-      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
-      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, mathPrec);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
-    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc,
-                                              cudnnRNNBiasMode_t biasMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNBiasMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, biasMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc,
-                                              cudnnRNNBiasMode_t *biasMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBiasMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, biasMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNSetClip(cudnnHandle_t handle,
-                                          cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnRNNClipMode_t clipMode,
-                                          cudnnNanPropagation_t clipNanOpt,
-                                          double lclip, double rclip) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t,
-      cudnnNanPropagation_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNGetClip(cudnnHandle_t handle,
-                                          cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnRNNClipMode_t *clipMode,
-                                          cudnnNanPropagation_t *clipNanOpt,
-                                          double *lclip, double *rclip) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *,
-      cudnnNanPropagation_t *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize, const int outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
-    int *outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
-    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
-                                               const cudnnDataType_t,
-                                               cudnnPersistentRNNPlan_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, minibatch, dataType, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
-                                               cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerMatDesc, linLayerMat);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerBiasDesc, linLayerBias);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
-                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
-                     const void *w, const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-                     void *workspace, size_t workSpaceSizeInBytes,
-                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNPaddingMode(
-    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, paddingMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNPaddingMode(
-    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
-                                               cudnnRNNPaddingMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, paddingMode);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDataDescriptor(
-    cudnnRNNDataDescriptor_t rnnDataDesc, cudnnDataType_t dataType,
-    cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize,
-    int vectorSize,
-    const int seqLengthArray[], /* length of each sequence in the batch */
-    void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int,
-      int, const int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize,
-                  vectorSize, seqLengthArray, paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDataDescriptor(
-    cudnnRNNDataDescriptor_t rnnDataDesc, cudnnDataType_t *dataType,
-    cudnnRNNDataLayout_t *layout, int *maxSeqLength, int *batchSize,
-    int *vectorSize, int arrayLengthRequested, int seqLengthArray[],
-    void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *,
-      int *, int *, int *, int, int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize,
-                  vectorSize, arrayLengthRequested, seqLengthArray,
-                  paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTrainingEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnRNNDataDescriptor_t yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy,
-    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-    const void *keys,                     /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-    void *cAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-    void *iAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-    void *queries,                        /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
-      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
-                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
-                  iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes,
-                  reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInferenceEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnRNNDataDescriptor_t yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy,
-    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-    const void *keys,                     /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-    void *cAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-    void *iAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-    void *queries,                        /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
-      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
-                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
-                  iDesc, iAttn, qDesc, queries, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardDataEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t yDesc, const void *y,
-    const cudnnRNNDataDescriptor_t dyDesc, const void *dy,
-    const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
-    const void *dcAttn,                    /* reserved, should pass NULL */
-    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
-    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnRNNDataDescriptor_t dxDesc, void *dx,
-    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-    const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-    const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
-    void *dkeys,                           /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn,
-                  dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx,
-                  dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys,
-                  workSpace, workSpaceSizeInBytes, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeightsEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnRNNDataDescriptor_t yDesc, const void *y, void *workSpace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, const void *, void *, size_t,
-      const cudnnFilterDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace,
-                  workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-    cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
-    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
-    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnTensorDescriptor_t *dxDesc, void *dx,
-    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y,
-    const float findIntensity, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
-    const void *workspace, size_t workSpaceSizeInBytes,
-    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  findIntensity, requestedAlgoCount, returnedAlgoCount,
-                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
-                  reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnSeqDataDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnSeqDataDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetSeqDataDescriptor(
-    cudnnSeqDataDescriptor_t seqDataDesc, cudnnDataType_t dataType, int nbDims,
-    const int dimA[], const cudnnSeqDataAxis_t axes[],
-    size_t seqLengthArraySize, const int seqLengthArray[], void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnSeqDataDescriptor_t, cudnnDataType_t, int, const int[],
-      const cudnnSeqDataAxis_t[], size_t, const int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc, dataType, nbDims, dimA, axes, seqLengthArraySize,
-                  seqLengthArray, paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetSeqDataDescriptor(
-    const cudnnSeqDataDescriptor_t seqDataDesc, cudnnDataType_t *dataType,
-    int *nbDims, int nbDimsRequested, int dimA[], cudnnSeqDataAxis_t axes[],
-    size_t *seqLengthArraySize, size_t seqLengthSizeRequested,
-    int seqLengthArray[], void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnSeqDataDescriptor_t, cudnnDataType_t *, int *, int, int[],
-      cudnnSeqDataAxis_t[], size_t *, size_t, int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetSeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc, dataType, nbDims, nbDimsRequested, dimA, axes,
-                  seqLengthArraySize, seqLengthSizeRequested, seqLengthArray,
-                  paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAttnDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAttnDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAttnDescriptor(
-    cudnnAttnDescriptor_t attnDesc, cudnnAttnQueryMap_t queryMap, int nHeads,
-    double smScaler, cudnnDataType_t dataType, cudnnDataType_t computePrec,
-    cudnnMathType_t mathType, cudnnDropoutDescriptor_t attnDropoutDesc,
-    cudnnDropoutDescriptor_t postDropoutDesc, int qSize, int kSize, int vSize,
-    int qProjSize, int kProjSize, int vProjSize, int oProjSize,
-    int qoMaxSeqLength, int kvMaxSeqLength, int maxBatchSize, int maxBeamSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnAttnDescriptor_t, cudnnAttnQueryMap_t, int, double, cudnnDataType_t,
-      cudnnDataType_t, cudnnMathType_t, cudnnDropoutDescriptor_t,
-      cudnnDropoutDescriptor_t, int, int, int, int, int, int, int, int, int,
-      int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec,
-                  mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize,
-                  vSize, qProjSize, kProjSize, vProjSize, oProjSize,
-                  qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAttnDescriptor(
-    cudnnAttnDescriptor_t attnDesc, cudnnAttnQueryMap_t *queryMap, int *nHeads,
-    double *smScaler, cudnnDataType_t *dataType, cudnnDataType_t *computePrec,
-    cudnnMathType_t *mathType, cudnnDropoutDescriptor_t *attnDropoutDesc,
-    cudnnDropoutDescriptor_t *postDropoutDesc, int *qSize, int *kSize,
-    int *vSize, int *qProjSize, int *kProjSize, int *vProjSize, int *oProjSize,
-    int *qoMaxSeqLength, int *kvMaxSeqLength, int *maxBatchSize,
-    int *maxBeamSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnAttnDescriptor_t, cudnnAttnQueryMap_t *, int *, double *,
-      cudnnDataType_t *, cudnnDataType_t *, cudnnMathType_t *,
-      cudnnDropoutDescriptor_t *, cudnnDropoutDescriptor_t *, int *, int *,
-      int *, int *, int *, int *, int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec,
-                  mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize,
-                  vSize, qProjSize, kProjSize, vProjSize, oProjSize,
-                  qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetMultiHeadAttnBuffers(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    size_t *weightSizeInBytes, size_t *workSpaceSizeInBytes,
-    size_t *reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, size_t *, size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnBuffers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, weightSizeInBytes, workSpaceSizeInBytes,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetMultiHeadAttnWeights(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    cudnnMultiHeadAttnWeightKind_t wKind, size_t weightSizeInBytes,
-    const void *w, cudnnTensorDescriptor_t wDesc, void **wAddr) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t,
-      cudnnMultiHeadAttnWeightKind_t, size_t, const void *,
-      cudnnTensorDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, wKind, weightSizeInBytes, w, wDesc, wAddr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnForward(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc, int currIdx,
-    const int *loWinIdx, const int *hiWinIdx, const int *seqLengthArrayQRO,
-    const int *seqLengthArrayKV, const cudnnSeqDataDescriptor_t qDesc,
-    const void *queries, const void *residuals,
-    const cudnnSeqDataDescriptor_t kDesc, const void *keys,
-    const cudnnSeqDataDescriptor_t vDesc, const void *values,
-    const cudnnSeqDataDescriptor_t oDesc, void *out, size_t weightSizeInBytes,
-    const void *w, size_t workSpaceSizeInBytes, void *workSpace,
-    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, int, const int *, const int *,
-      const int *, const int *, const cudnnSeqDataDescriptor_t, const void *,
-      const void *, const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, void *, size_t, const void *, size_t,
-      void *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, currIdx, loWinIdx, hiWinIdx,
-                  seqLengthArrayQRO, seqLengthArrayKV, qDesc, queries,
-                  residuals, kDesc, keys, vDesc, values, oDesc, out,
-                  weightSizeInBytes, w, workSpaceSizeInBytes, workSpace,
-                  reserveSpaceSizeInBytes, reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnBackwardData(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    const int *loWinIdx, const int *hiWinIdx, const int *seqLengthArrayDQDO,
-    const int *seqLengthArrayDKDV, const cudnnSeqDataDescriptor_t doDesc,
-    const void *dout, const cudnnSeqDataDescriptor_t dqDesc, void *dqueries,
-    const void *queries, const cudnnSeqDataDescriptor_t dkDesc, void *dkeys,
-    const void *keys, const cudnnSeqDataDescriptor_t dvDesc, void *dvalues,
-    const void *values, size_t weightSizeInBytes, const void *w,
-    size_t workSpaceSizeInBytes, void *workSpace,
-    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, const int *, const int *,
-      const int *, const int *, const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, void *, const void *,
-      const cudnnSeqDataDescriptor_t, void *, const void *,
-      const cudnnSeqDataDescriptor_t, void *, const void *, size_t,
-      const void *, size_t, void *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, loWinIdx, hiWinIdx, seqLengthArrayDQDO,
-                  seqLengthArrayDKDV, doDesc, dout, dqDesc, dqueries, queries,
-                  dkDesc, dkeys, keys, dvDesc, dvalues, values,
-                  weightSizeInBytes, w, workSpaceSizeInBytes, workSpace,
-                  reserveSpaceSizeInBytes, reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnBackwardWeights(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    cudnnWgradMode_t addGrad, const cudnnSeqDataDescriptor_t qDesc,
-    const void *queries, const cudnnSeqDataDescriptor_t kDesc, const void *keys,
-    const cudnnSeqDataDescriptor_t vDesc, const void *values,
-    const cudnnSeqDataDescriptor_t doDesc, const void *dout,
-    size_t weightSizeInBytes, const void *w, void *dw,
-    size_t workSpaceSizeInBytes, void *workSpace,
-    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnWgradMode_t,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *, size_t, const void *,
-      void *, size_t, void *, size_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, addGrad, qDesc, queries, kDesc, keys, vDesc,
-                  values, doDesc, dout, weightSizeInBytes, w, dw,
-                  workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes,
-                  reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptorEx(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType,
-    cudnnLossNormalizationMode_t normMode, cudnnNanPropagation_t gradMode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t,
-      cudnnNanPropagation_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType, normMode, gradMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptorEx(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType,
-    cudnnLossNormalizationMode_t *normMode, cudnnNanPropagation_t *gradMode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnCTCLossDescriptor_t, cudnnDataType_t *,
-      cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType, normMode, gradMode);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the timing steps, N is the
-                      mini batch size, A is the alphabet size)  */
-    const void *probs,       /* probabilities after softmax, in GPU memory */
-    const int *labels,       /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    void *costs,             /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
-                          T,N,A */
-    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
-                                compute costs only, set it to NULL */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace, /* pointer to the workspace, in GPU memory */
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
-      const int *, const int *, void *, const cudnnTensorDescriptor_t,
-      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
-                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the
-                      timing steps, N is the mini batch size, A is the alphabet
-                      size) */
-    const cudnnTensorDescriptor_t
-        gradientsDesc,       /* Tensor descriptor for gradients, the
-                                dimensions are T,N,A. To compute costs
-                                only, set it to NULL */
-    const int *labels,       /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
-      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
-                  inputLengths, algo, ctcLossDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
-    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithm_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc, algorithm);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
-    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithm_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc, algorithm);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
-    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(src, dest);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToCreate);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
-    cudnnStatus_t status, float time, size_t memory) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
-                                               cudnnAlgorithmDescriptor_t,
-                                               cudnnStatus_t, float, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, algoDesc, status, time, memory);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
-    const cudnnAlgorithmPerformance_t algoPerf,
-    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
-    size_t *memory) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
-      cudnnStatus_t *, float *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, algoDesc, status, time, memory);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToDestroy);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
-    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
-    size_t *algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
-                   void *algoSpace, size_t algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
-    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
-    cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
-                                               cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
-                                           cudnnCallback_t fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
-                                           cudnnCallback_t *fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsConstParamPack(
-    cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t *,
-                                               cudnnFusedOps_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateFusedOpsConstParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack, ops);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsConstParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFusedOpsConstParamPackAttribute(
-    cudnnFusedOpsConstParamPack_t constPack,
-    cudnnFusedOpsConstParamLabel_t paramLabel, const void *param) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t,
-                                               cudnnFusedOpsConstParamLabel_t,
-                                               const void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetFusedOpsConstParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack, paramLabel, param);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFusedOpsConstParamPackAttribute(
-    const cudnnFusedOpsConstParamPack_t constPack,
-    cudnnFusedOpsConstParamLabel_t paramLabel, void *param, int *isNULL) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t,
-      void *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetFusedOpsConstParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack, paramLabel, param, isNULL);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsVariantParamPack(
-    cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnFusedOpsVariantParamPack_t *, cudnnFusedOps_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateFusedOpsVariantParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack, ops);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsVariantParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFusedOpsVariantParamPackAttribute(
-    cudnnFusedOpsVariantParamPack_t varPack,
-    cudnnFusedOpsVariantParamLabel_t paramLabel, void *ptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t,
-                                   cudnnFusedOpsVariantParamLabel_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetFusedOpsVariantParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack, paramLabel, ptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFusedOpsVariantParamPackAttribute(
-    const cudnnFusedOpsVariantParamPack_t varPack,
-    cudnnFusedOpsVariantParamLabel_t paramLabel, void *ptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnFusedOpsVariantParamPack_t,
-                                   cudnnFusedOpsVariantParamLabel_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetFusedOpsVariantParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack, paramLabel, ptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan,
-                                                  cudnnFusedOps_t ops) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsPlan_t *, cudnnFusedOps_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, ops);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnMakeFusedOpsPlan(cudnnHandle_t handle, cudnnFusedOpsPlan_t plan,
-                      const cudnnFusedOpsConstParamPack_t constPack,
-                      size_t *workspaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnFusedOpsPlan_t, const cudnnFusedOpsConstParamPack_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMakeFusedOpsPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, plan, constPack, workspaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan,
-                     cudnnFusedOpsVariantParamPack_t varPack) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnFusedOpsPlan_t,
-                                   cudnnFusedOpsVariantParamPack_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFusedOpsExecute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, plan, varPack);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t mathPrec) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, mode, algo, mathPrec);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
-    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
-    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
-    cudnnDataType_t mathPrec) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
-      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
-      cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
-                  direction, mode, mathPrec);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_8_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_8_0.inc
deleted file mode 100644
index d9bf35184e4e41..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_8_0.inc
+++ /dev/null
@@ -1,3213 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
-  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
-                                                 cudnnStatus_t *rstatus,
-                                                 cudnnErrQueryMode_t mode,
-                                                 cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rstatus, mode, tag);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-
-#if CUDNN_MAJOR>=8 && (CUDNN_MINOR > 0 || CUDNN_PATCHLEVEL >= 4)
-cudnnStatus_t CUDNNWINAPI cudnnCnnInferVersionCheck(void) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCnnInferVersionCheck");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCnnTrainVersionCheck(void) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCnnTrainVersionCheck");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-#endif
-
-cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
-                                         cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
-                                         cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w,                    /* width of input section */
-    int nStride, int cStride, int hStride, int wStride) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
-                                   int, int, int, int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t *dataType, /* image data type */
-    int *n,                    /* number of inputs (batch size) */
-    int *c,                    /* number of input feature maps  */
-    int *h,                    /* height of input section */
-    int *w,                    /* width of input section */
-    int *nStride, int *cStride, int *hStride, int *wStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
-      int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
-    const int dimA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
-                                   cudnnDataType_t *, int *, int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, size);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnInitTransformDest(
-    const cudnnTensorTransformDescriptor_t transformDesc,
-    const cudnnTensorDescriptor_t srcDesc, cudnnTensorDescriptor_t destDesc,
-    size_t *destSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnTensorTransformDescriptor_t, const cudnnTensorDescriptor_t,
-      cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnInitTransformDest");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, srcDesc, destDesc, destSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t *transformDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t transformDesc, const uint32_t nbDims,
-    const cudnnTensorFormat_t destFormat, const int32_t padBeforeA[],
-    const int32_t padAfterA[], const uint32_t foldA[],
-    const cudnnFoldingDirection_t direction) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorTransformDescriptor_t, const uint32_t,
-      const cudnnTensorFormat_t, const int32_t[], const int32_t[],
-      const uint32_t[], const cudnnFoldingDirection_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, nbDims, destFormat, padBeforeA, padAfterA,
-                  foldA, direction);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t transformDesc, uint32_t nbDimsRequested,
-    cudnnTensorFormat_t *destFormat, int32_t padBeforeA[], int32_t padAfterA[],
-    uint32_t foldA[], cudnnFoldingDirection_t *direction) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorTransformDescriptor_t, uint32_t, cudnnTensorFormat_t *,
-      int32_t[], int32_t[], uint32_t[], cudnnFoldingDirection_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, nbDimsRequested, destFormat, padBeforeA,
-                  padAfterA, foldA, direction);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t transformDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformTensorEx(
-    cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc,
-    const void *alpha, const cudnnTensorDescriptor_t srcDesc,
-    const void *srcData, const void *beta,
-    const cudnnTensorDescriptor_t destDesc, void *destData) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc,
-                  destData);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
-                                         const void *alpha,
-                                         const cudnnTensorDescriptor_t aDesc,
-                                         const void *A, const void *beta,
-                                         const cudnnTensorDescriptor_t cDesc,
-                                         void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
-    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
-                                   cudnnDataType_t, cudnnNanPropagation_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
-    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
-      cudnnNanPropagation_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
-    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
-    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
-    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
-                  beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
-    cudnnNanPropagation_t reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t reduceTensorIndices,
-    cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
-      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t *reduceTensorOp,
-    cudnnDataType_t *reduceTensorCompType,
-    cudnnNanPropagation_t *reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t *reduceTensorIndices,
-    cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
-      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
-      cudnnIndicesType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    void *indices, size_t indicesSizeInBytes, void *workspace,
-    size_t workspaceSizeInBytes, const void *alpha,
-    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
-    const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
-      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
-                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
-                  C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
-                                         const cudnnTensorDescriptor_t yDesc,
-                                         void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, valuePtr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t yDesc,
-                                           void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, alpha);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType,          /* image data type */
-    cudnnTensorFormat_t format, int k, /* number of output feature maps */
-    int c,                             /* number of input feature maps */
-    int h,                             /* height of each input filter */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-    const cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t *dataType,           /* image data type */
-    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
-    int *c,                              /* number of input feature maps */
-    int *h,                              /* height of each input filter */
-    int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
-      int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType, /* image data type */
-    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, /* image data type */
-    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
-      cudnnTensorFormat_t *, int *, int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
-                  filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilterSizeInBytes(
-    const cudnnFilterDescriptor_t filterDesc, size_t *size) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnFilterDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterSizeInBytes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, size);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformFilter(
-    cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc,
-    const void *alpha, const cudnnFilterDescriptor_t srcDesc,
-    const void *srcData, const void *beta,
-    const cudnnFilterDescriptor_t destDesc, void *destData) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *, const void *,
-      const cudnnFilterDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformFilter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc,
-                  destData);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
-    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
-    int verticalPadding, int horizontalPadding, int verticalStride,
-    int horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
-      int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
-    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
-    int *windowWidth, int *verticalPadding, int *horizontalPadding,
-    int *verticalStride, int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
-    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
-    const int windowDimA[], const int paddingA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
-      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
-                  paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
-    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
-    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int[], int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
-                  windowDimA, paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims, int outputTensorDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                   const cudnnTensorDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
-    cudnnNanPropagation_t reluNanOpt, double coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
-                                               cudnnActivationMode_t,
-                                               cudnnNanPropagation_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
-      cudnnNanPropagation_t *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned lrnN, double lrnAlpha,
-                                                double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int, double, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned *lrnN,
-                                                double *lrnAlpha,
-                                                double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lrnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
-                  beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
-    cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               cudnnBatchNormMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(derivedBnDesc, xDesc, mode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias, const void *estimatedMean,
-    const void *estimatedVariance, double epsilon) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, const void *, const void *, double);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
-                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
-                  estimatedVariance, epsilon);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
-    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
-      const int, const int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *theta, void *grid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, theta, grid);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
-    void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
-                                                    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
-    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(xdesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
-    void **states, unsigned long long *seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float *, void **, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t xdesc, const void *x,
-    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToCreate);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToDestroy);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
-                                           cudnnCallback_t fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
-                                           cudnnCallback_t *fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpsInferVersionCheck(void) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpsInferVersionCheck");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionReorderType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnReorderType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionReorderType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, reorderType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionReorderType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnReorderType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionReorderType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, reorderType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
-    int pad_w,                                        /* zero-padding width */
-    int u,          /* vertical filter stride */
-    int v,          /* horizontal filter stride */
-    int dilation_h, /* filter dilation in the vertical dimension */
-    int dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc,
-    int *pad_h,      /* zero-padding height */
-    int *pad_w,      /* zero-padding width */
-    int *u,          /* vertical filter stride */
-    int *v,          /* horizontal filter stride */
-    int *dilation_h, /* filter dilation in the vertical dimension */
-    int *dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
-      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
-    const int padA[], const int filterStrideA[], const int dilationA[],
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
-    int *arrayLength, int padA[], int strideA[], int dilationA[],
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
-      cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
-                  dilationA, mode, computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int nbDims,
-    int tensorOuputDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
-                  tensorOuputDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnFilterDescriptor_t filterDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-            const void *x, const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
-                                   const void *, const cudnnFilterDescriptor_t,
-                                   const cudnnConvolutionDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnReorderFilterAndBias(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    cudnnReorderType_t reorderType, const void *filterData,
-    void *reorderedFilterData, int reorderBias, const void *biasData,
-    void *reorderedBiasData) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t, cudnnReorderType_t,
-      const void *, void *, int, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReorderFilterAndBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, reorderType, filterData,
-                  reorderedFilterData, reorderBias, biasData,
-                  reorderedBiasData);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-    cudnnHandle_t handle, const void *alpha1,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
-    const cudnnTensorDescriptor_t zDesc, const void *z,
-    const cudnnTensorDescriptor_t biasDesc, const void *bias,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
-                  activationDesc, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType,
-                             cudnnBackendDescriptor_t *descriptor) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnBackendDescriptorType_t,
-                                               cudnnBackendDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBackendCreateDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descriptorType, descriptor);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnBackendDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBackendDestroyDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descriptor);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnBackendDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBackendFinalize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descriptor);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
-                         cudnnBackendAttributeName_t attributeName,
-                         cudnnBackendAttributeType_t attributeType,
-                         int64_t elementCount, const void *arrayOfElements) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnBackendDescriptor_t, cudnnBackendAttributeName_t,
-      cudnnBackendAttributeType_t, int64_t, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBackendSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descriptor, attributeName, attributeType, elementCount,
-                  arrayOfElements);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBackendGetAttribute(
-    cudnnBackendDescriptor_t const descriptor,
-    cudnnBackendAttributeName_t attributeName,
-    cudnnBackendAttributeType_t attributeType, int64_t requestedElementCount,
-    int64_t *elementCount, void *arrayOfElements) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnBackendDescriptor_t const, cudnnBackendAttributeName_t,
-      cudnnBackendAttributeType_t, int64_t, int64_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBackendGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descriptor, attributeName, attributeType,
-                  requestedElementCount, elementCount, arrayOfElements);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBackendExecute(
-    cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan,
-    cudnnBackendDescriptor_t variantPack) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBackendDescriptor_t, cudnnBackendDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBackendExecute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, executionPlan, variantPack);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFoldedConvBackwardDataDescriptors(
-    const cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t gradDesc,
-    const cudnnTensorFormat_t transformFormat,
-    cudnnFilterDescriptor_t foldedFilterDesc,
-    cudnnTensorDescriptor_t paddedDiffDesc,
-    cudnnConvolutionDescriptor_t foldedConvDesc,
-    cudnnTensorDescriptor_t foldedGradDesc,
-    cudnnTensorTransformDescriptor_t filterFoldTransDesc,
-    cudnnTensorTransformDescriptor_t diffPadTransDesc,
-    cudnnTensorTransformDescriptor_t gradFoldTransDesc,
-    cudnnTensorTransformDescriptor_t gradUnfoldTransDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorFormat_t,
-      cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
-      cudnnConvolutionDescriptor_t, cudnnTensorDescriptor_t,
-      cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t,
-      cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetFoldedConvBackwardDataDescriptors");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
-                  transformFormat, foldedFilterDesc, paddedDiffDesc,
-                  foldedConvDesc, foldedGradDesc, filterFoldTransDesc,
-                  diffPadTransDesc, gradFoldTransDesc, gradUnfoldTransDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsConstParamPack(
-    cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t *,
-                                               cudnnFusedOps_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateFusedOpsConstParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack, ops);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsConstParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFusedOpsConstParamPackAttribute(
-    cudnnFusedOpsConstParamPack_t constPack,
-    cudnnFusedOpsConstParamLabel_t paramLabel, const void *param) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t,
-                                               cudnnFusedOpsConstParamLabel_t,
-                                               const void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetFusedOpsConstParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack, paramLabel, param);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFusedOpsConstParamPackAttribute(
-    const cudnnFusedOpsConstParamPack_t constPack,
-    cudnnFusedOpsConstParamLabel_t paramLabel, void *param, int *isNULL) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t,
-      void *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetFusedOpsConstParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack, paramLabel, param, isNULL);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsVariantParamPack(
-    cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnFusedOpsVariantParamPack_t *, cudnnFusedOps_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateFusedOpsVariantParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack, ops);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsVariantParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFusedOpsVariantParamPackAttribute(
-    cudnnFusedOpsVariantParamPack_t varPack,
-    cudnnFusedOpsVariantParamLabel_t paramLabel, void *ptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t,
-                                   cudnnFusedOpsVariantParamLabel_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetFusedOpsVariantParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack, paramLabel, ptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFusedOpsVariantParamPackAttribute(
-    const cudnnFusedOpsVariantParamPack_t varPack,
-    cudnnFusedOpsVariantParamLabel_t paramLabel, void *ptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnFusedOpsVariantParamPack_t,
-                                   cudnnFusedOpsVariantParamLabel_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetFusedOpsVariantParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack, paramLabel, ptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan,
-                                                  cudnnFusedOps_t ops) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsPlan_t *, cudnnFusedOps_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, ops);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnMakeFusedOpsPlan(cudnnHandle_t handle, cudnnFusedOpsPlan_t plan,
-                      const cudnnFusedOpsConstParamPack_t constPack,
-                      size_t *workspaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnFusedOpsPlan_t, const cudnnFusedOpsConstParamPack_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMakeFusedOpsPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, plan, constPack, workspaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan,
-                     cudnnFusedOpsVariantParamPack_t varPack) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnFusedOpsPlan_t,
-                                   cudnnFusedOpsVariantParamPack_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFusedOpsExecute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, plan, varPack);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v8(
-    cudnnRNNDescriptor_t rnnDesc, cudnnRNNAlgo_t algo, cudnnRNNMode_t cellMode,
-    cudnnRNNBiasMode_t biasMode, cudnnDirectionMode_t dirMode,
-    cudnnRNNInputMode_t inputMode, cudnnDataType_t dataType,
-    cudnnDataType_t mathPrec, cudnnMathType_t mathType, int32_t inputSize,
-    int32_t hiddenSize, int32_t projSize, int32_t numLayers,
-    cudnnDropoutDescriptor_t dropoutDesc, uint32_t auxFlags) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDescriptor_t, cudnnRNNAlgo_t, cudnnRNNMode_t, cudnnRNNBiasMode_t,
-      cudnnDirectionMode_t, cudnnRNNInputMode_t, cudnnDataType_t,
-      cudnnDataType_t, cudnnMathType_t, int32_t, int32_t, int32_t, int32_t,
-      cudnnDropoutDescriptor_t, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, algo, cellMode, biasMode, dirMode, inputMode,
-                  dataType, mathPrec, mathType, inputSize, hiddenSize, projSize,
-                  numLayers, dropoutDesc, auxFlags);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor_v8(
-    cudnnRNNDescriptor_t rnnDesc, cudnnRNNAlgo_t *algo,
-    cudnnRNNMode_t *cellMode, cudnnRNNBiasMode_t *biasMode,
-    cudnnDirectionMode_t *dirMode, cudnnRNNInputMode_t *inputMode,
-    cudnnDataType_t *dataType, cudnnDataType_t *mathPrec,
-    cudnnMathType_t *mathType, int32_t *inputSize, int32_t *hiddenSize,
-    int32_t *projSize, int32_t *numLayers,
-    cudnnDropoutDescriptor_t *dropoutDesc, uint32_t *auxFlags) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDescriptor_t, cudnnRNNAlgo_t *, cudnnRNNMode_t *,
-      cudnnRNNBiasMode_t *, cudnnDirectionMode_t *, cudnnRNNInputMode_t *,
-      cudnnDataType_t *, cudnnDataType_t *, cudnnMathType_t *, int32_t *,
-      int32_t *, int32_t *, int32_t *, cudnnDropoutDescriptor_t *, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, algo, cellMode, biasMode, dirMode, inputMode,
-                  dataType, mathPrec, mathType, inputSize, hiddenSize, projSize,
-                  numLayers, dropoutDesc, auxFlags);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t cellMode, cudnnRNNAlgo_t algo, cudnnDataType_t mathPrec) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, cellMode, algo, mathPrec);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor_v6(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
-    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
-    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
-    cudnnRNNMode_t *cellMode, cudnnRNNAlgo_t *algo, cudnnDataType_t *mathPrec) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
-      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
-      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor_v6");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, cellMode, algo, mathPrec);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
-    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc,
-                                              cudnnRNNBiasMode_t biasMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNBiasMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, biasMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc,
-                                              cudnnRNNBiasMode_t *biasMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBiasMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, biasMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNSetClip(cudnnHandle_t handle,
-                                          cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnRNNClipMode_t clipMode,
-                                          cudnnNanPropagation_t clipNanOpt,
-                                          double lclip, double rclip) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t,
-      cudnnNanPropagation_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNGetClip(cudnnHandle_t handle,
-                                          cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnRNNClipMode_t *clipMode,
-                                          cudnnNanPropagation_t *clipNanOpt,
-                                          double *lclip, double *rclip) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *,
-      cudnnNanPropagation_t *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize, const int outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
-    int *outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
-    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
-                                               const cudnnDataType_t,
-                                               cudnnPersistentRNNPlan_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, minibatch, dataType, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
-                                               cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-                           size_t *weightSpaceSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t,
-                                               cudnnRNNDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWeightSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, weightSpaceSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTempSpaceSizes(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-    cudnnForwardMode_t fMode, cudnnRNNDataDescriptor_t xDesc,
-    size_t *workSpaceSize, size_t *reserveSpaceSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnForwardMode_t,
-      cudnnRNNDataDescriptor_t, size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTempSpaceSizes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, fMode, xDesc, workSpaceSize,
-                  reserveSpaceSize);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerMatDesc, linLayerMat);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerBiasDesc, linLayerBias);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc,
-                                                 unsigned paddingMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, paddingMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc,
-                                                 unsigned *paddingMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, paddingMode);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDataDescriptor(
-    cudnnRNNDataDescriptor_t rnnDataDesc, cudnnDataType_t dataType,
-    cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize,
-    int vectorSize,
-    const int seqLengthArray[], /* length of each sequence in the batch */
-    void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int,
-      int, const int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize,
-                  vectorSize, seqLengthArray, paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDataDescriptor(
-    cudnnRNNDataDescriptor_t rnnDataDesc, cudnnDataType_t *dataType,
-    cudnnRNNDataLayout_t *layout, int *maxSeqLength, int *batchSize,
-    int *vectorSize, int arrayLengthRequested, int seqLengthArray[],
-    void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *,
-      int *, int *, int *, int, int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize,
-                  vectorSize, arrayLengthRequested, seqLengthArray,
-                  paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInferenceEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnRNNDataDescriptor_t yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy,
-    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-    const void *keys,                     /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-    void *cAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-    void *iAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-    void *queries,                        /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
-      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
-                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
-                  iDesc, iAttn, qDesc, queries, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnSeqDataDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnSeqDataDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetSeqDataDescriptor(
-    cudnnSeqDataDescriptor_t seqDataDesc, cudnnDataType_t dataType, int nbDims,
-    const int dimA[], const cudnnSeqDataAxis_t axes[],
-    size_t seqLengthArraySize, const int seqLengthArray[], void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnSeqDataDescriptor_t, cudnnDataType_t, int, const int[],
-      const cudnnSeqDataAxis_t[], size_t, const int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc, dataType, nbDims, dimA, axes, seqLengthArraySize,
-                  seqLengthArray, paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetSeqDataDescriptor(
-    const cudnnSeqDataDescriptor_t seqDataDesc, cudnnDataType_t *dataType,
-    int *nbDims, int nbDimsRequested, int dimA[], cudnnSeqDataAxis_t axes[],
-    size_t *seqLengthArraySize, size_t seqLengthSizeRequested,
-    int seqLengthArray[], void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnSeqDataDescriptor_t, cudnnDataType_t *, int *, int, int[],
-      cudnnSeqDataAxis_t[], size_t *, size_t, int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetSeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc, dataType, nbDims, nbDimsRequested, dimA, axes,
-                  seqLengthArraySize, seqLengthSizeRequested, seqLengthArray,
-                  paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAttnDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAttnDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAttnDescriptor(
-    cudnnAttnDescriptor_t attnDesc, unsigned attnMode, int nHeads,
-    double smScaler, cudnnDataType_t dataType, cudnnDataType_t computePrec,
-    cudnnMathType_t mathType, cudnnDropoutDescriptor_t attnDropoutDesc,
-    cudnnDropoutDescriptor_t postDropoutDesc, int qSize, int kSize, int vSize,
-    int qProjSize, int kProjSize, int vProjSize, int oProjSize,
-    int qoMaxSeqLength, int kvMaxSeqLength, int maxBatchSize, int maxBeamSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnAttnDescriptor_t, unsigned int, int, double, cudnnDataType_t,
-      cudnnDataType_t, cudnnMathType_t, cudnnDropoutDescriptor_t,
-      cudnnDropoutDescriptor_t, int, int, int, int, int, int, int, int, int,
-      int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc, attnMode, nHeads, smScaler, dataType, computePrec,
-                  mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize,
-                  vSize, qProjSize, kProjSize, vProjSize, oProjSize,
-                  qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAttnDescriptor(
-    cudnnAttnDescriptor_t attnDesc, unsigned *attnMode, int *nHeads,
-    double *smScaler, cudnnDataType_t *dataType, cudnnDataType_t *computePrec,
-    cudnnMathType_t *mathType, cudnnDropoutDescriptor_t *attnDropoutDesc,
-    cudnnDropoutDescriptor_t *postDropoutDesc, int *qSize, int *kSize,
-    int *vSize, int *qProjSize, int *kProjSize, int *vProjSize, int *oProjSize,
-    int *qoMaxSeqLength, int *kvMaxSeqLength, int *maxBatchSize,
-    int *maxBeamSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnAttnDescriptor_t, unsigned int *, int *, double *, cudnnDataType_t *,
-      cudnnDataType_t *, cudnnMathType_t *, cudnnDropoutDescriptor_t *,
-      cudnnDropoutDescriptor_t *, int *, int *, int *, int *, int *, int *,
-      int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc, attnMode, nHeads, smScaler, dataType, computePrec,
-                  mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize,
-                  vSize, qProjSize, kProjSize, vProjSize, oProjSize,
-                  qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetMultiHeadAttnBuffers(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    size_t *weightSizeInBytes, size_t *workSpaceSizeInBytes,
-    size_t *reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, size_t *, size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnBuffers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, weightSizeInBytes, workSpaceSizeInBytes,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetMultiHeadAttnWeights(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    cudnnMultiHeadAttnWeightKind_t wKind, size_t weightSizeInBytes,
-    const void *weights, cudnnTensorDescriptor_t wDesc, void **wAddr) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t,
-      cudnnMultiHeadAttnWeightKind_t, size_t, const void *,
-      cudnnTensorDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, wKind, weightSizeInBytes, weights, wDesc,
-                  wAddr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnForward(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc, int currIdx,
-    const int loWinIdx[], const int hiWinIdx[], const int devSeqLengthsQO[],
-    const int devSeqLengthsKV[], const cudnnSeqDataDescriptor_t qDesc,
-    const void *queries, const void *residuals,
-    const cudnnSeqDataDescriptor_t kDesc, const void *keys,
-    const cudnnSeqDataDescriptor_t vDesc, const void *values,
-    const cudnnSeqDataDescriptor_t oDesc, void *out, size_t weightSizeInBytes,
-    const void *weights, size_t workSpaceSizeInBytes, void *workSpace,
-    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, int, const int[], const int[],
-      const int[], const int[], const cudnnSeqDataDescriptor_t, const void *,
-      const void *, const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, void *, size_t, const void *, size_t,
-      void *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, currIdx, loWinIdx, hiWinIdx,
-                  devSeqLengthsQO, devSeqLengthsKV, qDesc, queries, residuals,
-                  kDesc, keys, vDesc, values, oDesc, out, weightSizeInBytes,
-                  weights, workSpaceSizeInBytes, workSpace,
-                  reserveSpaceSizeInBytes, reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAdvInferVersionCheck(void) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAdvInferVersionCheck");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
-                  dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
-                  x, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t
-        xDesc, /* same desc for x, means, dy, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    const void *dy, void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-    void *dx,                                   /* output x differential */
-    void *dMeans) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
-                  temp2, beta, dXdMeansDesc, dx, dMeans);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc,
-    const cudnnTensorDescriptor_t yDesc,
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc,
-                  bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc,
-    const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc,
-    const cudnnTensorDescriptor_t dxDesc,
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc,
-                  dBnScaleBiasDesc, activationDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-
-    /* Shared desc for the next 6 tensors in the argument list.
-       Data type to be set as follows:
-       type = (typeOf(x) == double) ? double : float
-       Dimensions for this descriptor depend on normalization mode
-       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-        (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of
-       1xCxHxW (normalization is performed across N) */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
-     */
-    const void *bnScale, const void *bnBias,
-
-    /* MUST use factor=1 in the very first call of a complete training cycle.
-       Use a factor=1/(1+n) at N-th call to the function to get
-       Cumulative Moving Average (CMA) behavior
-       CMA[n] = (x[1]+...+x[n])/n
-       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-    double exponentialAverageFactor,
-
-    /* Used in Training phase only.
-       runningMean = newMean*factor + runningMean*(1-factor) */
-    void *resultRunningMean,
-    /* Output in training mode, input in inference. Is the moving average
-       of  variance[x] (factor is applied in the same way as for runningMean) */
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
-      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
-      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTrainingEx(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-
-    const cudnnTensorDescriptor_t xDesc, const void *xData,
-    const cudnnTensorDescriptor_t zDesc, const void *zData,
-    const cudnnTensorDescriptor_t yDesc, void *yData,
-
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias,
-
-    double exponentialAverageFactor, void *resultRunningMean,
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance,
-
-    cudnnActivationDescriptor_t activationDesc, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData,
-                  yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias,
-                  exponentialAverageFactor, resultRunningMean,
-                  resultRunningVariance, epsilon, resultSaveMean,
-                  resultSaveInvVariance, activationDesc, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
-    const void *betaDataDiff, const void *alphaParamDiff,
-    const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const void *bnScale, /* bnBias doesn't affect backpropagation */
-    /* scale and bias diff are not backpropagated below this layer */
-    void *dBnScaleResult, void *dBnBiasResult,
-    /* Same epsilon as forward pass */
-    double epsilon,
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, void *, void *, double, const void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
-                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
-                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
-                  epsilon, savedMean, savedInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackwardEx(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-
-    const void *alphaDataDiff, const void *betaDataDiff,
-    const void *alphaParamDiff, const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc, const void *xData,
-    const cudnnTensorDescriptor_t yDesc, const void *yData,
-    const cudnnTensorDescriptor_t dyDesc, const void *dyData,
-    const cudnnTensorDescriptor_t dzDesc, void *dzData,
-    const cudnnTensorDescriptor_t dxDesc, void *dxData,
-
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc, const void *bnScaleData,
-    const void *bnBiasData, /* needed if there is activation */
-    void *dBnScaleData, void *dBnBiasData,
-    double epsilon, /* Same epsilon as forward pass */
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance,
-    cudnnActivationDescriptor_t activationDesc, void *workSpace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
-      const void *, const void *, const void *, const cudnnTensorDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, const void *, const void *, void *,
-      void *, double, const void *, const void *, cudnnActivationDescriptor_t,
-      void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff,
-      betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData,
-      dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData,
-      dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc,
-      workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *dgrid, void *dtheta) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, dgrid, dtheta);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
-    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
-                  dyDesc, dy, grid, betaDgrid, dgrid);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t dydesc, const void *dy,
-    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpsTrainVersionCheck(void) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpsTrainVersionCheck");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *y,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnFilterDescriptor_t dwDesc, void *dw) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
-      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dbDesc, void *db) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
-                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
-                     const void *w, const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-                     void *workspace, size_t workSpaceSizeInBytes,
-                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTrainingEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnRNNDataDescriptor_t yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy,
-    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-    const void *keys,                     /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-    void *cAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-    void *iAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-    void *queries,                        /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
-      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
-                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
-                  iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes,
-                  reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForward(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-    cudnnForwardMode_t fwdMode, const int32_t devSeqLengths[],
-    cudnnRNNDataDescriptor_t xDesc, const void *x,
-    cudnnRNNDataDescriptor_t yDesc, void *y, cudnnTensorDescriptor_t hDesc,
-    const void *hx, void *hy, cudnnTensorDescriptor_t cDesc, const void *cx,
-    void *cy, size_t weightSpaceSize, const void *weightSpace,
-    size_t workSpaceSize, void *workSpace, size_t reserveSpaceSize,
-    void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnForwardMode_t, const int32_t[],
-      cudnnRNNDataDescriptor_t, const void *, cudnnRNNDataDescriptor_t, void *,
-      cudnnTensorDescriptor_t, const void *, void *, cudnnTensorDescriptor_t,
-      const void *, void *, size_t, const void *, size_t, void *, size_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, fwdMode, devSeqLengths, xDesc, x, yDesc, y,
-                  hDesc, hx, hy, cDesc, cx, cy, weightSpaceSize, weightSpace,
-                  workSpaceSize, workSpace, reserveSpaceSize, reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardDataEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t yDesc, const void *y,
-    const cudnnRNNDataDescriptor_t dyDesc, const void *dy,
-    const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
-    const void *dcAttn,                    /* reserved, should pass NULL */
-    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
-    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnRNNDataDescriptor_t dxDesc, void *dx,
-    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-    const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-    const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
-    void *dkeys,                           /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn,
-                  dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx,
-                  dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys,
-                  workSpace, workSpaceSizeInBytes, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData_v8(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-    const int32_t devSeqLengths[], cudnnRNNDataDescriptor_t yDesc,
-    const void *y, const void *dy, cudnnRNNDataDescriptor_t xDesc, void *dx,
-    cudnnTensorDescriptor_t hDesc, const void *hx, const void *dhy, void *dhx,
-    cudnnTensorDescriptor_t cDesc, const void *cx, const void *dcy, void *dcx,
-    size_t weightSpaceSize, const void *weightSpace, size_t workSpaceSize,
-    void *workSpace, size_t reserveSpaceSize, void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int32_t[],
-      cudnnRNNDataDescriptor_t, const void *, const void *,
-      cudnnRNNDataDescriptor_t, void *, cudnnTensorDescriptor_t, const void *,
-      const void *, void *, cudnnTensorDescriptor_t, const void *, const void *,
-      void *, size_t, const void *, size_t, void *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, devSeqLengths, yDesc, y, dy, xDesc, dx,
-                  hDesc, hx, dhy, dhx, cDesc, cx, dcy, dcx, weightSpaceSize,
-                  weightSpace, workSpaceSize, workSpace, reserveSpaceSize,
-                  reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeightsEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnRNNDataDescriptor_t yDesc, const void *y, void *workSpace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, const void *, void *, size_t,
-      const cudnnFilterDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace,
-                  workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights_v8(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-    cudnnWgradMode_t addGrad, const int32_t devSeqLengths[],
-    cudnnRNNDataDescriptor_t xDesc, const void *x,
-    cudnnTensorDescriptor_t hDesc, const void *hx,
-    cudnnRNNDataDescriptor_t yDesc, const void *y, size_t weightSpaceSize,
-    void *dweightSpace, size_t workSpaceSize, void *workSpace,
-    size_t reserveSpaceSize, void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnWgradMode_t, const int32_t[],
-      cudnnRNNDataDescriptor_t, const void *, cudnnTensorDescriptor_t,
-      const void *, cudnnRNNDataDescriptor_t, const void *, size_t, void *,
-      size_t, void *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, addGrad, devSeqLengths, xDesc, x, hDesc, hx,
-                  yDesc, y, weightSpaceSize, dweightSpace, workSpaceSize,
-                  workSpace, reserveSpaceSize, reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnBackwardData(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    const int loWinIdx[], const int hiWinIdx[], const int devSeqLengthsDQDO[],
-    const int devSeqLengthsDKDV[], const cudnnSeqDataDescriptor_t doDesc,
-    const void *dout, const cudnnSeqDataDescriptor_t dqDesc, void *dqueries,
-    const void *queries, const cudnnSeqDataDescriptor_t dkDesc, void *dkeys,
-    const void *keys, const cudnnSeqDataDescriptor_t dvDesc, void *dvalues,
-    const void *values, size_t weightSizeInBytes, const void *weights,
-    size_t workSpaceSizeInBytes, void *workSpace,
-    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, const int[], const int[],
-      const int[], const int[], const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, void *, const void *,
-      const cudnnSeqDataDescriptor_t, void *, const void *,
-      const cudnnSeqDataDescriptor_t, void *, const void *, size_t,
-      const void *, size_t, void *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, loWinIdx, hiWinIdx, devSeqLengthsDQDO,
-                  devSeqLengthsDKDV, doDesc, dout, dqDesc, dqueries, queries,
-                  dkDesc, dkeys, keys, dvDesc, dvalues, values,
-                  weightSizeInBytes, weights, workSpaceSizeInBytes, workSpace,
-                  reserveSpaceSizeInBytes, reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnBackwardWeights(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    cudnnWgradMode_t addGrad, const cudnnSeqDataDescriptor_t qDesc,
-    const void *queries, const cudnnSeqDataDescriptor_t kDesc, const void *keys,
-    const cudnnSeqDataDescriptor_t vDesc, const void *values,
-    const cudnnSeqDataDescriptor_t doDesc, const void *dout,
-    size_t weightSizeInBytes, const void *weights, void *dweights,
-    size_t workSpaceSizeInBytes, void *workSpace,
-    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnWgradMode_t,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *, size_t, const void *,
-      void *, size_t, void *, size_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, addGrad, qDesc, queries, kDesc, keys, vDesc,
-                  values, doDesc, dout, weightSizeInBytes, weights, dweights,
-                  workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes,
-                  reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptorEx(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType,
-    cudnnLossNormalizationMode_t normMode, cudnnNanPropagation_t gradMode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t,
-      cudnnNanPropagation_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType, normMode, gradMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor_v8(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType,
-    cudnnLossNormalizationMode_t normMode, cudnnNanPropagation_t gradMode,
-    int maxLabelLength) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t,
-      cudnnNanPropagation_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType, normMode, gradMode, maxLabelLength);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptorEx(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType,
-    cudnnLossNormalizationMode_t *normMode, cudnnNanPropagation_t *gradMode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnCTCLossDescriptor_t, cudnnDataType_t *,
-      cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType, normMode, gradMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor_v8(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType,
-    cudnnLossNormalizationMode_t *normMode, cudnnNanPropagation_t *gradMode,
-    int *maxLabelLength) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnCTCLossDescriptor_t, cudnnDataType_t *,
-      cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType, normMode, gradMode, maxLabelLength);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the timing steps, N is the
-                      mini batch size, A is the alphabet size)  */
-    const void *probs,      /* probabilities after softmax, in GPU memory */
-    const int hostLabels[], /* labels, in CPU memory */
-    const int hostLabelLengths[], /* the length of each label, in CPU memory */
-    const int hostInputLengths[], /* the lengths of timing steps in each batch,
-                                     in CPU memory */
-    void *costs,                  /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
-                          T,N,A */
-    void *gradients,   /* the returned CTC gradients, in GPU memory, to compute
-                          costs only, set it to NULL */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace, /* pointer to the workspace, in GPU memory */
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int[],
-      const int[], const int[], void *, const cudnnTensorDescriptor_t, void *,
-      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, hostLabels, hostLabelLengths,
-                  hostInputLengths, costs, gradientsDesc, gradients, algo,
-                  ctcLossDesc, workspace, workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss_v8(
-    cudnnHandle_t handle,
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the timing steps, N is the
-                      mini batch size, A is the alphabet size)  */
-    const void *probs,        /* probabilities after softmax, in GPU memory */
-    const int labels[],       /* labels, in GPU memory */
-    const int labelLengths[], /* the length of each label, in GPU memory */
-    const int inputLengths[], /* the lengths of timing steps in each batch, in
-                                 GPU memory */
-    void *costs,              /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
-                          T,N,A */
-    void *gradients,   /* the returned CTC gradients, in GPU memory, to compute
-                          costs only, set it to NULL */
-    size_t workSpaceSizeInBytes, /* size of the workspace */
-    void *workspace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *, const int[], const int[],
-      const int[], void *, const cudnnTensorDescriptor_t, void *, size_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, ctcLossDesc, probsDesc, probs, labels,
-                  labelLengths, inputLengths, costs, gradientsDesc, gradients,
-                  workSpaceSizeInBytes, workspace);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the
-                      timing steps, N is the mini batch size, A is the alphabet
-                      size) */
-    const cudnnTensorDescriptor_t
-        gradientsDesc,       /* Tensor descriptor for gradients, the
-                                dimensions are T,N,A. To compute costs
-                                only, set it to NULL */
-    const int *labels,       /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
-      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
-                  inputLengths, algo, ctcLossDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize_v8(
-    cudnnHandle_t handle,
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the
-                      timing steps, N is the mini batch size, A is the alphabet
-                      size) */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the
-                          dimensions are T,N,A. To compute costs
-                          only, set it to NULL */
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, ctcLossDesc, probsDesc, gradientsDesc,
-                  sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAdvTrainVersionCheck(void) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAdvTrainVersionCheck");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc
index 1f86d27e100f22..705dae12f903f3 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc
@@ -33,35 +33,40 @@ void* GetDsoHandle() {
 #endif
 }
 
-template <typename T>
-T LoadSymbol(const char* symbol_name) {
+void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
     tsl::Env::Default()
         ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
-  return reinterpret_cast<T>(symbol);
+  return symbol;
 }
 
-cudnnStatus_t GetSymbolNotFoundError() { return CUDNN_STATUS_INTERNAL_ERROR; }
+const char* kSymbols[] = {
+#include "tsl/cuda/cudnn.inc"
+};
+
+constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
+
 }  // namespace
 
-#if CUDNN_MAJOR < 6
-#error cuDNN version earlier than 6 is not supported.
-#elif CUDNN_MAJOR < 7
-#include "tsl/cuda/cudnn_6_0.inc"
-#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 1
-#include "tsl/cuda/cudnn_7_0.inc"
-// 2 instead of 3: see https://github.com/tensorflow/tensorflow/issues/32350
-#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 2
-#include "tsl/cuda/cudnn_7_1.inc"
-#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 4
-#include "tsl/cuda/cudnn_7_3.inc"
-#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 6
-#include "tsl/cuda/cudnn_7_4.inc"
-#elif CUDNN_MAJOR == 7
-#include "tsl/cuda/cudnn_7_6.inc"
-#else
-#include "tsl/cuda/cudnn_8_0.inc"
-#endif
+extern "C" {
+
+static cudnnStatus_t GetSymbolNotFoundError() {
+  return CUDNN_STATUS_INTERNAL_ERROR;
+}
+
+extern void* _cudnn_tramp_table[];
+
+void _cudnn_tramp_resolve(int i) {
+  CHECK_LE(0, i);
+  CHECK_LT(i, kNumSymbols);
+  void* p = LoadSymbol(kSymbols[i]);
+  if (!p) {
+    p = reinterpret_cast<void*>(&GetSymbolNotFoundError);
+  }
+  _cudnn_tramp_table[i] = p;
+}
+
+}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cufft.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cufft.symbols
new file mode 100644
index 00000000000000..605815200bd90e
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cufft.symbols
@@ -0,0 +1,58 @@
+cufftCreate
+cufftDebug
+cufftDestroy
+cufftEnterCS
+cufftEstimate1d
+cufftEstimate2d
+cufftEstimate3d
+cufftEstimateMany
+cufftExecC2C
+cufftExecC2R
+cufftExecD2Z
+cufftExecR2C
+cufftExecZ2D
+cufftExecZ2Z
+cufftGetProperty
+cufftGetSize
+cufftGetSize1d
+cufftGetSize2d
+cufftGetSize3d
+cufftGetSizeMany
+cufftGetSizeMany64
+cufftGetVersion
+cufftLeaveCS
+cufftMakePlan1d
+cufftMakePlan2d
+cufftMakePlan3d
+cufftMakePlanGuru64
+cufftMakePlanMany
+cufftMakePlanMany64
+cufftPlan1d
+cufftPlan2d
+cufftPlan3d
+cufftPlanMany
+cufftSetAutoAllocation
+cufftSetStream
+cufftSetWorkArea
+cufftXtClearCallback
+cufftXtExec
+cufftXtExecDescriptor
+cufftXtExecDescriptorC2C
+cufftXtExecDescriptorC2R
+cufftXtExecDescriptorD2Z
+cufftXtExecDescriptorR2C
+cufftXtExecDescriptorZ2D
+cufftXtExecDescriptorZ2Z
+cufftXtFree
+cufftXtGetSizeMany
+cufftXtMakePlanGuru64
+cufftXtMakePlanMany
+cufftXtMalloc
+cufftXtMemcpy
+cufftXtQueryPlan
+cufftXtSetCallback
+cufftXtSetCallbackSharedSize
+cufftXtSetDistribution
+cufftXtSetGPUs
+cufftXtSetWorkArea
+cufftXtSetWorkAreaPolicy
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cufft_10_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cufft_10_0.inc
deleted file mode 100644
index 72068864f748dc..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cufft_10_0.inc
+++ /dev/null
@@ -1,361 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, int nx, cufftType type,
-                                 int batch) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *, int, cufftType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan1d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, type, batch);
-}
-
-cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, int nx, int ny,
-                                 cufftType type) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *, int, int, cufftType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan2d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, ny, type);
-}
-
-cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan, int nx, int ny, int nz,
-                                 cufftType type) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle *, int, int, int, cufftType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan3d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, ny, nz, type);
-}
-
-cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan, int rank, int *n,
-                                   int *inembed, int istride, int idist,
-                                   int *onembed, int ostride, int odist,
-                                   cufftType type, int batch) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *, int, int *, int *, int,
-                                          int, int *, int, int, cufftType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlanMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
-                  odist, type, batch);
-}
-
-cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan, int nx, cufftType type,
-                                     int batch, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan1d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan, int nx, int ny,
-                                     cufftType type, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan2d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, ny, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan, int nx, int ny, int nz,
-                                     cufftType type, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan3d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, ny, nz, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan, int rank, int *n,
-                                       int *inembed, int istride, int idist,
-                                       int *onembed, int ostride, int odist,
-                                       cufftType type, int batch,
-                                       size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int *, int *, int, int, int *,
-                              int, int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlanMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
-                  odist, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftMakePlanMany64(
-    cufftHandle plan, int rank, long long int *n, long long int *inembed,
-    long long int istride, long long int idist, long long int *onembed,
-    long long int ostride, long long int odist, cufftType type,
-    long long int batch, size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(
-      cufftHandle, int, long long *, long long *, long long, long long,
-      long long *, long long, long long, cufftType, long long, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlanMany64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
-                  odist, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftGetSizeMany64(
-    cufftHandle plan, int rank, long long int *n, long long int *inembed,
-    long long int istride, long long int idist, long long int *onembed,
-    long long int ostride, long long int odist, cufftType type,
-    long long int batch, size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(
-      cufftHandle, int, long long *, long long *, long long, long long,
-      long long *, long long, long long, cufftType, long long, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSizeMany64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
-                  odist, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftEstimate1d(int nx, cufftType type, int batch,
-                                     size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate1d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nx, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny, cufftType type,
-                                     size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate2d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nx, ny, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz, cufftType type,
-                                     size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(int, int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate3d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nx, ny, nz, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftEstimateMany(int rank, int *n, int *inembed,
-                                       int istride, int idist, int *onembed,
-                                       int ostride, int odist, cufftType type,
-                                       int batch, size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(int, int *, int *, int, int, int *,
-                                          int, int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimateMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rank, n, inembed, istride, idist, onembed, ostride, odist,
-                  type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftCreate(cufftHandle *handle) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle, int nx, cufftType type,
-                                    int batch, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize1d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nx, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle, int nx, int ny,
-                                    cufftType type, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize2d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nx, ny, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle, int nx, int ny, int nz,
-                                    cufftType type, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize3d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nx, ny, nz, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle, int rank, int *n,
-                                      int *inembed, int istride, int idist,
-                                      int *onembed, int ostride, int odist,
-                                      cufftType type, int batch,
-                                      size_t *workArea) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int *, int *, int, int, int *,
-                              int, int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSizeMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rank, n, inembed, istride, idist, onembed, ostride,
-                  odist, type, batch, workArea);
-}
-
-cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, workSize);
-}
-
-cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetWorkArea");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, workArea);
-}
-
-cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan,
-                                            int autoAllocate) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetAutoAllocation");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, autoAllocate);
-}
-
-cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan, cufftComplex *idata,
-                                  cufftComplex *odata, int direction) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, cufftComplex *, cufftComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecC2C");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata, direction);
-}
-
-cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, cufftReal *idata,
-                                  cufftComplex *odata) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, cufftReal *, cufftComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecR2C");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata);
-}
-
-cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, cufftComplex *idata,
-                                  cufftReal *odata) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, cufftComplex *, cufftReal *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecC2R");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata);
-}
-
-cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan, cufftDoubleComplex *idata,
-                                  cufftDoubleComplex *odata, int direction) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cufftDoubleComplex *,
-                                          cufftDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecZ2Z");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata, direction);
-}
-
-cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan, cufftDoubleReal *idata,
-                                  cufftDoubleComplex *odata) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cufftDoubleReal *,
-                                          cufftDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecD2Z");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata);
-}
-
-cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan, cufftDoubleComplex *idata,
-                                  cufftDoubleReal *odata) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cufftDoubleComplex *,
-                                          cufftDoubleReal *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecZ2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata);
-}
-
-cufftResult CUFFTAPI cufftSetStream(cufftHandle plan, cudaStream_t stream) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, stream);
-}
-
-cufftResult CUFFTAPI cufftDestroy(cufftHandle plan) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cufftResult CUFFTAPI cufftGetVersion(int *version) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-// The cufftXt API functions below are not currently used, they are merely here
-// to allow replacing TFRT's cuda_stubs with cuda_runtime_for_xlir.
-
-cufftResult CUFFTAPI cufftXtMakePlanMany(
-    cufftHandle plan, int rank, long long int* n, long long int* inembed,
-    long long int istride, long long int idist, cudaDataType inputtype,
-    long long int* onembed, long long int ostride, long long int odist,
-    cudaDataType outputtype, long long int batch, size_t* workSize,
-    cudaDataType executiontype) {
-  using FuncPtr = cufftResult(CUFFTAPI*)(
-      cufftHandle, int, long long int*, long long int*, long long int,
-      long long int, cudaDataType, long long int*, long long int, long long int,
-      cudaDataType, long long int, size_t*, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftXtMakePlanMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, inputtype, onembed,
-                  ostride, odist, outputtype, batch, workSize, executiontype);
-}
-
-cufftResult CUFFTAPI cufftXtGetSizeMany(
-    cufftHandle plan, int rank, long long int* n, long long int* inembed,
-    long long int istride, long long int idist, cudaDataType inputtype,
-    long long int* onembed, long long int ostride, long long int odist,
-    cudaDataType outputtype, long long int batch, size_t* workSize,
-    cudaDataType executiontype) {
-  using FuncPtr = cufftResult(CUFFTAPI*)(
-      cufftHandle, int, long long int*, long long int*, long long int,
-      long long int, cudaDataType, long long int*, long long int, long long int,
-      cudaDataType, long long int, size_t*, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftXtGetSizeMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, inputtype, onembed,
-                  ostride, odist, outputtype, batch, workSize, executiontype);
-}
-
-cufftResult CUFFTAPI cufftXtSetWorkAreaPolicy(cufftHandle plan,
-                                              cufftXtWorkAreaPolicy policy,
-                                              size_t* workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI*)(cufftHandle, cufftXtWorkAreaPolicy, size_t*);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftXtGetSizeMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, policy, workSize);
-}
-
-cufftResult CUFFTAPI cufftXtExec(cufftHandle plan, void* input, void* output,
-                                 int direction) {
-  using FuncPtr = cufftResult(CUFFTAPI*)(cufftHandle, void*, void*, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftXtGetSizeMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, input, output, direction);
-}
-
-cufftResult CUFFTAPI cufftXtMemcpy(cufftHandle plan, void* dstPointer,
-                                   void* srcPointer, cufftXtCopyType type) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI*)(cufftHandle, void*, void*, cufftXtCopyType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftXtGetSizeMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, dstPointer, srcPointer, type);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cufft_9_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cufft_9_0.inc
deleted file mode 100644
index e6244f0705d7ee..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cufft_9_0.inc
+++ /dev/null
@@ -1,307 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, int nx, cufftType type,
-                                 int batch) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *, int, cufftType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan1d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, type, batch);
-}
-
-cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, int nx, int ny,
-                                 cufftType type) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *, int, int, cufftType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan2d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, ny, type);
-}
-
-cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan, int nx, int ny, int nz,
-                                 cufftType type) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle *, int, int, int, cufftType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan3d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, ny, nz, type);
-}
-
-cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan, int rank, int *n,
-                                   int *inembed, int istride, int idist,
-                                   int *onembed, int ostride, int odist,
-                                   cufftType type, int batch) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *, int, int *, int *, int,
-                                          int, int *, int, int, cufftType, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlanMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
-                  odist, type, batch);
-}
-
-cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan, int nx, cufftType type,
-                                     int batch, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan1d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan, int nx, int ny,
-                                     cufftType type, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan2d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, ny, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan, int nx, int ny, int nz,
-                                     cufftType type, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan3d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, nx, ny, nz, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan, int rank, int *n,
-                                       int *inembed, int istride, int idist,
-                                       int *onembed, int ostride, int odist,
-                                       cufftType type, int batch,
-                                       size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int *, int *, int, int, int *,
-                              int, int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlanMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
-                  odist, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftMakePlanMany64(
-    cufftHandle plan, int rank, long long int *n, long long int *inembed,
-    long long int istride, long long int idist, long long int *onembed,
-    long long int ostride, long long int odist, cufftType type,
-    long long int batch, size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(
-      cufftHandle, int, long long *, long long *, long long, long long,
-      long long *, long long, long long, cufftType, long long, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlanMany64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
-                  odist, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftGetSizeMany64(
-    cufftHandle plan, int rank, long long int *n, long long int *inembed,
-    long long int istride, long long int idist, long long int *onembed,
-    long long int ostride, long long int odist, cufftType type,
-    long long int batch, size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(
-      cufftHandle, int, long long *, long long *, long long, long long,
-      long long *, long long, long long, cufftType, long long, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSizeMany64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride,
-                  odist, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftEstimate1d(int nx, cufftType type, int batch,
-                                     size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate1d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nx, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny, cufftType type,
-                                     size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate2d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nx, ny, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz, cufftType type,
-                                     size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(int, int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate3d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(nx, ny, nz, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftEstimateMany(int rank, int *n, int *inembed,
-                                       int istride, int idist, int *onembed,
-                                       int ostride, int odist, cufftType type,
-                                       int batch, size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(int, int *, int *, int, int, int *,
-                                          int, int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimateMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rank, n, inembed, istride, idist, onembed, ostride, odist,
-                  type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftCreate(cufftHandle *handle) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle, int nx, cufftType type,
-                                    int batch, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize1d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nx, type, batch, workSize);
-}
-
-cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle, int nx, int ny,
-                                    cufftType type, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize2d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nx, ny, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle, int nx, int ny, int nz,
-                                    cufftType type, size_t *workSize) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int, int, cufftType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize3d");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nx, ny, nz, type, workSize);
-}
-
-cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle, int rank, int *n,
-                                      int *inembed, int istride, int idist,
-                                      int *onembed, int ostride, int odist,
-                                      cufftType type, int batch,
-                                      size_t *workArea) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, int, int *, int *, int, int, int *,
-                              int, int, cufftType, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSizeMany");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rank, n, inembed, istride, idist, onembed, ostride,
-                  odist, type, batch, workArea);
-}
-
-cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, workSize);
-}
-
-cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetWorkArea");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, workArea);
-}
-
-cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan,
-                                            int autoAllocate) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetAutoAllocation");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, autoAllocate);
-}
-
-cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan, cufftComplex *idata,
-                                  cufftComplex *odata, int direction) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, cufftComplex *, cufftComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecC2C");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata, direction);
-}
-
-cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, cufftReal *idata,
-                                  cufftComplex *odata) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, cufftReal *, cufftComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecR2C");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata);
-}
-
-cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, cufftComplex *idata,
-                                  cufftReal *odata) {
-  using FuncPtr =
-      cufftResult(CUFFTAPI *)(cufftHandle, cufftComplex *, cufftReal *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecC2R");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata);
-}
-
-cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan, cufftDoubleComplex *idata,
-                                  cufftDoubleComplex *odata, int direction) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cufftDoubleComplex *,
-                                          cufftDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecZ2Z");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata, direction);
-}
-
-cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan, cufftDoubleReal *idata,
-                                  cufftDoubleComplex *odata) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cufftDoubleReal *,
-                                          cufftDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecD2Z");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata);
-}
-
-cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan, cufftDoubleComplex *idata,
-                                  cufftDoubleReal *odata) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cufftDoubleComplex *,
-                                          cufftDoubleReal *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecZ2D");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, idata, odata);
-}
-
-cufftResult CUFFTAPI cufftSetStream(cufftHandle plan, cudaStream_t stream) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, stream);
-}
-
-cufftResult CUFFTAPI cufftSetCompatibilityMode(cufftHandle plan,
-                                               cufftCompatibility mode) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle, cufftCompatibility);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetCompatibilityMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, mode);
-}
-
-cufftResult CUFFTAPI cufftDestroy(cufftHandle plan) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(cufftHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cufftResult CUFFTAPI cufftGetVersion(int *version) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cufftResult(CUFFTAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cufft_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cufft_stub.cc
index e7ea9d1eee78c8..8f5c1b0d687337 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cufft_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cufft_stub.cc
@@ -34,23 +34,38 @@ void* GetDsoHandle() {
 #endif
 }
 
-template <typename T>
-T LoadSymbol(const char* symbol_name) {
+void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
     tsl::Env::Default()
         ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
-  return reinterpret_cast<T>(symbol);
+  return symbol;
 }
 
-cufftResult GetSymbolNotFoundError() { return CUFFT_INTERNAL_ERROR; }
+const char* kSymbols[] = {
+#include "tsl/cuda/cufft.inc"
+};
+
+constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
+
 }  // namespace
 
-#if CUFFT_VERSION < 10000
-#include "tsl/cuda/cufft_9_0.inc"
-#else
-// All CUDA-10+ implementations use the same API.
-#include "tsl/cuda/cufft_10_0.inc"
-#endif
+extern "C" {
+
+static cufftResult GetSymbolNotFoundError() { return CUFFT_INTERNAL_ERROR; }
+
+extern void* _cufft_tramp_table[];
+
+void _cufft_tramp_resolve(int i) {
+  CHECK_LE(0, i);
+  CHECK_LT(i, kNumSymbols);
+  void* p = LoadSymbol(kSymbols[i]);
+  if (!p) {
+    p = reinterpret_cast<void*>(&GetSymbolNotFoundError);
+  }
+  _cufft_tramp_table[i] = p;
+}
+
+}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cupti.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cupti.symbols
new file mode 100644
index 00000000000000..0e95eaba0d5a67
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cupti.symbols
@@ -0,0 +1,148 @@
+InitializeInjectionNvtx
+InitializeInjectionNvtx2
+cuptiActivityConfigurePCSampling
+cuptiActivityConfigureUnifiedMemoryCounter
+cuptiActivityDisable
+cuptiActivityDisableContext
+cuptiActivityEnable
+cuptiActivityEnableAndDump
+cuptiActivityEnableBufferSummary
+cuptiActivityEnableContext
+cuptiActivityEnableLatencyTimestamps
+cuptiActivityEnableLaunchAttributes
+cuptiActivityEnableRawTimestamps
+cuptiActivityFlush
+cuptiActivityFlushAll
+cuptiActivityFlushPeriod
+cuptiActivityGetAttribute
+cuptiActivityGetNextRecord
+cuptiActivityGetNumDroppedRecords
+cuptiActivityPopExternalCorrelationId
+cuptiActivityPushExternalCorrelationId
+cuptiActivityRegisterCallbacks
+cuptiActivityRegisterTimestampCallback
+cuptiActivitySetAttribute
+cuptiComputeCapabilitySupported
+cuptiDeviceEnumEventDomains
+cuptiDeviceEnumMetrics
+cuptiDeviceGetAttribute
+cuptiDeviceGetChipName
+cuptiDeviceGetEventDomainAttribute
+cuptiDeviceGetNumEventDomains
+cuptiDeviceGetNumMetrics
+cuptiDeviceSupported
+cuptiDeviceVirtualizationMode
+cuptiDisableKernelReplayMode
+cuptiDisableLibcuda
+cuptiDisableNonOverlappingMode
+cuptiEnableAllDomains
+cuptiEnableCallback
+cuptiEnableDomain
+cuptiEnableKernelReplayMode
+cuptiEnableNonOverlappingMode
+cuptiEnumEventDomains
+cuptiEnumMetrics
+cuptiEventDomainEnumEvents
+cuptiEventDomainGetAttribute
+cuptiEventDomainGetNumEvents
+cuptiEventGetAttribute
+cuptiEventGetIdFromName
+cuptiEventGroupAddEvent
+cuptiEventGroupCreate
+cuptiEventGroupDestroy
+cuptiEventGroupDisable
+cuptiEventGroupEnable
+cuptiEventGroupGetAttribute
+cuptiEventGroupReadAllEvents
+cuptiEventGroupReadEvent
+cuptiEventGroupRemoveAllEvents
+cuptiEventGroupRemoveEvent
+cuptiEventGroupResetAllEvents
+cuptiEventGroupSetAttribute
+cuptiEventGroupSetDisable
+cuptiEventGroupSetEnable
+cuptiEventGroupSetsCreate
+cuptiEventGroupSetsDestroy
+cuptiFinalize
+cuptiGetAutoBoostState
+cuptiGetCallbackName
+cuptiGetCallbackState
+cuptiGetContextId
+cuptiGetCubinCrc
+cuptiGetDeviceId
+cuptiGetErrorMessage
+cuptiGetGlobalCallbackState
+cuptiGetGraphId
+cuptiGetGraphNodeId
+cuptiGetLastError
+cuptiGetNumContexts
+cuptiGetNumEventDomains
+cuptiGetNumMetrics
+cuptiGetRecommendedBufferSize
+cuptiGetResultString
+cuptiGetSassToSourceCorrelation
+cuptiGetStreamId
+cuptiGetStreamIdEx
+cuptiGetThreadIdType
+cuptiGetTimestamp
+cuptiGetVersion
+cuptiKernelReplaySubscribeUpdate
+cuptiMetricCreateEventGroupSets
+cuptiMetricEnumEvents
+cuptiMetricEnumProperties
+cuptiMetricGetAttribute
+cuptiMetricGetIdFromName
+cuptiMetricGetNumEvents
+cuptiMetricGetNumProperties
+cuptiMetricGetRequiredEventGroupSets
+cuptiMetricGetValue
+cuptiMetricGetValue2
+cuptiNvtxInitialize
+cuptiNvtxInitialize2
+cuptiOpenACCInitialize
+cuptiOpenMpInitialize
+cuptiOpenMpInitialize_v2
+cuptiPCSamplingDisable
+cuptiPCSamplingEnable
+cuptiPCSamplingGetConfigurationAttribute
+cuptiPCSamplingGetData
+cuptiPCSamplingGetNumStallReasons
+cuptiPCSamplingGetStallReasons
+cuptiPCSamplingSetConfigurationAttribute
+cuptiPCSamplingStart
+cuptiPCSamplingStop
+cuptiProfilerBeginPass
+cuptiProfilerBeginSession
+cuptiProfilerCounterDataImageCalculateScratchBufferSize
+cuptiProfilerCounterDataImageCalculateSize
+cuptiProfilerCounterDataImageInitialize
+cuptiProfilerCounterDataImageInitializeScratchBuffer
+cuptiProfilerDeInitialize
+cuptiProfilerDeviceSupported
+cuptiProfilerDisableProfiling
+cuptiProfilerEnableProfiling
+cuptiProfilerEndPass
+cuptiProfilerEndSession
+cuptiProfilerFlushCounterData
+cuptiProfilerGetCounterAvailability
+cuptiProfilerInitialize
+cuptiProfilerPopRange
+cuptiProfilerPushRange
+cuptiProfilerSetConfig
+cuptiProfilerUnsetConfig
+cuptiRegisterComputeCrcCallback
+cuptiSassMetricsDisable
+cuptiSassMetricsEnable
+cuptiSassMetricsFlushData
+cuptiSassMetricsGetDataProperties
+cuptiSassMetricsGetMetrics
+cuptiSassMetricsGetNumOfMetrics
+cuptiSassMetricsGetProperties
+cuptiSassMetricsSetConfig
+cuptiSassMetricsUnsetConfig
+cuptiSetEventCollectionMode
+cuptiSetThreadIdType
+cuptiStateQuery
+cuptiSubscribe
+cuptiSupportedDomains
+cuptiUnsubscribe
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cupti_10_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cupti_10_0.inc
deleted file mode 100644
index dac0c7919342bd..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cupti_10_0.inc
+++ /dev/null
@@ -1,763 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result,
-                                          const char **str) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUptiResult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetResultString");
-  if (!func_ptr) {
-    if (str) {
-      *str = "CUPTI could not be loaded or symbol could not be found.";
-    }
-    return GetSymbolNotFoundError();
-  }
-  return func_ptr(result, str);
-}
-
-CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-CUptiResult CUPTIAPI cuptiSupportedDomains(size_t *domainCount,
-                                           CUpti_DomainTable *domainTable) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_DomainTable *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSupportedDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domainCount, domainTable);
-}
-
-CUptiResult CUPTIAPI cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
-                                    CUpti_CallbackFunc callback,
-                                    void *userdata) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle *,
-                                          CUpti_CallbackFunc, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber, callback, userdata);
-}
-
-CUptiResult CUPTIAPI cuptiUnsubscribe(CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiUnsubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackState(uint32_t *enable,
-                                           CUpti_SubscriberHandle subscriber,
-                                           CUpti_CallbackDomain domain,
-                                           CUpti_CallbackId cbid) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(uint32_t *, CUpti_SubscriberHandle,
-                              CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableCallback(uint32_t enable,
-                                         CUpti_SubscriberHandle subscriber,
-                                         CUpti_CallbackDomain domain,
-                                         CUpti_CallbackId cbid) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      uint32_t, CUpti_SubscriberHandle, CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableDomain(uint32_t enable,
-                                       CUpti_SubscriberHandle subscriber,
-                                       CUpti_CallbackDomain domain) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle,
-                                          CUpti_CallbackDomain);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableDomain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain);
-}
-
-CUptiResult CUPTIAPI cuptiEnableAllDomains(uint32_t enable,
-                                           CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableAllDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackName(CUpti_CallbackDomain domain,
-                                          uint32_t cbid, const char **name) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_CallbackDomain, uint32_t, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domain, cbid, name);
-}
-
-CUptiResult CUPTIAPI
-cuptiSetEventCollectionMode(CUcontext context, CUpti_EventCollectionMode mode) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventCollectionMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetEventCollectionMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, mode);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
-                                             CUpti_DeviceAttribute attrib,
-                                             size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_DeviceAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetTimestamp(CUcontext context,
-                                             uint64_t *timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
-                                                   uint32_t *numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(
-    CUdevice device, size_t *arraySizeBytes, CUpti_EventDomainID *domainArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(
-    CUdevice device, CUpti_EventDomainID eventDomain,
-    CUpti_EventDomainAttribute attrib, size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, CUpti_EventDomainID,
-                              CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiDeviceGetEventDomainAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
-                                           CUpti_EventDomainID *domainArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(
-    CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib,
-    size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(
-    CUpti_EventDomainID eventDomain, uint32_t *numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
-                                                size_t *arraySizeBytes,
-                                                CUpti_EventID *eventArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, arraySizeBytes, eventArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
-                                            CUpti_EventAttribute attrib,
-                                            size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventID, CUpti_EventAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
-                                             const char *eventName,
-                                             CUpti_EventID *event) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventName, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
-                                           CUpti_EventGroup *eventGroup,
-                                           uint32_t flags) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventGroup *, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventGroup, flags);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(
-    CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(
-    CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
-                                             CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupAddEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
-                                                CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupResetAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
-                                              CUpti_ReadEventFlags flags,
-                                              CUpti_EventID event,
-                                              size_t *eventValueBufferSizeBytes,
-                                              uint64_t *eventValueBuffer) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags,
-                              CUpti_EventID, size_t *, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, event, eventValueBufferSizeBytes,
-                  eventValueBuffer);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(
-    CUpti_EventGroup eventGroup, CUpti_ReadEventFlags flags,
-    size_t *eventValueBufferSizeBytes, uint64_t *eventValueBuffer,
-    size_t *eventIdArraySizeBytes, CUpti_EventID *eventIdArray,
-    size_t *numEventIdsRead) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *,
-                              uint64_t *, size_t *, CUpti_EventID *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, eventValueBufferSizeBytes,
-                  eventValueBuffer, eventIdArraySizeBytes, eventIdArray,
-                  numEventIdsRead);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(
-    CUcontext context, size_t eventIdArraySizeBytes,
-    CUpti_EventID *eventIdArray, CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_EventID *,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventIdArraySizeBytes, eventIdArray,
-                  eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSets *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSets);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDisableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(
-    CUpti_KernelReplayUpdateFunc updateFunc, void *customData) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_KernelReplayUpdateFunc, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiKernelReplaySubscribeUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(updateFunc, customData);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
-                                      CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
-                                              uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
-                                            size_t *arraySizeBytes,
-                                            CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
-                                             CUpti_MetricAttribute attrib,
-                                             size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, CUpti_MetricAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
-                                              const char *metricName,
-                                              CUpti_MetricID *metric) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metricName, metric);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
-                                             uint32_t *numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
-                                           size_t *eventIdArraySizeBytes,
-                                           CUpti_EventID *eventIdArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
-                                                 uint32_t *numProp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numProp);
-}
-
-CUptiResult CUPTIAPI
-cuptiMetricEnumProperties(CUpti_MetricID metric, size_t *propIdArraySizeBytes,
-                          CUpti_MetricPropertyID *propIdArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *,
-                                          CUpti_MetricPropertyID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, propIdArraySizeBytes, propIdArray);
-}
-
-CUptiResult CUPTIAPI
-cuptiMetricGetRequiredEventGroupSets(CUcontext context, CUpti_MetricID metric,
-                                     CUpti_EventGroupSets **eventGroupSets) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_MetricID,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiMetricGetRequiredEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metric, eventGroupSets);
-}
-
-CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(
-    CUcontext context, size_t metricIdArraySizeBytes,
-    CUpti_MetricID *metricIdArray, CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_MetricID *,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricCreateEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metricIdArraySizeBytes, metricIdArray,
-                  eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device, CUpti_MetricID metric,
-                                         size_t eventIdArraySizeBytes,
-                                         CUpti_EventID *eventIdArray,
-                                         size_t eventValueArraySizeBytes,
-                                         uint64_t *eventValueArray,
-                                         uint64_t timeDuration,
-                                         CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_MetricID, size_t,
-                                          CUpti_EventID *, size_t, uint64_t *,
-                                          uint64_t, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metric, eventIdArraySizeBytes, eventIdArray,
-                  eventValueArraySizeBytes, eventValueArray, timeDuration,
-                  metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue2(
-    CUpti_MetricID metric, size_t eventIdArraySizeBytes,
-    CUpti_EventID *eventIdArray, size_t eventValueArraySizeBytes,
-    uint64_t *eventValueArray, size_t propIdArraySizeBytes,
-    CUpti_MetricPropertyID *propIdArray, size_t propValueArraySizeBytes,
-    uint64_t *propValueArray, CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_MetricID, size_t, CUpti_EventID *, size_t, uint64_t *, size_t,
-      CUpti_MetricPropertyID *, size_t, uint64_t *, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray,
-                  eventValueArraySizeBytes, eventValueArray,
-                  propIdArraySizeBytes, propIdArray, propValueArraySizeBytes,
-                  propValueArray, metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetContextId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, contextId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream,
-                                      uint32_t *streamId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream,
-                                        uint8_t perThreadStream,
-                                        uint32_t *streamId) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint8_t, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamIdEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, perThreadStream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetDeviceId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, deviceId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context,
-                                                CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context,
-                                                 CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context,
-                                                       uint32_t streamId,
-                                                       size_t *dropped) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityGetNumDroppedRecords");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, dropped);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t *buffer,
-                                                size_t validBufferSizeBytes,
-                                                CUpti_Activity **record) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t *, size_t, CUpti_Activity **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetNextRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(buffer, validBufferSizeBytes, record);
-}
-
-CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(
-    CUpti_BuffersCallbackRequestFunc funcBufferRequested,
-    CUpti_BuffersCallbackCompleteFunc funcBufferCompleted) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_BuffersCallbackRequestFunc,
-                                          CUpti_BuffersCallbackCompleteFunc);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityRegisterCallbacks");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(funcBufferRequested, funcBufferCompleted);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId,
-                                        uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlush");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlushAll");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivitySetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(
-    CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_ActivityUnifiedMemoryCounterConfig *, uint32_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityConfigureUnifiedMemoryCounter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config, count);
-}
-
-CUptiResult CUPTIAPI
-cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityAutoBoostState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetAutoBoostState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, state);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(
-    CUcontext ctx, CUpti_ActivityPCSamplingConfig *config) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityPCSamplingConfig *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityConfigurePCSampling");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, config);
-}
-
-CUptiResult CUPTIAPI cuptiGetLastError(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor,
-                                                     int *support) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiComputeCapabilitySupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, support);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceSupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, support);
-}
-
-CUptiResult CUPTIAPI cuptiFinalize(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiFinalize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(
-    CUpti_ExternalCorrelationKind kind, uint64_t id) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityPushExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, id);
-}
-
-CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(
-    CUpti_ExternalCorrelationKind kind, uint64_t *lastId) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityPopExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, lastId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityEnableLatencyTimestamps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cupti_10_1.inc b/third_party/xla/third_party/tsl/tsl/cuda/cupti_10_1.inc
deleted file mode 100644
index dac0c7919342bd..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cupti_10_1.inc
+++ /dev/null
@@ -1,763 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result,
-                                          const char **str) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUptiResult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetResultString");
-  if (!func_ptr) {
-    if (str) {
-      *str = "CUPTI could not be loaded or symbol could not be found.";
-    }
-    return GetSymbolNotFoundError();
-  }
-  return func_ptr(result, str);
-}
-
-CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-CUptiResult CUPTIAPI cuptiSupportedDomains(size_t *domainCount,
-                                           CUpti_DomainTable *domainTable) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_DomainTable *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSupportedDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domainCount, domainTable);
-}
-
-CUptiResult CUPTIAPI cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
-                                    CUpti_CallbackFunc callback,
-                                    void *userdata) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle *,
-                                          CUpti_CallbackFunc, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber, callback, userdata);
-}
-
-CUptiResult CUPTIAPI cuptiUnsubscribe(CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiUnsubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackState(uint32_t *enable,
-                                           CUpti_SubscriberHandle subscriber,
-                                           CUpti_CallbackDomain domain,
-                                           CUpti_CallbackId cbid) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(uint32_t *, CUpti_SubscriberHandle,
-                              CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableCallback(uint32_t enable,
-                                         CUpti_SubscriberHandle subscriber,
-                                         CUpti_CallbackDomain domain,
-                                         CUpti_CallbackId cbid) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      uint32_t, CUpti_SubscriberHandle, CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableDomain(uint32_t enable,
-                                       CUpti_SubscriberHandle subscriber,
-                                       CUpti_CallbackDomain domain) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle,
-                                          CUpti_CallbackDomain);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableDomain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain);
-}
-
-CUptiResult CUPTIAPI cuptiEnableAllDomains(uint32_t enable,
-                                           CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableAllDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackName(CUpti_CallbackDomain domain,
-                                          uint32_t cbid, const char **name) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_CallbackDomain, uint32_t, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domain, cbid, name);
-}
-
-CUptiResult CUPTIAPI
-cuptiSetEventCollectionMode(CUcontext context, CUpti_EventCollectionMode mode) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventCollectionMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetEventCollectionMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, mode);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
-                                             CUpti_DeviceAttribute attrib,
-                                             size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_DeviceAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetTimestamp(CUcontext context,
-                                             uint64_t *timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
-                                                   uint32_t *numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(
-    CUdevice device, size_t *arraySizeBytes, CUpti_EventDomainID *domainArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(
-    CUdevice device, CUpti_EventDomainID eventDomain,
-    CUpti_EventDomainAttribute attrib, size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, CUpti_EventDomainID,
-                              CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiDeviceGetEventDomainAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
-                                           CUpti_EventDomainID *domainArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(
-    CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib,
-    size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(
-    CUpti_EventDomainID eventDomain, uint32_t *numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
-                                                size_t *arraySizeBytes,
-                                                CUpti_EventID *eventArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, arraySizeBytes, eventArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
-                                            CUpti_EventAttribute attrib,
-                                            size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventID, CUpti_EventAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
-                                             const char *eventName,
-                                             CUpti_EventID *event) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventName, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
-                                           CUpti_EventGroup *eventGroup,
-                                           uint32_t flags) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventGroup *, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventGroup, flags);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(
-    CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(
-    CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
-                                             CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupAddEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
-                                                CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupResetAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
-                                              CUpti_ReadEventFlags flags,
-                                              CUpti_EventID event,
-                                              size_t *eventValueBufferSizeBytes,
-                                              uint64_t *eventValueBuffer) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags,
-                              CUpti_EventID, size_t *, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, event, eventValueBufferSizeBytes,
-                  eventValueBuffer);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(
-    CUpti_EventGroup eventGroup, CUpti_ReadEventFlags flags,
-    size_t *eventValueBufferSizeBytes, uint64_t *eventValueBuffer,
-    size_t *eventIdArraySizeBytes, CUpti_EventID *eventIdArray,
-    size_t *numEventIdsRead) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *,
-                              uint64_t *, size_t *, CUpti_EventID *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, eventValueBufferSizeBytes,
-                  eventValueBuffer, eventIdArraySizeBytes, eventIdArray,
-                  numEventIdsRead);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(
-    CUcontext context, size_t eventIdArraySizeBytes,
-    CUpti_EventID *eventIdArray, CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_EventID *,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventIdArraySizeBytes, eventIdArray,
-                  eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSets *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSets);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDisableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(
-    CUpti_KernelReplayUpdateFunc updateFunc, void *customData) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_KernelReplayUpdateFunc, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiKernelReplaySubscribeUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(updateFunc, customData);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
-                                      CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
-                                              uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
-                                            size_t *arraySizeBytes,
-                                            CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
-                                             CUpti_MetricAttribute attrib,
-                                             size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, CUpti_MetricAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
-                                              const char *metricName,
-                                              CUpti_MetricID *metric) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metricName, metric);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
-                                             uint32_t *numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
-                                           size_t *eventIdArraySizeBytes,
-                                           CUpti_EventID *eventIdArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
-                                                 uint32_t *numProp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numProp);
-}
-
-CUptiResult CUPTIAPI
-cuptiMetricEnumProperties(CUpti_MetricID metric, size_t *propIdArraySizeBytes,
-                          CUpti_MetricPropertyID *propIdArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *,
-                                          CUpti_MetricPropertyID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, propIdArraySizeBytes, propIdArray);
-}
-
-CUptiResult CUPTIAPI
-cuptiMetricGetRequiredEventGroupSets(CUcontext context, CUpti_MetricID metric,
-                                     CUpti_EventGroupSets **eventGroupSets) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_MetricID,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiMetricGetRequiredEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metric, eventGroupSets);
-}
-
-CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(
-    CUcontext context, size_t metricIdArraySizeBytes,
-    CUpti_MetricID *metricIdArray, CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_MetricID *,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricCreateEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metricIdArraySizeBytes, metricIdArray,
-                  eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device, CUpti_MetricID metric,
-                                         size_t eventIdArraySizeBytes,
-                                         CUpti_EventID *eventIdArray,
-                                         size_t eventValueArraySizeBytes,
-                                         uint64_t *eventValueArray,
-                                         uint64_t timeDuration,
-                                         CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_MetricID, size_t,
-                                          CUpti_EventID *, size_t, uint64_t *,
-                                          uint64_t, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metric, eventIdArraySizeBytes, eventIdArray,
-                  eventValueArraySizeBytes, eventValueArray, timeDuration,
-                  metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue2(
-    CUpti_MetricID metric, size_t eventIdArraySizeBytes,
-    CUpti_EventID *eventIdArray, size_t eventValueArraySizeBytes,
-    uint64_t *eventValueArray, size_t propIdArraySizeBytes,
-    CUpti_MetricPropertyID *propIdArray, size_t propValueArraySizeBytes,
-    uint64_t *propValueArray, CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_MetricID, size_t, CUpti_EventID *, size_t, uint64_t *, size_t,
-      CUpti_MetricPropertyID *, size_t, uint64_t *, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray,
-                  eventValueArraySizeBytes, eventValueArray,
-                  propIdArraySizeBytes, propIdArray, propValueArraySizeBytes,
-                  propValueArray, metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetContextId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, contextId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream,
-                                      uint32_t *streamId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream,
-                                        uint8_t perThreadStream,
-                                        uint32_t *streamId) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint8_t, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamIdEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, perThreadStream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetDeviceId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, deviceId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context,
-                                                CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context,
-                                                 CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context,
-                                                       uint32_t streamId,
-                                                       size_t *dropped) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityGetNumDroppedRecords");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, dropped);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t *buffer,
-                                                size_t validBufferSizeBytes,
-                                                CUpti_Activity **record) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t *, size_t, CUpti_Activity **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetNextRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(buffer, validBufferSizeBytes, record);
-}
-
-CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(
-    CUpti_BuffersCallbackRequestFunc funcBufferRequested,
-    CUpti_BuffersCallbackCompleteFunc funcBufferCompleted) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_BuffersCallbackRequestFunc,
-                                          CUpti_BuffersCallbackCompleteFunc);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityRegisterCallbacks");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(funcBufferRequested, funcBufferCompleted);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId,
-                                        uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlush");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlushAll");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivitySetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(
-    CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_ActivityUnifiedMemoryCounterConfig *, uint32_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityConfigureUnifiedMemoryCounter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config, count);
-}
-
-CUptiResult CUPTIAPI
-cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityAutoBoostState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetAutoBoostState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, state);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(
-    CUcontext ctx, CUpti_ActivityPCSamplingConfig *config) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityPCSamplingConfig *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityConfigurePCSampling");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, config);
-}
-
-CUptiResult CUPTIAPI cuptiGetLastError(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor,
-                                                     int *support) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiComputeCapabilitySupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, support);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceSupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, support);
-}
-
-CUptiResult CUPTIAPI cuptiFinalize(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiFinalize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(
-    CUpti_ExternalCorrelationKind kind, uint64_t id) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityPushExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, id);
-}
-
-CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(
-    CUpti_ExternalCorrelationKind kind, uint64_t *lastId) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityPopExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, lastId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityEnableLatencyTimestamps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cupti_10_2.inc b/third_party/xla/third_party/tsl/tsl/cuda/cupti_10_2.inc
deleted file mode 100644
index dac0c7919342bd..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cupti_10_2.inc
+++ /dev/null
@@ -1,763 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result,
-                                          const char **str) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUptiResult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetResultString");
-  if (!func_ptr) {
-    if (str) {
-      *str = "CUPTI could not be loaded or symbol could not be found.";
-    }
-    return GetSymbolNotFoundError();
-  }
-  return func_ptr(result, str);
-}
-
-CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-CUptiResult CUPTIAPI cuptiSupportedDomains(size_t *domainCount,
-                                           CUpti_DomainTable *domainTable) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_DomainTable *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSupportedDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domainCount, domainTable);
-}
-
-CUptiResult CUPTIAPI cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
-                                    CUpti_CallbackFunc callback,
-                                    void *userdata) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle *,
-                                          CUpti_CallbackFunc, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber, callback, userdata);
-}
-
-CUptiResult CUPTIAPI cuptiUnsubscribe(CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiUnsubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackState(uint32_t *enable,
-                                           CUpti_SubscriberHandle subscriber,
-                                           CUpti_CallbackDomain domain,
-                                           CUpti_CallbackId cbid) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(uint32_t *, CUpti_SubscriberHandle,
-                              CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableCallback(uint32_t enable,
-                                         CUpti_SubscriberHandle subscriber,
-                                         CUpti_CallbackDomain domain,
-                                         CUpti_CallbackId cbid) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      uint32_t, CUpti_SubscriberHandle, CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableDomain(uint32_t enable,
-                                       CUpti_SubscriberHandle subscriber,
-                                       CUpti_CallbackDomain domain) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle,
-                                          CUpti_CallbackDomain);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableDomain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain);
-}
-
-CUptiResult CUPTIAPI cuptiEnableAllDomains(uint32_t enable,
-                                           CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableAllDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackName(CUpti_CallbackDomain domain,
-                                          uint32_t cbid, const char **name) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_CallbackDomain, uint32_t, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domain, cbid, name);
-}
-
-CUptiResult CUPTIAPI
-cuptiSetEventCollectionMode(CUcontext context, CUpti_EventCollectionMode mode) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventCollectionMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetEventCollectionMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, mode);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
-                                             CUpti_DeviceAttribute attrib,
-                                             size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_DeviceAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetTimestamp(CUcontext context,
-                                             uint64_t *timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
-                                                   uint32_t *numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(
-    CUdevice device, size_t *arraySizeBytes, CUpti_EventDomainID *domainArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(
-    CUdevice device, CUpti_EventDomainID eventDomain,
-    CUpti_EventDomainAttribute attrib, size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, CUpti_EventDomainID,
-                              CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiDeviceGetEventDomainAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
-                                           CUpti_EventDomainID *domainArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(
-    CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib,
-    size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(
-    CUpti_EventDomainID eventDomain, uint32_t *numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
-                                                size_t *arraySizeBytes,
-                                                CUpti_EventID *eventArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, arraySizeBytes, eventArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
-                                            CUpti_EventAttribute attrib,
-                                            size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventID, CUpti_EventAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
-                                             const char *eventName,
-                                             CUpti_EventID *event) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventName, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
-                                           CUpti_EventGroup *eventGroup,
-                                           uint32_t flags) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventGroup *, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventGroup, flags);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(
-    CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(
-    CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
-                                             CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupAddEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
-                                                CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupResetAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
-                                              CUpti_ReadEventFlags flags,
-                                              CUpti_EventID event,
-                                              size_t *eventValueBufferSizeBytes,
-                                              uint64_t *eventValueBuffer) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags,
-                              CUpti_EventID, size_t *, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, event, eventValueBufferSizeBytes,
-                  eventValueBuffer);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(
-    CUpti_EventGroup eventGroup, CUpti_ReadEventFlags flags,
-    size_t *eventValueBufferSizeBytes, uint64_t *eventValueBuffer,
-    size_t *eventIdArraySizeBytes, CUpti_EventID *eventIdArray,
-    size_t *numEventIdsRead) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *,
-                              uint64_t *, size_t *, CUpti_EventID *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, eventValueBufferSizeBytes,
-                  eventValueBuffer, eventIdArraySizeBytes, eventIdArray,
-                  numEventIdsRead);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(
-    CUcontext context, size_t eventIdArraySizeBytes,
-    CUpti_EventID *eventIdArray, CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_EventID *,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventIdArraySizeBytes, eventIdArray,
-                  eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSets *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSets);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDisableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(
-    CUpti_KernelReplayUpdateFunc updateFunc, void *customData) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_KernelReplayUpdateFunc, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiKernelReplaySubscribeUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(updateFunc, customData);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
-                                      CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
-                                              uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
-                                            size_t *arraySizeBytes,
-                                            CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
-                                             CUpti_MetricAttribute attrib,
-                                             size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, CUpti_MetricAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
-                                              const char *metricName,
-                                              CUpti_MetricID *metric) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metricName, metric);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
-                                             uint32_t *numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
-                                           size_t *eventIdArraySizeBytes,
-                                           CUpti_EventID *eventIdArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
-                                                 uint32_t *numProp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numProp);
-}
-
-CUptiResult CUPTIAPI
-cuptiMetricEnumProperties(CUpti_MetricID metric, size_t *propIdArraySizeBytes,
-                          CUpti_MetricPropertyID *propIdArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *,
-                                          CUpti_MetricPropertyID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, propIdArraySizeBytes, propIdArray);
-}
-
-CUptiResult CUPTIAPI
-cuptiMetricGetRequiredEventGroupSets(CUcontext context, CUpti_MetricID metric,
-                                     CUpti_EventGroupSets **eventGroupSets) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_MetricID,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiMetricGetRequiredEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metric, eventGroupSets);
-}
-
-CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(
-    CUcontext context, size_t metricIdArraySizeBytes,
-    CUpti_MetricID *metricIdArray, CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_MetricID *,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricCreateEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metricIdArraySizeBytes, metricIdArray,
-                  eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device, CUpti_MetricID metric,
-                                         size_t eventIdArraySizeBytes,
-                                         CUpti_EventID *eventIdArray,
-                                         size_t eventValueArraySizeBytes,
-                                         uint64_t *eventValueArray,
-                                         uint64_t timeDuration,
-                                         CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_MetricID, size_t,
-                                          CUpti_EventID *, size_t, uint64_t *,
-                                          uint64_t, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metric, eventIdArraySizeBytes, eventIdArray,
-                  eventValueArraySizeBytes, eventValueArray, timeDuration,
-                  metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue2(
-    CUpti_MetricID metric, size_t eventIdArraySizeBytes,
-    CUpti_EventID *eventIdArray, size_t eventValueArraySizeBytes,
-    uint64_t *eventValueArray, size_t propIdArraySizeBytes,
-    CUpti_MetricPropertyID *propIdArray, size_t propValueArraySizeBytes,
-    uint64_t *propValueArray, CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_MetricID, size_t, CUpti_EventID *, size_t, uint64_t *, size_t,
-      CUpti_MetricPropertyID *, size_t, uint64_t *, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray,
-                  eventValueArraySizeBytes, eventValueArray,
-                  propIdArraySizeBytes, propIdArray, propValueArraySizeBytes,
-                  propValueArray, metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetContextId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, contextId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream,
-                                      uint32_t *streamId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream,
-                                        uint8_t perThreadStream,
-                                        uint32_t *streamId) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint8_t, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamIdEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, perThreadStream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetDeviceId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, deviceId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context,
-                                                CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context,
-                                                 CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context,
-                                                       uint32_t streamId,
-                                                       size_t *dropped) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityGetNumDroppedRecords");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, dropped);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t *buffer,
-                                                size_t validBufferSizeBytes,
-                                                CUpti_Activity **record) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t *, size_t, CUpti_Activity **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetNextRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(buffer, validBufferSizeBytes, record);
-}
-
-CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(
-    CUpti_BuffersCallbackRequestFunc funcBufferRequested,
-    CUpti_BuffersCallbackCompleteFunc funcBufferCompleted) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_BuffersCallbackRequestFunc,
-                                          CUpti_BuffersCallbackCompleteFunc);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityRegisterCallbacks");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(funcBufferRequested, funcBufferCompleted);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId,
-                                        uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlush");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlushAll");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivitySetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(
-    CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_ActivityUnifiedMemoryCounterConfig *, uint32_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityConfigureUnifiedMemoryCounter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config, count);
-}
-
-CUptiResult CUPTIAPI
-cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityAutoBoostState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetAutoBoostState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, state);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(
-    CUcontext ctx, CUpti_ActivityPCSamplingConfig *config) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityPCSamplingConfig *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityConfigurePCSampling");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, config);
-}
-
-CUptiResult CUPTIAPI cuptiGetLastError(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor,
-                                                     int *support) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiComputeCapabilitySupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, support);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceSupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, support);
-}
-
-CUptiResult CUPTIAPI cuptiFinalize(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiFinalize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(
-    CUpti_ExternalCorrelationKind kind, uint64_t id) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityPushExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, id);
-}
-
-CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(
-    CUpti_ExternalCorrelationKind kind, uint64_t *lastId) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityPopExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, lastId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityEnableLatencyTimestamps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cupti_11_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cupti_11_0.inc
deleted file mode 100644
index dac0c7919342bd..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cupti_11_0.inc
+++ /dev/null
@@ -1,763 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result,
-                                          const char **str) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUptiResult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetResultString");
-  if (!func_ptr) {
-    if (str) {
-      *str = "CUPTI could not be loaded or symbol could not be found.";
-    }
-    return GetSymbolNotFoundError();
-  }
-  return func_ptr(result, str);
-}
-
-CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-CUptiResult CUPTIAPI cuptiSupportedDomains(size_t *domainCount,
-                                           CUpti_DomainTable *domainTable) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_DomainTable *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSupportedDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domainCount, domainTable);
-}
-
-CUptiResult CUPTIAPI cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
-                                    CUpti_CallbackFunc callback,
-                                    void *userdata) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle *,
-                                          CUpti_CallbackFunc, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber, callback, userdata);
-}
-
-CUptiResult CUPTIAPI cuptiUnsubscribe(CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiUnsubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackState(uint32_t *enable,
-                                           CUpti_SubscriberHandle subscriber,
-                                           CUpti_CallbackDomain domain,
-                                           CUpti_CallbackId cbid) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(uint32_t *, CUpti_SubscriberHandle,
-                              CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableCallback(uint32_t enable,
-                                         CUpti_SubscriberHandle subscriber,
-                                         CUpti_CallbackDomain domain,
-                                         CUpti_CallbackId cbid) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      uint32_t, CUpti_SubscriberHandle, CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableDomain(uint32_t enable,
-                                       CUpti_SubscriberHandle subscriber,
-                                       CUpti_CallbackDomain domain) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle,
-                                          CUpti_CallbackDomain);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableDomain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain);
-}
-
-CUptiResult CUPTIAPI cuptiEnableAllDomains(uint32_t enable,
-                                           CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableAllDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackName(CUpti_CallbackDomain domain,
-                                          uint32_t cbid, const char **name) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_CallbackDomain, uint32_t, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domain, cbid, name);
-}
-
-CUptiResult CUPTIAPI
-cuptiSetEventCollectionMode(CUcontext context, CUpti_EventCollectionMode mode) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventCollectionMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetEventCollectionMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, mode);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
-                                             CUpti_DeviceAttribute attrib,
-                                             size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_DeviceAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetTimestamp(CUcontext context,
-                                             uint64_t *timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
-                                                   uint32_t *numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(
-    CUdevice device, size_t *arraySizeBytes, CUpti_EventDomainID *domainArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(
-    CUdevice device, CUpti_EventDomainID eventDomain,
-    CUpti_EventDomainAttribute attrib, size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, CUpti_EventDomainID,
-                              CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiDeviceGetEventDomainAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
-                                           CUpti_EventDomainID *domainArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(
-    CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib,
-    size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(
-    CUpti_EventDomainID eventDomain, uint32_t *numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
-                                                size_t *arraySizeBytes,
-                                                CUpti_EventID *eventArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, arraySizeBytes, eventArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
-                                            CUpti_EventAttribute attrib,
-                                            size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventID, CUpti_EventAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
-                                             const char *eventName,
-                                             CUpti_EventID *event) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventName, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
-                                           CUpti_EventGroup *eventGroup,
-                                           uint32_t flags) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventGroup *, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventGroup, flags);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(
-    CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(
-    CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
-                                             CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupAddEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
-                                                CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupResetAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
-                                              CUpti_ReadEventFlags flags,
-                                              CUpti_EventID event,
-                                              size_t *eventValueBufferSizeBytes,
-                                              uint64_t *eventValueBuffer) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags,
-                              CUpti_EventID, size_t *, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, event, eventValueBufferSizeBytes,
-                  eventValueBuffer);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(
-    CUpti_EventGroup eventGroup, CUpti_ReadEventFlags flags,
-    size_t *eventValueBufferSizeBytes, uint64_t *eventValueBuffer,
-    size_t *eventIdArraySizeBytes, CUpti_EventID *eventIdArray,
-    size_t *numEventIdsRead) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *,
-                              uint64_t *, size_t *, CUpti_EventID *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, eventValueBufferSizeBytes,
-                  eventValueBuffer, eventIdArraySizeBytes, eventIdArray,
-                  numEventIdsRead);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(
-    CUcontext context, size_t eventIdArraySizeBytes,
-    CUpti_EventID *eventIdArray, CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_EventID *,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventIdArraySizeBytes, eventIdArray,
-                  eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSets *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSets);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDisableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(
-    CUpti_KernelReplayUpdateFunc updateFunc, void *customData) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_KernelReplayUpdateFunc, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiKernelReplaySubscribeUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(updateFunc, customData);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
-                                      CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
-                                              uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
-                                            size_t *arraySizeBytes,
-                                            CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
-                                             CUpti_MetricAttribute attrib,
-                                             size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, CUpti_MetricAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
-                                              const char *metricName,
-                                              CUpti_MetricID *metric) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metricName, metric);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
-                                             uint32_t *numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
-                                           size_t *eventIdArraySizeBytes,
-                                           CUpti_EventID *eventIdArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
-                                                 uint32_t *numProp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numProp);
-}
-
-CUptiResult CUPTIAPI
-cuptiMetricEnumProperties(CUpti_MetricID metric, size_t *propIdArraySizeBytes,
-                          CUpti_MetricPropertyID *propIdArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *,
-                                          CUpti_MetricPropertyID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, propIdArraySizeBytes, propIdArray);
-}
-
-CUptiResult CUPTIAPI
-cuptiMetricGetRequiredEventGroupSets(CUcontext context, CUpti_MetricID metric,
-                                     CUpti_EventGroupSets **eventGroupSets) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_MetricID,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiMetricGetRequiredEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metric, eventGroupSets);
-}
-
-CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(
-    CUcontext context, size_t metricIdArraySizeBytes,
-    CUpti_MetricID *metricIdArray, CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_MetricID *,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricCreateEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metricIdArraySizeBytes, metricIdArray,
-                  eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device, CUpti_MetricID metric,
-                                         size_t eventIdArraySizeBytes,
-                                         CUpti_EventID *eventIdArray,
-                                         size_t eventValueArraySizeBytes,
-                                         uint64_t *eventValueArray,
-                                         uint64_t timeDuration,
-                                         CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_MetricID, size_t,
-                                          CUpti_EventID *, size_t, uint64_t *,
-                                          uint64_t, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metric, eventIdArraySizeBytes, eventIdArray,
-                  eventValueArraySizeBytes, eventValueArray, timeDuration,
-                  metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue2(
-    CUpti_MetricID metric, size_t eventIdArraySizeBytes,
-    CUpti_EventID *eventIdArray, size_t eventValueArraySizeBytes,
-    uint64_t *eventValueArray, size_t propIdArraySizeBytes,
-    CUpti_MetricPropertyID *propIdArray, size_t propValueArraySizeBytes,
-    uint64_t *propValueArray, CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_MetricID, size_t, CUpti_EventID *, size_t, uint64_t *, size_t,
-      CUpti_MetricPropertyID *, size_t, uint64_t *, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray,
-                  eventValueArraySizeBytes, eventValueArray,
-                  propIdArraySizeBytes, propIdArray, propValueArraySizeBytes,
-                  propValueArray, metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetContextId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, contextId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream,
-                                      uint32_t *streamId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream,
-                                        uint8_t perThreadStream,
-                                        uint32_t *streamId) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint8_t, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamIdEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, perThreadStream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetDeviceId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, deviceId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context,
-                                                CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context,
-                                                 CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context,
-                                                       uint32_t streamId,
-                                                       size_t *dropped) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityGetNumDroppedRecords");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, dropped);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t *buffer,
-                                                size_t validBufferSizeBytes,
-                                                CUpti_Activity **record) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t *, size_t, CUpti_Activity **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetNextRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(buffer, validBufferSizeBytes, record);
-}
-
-CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(
-    CUpti_BuffersCallbackRequestFunc funcBufferRequested,
-    CUpti_BuffersCallbackCompleteFunc funcBufferCompleted) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_BuffersCallbackRequestFunc,
-                                          CUpti_BuffersCallbackCompleteFunc);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityRegisterCallbacks");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(funcBufferRequested, funcBufferCompleted);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId,
-                                        uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlush");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlushAll");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivitySetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(
-    CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_ActivityUnifiedMemoryCounterConfig *, uint32_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityConfigureUnifiedMemoryCounter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config, count);
-}
-
-CUptiResult CUPTIAPI
-cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityAutoBoostState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetAutoBoostState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, state);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(
-    CUcontext ctx, CUpti_ActivityPCSamplingConfig *config) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityPCSamplingConfig *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityConfigurePCSampling");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, config);
-}
-
-CUptiResult CUPTIAPI cuptiGetLastError(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor,
-                                                     int *support) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiComputeCapabilitySupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, support);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceSupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, support);
-}
-
-CUptiResult CUPTIAPI cuptiFinalize(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiFinalize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(
-    CUpti_ExternalCorrelationKind kind, uint64_t id) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityPushExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, id);
-}
-
-CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(
-    CUpti_ExternalCorrelationKind kind, uint64_t *lastId) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityPopExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, lastId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityEnableLatencyTimestamps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cupti_12_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cupti_12_0.inc
deleted file mode 100644
index 7396fdde55eb1c..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cupti_12_0.inc
+++ /dev/null
@@ -1,744 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result, const char **str) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUptiResult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetResultString");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(result, str);
-}
-
-CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-CUptiResult CUPTIAPI cuptiSupportedDomains(size_t *domainCount,
-                                           CUpti_DomainTable *domainTable) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(size_t *, CUpti_DomainTable *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSupportedDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domainCount, domainTable);
-}
-
-CUptiResult CUPTIAPI cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
-                                    CUpti_CallbackFunc callback,
-                                    void *userdata) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_SubscriberHandle *, CUpti_CallbackFunc, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber, callback, userdata);
-}
-
-CUptiResult CUPTIAPI cuptiUnsubscribe(CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiUnsubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackState(uint32_t *enable,
-                                           CUpti_SubscriberHandle subscriber,
-                                           CUpti_CallbackDomain domain,
-                                           CUpti_CallbackId cbid) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint32_t *, CUpti_SubscriberHandle, CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableCallback(uint32_t enable,
-                                         CUpti_SubscriberHandle subscriber,
-                                         CUpti_CallbackDomain domain,
-                                         CUpti_CallbackId cbid) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle, CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableDomain(uint32_t enable,
-                                       CUpti_SubscriberHandle subscriber,
-                                       CUpti_CallbackDomain domain) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle, CUpti_CallbackDomain);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableDomain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain);
-}
-
-CUptiResult CUPTIAPI cuptiEnableAllDomains(uint32_t enable,
-                                           CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableAllDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackName(CUpti_CallbackDomain domain,
-                                          uint32_t cbid,
-                                          const char **name) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_CallbackDomain, uint32_t, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domain, cbid, name);
-}
-
-CUptiResult CUPTIAPI cuptiSetEventCollectionMode(CUcontext context,
-                                                 CUpti_EventCollectionMode mode) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, CUpti_EventCollectionMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetEventCollectionMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, mode);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
-                                             CUpti_DeviceAttribute attrib,
-                                             size_t *valueSize,
-                                             void *value) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUdevice, CUpti_DeviceAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetTimestamp(CUcontext context,
-                                             uint64_t *timestamp) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
-                                                   uint32_t *numDomains) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(CUdevice device,
-                                                 size_t *arraySizeBytes,
-                                                 CUpti_EventDomainID *domainArray) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUdevice, size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(CUdevice device,
-                                                        CUpti_EventDomainID eventDomain,
-                                                        CUpti_EventDomainAttribute attrib,
-                                                        size_t *valueSize,
-                                                        void *value) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUdevice, CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetEventDomainAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
-                                           CUpti_EventDomainID *domainArray) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(CUpti_EventDomainID eventDomain,
-                                                  CUpti_EventDomainAttribute attrib,
-                                                  size_t *valueSize,
-                                                  void *value) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(CUpti_EventDomainID eventDomain,
-                                                  uint32_t *numEvents) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventDomainID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
-                                                size_t *arraySizeBytes,
-                                                CUpti_EventID *eventArray) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventDomainID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, arraySizeBytes, eventArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
-                                            CUpti_EventAttribute attrib,
-                                            size_t *valueSize,
-                                            void *value) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventID, CUpti_EventAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
-                                             const char *eventName,
-                                             CUpti_EventID *event) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUdevice, const char *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventName, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
-                                           CUpti_EventGroup *eventGroup,
-                                           uint32_t flags) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, CUpti_EventGroup *, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventGroup, flags);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(CUpti_EventGroup eventGroup,
-                                                 CUpti_EventGroupAttribute attrib,
-                                                 size_t *valueSize,
-                                                 void *value) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroup, CUpti_EventGroupAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(CUpti_EventGroup eventGroup,
-                                                 CUpti_EventGroupAttribute attrib,
-                                                 size_t valueSize,
-                                                 void *value) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroup, CUpti_EventGroupAttribute, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
-                                             CUpti_EventID event) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupAddEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
-                                                CUpti_EventID event) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupResetAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
-                                              CUpti_ReadEventFlags flags,
-                                              CUpti_EventID event,
-                                              size_t *eventValueBufferSizeBytes,
-                                              uint64_t *eventValueBuffer) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags, CUpti_EventID, size_t *, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, event, eventValueBufferSizeBytes, eventValueBuffer);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(CUpti_EventGroup       eventGroup,
-                                                  CUpti_ReadEventFlags   flags,
-                                                  size_t                 *eventValueBufferSizeBytes,
-                                                  uint64_t               *eventValueBuffer,
-                                                  size_t                 *eventIdArraySizeBytes,
-                                                  CUpti_EventID          *eventIdArray,
-                                                  size_t                 *numEventIdsRead) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, eventValueBufferSizeBytes, eventValueBuffer, eventIdArraySizeBytes, eventIdArray, numEventIdsRead);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(CUcontext context,
-                                               size_t eventIdArraySizeBytes,
-                                               CUpti_EventID *eventIdArray,
-                                               CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, size_t, CUpti_EventID *, CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventIdArraySizeBytes, eventIdArray, eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroupSets *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSets);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDisableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(CUpti_KernelReplayUpdateFunc updateFunc, void *customData) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_KernelReplayUpdateFunc, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiKernelReplaySubscribeUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(updateFunc, customData);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
-                                      CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
-                                              uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
-                                            size_t *arraySizeBytes,
-                                            CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUdevice, size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
-                                             CUpti_MetricAttribute attrib,
-                                             size_t *valueSize,
-                                             void *value) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_MetricID, CUpti_MetricAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
-                                              const char *metricName,
-                                              CUpti_MetricID *metric) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUdevice, const char *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metricName, metric);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
-                                             uint32_t *numEvents) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
-                                           size_t *eventIdArraySizeBytes,
-                                           CUpti_EventID *eventIdArray) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_MetricID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
-                                                 uint32_t *numProp) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numProp);
-}
-
-CUptiResult CUPTIAPI cuptiMetricEnumProperties(CUpti_MetricID metric,
-                                               size_t *propIdArraySizeBytes,
-                                               CUpti_MetricPropertyID *propIdArray) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_MetricID, size_t *, CUpti_MetricPropertyID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, propIdArraySizeBytes, propIdArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetRequiredEventGroupSets(CUcontext context,
-                                                          CUpti_MetricID metric,
-                                                          CUpti_EventGroupSets **eventGroupSets) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, CUpti_MetricID, CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetRequiredEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metric, eventGroupSets);
-}
-
-CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(CUcontext context,
-                                                     size_t metricIdArraySizeBytes,
-                                                     CUpti_MetricID *metricIdArray,
-                                                     CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, size_t, CUpti_MetricID *, CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricCreateEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metricIdArraySizeBytes, metricIdArray, eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device,
-                                         CUpti_MetricID metric,
-                                         size_t eventIdArraySizeBytes,
-                                         CUpti_EventID *eventIdArray,
-                                         size_t eventValueArraySizeBytes,
-                                         uint64_t *eventValueArray,
-                                         uint64_t timeDuration,
-                                         CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUdevice, CUpti_MetricID, size_t, CUpti_EventID *, size_t, uint64_t *, uint64_t, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metric, eventIdArraySizeBytes, eventIdArray, eventValueArraySizeBytes, eventValueArray, timeDuration, metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue2(CUpti_MetricID metric,
-                                          size_t eventIdArraySizeBytes,
-                                          CUpti_EventID *eventIdArray,
-                                          size_t eventValueArraySizeBytes,
-                                          uint64_t *eventValueArray,
-                                          size_t propIdArraySizeBytes,
-                                          CUpti_MetricPropertyID *propIdArray,
-                                          size_t propValueArraySizeBytes,
-                                          uint64_t *propValueArray,
-                                          CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_MetricID, size_t, CUpti_EventID *, size_t, uint64_t *, size_t, CUpti_MetricPropertyID *, size_t, uint64_t *, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray, eventValueArraySizeBytes, eventValueArray, propIdArraySizeBytes, propIdArray, propValueArraySizeBytes, propValueArray, metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetContextId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, contextId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream, uint32_t *streamId) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, CUstream, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream, uint8_t perThreadStream, uint32_t *streamId) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, CUstream, uint8_t, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamIdEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, perThreadStream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetDeviceId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, deviceId);
-}
-
-CUptiResult CUPTIAPI cuptiGetGraphNodeId(CUgraphNode node, uint64_t *nodeId) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUgraphNode, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetGraphNodeId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, nodeId);
-}
-
-CUptiResult CUPTIAPI cuptiGetGraphId(CUgraph graph, uint32_t *pId) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUgraph, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetGraphId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, pId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableAndDump(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnableAndDump");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context, CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context, CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId,
-                                                       size_t *dropped) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, uint32_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetNumDroppedRecords");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, dropped);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t* buffer, size_t validBufferSizeBytes,
-                                                CUpti_Activity **record) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint8_t *, size_t, CUpti_Activity **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetNextRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(buffer, validBufferSizeBytes, record);
-}
-
-CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested,
-        CUpti_BuffersCallbackCompleteFunc funcBufferCompleted) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_BuffersCallbackRequestFunc, CUpti_BuffersCallbackCompleteFunc);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityRegisterCallbacks");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(funcBufferRequested, funcBufferCompleted);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId, uint32_t flag) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, uint32_t, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlush");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlushAll");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
-        size_t *valueSize, void* value) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
-        size_t *valueSize, void* value) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivitySetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_ActivityUnifiedMemoryCounterConfig *, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityConfigureUnifiedMemoryCounter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config, count);
-}
-
-CUptiResult CUPTIAPI cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, CUpti_ActivityAutoBoostState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetAutoBoostState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, state);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(CUcontext ctx, CUpti_ActivityPCSamplingConfig *config) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUcontext, CUpti_ActivityPCSamplingConfig *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityConfigurePCSampling");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, config);
-}
-
-CUptiResult CUPTIAPI cuptiGetLastError(void) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_ActivityThreadIdType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_ActivityThreadIdType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor, int *support) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiComputeCapabilitySupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, support);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUdevice, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceSupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, support);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceVirtualizationMode(CUdevice dev, CUpti_DeviceVirtualizationMode *mode) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUdevice, CUpti_DeviceVirtualizationMode *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceVirtualizationMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, mode);
-}
-
-CUptiResult CUPTIAPI cuptiFinalize(void) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiFinalize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t id) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityPushExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, id);
-}
-
-CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t *lastId) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityPopExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, lastId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint8_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnableLatencyTimestamps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlushPeriod(uint32_t time) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlushPeriod");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(time);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableLaunchAttributes(uint8_t enable) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(uint8_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnableLaunchAttributes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable);
-}
-
-CUptiResult CUPTIAPI cuptiActivityRegisterTimestampCallback(CUpti_TimestampCallbackFunc funcTimestamp) {
-  using FuncPtr = CUptiResult (CUPTIAPI *)(CUpti_TimestampCallbackFunc);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityRegisterTimestampCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(funcTimestamp);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cupti_9_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cupti_9_0.inc
deleted file mode 100644
index dac0c7919342bd..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cupti_9_0.inc
+++ /dev/null
@@ -1,763 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result,
-                                          const char **str) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUptiResult, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetResultString");
-  if (!func_ptr) {
-    if (str) {
-      *str = "CUPTI could not be loaded or symbol could not be found.";
-    }
-    return GetSymbolNotFoundError();
-  }
-  return func_ptr(result, str);
-}
-
-CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-CUptiResult CUPTIAPI cuptiSupportedDomains(size_t *domainCount,
-                                           CUpti_DomainTable *domainTable) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_DomainTable *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSupportedDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domainCount, domainTable);
-}
-
-CUptiResult CUPTIAPI cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
-                                    CUpti_CallbackFunc callback,
-                                    void *userdata) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle *,
-                                          CUpti_CallbackFunc, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber, callback, userdata);
-}
-
-CUptiResult CUPTIAPI cuptiUnsubscribe(CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiUnsubscribe");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackState(uint32_t *enable,
-                                           CUpti_SubscriberHandle subscriber,
-                                           CUpti_CallbackDomain domain,
-                                           CUpti_CallbackId cbid) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(uint32_t *, CUpti_SubscriberHandle,
-                              CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableCallback(uint32_t enable,
-                                         CUpti_SubscriberHandle subscriber,
-                                         CUpti_CallbackDomain domain,
-                                         CUpti_CallbackId cbid) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      uint32_t, CUpti_SubscriberHandle, CUpti_CallbackDomain, CUpti_CallbackId);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain, cbid);
-}
-
-CUptiResult CUPTIAPI cuptiEnableDomain(uint32_t enable,
-                                       CUpti_SubscriberHandle subscriber,
-                                       CUpti_CallbackDomain domain) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle,
-                                          CUpti_CallbackDomain);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableDomain");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber, domain);
-}
-
-CUptiResult CUPTIAPI cuptiEnableAllDomains(uint32_t enable,
-                                           CUpti_SubscriberHandle subscriber) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t, CUpti_SubscriberHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableAllDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable, subscriber);
-}
-
-CUptiResult CUPTIAPI cuptiGetCallbackName(CUpti_CallbackDomain domain,
-                                          uint32_t cbid, const char **name) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_CallbackDomain, uint32_t, const char **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetCallbackName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(domain, cbid, name);
-}
-
-CUptiResult CUPTIAPI
-cuptiSetEventCollectionMode(CUcontext context, CUpti_EventCollectionMode mode) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventCollectionMode);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetEventCollectionMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, mode);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
-                                             CUpti_DeviceAttribute attrib,
-                                             size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_DeviceAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetTimestamp(CUcontext context,
-                                             uint64_t *timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
-                                                   uint32_t *numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(
-    CUdevice device, size_t *arraySizeBytes, CUpti_EventDomainID *domainArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(
-    CUdevice device, CUpti_EventDomainID eventDomain,
-    CUpti_EventDomainAttribute attrib, size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, CUpti_EventDomainID,
-                              CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiDeviceGetEventDomainAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numDomains);
-}
-
-CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
-                                           CUpti_EventDomainID *domainArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_EventDomainID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumEventDomains");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, domainArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(
-    CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib,
-    size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(
-    CUpti_EventDomainID eventDomain, uint32_t *numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
-                                                size_t *arraySizeBytes,
-                                                CUpti_EventID *eventArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventDomainID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventDomainEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventDomain, arraySizeBytes, eventArray);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
-                                            CUpti_EventAttribute attrib,
-                                            size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventID, CUpti_EventAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(event, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
-                                             const char *eventName,
-                                             CUpti_EventID *event) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, eventName, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
-                                           CUpti_EventGroup *eventGroup,
-                                           uint32_t flags) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_EventGroup *, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventGroup, flags);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(
-    CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(
-    CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
-    size_t valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_EventGroup, CUpti_EventGroupAttribute, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
-                                             CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupAddEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
-                                                CUpti_EventID event) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_EventID);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, event);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupRemoveAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupResetAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroup);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
-                                              CUpti_ReadEventFlags flags,
-                                              CUpti_EventID event,
-                                              size_t *eventValueBufferSizeBytes,
-                                              uint64_t *eventValueBuffer) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags,
-                              CUpti_EventID, size_t *, uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadEvent");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, event, eventValueBufferSizeBytes,
-                  eventValueBuffer);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(
-    CUpti_EventGroup eventGroup, CUpti_ReadEventFlags flags,
-    size_t *eventValueBufferSizeBytes, uint64_t *eventValueBuffer,
-    size_t *eventIdArraySizeBytes, CUpti_EventID *eventIdArray,
-    size_t *numEventIdsRead) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *,
-                              uint64_t *, size_t *, CUpti_EventID *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupReadAllEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroup, flags, eventValueBufferSizeBytes,
-                  eventValueBuffer, eventIdArraySizeBytes, eventIdArray,
-                  numEventIdsRead);
-}
-
-CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(
-    CUcontext context, size_t eventIdArraySizeBytes,
-    CUpti_EventID *eventIdArray, CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_EventID *,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, eventIdArraySizeBytes, eventIdArray,
-                  eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSets *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetsDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSets);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI
-cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_EventGroupSet *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEventGroupSetDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(eventGroupSet);
-}
-
-CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDisableKernelReplayMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context);
-}
-
-CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(
-    CUpti_KernelReplayUpdateFunc updateFunc, void *customData) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_KernelReplayUpdateFunc, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiKernelReplaySubscribeUpdate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(updateFunc, customData);
-}
-
-CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
-                                      CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
-                                              uint32_t *numMetrics) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceGetNumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, numMetrics);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
-                                            size_t *arraySizeBytes,
-                                            CUpti_MetricID *metricArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, size_t *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceEnumMetrics");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, arraySizeBytes, metricArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
-                                             CUpti_MetricAttribute attrib,
-                                             size_t *valueSize, void *value) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, CUpti_MetricAttribute,
-                                          size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, attrib, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
-                                              const char *metricName,
-                                              CUpti_MetricID *metric) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUdevice, const char *, CUpti_MetricID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetIdFromName");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metricName, metric);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
-                                             uint32_t *numEvents) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numEvents);
-}
-
-CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
-                                           size_t *eventIdArraySizeBytes,
-                                           CUpti_EventID *eventIdArray) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *, CUpti_EventID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumEvents");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
-                                                 uint32_t *numProp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetNumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, numProp);
-}
-
-CUptiResult CUPTIAPI
-cuptiMetricEnumProperties(CUpti_MetricID metric, size_t *propIdArraySizeBytes,
-                          CUpti_MetricPropertyID *propIdArray) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_MetricID, size_t *,
-                                          CUpti_MetricPropertyID *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricEnumProperties");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, propIdArraySizeBytes, propIdArray);
-}
-
-CUptiResult CUPTIAPI
-cuptiMetricGetRequiredEventGroupSets(CUcontext context, CUpti_MetricID metric,
-                                     CUpti_EventGroupSets **eventGroupSets) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_MetricID,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiMetricGetRequiredEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metric, eventGroupSets);
-}
-
-CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(
-    CUcontext context, size_t metricIdArraySizeBytes,
-    CUpti_MetricID *metricIdArray, CUpti_EventGroupSets **eventGroupPasses) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, size_t, CUpti_MetricID *,
-                                          CUpti_EventGroupSets **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricCreateEventGroupSets");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, metricIdArraySizeBytes, metricIdArray,
-                  eventGroupPasses);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device, CUpti_MetricID metric,
-                                         size_t eventIdArraySizeBytes,
-                                         CUpti_EventID *eventIdArray,
-                                         size_t eventValueArraySizeBytes,
-                                         uint64_t *eventValueArray,
-                                         uint64_t timeDuration,
-                                         CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, CUpti_MetricID, size_t,
-                                          CUpti_EventID *, size_t, uint64_t *,
-                                          uint64_t, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(device, metric, eventIdArraySizeBytes, eventIdArray,
-                  eventValueArraySizeBytes, eventValueArray, timeDuration,
-                  metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiMetricGetValue2(
-    CUpti_MetricID metric, size_t eventIdArraySizeBytes,
-    CUpti_EventID *eventIdArray, size_t eventValueArraySizeBytes,
-    uint64_t *eventValueArray, size_t propIdArraySizeBytes,
-    CUpti_MetricPropertyID *propIdArray, size_t propValueArraySizeBytes,
-    uint64_t *propValueArray, CUpti_MetricValue *metricValue) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_MetricID, size_t, CUpti_EventID *, size_t, uint64_t *, size_t,
-      CUpti_MetricPropertyID *, size_t, uint64_t *, CUpti_MetricValue *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiMetricGetValue2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(metric, eventIdArraySizeBytes, eventIdArray,
-                  eventValueArraySizeBytes, eventValueArray,
-                  propIdArraySizeBytes, propIdArray, propValueArraySizeBytes,
-                  propValueArray, metricValue);
-}
-
-CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetTimestamp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(timestamp);
-}
-
-CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetContextId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, contextId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream,
-                                      uint32_t *streamId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream,
-                                        uint8_t perThreadStream,
-                                        uint32_t *streamId) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUstream, uint8_t, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetStreamIdEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, stream, perThreadStream, streamId);
-}
-
-CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetDeviceId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, deviceId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context,
-                                                CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityEnableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context,
-                                                 CUpti_ActivityKind kind) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityKind);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityDisableContext");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, kind);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context,
-                                                       uint32_t streamId,
-                                                       size_t *dropped) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityGetNumDroppedRecords");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, dropped);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t *buffer,
-                                                size_t validBufferSizeBytes,
-                                                CUpti_Activity **record) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t *, size_t, CUpti_Activity **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetNextRecord");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(buffer, validBufferSizeBytes, record);
-}
-
-CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(
-    CUpti_BuffersCallbackRequestFunc funcBufferRequested,
-    CUpti_BuffersCallbackCompleteFunc funcBufferCompleted) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_BuffersCallbackRequestFunc,
-                                          CUpti_BuffersCallbackCompleteFunc);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityRegisterCallbacks");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(funcBufferRequested, funcBufferCompleted);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId,
-                                        uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUcontext, uint32_t, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlush");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, streamId, flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityFlushAll");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(flag);
-}
-
-CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivityGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
-                                               size_t *valueSize, void *value) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ActivityAttribute, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiActivitySetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attr, valueSize, value);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(
-    CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(
-      CUpti_ActivityUnifiedMemoryCounterConfig *, uint32_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityConfigureUnifiedMemoryCounter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(config, count);
-}
-
-CUptiResult CUPTIAPI
-cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityAutoBoostState *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetAutoBoostState");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(context, state);
-}
-
-CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(
-    CUcontext ctx, CUpti_ActivityPCSamplingConfig *config) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUcontext, CUpti_ActivityPCSamplingConfig *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityConfigurePCSampling");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctx, config);
-}
-
-CUptiResult CUPTIAPI cuptiGetLastError(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetLastError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiSetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUpti_ActivityThreadIdType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiGetThreadIdType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type);
-}
-
-CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor,
-                                                     int *support) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiComputeCapabilitySupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(major, minor, support);
-}
-
-CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(CUdevice, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiDeviceSupported");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dev, support);
-}
-
-CUptiResult CUPTIAPI cuptiFinalize(void) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cuptiFinalize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(
-    CUpti_ExternalCorrelationKind kind, uint64_t id) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityPushExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, id);
-}
-
-CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(
-    CUpti_ExternalCorrelationKind kind, uint64_t *lastId) {
-  using FuncPtr =
-      CUptiResult(CUPTIAPI *)(CUpti_ExternalCorrelationKind, uint64_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityPopExternalCorrelationId");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(kind, lastId);
-}
-
-CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable) {
-  using FuncPtr = CUptiResult(CUPTIAPI *)(uint8_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cuptiActivityEnableLatencyTimestamps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(enable);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cupti_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cupti_stub.cc
index 2a6e094383bfab..9e632010d83a7a 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cupti_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cupti_stub.cc
@@ -35,35 +35,38 @@ void* GetDsoHandle() {
 #endif
 }
 
-template <typename T>
-T LoadSymbol(const char* symbol_name) {
+void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
     tsl::Env::Default()
         ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
-  return reinterpret_cast<T>(symbol);
+  return symbol;
 }
 
-CUptiResult GetSymbolNotFoundError() { return CUPTI_ERROR_UNKNOWN; }
+const char* kSymbols[] = {
+#include "tsl/cuda/cupti.inc"
+};
+
+constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
+
 }  // namespace
 
-// For now we only need one stub implementation. We will need to generate
-// a new file when CUPTI breaks backwards compatibility (has not been the case
-// for quite a while) or if we want to use functionality introduced in a new
-// version.
-//
-// Calling a function that is not yet available in the loaded CUPTI version will
-// return CUPTI_ERROR_UNKNOWN.
-#if CUDA_VERSION < 10010
-#include "tsl/cuda/cupti_10_0.inc"
-#elif CUDA_VERSION < 10020
-#include "tsl/cuda/cupti_10_1.inc"
-#elif CUDA_VERSION < 11000
-#include "tsl/cuda/cupti_10_2.inc"
-#elif CUDA_VERSION < 12000
-#include "tsl/cuda/cupti_11_0.inc"
-#else
-#include "tsl/cuda/cupti_12_0.inc"
-#endif
+extern "C" {
+
+static CUptiResult GetSymbolNotFoundError() { return CUPTI_ERROR_UNKNOWN; }
+
+extern void* _cupti_tramp_table[];
+
+void _cupti_tramp_resolve(int i) {
+  CHECK_LE(0, i);
+  CHECK_LT(i, kNumSymbols);
+  void* p = LoadSymbol(kSymbols[i]);
+  if (!p) {
+    p = reinterpret_cast<void*>(&GetSymbolNotFoundError);
+  }
+  _cupti_tramp_table[i] = p;
+}
+
+}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/curand_10_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/curand_10_0.inc
deleted file mode 100644
index 3c4c8fed67ac11..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/curand_10_0.inc
+++ /dev/null
@@ -1,268 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-curandStatus_t CURANDAPI curandCreateGenerator(curandGenerator_t *generator,
-                                               curandRngType_t rng_type) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGenerator");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, rng_type);
-}
-
-curandStatus_t CURANDAPI curandCreateGeneratorHost(curandGenerator_t *generator,
-                                                   curandRngType_t rng_type) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGeneratorHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, rng_type);
-}
-
-curandStatus_t CURANDAPI curandDestroyGenerator(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyGenerator");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator);
-}
-
-curandStatus_t CURANDAPI curandGetVersion(int *version) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-curandStatus_t CURANDAPI curandGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-curandStatus_t CURANDAPI curandSetStream(curandGenerator_t generator,
-                                         cudaStream_t stream) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, stream);
-}
-
-curandStatus_t CURANDAPI curandSetPseudoRandomGeneratorSeed(
-    curandGenerator_t generator, unsigned long long seed) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("curandSetPseudoRandomGeneratorSeed");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, seed);
-}
-
-curandStatus_t CURANDAPI curandSetGeneratorOffset(curandGenerator_t generator,
-                                                  unsigned long long offset) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, offset);
-}
-
-curandStatus_t CURANDAPI curandSetGeneratorOrdering(curandGenerator_t generator,
-                                                    curandOrdering_t order) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, curandOrdering_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOrdering");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, order);
-}
-
-curandStatus_t CURANDAPI curandSetQuasiRandomGeneratorDimensions(
-    curandGenerator_t generator, unsigned int num_dimensions) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("curandSetQuasiRandomGeneratorDimensions");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, num_dimensions);
-}
-
-curandStatus_t CURANDAPI curandGenerate(curandGenerator_t generator,
-                                        unsigned int *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateLongLong(curandGenerator_t generator,
-                                                unsigned long long *outputPtr,
-                                                size_t num) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t,
-                                              unsigned long long *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLongLong");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateUniform(curandGenerator_t generator,
-                                               float *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, float *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniform");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateUniformDouble(
-    curandGenerator_t generator, double *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, double *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniformDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateNormal(curandGenerator_t generator,
-                                              float *outputPtr, size_t n,
-                                              float mean, float stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
-                                              size_t, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandGenerateNormalDouble(curandGenerator_t generator,
-                                                    double *outputPtr, size_t n,
-                                                    double mean,
-                                                    double stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
-                                              size_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormalDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandGenerateLogNormal(curandGenerator_t generator,
-                                                 float *outputPtr, size_t n,
-                                                 float mean, float stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
-                                              size_t, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI
-curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr,
-                              size_t n, double mean, double stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
-                                              size_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormalDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandCreatePoissonDistribution(
-    double lambda, curandDiscreteDistribution_t *discrete_distribution) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(double, curandDiscreteDistribution_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreatePoissonDistribution");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lambda, discrete_distribution);
-}
-
-curandStatus_t CURANDAPI
-curandDestroyDistribution(curandDiscreteDistribution_t discrete_distribution) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDiscreteDistribution_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyDistribution");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(discrete_distribution);
-}
-
-curandStatus_t CURANDAPI curandGeneratePoisson(curandGenerator_t generator,
-                                               unsigned int *outputPtr,
-                                               size_t n, double lambda) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoisson");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, lambda);
-}
-
-curandStatus_t CURANDAPI curandGeneratePoissonMethod(
-    curandGenerator_t generator, unsigned int *outputPtr, size_t n,
-    double lambda, curandMethod_t method) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, double, curandMethod_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoissonMethod");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, lambda, method);
-}
-
-curandStatus_t CURANDAPI curandGenerateBinomial(curandGenerator_t generator,
-                                                unsigned int *outputPtr,
-                                                size_t num, unsigned int n,
-                                                double p) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, unsigned int, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomial");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num, n, p);
-}
-
-curandStatus_t CURANDAPI curandGenerateBinomialMethod(
-    curandGenerator_t generator, unsigned int *outputPtr, size_t num,
-    unsigned int n, double p, curandMethod_t method) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t,
-                                  unsigned int, double, curandMethod_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomialMethod");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num, n, p, method);
-}
-
-curandStatus_t CURANDAPI curandGenerateSeeds(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateSeeds");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator);
-}
-
-curandStatus_t CURANDAPI curandGetDirectionVectors32(
-    curandDirectionVectors32_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors32_t *[],
-                                              curandDirectionVectorSet_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(vectors, set);
-}
-
-curandStatus_t CURANDAPI
-curandGetScrambleConstants32(unsigned int **constants) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned int **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constants);
-}
-
-curandStatus_t CURANDAPI curandGetDirectionVectors64(
-    curandDirectionVectors64_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors64_t *[],
-                                              curandDirectionVectorSet_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(vectors, set);
-}
-
-curandStatus_t CURANDAPI
-curandGetScrambleConstants64(unsigned long long **constants) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned long long **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constants);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/curand_10_1.inc b/third_party/xla/third_party/tsl/tsl/cuda/curand_10_1.inc
deleted file mode 100644
index 3c4c8fed67ac11..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/curand_10_1.inc
+++ /dev/null
@@ -1,268 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-curandStatus_t CURANDAPI curandCreateGenerator(curandGenerator_t *generator,
-                                               curandRngType_t rng_type) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGenerator");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, rng_type);
-}
-
-curandStatus_t CURANDAPI curandCreateGeneratorHost(curandGenerator_t *generator,
-                                                   curandRngType_t rng_type) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGeneratorHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, rng_type);
-}
-
-curandStatus_t CURANDAPI curandDestroyGenerator(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyGenerator");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator);
-}
-
-curandStatus_t CURANDAPI curandGetVersion(int *version) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-curandStatus_t CURANDAPI curandGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-curandStatus_t CURANDAPI curandSetStream(curandGenerator_t generator,
-                                         cudaStream_t stream) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, stream);
-}
-
-curandStatus_t CURANDAPI curandSetPseudoRandomGeneratorSeed(
-    curandGenerator_t generator, unsigned long long seed) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("curandSetPseudoRandomGeneratorSeed");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, seed);
-}
-
-curandStatus_t CURANDAPI curandSetGeneratorOffset(curandGenerator_t generator,
-                                                  unsigned long long offset) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, offset);
-}
-
-curandStatus_t CURANDAPI curandSetGeneratorOrdering(curandGenerator_t generator,
-                                                    curandOrdering_t order) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, curandOrdering_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOrdering");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, order);
-}
-
-curandStatus_t CURANDAPI curandSetQuasiRandomGeneratorDimensions(
-    curandGenerator_t generator, unsigned int num_dimensions) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("curandSetQuasiRandomGeneratorDimensions");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, num_dimensions);
-}
-
-curandStatus_t CURANDAPI curandGenerate(curandGenerator_t generator,
-                                        unsigned int *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateLongLong(curandGenerator_t generator,
-                                                unsigned long long *outputPtr,
-                                                size_t num) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t,
-                                              unsigned long long *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLongLong");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateUniform(curandGenerator_t generator,
-                                               float *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, float *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniform");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateUniformDouble(
-    curandGenerator_t generator, double *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, double *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniformDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateNormal(curandGenerator_t generator,
-                                              float *outputPtr, size_t n,
-                                              float mean, float stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
-                                              size_t, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandGenerateNormalDouble(curandGenerator_t generator,
-                                                    double *outputPtr, size_t n,
-                                                    double mean,
-                                                    double stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
-                                              size_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormalDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandGenerateLogNormal(curandGenerator_t generator,
-                                                 float *outputPtr, size_t n,
-                                                 float mean, float stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
-                                              size_t, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI
-curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr,
-                              size_t n, double mean, double stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
-                                              size_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormalDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandCreatePoissonDistribution(
-    double lambda, curandDiscreteDistribution_t *discrete_distribution) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(double, curandDiscreteDistribution_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreatePoissonDistribution");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lambda, discrete_distribution);
-}
-
-curandStatus_t CURANDAPI
-curandDestroyDistribution(curandDiscreteDistribution_t discrete_distribution) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDiscreteDistribution_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyDistribution");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(discrete_distribution);
-}
-
-curandStatus_t CURANDAPI curandGeneratePoisson(curandGenerator_t generator,
-                                               unsigned int *outputPtr,
-                                               size_t n, double lambda) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoisson");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, lambda);
-}
-
-curandStatus_t CURANDAPI curandGeneratePoissonMethod(
-    curandGenerator_t generator, unsigned int *outputPtr, size_t n,
-    double lambda, curandMethod_t method) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, double, curandMethod_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoissonMethod");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, lambda, method);
-}
-
-curandStatus_t CURANDAPI curandGenerateBinomial(curandGenerator_t generator,
-                                                unsigned int *outputPtr,
-                                                size_t num, unsigned int n,
-                                                double p) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, unsigned int, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomial");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num, n, p);
-}
-
-curandStatus_t CURANDAPI curandGenerateBinomialMethod(
-    curandGenerator_t generator, unsigned int *outputPtr, size_t num,
-    unsigned int n, double p, curandMethod_t method) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t,
-                                  unsigned int, double, curandMethod_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomialMethod");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num, n, p, method);
-}
-
-curandStatus_t CURANDAPI curandGenerateSeeds(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateSeeds");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator);
-}
-
-curandStatus_t CURANDAPI curandGetDirectionVectors32(
-    curandDirectionVectors32_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors32_t *[],
-                                              curandDirectionVectorSet_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(vectors, set);
-}
-
-curandStatus_t CURANDAPI
-curandGetScrambleConstants32(unsigned int **constants) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned int **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constants);
-}
-
-curandStatus_t CURANDAPI curandGetDirectionVectors64(
-    curandDirectionVectors64_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors64_t *[],
-                                              curandDirectionVectorSet_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(vectors, set);
-}
-
-curandStatus_t CURANDAPI
-curandGetScrambleConstants64(unsigned long long **constants) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned long long **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constants);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/curand_10_2.inc b/third_party/xla/third_party/tsl/tsl/cuda/curand_10_2.inc
deleted file mode 100644
index 3c4c8fed67ac11..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/curand_10_2.inc
+++ /dev/null
@@ -1,268 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-curandStatus_t CURANDAPI curandCreateGenerator(curandGenerator_t *generator,
-                                               curandRngType_t rng_type) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGenerator");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, rng_type);
-}
-
-curandStatus_t CURANDAPI curandCreateGeneratorHost(curandGenerator_t *generator,
-                                                   curandRngType_t rng_type) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGeneratorHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, rng_type);
-}
-
-curandStatus_t CURANDAPI curandDestroyGenerator(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyGenerator");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator);
-}
-
-curandStatus_t CURANDAPI curandGetVersion(int *version) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-curandStatus_t CURANDAPI curandGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-curandStatus_t CURANDAPI curandSetStream(curandGenerator_t generator,
-                                         cudaStream_t stream) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, stream);
-}
-
-curandStatus_t CURANDAPI curandSetPseudoRandomGeneratorSeed(
-    curandGenerator_t generator, unsigned long long seed) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("curandSetPseudoRandomGeneratorSeed");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, seed);
-}
-
-curandStatus_t CURANDAPI curandSetGeneratorOffset(curandGenerator_t generator,
-                                                  unsigned long long offset) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, offset);
-}
-
-curandStatus_t CURANDAPI curandSetGeneratorOrdering(curandGenerator_t generator,
-                                                    curandOrdering_t order) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, curandOrdering_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOrdering");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, order);
-}
-
-curandStatus_t CURANDAPI curandSetQuasiRandomGeneratorDimensions(
-    curandGenerator_t generator, unsigned int num_dimensions) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("curandSetQuasiRandomGeneratorDimensions");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, num_dimensions);
-}
-
-curandStatus_t CURANDAPI curandGenerate(curandGenerator_t generator,
-                                        unsigned int *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateLongLong(curandGenerator_t generator,
-                                                unsigned long long *outputPtr,
-                                                size_t num) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t,
-                                              unsigned long long *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLongLong");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateUniform(curandGenerator_t generator,
-                                               float *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, float *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniform");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateUniformDouble(
-    curandGenerator_t generator, double *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, double *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniformDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateNormal(curandGenerator_t generator,
-                                              float *outputPtr, size_t n,
-                                              float mean, float stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
-                                              size_t, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandGenerateNormalDouble(curandGenerator_t generator,
-                                                    double *outputPtr, size_t n,
-                                                    double mean,
-                                                    double stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
-                                              size_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormalDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandGenerateLogNormal(curandGenerator_t generator,
-                                                 float *outputPtr, size_t n,
-                                                 float mean, float stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
-                                              size_t, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI
-curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr,
-                              size_t n, double mean, double stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
-                                              size_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormalDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandCreatePoissonDistribution(
-    double lambda, curandDiscreteDistribution_t *discrete_distribution) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(double, curandDiscreteDistribution_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreatePoissonDistribution");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lambda, discrete_distribution);
-}
-
-curandStatus_t CURANDAPI
-curandDestroyDistribution(curandDiscreteDistribution_t discrete_distribution) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDiscreteDistribution_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyDistribution");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(discrete_distribution);
-}
-
-curandStatus_t CURANDAPI curandGeneratePoisson(curandGenerator_t generator,
-                                               unsigned int *outputPtr,
-                                               size_t n, double lambda) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoisson");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, lambda);
-}
-
-curandStatus_t CURANDAPI curandGeneratePoissonMethod(
-    curandGenerator_t generator, unsigned int *outputPtr, size_t n,
-    double lambda, curandMethod_t method) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, double, curandMethod_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoissonMethod");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, lambda, method);
-}
-
-curandStatus_t CURANDAPI curandGenerateBinomial(curandGenerator_t generator,
-                                                unsigned int *outputPtr,
-                                                size_t num, unsigned int n,
-                                                double p) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, unsigned int, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomial");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num, n, p);
-}
-
-curandStatus_t CURANDAPI curandGenerateBinomialMethod(
-    curandGenerator_t generator, unsigned int *outputPtr, size_t num,
-    unsigned int n, double p, curandMethod_t method) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t,
-                                  unsigned int, double, curandMethod_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomialMethod");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num, n, p, method);
-}
-
-curandStatus_t CURANDAPI curandGenerateSeeds(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateSeeds");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator);
-}
-
-curandStatus_t CURANDAPI curandGetDirectionVectors32(
-    curandDirectionVectors32_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors32_t *[],
-                                              curandDirectionVectorSet_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(vectors, set);
-}
-
-curandStatus_t CURANDAPI
-curandGetScrambleConstants32(unsigned int **constants) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned int **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constants);
-}
-
-curandStatus_t CURANDAPI curandGetDirectionVectors64(
-    curandDirectionVectors64_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors64_t *[],
-                                              curandDirectionVectorSet_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(vectors, set);
-}
-
-curandStatus_t CURANDAPI
-curandGetScrambleConstants64(unsigned long long **constants) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned long long **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constants);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/curand_11_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/curand_11_0.inc
deleted file mode 100644
index 3c4c8fed67ac11..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/curand_11_0.inc
+++ /dev/null
@@ -1,268 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-curandStatus_t CURANDAPI curandCreateGenerator(curandGenerator_t *generator,
-                                               curandRngType_t rng_type) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGenerator");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, rng_type);
-}
-
-curandStatus_t CURANDAPI curandCreateGeneratorHost(curandGenerator_t *generator,
-                                                   curandRngType_t rng_type) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGeneratorHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, rng_type);
-}
-
-curandStatus_t CURANDAPI curandDestroyGenerator(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyGenerator");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator);
-}
-
-curandStatus_t CURANDAPI curandGetVersion(int *version) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-curandStatus_t CURANDAPI curandGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-curandStatus_t CURANDAPI curandSetStream(curandGenerator_t generator,
-                                         cudaStream_t stream) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, stream);
-}
-
-curandStatus_t CURANDAPI curandSetPseudoRandomGeneratorSeed(
-    curandGenerator_t generator, unsigned long long seed) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("curandSetPseudoRandomGeneratorSeed");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, seed);
-}
-
-curandStatus_t CURANDAPI curandSetGeneratorOffset(curandGenerator_t generator,
-                                                  unsigned long long offset) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, offset);
-}
-
-curandStatus_t CURANDAPI curandSetGeneratorOrdering(curandGenerator_t generator,
-                                                    curandOrdering_t order) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, curandOrdering_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOrdering");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, order);
-}
-
-curandStatus_t CURANDAPI curandSetQuasiRandomGeneratorDimensions(
-    curandGenerator_t generator, unsigned int num_dimensions) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("curandSetQuasiRandomGeneratorDimensions");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, num_dimensions);
-}
-
-curandStatus_t CURANDAPI curandGenerate(curandGenerator_t generator,
-                                        unsigned int *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateLongLong(curandGenerator_t generator,
-                                                unsigned long long *outputPtr,
-                                                size_t num) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t,
-                                              unsigned long long *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLongLong");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateUniform(curandGenerator_t generator,
-                                               float *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, float *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniform");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateUniformDouble(
-    curandGenerator_t generator, double *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, double *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniformDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateNormal(curandGenerator_t generator,
-                                              float *outputPtr, size_t n,
-                                              float mean, float stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
-                                              size_t, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandGenerateNormalDouble(curandGenerator_t generator,
-                                                    double *outputPtr, size_t n,
-                                                    double mean,
-                                                    double stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
-                                              size_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormalDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandGenerateLogNormal(curandGenerator_t generator,
-                                                 float *outputPtr, size_t n,
-                                                 float mean, float stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
-                                              size_t, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI
-curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr,
-                              size_t n, double mean, double stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
-                                              size_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormalDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandCreatePoissonDistribution(
-    double lambda, curandDiscreteDistribution_t *discrete_distribution) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(double, curandDiscreteDistribution_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreatePoissonDistribution");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lambda, discrete_distribution);
-}
-
-curandStatus_t CURANDAPI
-curandDestroyDistribution(curandDiscreteDistribution_t discrete_distribution) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDiscreteDistribution_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyDistribution");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(discrete_distribution);
-}
-
-curandStatus_t CURANDAPI curandGeneratePoisson(curandGenerator_t generator,
-                                               unsigned int *outputPtr,
-                                               size_t n, double lambda) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoisson");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, lambda);
-}
-
-curandStatus_t CURANDAPI curandGeneratePoissonMethod(
-    curandGenerator_t generator, unsigned int *outputPtr, size_t n,
-    double lambda, curandMethod_t method) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, double, curandMethod_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoissonMethod");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, lambda, method);
-}
-
-curandStatus_t CURANDAPI curandGenerateBinomial(curandGenerator_t generator,
-                                                unsigned int *outputPtr,
-                                                size_t num, unsigned int n,
-                                                double p) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, unsigned int, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomial");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num, n, p);
-}
-
-curandStatus_t CURANDAPI curandGenerateBinomialMethod(
-    curandGenerator_t generator, unsigned int *outputPtr, size_t num,
-    unsigned int n, double p, curandMethod_t method) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t,
-                                  unsigned int, double, curandMethod_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomialMethod");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num, n, p, method);
-}
-
-curandStatus_t CURANDAPI curandGenerateSeeds(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateSeeds");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator);
-}
-
-curandStatus_t CURANDAPI curandGetDirectionVectors32(
-    curandDirectionVectors32_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors32_t *[],
-                                              curandDirectionVectorSet_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(vectors, set);
-}
-
-curandStatus_t CURANDAPI
-curandGetScrambleConstants32(unsigned int **constants) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned int **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constants);
-}
-
-curandStatus_t CURANDAPI curandGetDirectionVectors64(
-    curandDirectionVectors64_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors64_t *[],
-                                              curandDirectionVectorSet_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(vectors, set);
-}
-
-curandStatus_t CURANDAPI
-curandGetScrambleConstants64(unsigned long long **constants) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned long long **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constants);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/curand_9_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/curand_9_0.inc
deleted file mode 100644
index 3c4c8fed67ac11..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/curand_9_0.inc
+++ /dev/null
@@ -1,268 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-curandStatus_t CURANDAPI curandCreateGenerator(curandGenerator_t *generator,
-                                               curandRngType_t rng_type) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGenerator");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, rng_type);
-}
-
-curandStatus_t CURANDAPI curandCreateGeneratorHost(curandGenerator_t *generator,
-                                                   curandRngType_t rng_type) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t *, curandRngType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGeneratorHost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, rng_type);
-}
-
-curandStatus_t CURANDAPI curandDestroyGenerator(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyGenerator");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator);
-}
-
-curandStatus_t CURANDAPI curandGetVersion(int *version) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-curandStatus_t CURANDAPI curandGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-curandStatus_t CURANDAPI curandSetStream(curandGenerator_t generator,
-                                         cudaStream_t stream) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, stream);
-}
-
-curandStatus_t CURANDAPI curandSetPseudoRandomGeneratorSeed(
-    curandGenerator_t generator, unsigned long long seed) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("curandSetPseudoRandomGeneratorSeed");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, seed);
-}
-
-curandStatus_t CURANDAPI curandSetGeneratorOffset(curandGenerator_t generator,
-                                                  unsigned long long offset) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOffset");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, offset);
-}
-
-curandStatus_t CURANDAPI curandSetGeneratorOrdering(curandGenerator_t generator,
-                                                    curandOrdering_t order) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, curandOrdering_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOrdering");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, order);
-}
-
-curandStatus_t CURANDAPI curandSetQuasiRandomGeneratorDimensions(
-    curandGenerator_t generator, unsigned int num_dimensions) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("curandSetQuasiRandomGeneratorDimensions");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, num_dimensions);
-}
-
-curandStatus_t CURANDAPI curandGenerate(curandGenerator_t generator,
-                                        unsigned int *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateLongLong(curandGenerator_t generator,
-                                                unsigned long long *outputPtr,
-                                                size_t num) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t,
-                                              unsigned long long *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLongLong");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateUniform(curandGenerator_t generator,
-                                               float *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, float *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniform");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateUniformDouble(
-    curandGenerator_t generator, double *outputPtr, size_t num) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, double *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniformDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num);
-}
-
-curandStatus_t CURANDAPI curandGenerateNormal(curandGenerator_t generator,
-                                              float *outputPtr, size_t n,
-                                              float mean, float stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
-                                              size_t, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandGenerateNormalDouble(curandGenerator_t generator,
-                                                    double *outputPtr, size_t n,
-                                                    double mean,
-                                                    double stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
-                                              size_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormalDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandGenerateLogNormal(curandGenerator_t generator,
-                                                 float *outputPtr, size_t n,
-                                                 float mean, float stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, float *,
-                                              size_t, float, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormal");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI
-curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr,
-                              size_t n, double mean, double stddev) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, double *,
-                                              size_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormalDouble");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, mean, stddev);
-}
-
-curandStatus_t CURANDAPI curandCreatePoissonDistribution(
-    double lambda, curandDiscreteDistribution_t *discrete_distribution) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(double, curandDiscreteDistribution_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreatePoissonDistribution");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lambda, discrete_distribution);
-}
-
-curandStatus_t CURANDAPI
-curandDestroyDistribution(curandDiscreteDistribution_t discrete_distribution) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDiscreteDistribution_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyDistribution");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(discrete_distribution);
-}
-
-curandStatus_t CURANDAPI curandGeneratePoisson(curandGenerator_t generator,
-                                               unsigned int *outputPtr,
-                                               size_t n, double lambda) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoisson");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, lambda);
-}
-
-curandStatus_t CURANDAPI curandGeneratePoissonMethod(
-    curandGenerator_t generator, unsigned int *outputPtr, size_t n,
-    double lambda, curandMethod_t method) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, double, curandMethod_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoissonMethod");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, n, lambda, method);
-}
-
-curandStatus_t CURANDAPI curandGenerateBinomial(curandGenerator_t generator,
-                                                unsigned int *outputPtr,
-                                                size_t num, unsigned int n,
-                                                double p) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *,
-                                              size_t, unsigned int, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomial");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num, n, p);
-}
-
-curandStatus_t CURANDAPI curandGenerateBinomialMethod(
-    curandGenerator_t generator, unsigned int *outputPtr, size_t num,
-    unsigned int n, double p, curandMethod_t method) {
-  using FuncPtr =
-      curandStatus_t(CURANDAPI *)(curandGenerator_t, unsigned int *, size_t,
-                                  unsigned int, double, curandMethod_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomialMethod");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator, outputPtr, num, n, p, method);
-}
-
-curandStatus_t CURANDAPI curandGenerateSeeds(curandGenerator_t generator) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandGenerator_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateSeeds");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(generator);
-}
-
-curandStatus_t CURANDAPI curandGetDirectionVectors32(
-    curandDirectionVectors32_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors32_t *[],
-                                              curandDirectionVectorSet_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(vectors, set);
-}
-
-curandStatus_t CURANDAPI
-curandGetScrambleConstants32(unsigned int **constants) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned int **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants32");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constants);
-}
-
-curandStatus_t CURANDAPI curandGetDirectionVectors64(
-    curandDirectionVectors64_t *vectors[], curandDirectionVectorSet_t set) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(curandDirectionVectors64_t *[],
-                                              curandDirectionVectorSet_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(vectors, set);
-}
-
-curandStatus_t CURANDAPI
-curandGetScrambleConstants64(unsigned long long **constants) {
-  using FuncPtr = curandStatus_t(CURANDAPI *)(unsigned long long **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants64");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constants);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusolver.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cusolver.symbols
new file mode 100644
index 00000000000000..f10b69198d56bb
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cusolver.symbols
@@ -0,0 +1,926 @@
+cublasCbatchAccGemm
+cublasCbatchAccGemm_BufferSize
+cublasCbatchGemm
+cublasCbatchTrsm
+cublasDbatchAccGemm
+cublasDbatchAccGemm_BufferSize
+cublasDbatchGemm
+cublasDbatchTrsm
+cublasSbatchAccGemm
+cublasSbatchAccGemm_BufferSize
+cublasSbatchGemm
+cublasSbatchTrsm
+cublasZbatchAccGemm
+cublasZbatchAccGemm_BufferSize
+cublasZbatchGemm
+cublasZbatchTrsm
+cusolverDnCCgels
+cusolverDnCCgels_bufferSize
+cusolverDnCCgesv
+cusolverDnCCgesv_bufferSize
+cusolverDnCEgels
+cusolverDnCEgels_bufferSize
+cusolverDnCEgesv
+cusolverDnCEgesv_bufferSize
+cusolverDnCKgels
+cusolverDnCKgels_bufferSize
+cusolverDnCKgesv
+cusolverDnCKgesv_bufferSize
+cusolverDnCYgels
+cusolverDnCYgels_bufferSize
+cusolverDnCYgesv
+cusolverDnCYgesv_bufferSize
+cusolverDnCaxpyHost
+cusolverDnCeyeBatch
+cusolverDnCgebrd
+cusolverDnCgebrd_bufferSize
+cusolverDnCgehd2
+cusolverDnCgehd2_bufferSize
+cusolverDnCgehrd
+cusolverDnCgehrd_bufferSize
+cusolverDnCgelqfHost
+cusolverDnCgemmHost
+cusolverDnCgeqlfHost
+cusolverDnCgeqrf
+cusolverDnCgeqrfHost
+cusolverDnCgeqrf_ApI
+cusolverDnCgeqrf_ApI_bufferSize
+cusolverDnCgeqrf_bufferSize
+cusolverDnCgercond
+cusolverDnCgercond_bufferSize
+cusolverDnCgesvd
+cusolverDnCgesvd_bufferSize
+cusolverDnCgesvdaStridedBatched
+cusolverDnCgesvdaStridedBatched_bufferSize
+cusolverDnCgesvdj
+cusolverDnCgesvdjBatched
+cusolverDnCgesvdjBatched_bufferSize
+cusolverDnCgesvdj_bufferSize
+cusolverDnCgetrf
+cusolverDnCgetrf_bufferSize
+cusolverDnCgetrs
+cusolverDnCheevd
+cusolverDnCheevd_bufferSize
+cusolverDnCheevdx
+cusolverDnCheevdx_bufferSize
+cusolverDnCheevj
+cusolverDnCheevjBatched
+cusolverDnCheevjBatched_bufferSize
+cusolverDnCheevj_bufferSize
+cusolverDnChegst
+cusolverDnChegst2
+cusolverDnChegst2_bufferSize
+cusolverDnChegst_bufferSize
+cusolverDnChegvd
+cusolverDnChegvd_bufferSize
+cusolverDnChegvdx
+cusolverDnChegvdx_bufferSize
+cusolverDnChegvj
+cusolverDnChegvj_bufferSize
+cusolverDnChemv
+cusolverDnChemv_bufferSize
+cusolverDnChetrd
+cusolverDnChetrd_bufferSize
+cusolverDnClacpyHost
+cusolverDnClahr2
+cusolverDnClange
+cusolverDnClangeHost
+cusolverDnClange_bufferSize
+cusolverDnClanhe
+cusolverDnClanheHost
+cusolverDnClanhe_bufferSize
+cusolverDnClantrHost
+cusolverDnClarfb
+cusolverDnClarfgHost
+cusolverDnClarft
+cusolverDnClarft_bufferSize
+cusolverDnClarnvHost
+cusolverDnClascl
+cusolverDnClasetHost
+cusolverDnClaswp
+cusolverDnClauum
+cusolverDnClauumHost
+cusolverDnClauum_bufferSize
+cusolverDnCnrm2Host
+cusolverDnCpolar
+cusolverDnCpolar_bufferSize
+cusolverDnCpotrf
+cusolverDnCpotrfBatched
+cusolverDnCpotrfHost
+cusolverDnCpotrf_bufferSize
+cusolverDnCpotri
+cusolverDnCpotriHost
+cusolverDnCpotri_bufferSize
+cusolverDnCpotrs
+cusolverDnCpotrsBatched
+cusolverDnCpotrsHost
+cusolverDnCreate
+cusolverDnCreateGesvdjInfo
+cusolverDnCreateParams
+cusolverDnCreateSyevjInfo
+cusolverDnCsscalHost
+cusolverDnCsytrf
+cusolverDnCsytrf_bufferSize
+cusolverDnCsytri
+cusolverDnCsytriHost
+cusolverDnCsytri_bufferSize
+cusolverDnCsytrsHost
+cusolverDnCtrtriHost
+cusolverDnCungbr
+cusolverDnCungbrHost
+cusolverDnCungbr_bufferSize
+cusolverDnCunghrHost
+cusolverDnCunglqHost
+cusolverDnCungqr
+cusolverDnCungqr_ApI
+cusolverDnCungqr_ApI_bufferSize
+cusolverDnCungqr_bufferSize
+cusolverDnCungtr
+cusolverDnCungtrHost
+cusolverDnCungtr_bufferSize
+cusolverDnCunmqr
+cusolverDnCunmqrHost
+cusolverDnCunmqr_bufferSize
+cusolverDnCunmtr
+cusolverDnCunmtrHost
+cusolverDnCunmtr_bufferSize
+cusolverDnDBgels
+cusolverDnDBgels_bufferSize
+cusolverDnDBgesv
+cusolverDnDBgesv_bufferSize
+cusolverDnDDgels
+cusolverDnDDgels_bufferSize
+cusolverDnDDgesv
+cusolverDnDDgesv_bufferSize
+cusolverDnDHgels
+cusolverDnDHgels_bufferSize
+cusolverDnDHgesv
+cusolverDnDHgesv_bufferSize
+cusolverDnDSgels
+cusolverDnDSgels_bufferSize
+cusolverDnDSgesv
+cusolverDnDSgesv_bufferSize
+cusolverDnDXgels
+cusolverDnDXgels_bufferSize
+cusolverDnDXgesv
+cusolverDnDXgesv_bufferSize
+cusolverDnDaxpyHost
+cusolverDnDestroy
+cusolverDnDestroyGesvdjInfo
+cusolverDnDestroyParams
+cusolverDnDestroySyevjInfo
+cusolverDnDeyeBatch
+cusolverDnDgebrd
+cusolverDnDgebrd_bufferSize
+cusolverDnDgehd2
+cusolverDnDgehd2_bufferSize
+cusolverDnDgehrd
+cusolverDnDgehrd_bufferSize
+cusolverDnDgelqfHost
+cusolverDnDgemmHost
+cusolverDnDgeqlfHost
+cusolverDnDgeqrf
+cusolverDnDgeqrfHost
+cusolverDnDgeqrf_ApI
+cusolverDnDgeqrf_ApI_bufferSize
+cusolverDnDgeqrf_bufferSize
+cusolverDnDgercond
+cusolverDnDgercond_bufferSize
+cusolverDnDgesvd
+cusolverDnDgesvd_bufferSize
+cusolverDnDgesvdaStridedBatched
+cusolverDnDgesvdaStridedBatched_bufferSize
+cusolverDnDgesvdj
+cusolverDnDgesvdjBatched
+cusolverDnDgesvdjBatched_bufferSize
+cusolverDnDgesvdj_bufferSize
+cusolverDnDgetrf
+cusolverDnDgetrf_bufferSize
+cusolverDnDgetrs
+cusolverDnDlacpyHost
+cusolverDnDlahr2
+cusolverDnDlamchHost
+cusolverDnDlange
+cusolverDnDlangeHost
+cusolverDnDlange_bufferSize
+cusolverDnDlansy
+cusolverDnDlansyHost
+cusolverDnDlansy_bufferSize
+cusolverDnDlantrHost
+cusolverDnDlarfb
+cusolverDnDlarfgHost
+cusolverDnDlarft
+cusolverDnDlarft_bufferSize
+cusolverDnDlarnvHost
+cusolverDnDlascl
+cusolverDnDlasetHost
+cusolverDnDlaswp
+cusolverDnDlauum
+cusolverDnDlauumHost
+cusolverDnDlauum_bufferSize
+cusolverDnDnrm2Host
+cusolverDnDorgbr
+cusolverDnDorgbrHost
+cusolverDnDorgbr_bufferSize
+cusolverDnDorghrHost
+cusolverDnDorglqHost
+cusolverDnDorgqr
+cusolverDnDorgqr_ApI
+cusolverDnDorgqr_ApI_bufferSize
+cusolverDnDorgqr_bufferSize
+cusolverDnDorgtr
+cusolverDnDorgtrHost
+cusolverDnDorgtr_bufferSize
+cusolverDnDormqr
+cusolverDnDormqrHost
+cusolverDnDormqr_bufferSize
+cusolverDnDormtr
+cusolverDnDormtrHost
+cusolverDnDormtr_bufferSize
+cusolverDnDpolar
+cusolverDnDpolar_bufferSize
+cusolverDnDpotrf
+cusolverDnDpotrfBatched
+cusolverDnDpotrfHost
+cusolverDnDpotrf_bufferSize
+cusolverDnDpotri
+cusolverDnDpotriHost
+cusolverDnDpotri_bufferSize
+cusolverDnDpotrs
+cusolverDnDpotrsBatched
+cusolverDnDpotrsHost
+cusolverDnDscalHost
+cusolverDnDsteqrHost
+cusolverDnDsterfHost
+cusolverDnDsyevd
+cusolverDnDsyevd_bufferSize
+cusolverDnDsyevdx
+cusolverDnDsyevdx_bufferSize
+cusolverDnDsyevj
+cusolverDnDsyevjBatched
+cusolverDnDsyevjBatched_bufferSize
+cusolverDnDsyevj_bufferSize
+cusolverDnDsygst
+cusolverDnDsygst2
+cusolverDnDsygst2_bufferSize
+cusolverDnDsygst_bufferSize
+cusolverDnDsygvd
+cusolverDnDsygvd_bufferSize
+cusolverDnDsygvdx
+cusolverDnDsygvdx_bufferSize
+cusolverDnDsygvj
+cusolverDnDsygvj_bufferSize
+cusolverDnDsymv
+cusolverDnDsymv_bufferSize
+cusolverDnDsytrd
+cusolverDnDsytrd_bufferSize
+cusolverDnDsytrf
+cusolverDnDsytrf_bufferSize
+cusolverDnDsytri
+cusolverDnDsytriHost
+cusolverDnDsytri_bufferSize
+cusolverDnDsytrsHost
+cusolverDnDtrtriHost
+cusolverDnGeqrf
+cusolverDnGeqrf_bufferSize
+cusolverDnGesvd
+cusolverDnGesvd_bufferSize
+cusolverDnGetDeterministicMode
+cusolverDnGetStream
+cusolverDnGetrf
+cusolverDnGetrf_bufferSize
+cusolverDnGetrs
+cusolverDnIRSInfosCreate
+cusolverDnIRSInfosDestroy
+cusolverDnIRSInfosGetMaxIters
+cusolverDnIRSInfosGetNiters
+cusolverDnIRSInfosGetOuterNiters
+cusolverDnIRSInfosGetResidualHistory
+cusolverDnIRSInfosRequestResidual
+cusolverDnIRSParamsCreate
+cusolverDnIRSParamsDestroy
+cusolverDnIRSParamsDisableFallback
+cusolverDnIRSParamsEnableFallback
+cusolverDnIRSParamsGetMaxIters
+cusolverDnIRSParamsSetMaxIters
+cusolverDnIRSParamsSetMaxItersInner
+cusolverDnIRSParamsSetRefinementSolver
+cusolverDnIRSParamsSetSolverLowestPrecision
+cusolverDnIRSParamsSetSolverMainPrecision
+cusolverDnIRSParamsSetSolverPrecisions
+cusolverDnIRSParamsSetTol
+cusolverDnIRSParamsSetTolInner
+cusolverDnIRSXgels
+cusolverDnIRSXgels_bufferSize
+cusolverDnIRSXgesv
+cusolverDnIRSXgesv_bufferSize
+cusolverDnPotrf
+cusolverDnPotrf_bufferSize
+cusolverDnPotrs
+cusolverDnSBgels
+cusolverDnSBgels_bufferSize
+cusolverDnSBgesv
+cusolverDnSBgesv_bufferSize
+cusolverDnSHgels
+cusolverDnSHgels_bufferSize
+cusolverDnSHgesv
+cusolverDnSHgesv_bufferSize
+cusolverDnSSgels
+cusolverDnSSgels_bufferSize
+cusolverDnSSgesv
+cusolverDnSSgesv_bufferSize
+cusolverDnSXgels
+cusolverDnSXgels_bufferSize
+cusolverDnSXgesv
+cusolverDnSXgesv_bufferSize
+cusolverDnSaxpyHost
+cusolverDnSetAdvOptions
+cusolverDnSetDeterministicMode
+cusolverDnSetStream
+cusolverDnSeyeBatch
+cusolverDnSgebrd
+cusolverDnSgebrd_bufferSize
+cusolverDnSgehd2
+cusolverDnSgehd2_bufferSize
+cusolverDnSgehrd
+cusolverDnSgehrd_bufferSize
+cusolverDnSgelqfHost
+cusolverDnSgemmHost
+cusolverDnSgeqlfHost
+cusolverDnSgeqrf
+cusolverDnSgeqrfHost
+cusolverDnSgeqrf_ApI
+cusolverDnSgeqrf_ApI_bufferSize
+cusolverDnSgeqrf_bufferSize
+cusolverDnSgercond
+cusolverDnSgercond_bufferSize
+cusolverDnSgesvd
+cusolverDnSgesvd_bufferSize
+cusolverDnSgesvdaStridedBatched
+cusolverDnSgesvdaStridedBatched_bufferSize
+cusolverDnSgesvdj
+cusolverDnSgesvdjBatched
+cusolverDnSgesvdjBatched_bufferSize
+cusolverDnSgesvdj_bufferSize
+cusolverDnSgetrf
+cusolverDnSgetrf_bufferSize
+cusolverDnSgetrs
+cusolverDnSlacpyHost
+cusolverDnSlahr2
+cusolverDnSlamchHost
+cusolverDnSlange
+cusolverDnSlangeHost
+cusolverDnSlange_bufferSize
+cusolverDnSlansy
+cusolverDnSlansyHost
+cusolverDnSlansy_bufferSize
+cusolverDnSlantrHost
+cusolverDnSlarfb
+cusolverDnSlarfgHost
+cusolverDnSlarft
+cusolverDnSlarft_bufferSize
+cusolverDnSlarnvHost
+cusolverDnSlascl
+cusolverDnSlasetHost
+cusolverDnSlaswp
+cusolverDnSlauum
+cusolverDnSlauumHost
+cusolverDnSlauum_bufferSize
+cusolverDnSnrm2Host
+cusolverDnSorgbr
+cusolverDnSorgbrHost
+cusolverDnSorgbr_bufferSize
+cusolverDnSorghrHost
+cusolverDnSorglqHost
+cusolverDnSorgqr
+cusolverDnSorgqr_ApI
+cusolverDnSorgqr_ApI_bufferSize
+cusolverDnSorgqr_bufferSize
+cusolverDnSorgtr
+cusolverDnSorgtrHost
+cusolverDnSorgtr_bufferSize
+cusolverDnSormqr
+cusolverDnSormqrHost
+cusolverDnSormqr_bufferSize
+cusolverDnSormtr
+cusolverDnSormtrHost
+cusolverDnSormtr_bufferSize
+cusolverDnSpolar
+cusolverDnSpolar_bufferSize
+cusolverDnSpotrf
+cusolverDnSpotrfBatched
+cusolverDnSpotrfHost
+cusolverDnSpotrf_bufferSize
+cusolverDnSpotri
+cusolverDnSpotriHost
+cusolverDnSpotri_bufferSize
+cusolverDnSpotrs
+cusolverDnSpotrsBatched
+cusolverDnSpotrsHost
+cusolverDnSscalHost
+cusolverDnSsteqrHost
+cusolverDnSsterfHost
+cusolverDnSsyevd
+cusolverDnSsyevd_bufferSize
+cusolverDnSsyevdx
+cusolverDnSsyevdx_bufferSize
+cusolverDnSsyevj
+cusolverDnSsyevjBatched
+cusolverDnSsyevjBatched_bufferSize
+cusolverDnSsyevj_bufferSize
+cusolverDnSsygst
+cusolverDnSsygst2
+cusolverDnSsygst2_bufferSize
+cusolverDnSsygst_bufferSize
+cusolverDnSsygvd
+cusolverDnSsygvd_bufferSize
+cusolverDnSsygvdx
+cusolverDnSsygvdx_bufferSize
+cusolverDnSsygvj
+cusolverDnSsygvj_bufferSize
+cusolverDnSsymv
+cusolverDnSsymv_bufferSize
+cusolverDnSsytrd
+cusolverDnSsytrd_bufferSize
+cusolverDnSsytrf
+cusolverDnSsytrf_bufferSize
+cusolverDnSsytri
+cusolverDnSsytriHost
+cusolverDnSsytri_bufferSize
+cusolverDnSsytrsHost
+cusolverDnStrtriHost
+cusolverDnSyevd
+cusolverDnSyevd_bufferSize
+cusolverDnSyevdx
+cusolverDnSyevdx_bufferSize
+cusolverDnXfillRandNormalOnDevice
+cusolverDnXfillRandNormalOnHost
+cusolverDnXgeqrf
+cusolverDnXgeqrf_bufferSize
+cusolverDnXgesvd
+cusolverDnXgesvd_bufferSize
+cusolverDnXgesvdjGetResidual
+cusolverDnXgesvdjGetSweeps
+cusolverDnXgesvdjSetMaxSweeps
+cusolverDnXgesvdjSetSortEig
+cusolverDnXgesvdjSetTolerance
+cusolverDnXgesvdp
+cusolverDnXgesvdp_bufferSize
+cusolverDnXgesvdr
+cusolverDnXgesvdr_bufferSize
+cusolverDnXgetrf
+cusolverDnXgetrf_bufferSize
+cusolverDnXgetrs
+cusolverDnXpotrf
+cusolverDnXpotrf_bufferSize
+cusolverDnXpotrs
+cusolverDnXsyevd
+cusolverDnXsyevd_bufferSize
+cusolverDnXsyevdx
+cusolverDnXsyevdx_bufferSize
+cusolverDnXsyevjGetResidual
+cusolverDnXsyevjGetSweeps
+cusolverDnXsyevjSetMaxSweeps
+cusolverDnXsyevjSetSortEig
+cusolverDnXsyevjSetTolerance
+cusolverDnXsytrs
+cusolverDnXsytrs_bufferSize
+cusolverDnXtrtri
+cusolverDnXtrtri_bufferSize
+cusolverDnZCgels
+cusolverDnZCgels_bufferSize
+cusolverDnZCgesv
+cusolverDnZCgesv_bufferSize
+cusolverDnZEgels
+cusolverDnZEgels_bufferSize
+cusolverDnZEgesv
+cusolverDnZEgesv_bufferSize
+cusolverDnZKgels
+cusolverDnZKgels_bufferSize
+cusolverDnZKgesv
+cusolverDnZKgesv_bufferSize
+cusolverDnZYgels
+cusolverDnZYgels_bufferSize
+cusolverDnZYgesv
+cusolverDnZYgesv_bufferSize
+cusolverDnZZgels
+cusolverDnZZgels_bufferSize
+cusolverDnZZgesv
+cusolverDnZZgesv_bufferSize
+cusolverDnZaxpyHost
+cusolverDnZdscalHost
+cusolverDnZeyeBatch
+cusolverDnZgebrd
+cusolverDnZgebrd_bufferSize
+cusolverDnZgehd2
+cusolverDnZgehd2_bufferSize
+cusolverDnZgehrd
+cusolverDnZgehrd_bufferSize
+cusolverDnZgelqfHost
+cusolverDnZgemmHost
+cusolverDnZgeqlfHost
+cusolverDnZgeqrf
+cusolverDnZgeqrfHost
+cusolverDnZgeqrf_ApI
+cusolverDnZgeqrf_ApI_bufferSize
+cusolverDnZgeqrf_bufferSize
+cusolverDnZgercond
+cusolverDnZgercond_bufferSize
+cusolverDnZgesvd
+cusolverDnZgesvd_bufferSize
+cusolverDnZgesvdaStridedBatched
+cusolverDnZgesvdaStridedBatched_bufferSize
+cusolverDnZgesvdj
+cusolverDnZgesvdjBatched
+cusolverDnZgesvdjBatched_bufferSize
+cusolverDnZgesvdj_bufferSize
+cusolverDnZgetrf
+cusolverDnZgetrf_bufferSize
+cusolverDnZgetrs
+cusolverDnZheevd
+cusolverDnZheevd_bufferSize
+cusolverDnZheevdx
+cusolverDnZheevdx_bufferSize
+cusolverDnZheevj
+cusolverDnZheevjBatched
+cusolverDnZheevjBatched_bufferSize
+cusolverDnZheevj_bufferSize
+cusolverDnZhegst
+cusolverDnZhegst2
+cusolverDnZhegst2_bufferSize
+cusolverDnZhegst_bufferSize
+cusolverDnZhegvd
+cusolverDnZhegvd_bufferSize
+cusolverDnZhegvdx
+cusolverDnZhegvdx_bufferSize
+cusolverDnZhegvj
+cusolverDnZhegvj_bufferSize
+cusolverDnZhemv
+cusolverDnZhemv_bufferSize
+cusolverDnZhetrd
+cusolverDnZhetrd_bufferSize
+cusolverDnZlacpyHost
+cusolverDnZlahr2
+cusolverDnZlange
+cusolverDnZlangeHost
+cusolverDnZlange_bufferSize
+cusolverDnZlanhe
+cusolverDnZlanheHost
+cusolverDnZlanhe_bufferSize
+cusolverDnZlantrHost
+cusolverDnZlarfb
+cusolverDnZlarfgHost
+cusolverDnZlarft
+cusolverDnZlarft_bufferSize
+cusolverDnZlarnvHost
+cusolverDnZlascl
+cusolverDnZlasetHost
+cusolverDnZlaswp
+cusolverDnZlauum
+cusolverDnZlauumHost
+cusolverDnZlauum_bufferSize
+cusolverDnZnrm2Host
+cusolverDnZpolar
+cusolverDnZpolar_bufferSize
+cusolverDnZpotrf
+cusolverDnZpotrfBatched
+cusolverDnZpotrfHost
+cusolverDnZpotrf_bufferSize
+cusolverDnZpotri
+cusolverDnZpotriHost
+cusolverDnZpotri_bufferSize
+cusolverDnZpotrs
+cusolverDnZpotrsBatched
+cusolverDnZpotrsHost
+cusolverDnZsytrf
+cusolverDnZsytrf_bufferSize
+cusolverDnZsytri
+cusolverDnZsytriHost
+cusolverDnZsytri_bufferSize
+cusolverDnZsytrsHost
+cusolverDnZtrtriHost
+cusolverDnZungbr
+cusolverDnZungbrHost
+cusolverDnZungbr_bufferSize
+cusolverDnZunghrHost
+cusolverDnZunglqHost
+cusolverDnZungqr
+cusolverDnZungqr_ApI
+cusolverDnZungqr_ApI_bufferSize
+cusolverDnZungqr_bufferSize
+cusolverDnZungtr
+cusolverDnZungtrHost
+cusolverDnZungtr_bufferSize
+cusolverDnZunmqr
+cusolverDnZunmqrHost
+cusolverDnZunmqr_bufferSize
+cusolverDnZunmtr
+cusolverDnZunmtrHost
+cusolverDnZunmtr_bufferSize
+cusolverGetProperty
+cusolverGetVersion
+cusolverIidentityHost
+cusolverRfAccessBundledFactorsDevice
+cusolverRfAnalyze
+cusolverRfBatchAnalyze
+cusolverRfBatchRefactor
+cusolverRfBatchResetValues
+cusolverRfBatchSetupHost
+cusolverRfBatchSolve
+cusolverRfBatchZeroPivot
+cusolverRfCreate
+cusolverRfDestroy
+cusolverRfExtractBundledFactorsHost
+cusolverRfExtractSplitFactorsHost
+cusolverRfGetAlgs
+cusolverRfGetMatrixFormat
+cusolverRfGetNumericBoostReport
+cusolverRfGetNumericProperties
+cusolverRfGetResetValuesFastMode
+cusolverRfRefactor
+cusolverRfResetValues
+cusolverRfSetAlgs
+cusolverRfSetMatrixFormat
+cusolverRfSetNumericProperties
+cusolverRfSetResetValuesFastMode
+cusolverRfSetupDevice
+cusolverRfSetupHost
+cusolverRfSetupM
+cusolverRfSolve
+cusolverSpCcsrbtfHost
+cusolverSpCcsrcholBufferInfo
+cusolverSpCcsrcholBufferInfoHost
+cusolverSpCcsrcholDiag
+cusolverSpCcsrcholFactor
+cusolverSpCcsrcholFactorHost
+cusolverSpCcsrcholSolve
+cusolverSpCcsrcholSolveHost
+cusolverSpCcsrcholZeroPivot
+cusolverSpCcsrcholZeroPivotHost
+cusolverSpCcsreigsHost
+cusolverSpCcsreigvsi
+cusolverSpCcsreigvsiHost
+cusolverSpCcsrlsqvqrHost
+cusolverSpCcsrlsvchol
+cusolverSpCcsrlsvcholHost
+cusolverSpCcsrlsvluHost
+cusolverSpCcsrlsvqr
+cusolverSpCcsrlsvqrHost
+cusolverSpCcsrluAnalysisHost
+cusolverSpCcsrluBufferInfoHost
+cusolverSpCcsrluExtractBTFHost
+cusolverSpCcsrluExtractHost
+cusolverSpCcsrluExtractMHost
+cusolverSpCcsrluFactorHost
+cusolverSpCcsrluSolveHost
+cusolverSpCcsrluZeroPivotHost
+cusolverSpCcsrlucondHost
+cusolverSpCcsrqrBufferInfo
+cusolverSpCcsrqrBufferInfoBatched
+cusolverSpCcsrqrBufferInfoHost
+cusolverSpCcsrqrFactor
+cusolverSpCcsrqrFactorHost
+cusolverSpCcsrqrSetup
+cusolverSpCcsrqrSetupHost
+cusolverSpCcsrqrSolve
+cusolverSpCcsrqrSolveHost
+cusolverSpCcsrqrZeroPivot
+cusolverSpCcsrqrZeroPivotHost
+cusolverSpCcsrqrrcond
+cusolverSpCcsrqrsvBatched
+cusolverSpCcsrsubm2denseHost
+cusolverSpCcsrzfdHost
+cusolverSpCreate
+cusolverSpCreateCsrcholInfo
+cusolverSpCreateCsrcholInfoHost
+cusolverSpCreateCsrluInfoHost
+cusolverSpCreateCsrqrInfo
+cusolverSpCreateCsrqrInfoHost
+cusolverSpCreateGluInfo
+cusolverSpCreateSnluInfo
+cusolverSpCsnluBuffersize
+cusolverSpCsnluFactor
+cusolverSpCsnluSolve
+cusolverSpCsymgthr
+cusolverSpCsymgthrHost
+cusolverSpDcsrbtfHost
+cusolverSpDcsrcholBufferInfo
+cusolverSpDcsrcholBufferInfoHost
+cusolverSpDcsrcholDiag
+cusolverSpDcsrcholFactor
+cusolverSpDcsrcholFactorHost
+cusolverSpDcsrcholSolve
+cusolverSpDcsrcholSolveHost
+cusolverSpDcsrcholZeroPivot
+cusolverSpDcsrcholZeroPivotHost
+cusolverSpDcsreigsHost
+cusolverSpDcsreigvsi
+cusolverSpDcsreigvsiHost
+cusolverSpDcsrlsqvqrHost
+cusolverSpDcsrlsvchol
+cusolverSpDcsrlsvcholHost
+cusolverSpDcsrlsvluHost
+cusolverSpDcsrlsvqr
+cusolverSpDcsrlsvqrHost
+cusolverSpDcsrluAnalysisHost
+cusolverSpDcsrluBufferInfoHost
+cusolverSpDcsrluExtractBTFHost
+cusolverSpDcsrluExtractHost
+cusolverSpDcsrluExtractMHost
+cusolverSpDcsrluFactorHost
+cusolverSpDcsrluSolveHost
+cusolverSpDcsrluZeroPivotHost
+cusolverSpDcsrlucondHost
+cusolverSpDcsrqrBufferInfo
+cusolverSpDcsrqrBufferInfoBatched
+cusolverSpDcsrqrBufferInfoHost
+cusolverSpDcsrqrFactor
+cusolverSpDcsrqrFactorHost
+cusolverSpDcsrqrSetup
+cusolverSpDcsrqrSetupHost
+cusolverSpDcsrqrSolve
+cusolverSpDcsrqrSolveHost
+cusolverSpDcsrqrZeroPivot
+cusolverSpDcsrqrZeroPivotHost
+cusolverSpDcsrqrrcond
+cusolverSpDcsrqrsvBatched
+cusolverSpDcsrsubm2denseHost
+cusolverSpDcsrzfdHost
+cusolverSpDestroy
+cusolverSpDestroyCsrcholInfo
+cusolverSpDestroyCsrcholInfoHost
+cusolverSpDestroyCsrluInfoHost
+cusolverSpDestroyCsrqrInfo
+cusolverSpDestroyCsrqrInfoHost
+cusolverSpDestroyGluInfo
+cusolverSpDestroySnluInfo
+cusolverSpDgluAnalysis
+cusolverSpDgluBufferSize
+cusolverSpDgluExtractMHost
+cusolverSpDgluFactor
+cusolverSpDgluNumericBoost
+cusolverSpDgluReset
+cusolverSpDgluSetup
+cusolverSpDgluSetupByTile
+cusolverSpDgluSolve
+cusolverSpDnrminf
+cusolverSpDsnluBuffersize
+cusolverSpDsnluFactor
+cusolverSpDsnluSolve
+cusolverSpDsymgthr
+cusolverSpDsymgthrHost
+cusolverSpGetStream
+cusolverSpLcsrqrBufferInfo
+cusolverSpLcsrqrFactor
+cusolverSpLcsrqrSetup
+cusolverSpLcsrqrSolve
+cusolverSpLcsrqrSolveRefine
+cusolverSpLcsrqrZeroPivot
+cusolverSpLcsrqrrcond
+cusolverSpQcsrqrBufferInfo
+cusolverSpQcsrqrFactor
+cusolverSpQcsrqrSetup
+cusolverSpQcsrqrSolve
+cusolverSpQcsrqrSolveRefine
+cusolverSpQcsrqrZeroPivot
+cusolverSpQcsrqrrcond
+cusolverSpScsrbtfHost
+cusolverSpScsrcholBufferInfo
+cusolverSpScsrcholBufferInfoHost
+cusolverSpScsrcholDiag
+cusolverSpScsrcholFactor
+cusolverSpScsrcholFactorHost
+cusolverSpScsrcholSolve
+cusolverSpScsrcholSolveHost
+cusolverSpScsrcholZeroPivot
+cusolverSpScsrcholZeroPivotHost
+cusolverSpScsreigsHost
+cusolverSpScsreigvsi
+cusolverSpScsreigvsiHost
+cusolverSpScsrlsqvqrHost
+cusolverSpScsrlsvchol
+cusolverSpScsrlsvcholHost
+cusolverSpScsrlsvluHost
+cusolverSpScsrlsvqr
+cusolverSpScsrlsvqrHost
+cusolverSpScsrluAnalysisHost
+cusolverSpScsrluBufferInfoHost
+cusolverSpScsrluExtractBTFHost
+cusolverSpScsrluExtractHost
+cusolverSpScsrluExtractMHost
+cusolverSpScsrluFactorHost
+cusolverSpScsrluSolveHost
+cusolverSpScsrluZeroPivotHost
+cusolverSpScsrlucondHost
+cusolverSpScsrqrBufferInfo
+cusolverSpScsrqrBufferInfoBatched
+cusolverSpScsrqrBufferInfoHost
+cusolverSpScsrqrFactor
+cusolverSpScsrqrFactorHost
+cusolverSpScsrqrSetup
+cusolverSpScsrqrSetupHost
+cusolverSpScsrqrSolve
+cusolverSpScsrqrSolveHost
+cusolverSpScsrqrZeroPivot
+cusolverSpScsrqrZeroPivotHost
+cusolverSpScsrqrrcond
+cusolverSpScsrqrsvBatched
+cusolverSpScsrsubm2denseHost
+cusolverSpScsrzfdHost
+cusolverSpSetStream
+cusolverSpSsnluBuffersize
+cusolverSpSsnluFactor
+cusolverSpSsnluSolve
+cusolverSpSsymgthr
+cusolverSpSsymgthrHost
+cusolverSpXcsr2cscHost
+cusolverSpXcsr2csc_bufferSizeHost
+cusolverSpXcsrcholAnalysis
+cusolverSpXcsrcholAnalysisHost
+cusolverSpXcsrissymHost
+cusolverSpXcsrluAnalysisHost
+cusolverSpXcsrluConfigHost
+cusolverSpXcsrluNnzHost
+cusolverSpXcsrluNnzMHost
+cusolverSpXcsrmetisndHost
+cusolverSpXcsrnsepHost
+cusolverSpXcsrpermHost
+cusolverSpXcsrperm_bufferSizeHost
+cusolverSpXcsrqrAnalysis
+cusolverSpXcsrqrAnalysisBatched
+cusolverSpXcsrqrAnalysisHost
+cusolverSpXcsrqrConfigInfo
+cusolverSpXcsrsubmHost
+cusolverSpXcsrsubmNnzHost
+cusolverSpXcsrsymamdHost
+cusolverSpXcsrsymmdqHost
+cusolverSpXcsrsympermHost
+cusolverSpXcsrsymperm_bufferSizeHost
+cusolverSpXcsrsymrcmHost
+cusolverSpXgluZeroPivot
+cusolverSpXsnluAnalysis
+cusolverSpXsnluFlops
+cusolverSpXsnluReport
+cusolverSpXsnluResizeSupernodeBy
+cusolverSpXsnluSetReorder
+cusolverSpZcsrbtfHost
+cusolverSpZcsrcholBufferInfo
+cusolverSpZcsrcholBufferInfoHost
+cusolverSpZcsrcholDiag
+cusolverSpZcsrcholFactor
+cusolverSpZcsrcholFactorHost
+cusolverSpZcsrcholSolve
+cusolverSpZcsrcholSolveHost
+cusolverSpZcsrcholZeroPivot
+cusolverSpZcsrcholZeroPivotHost
+cusolverSpZcsreigsHost
+cusolverSpZcsreigvsi
+cusolverSpZcsreigvsiHost
+cusolverSpZcsrlsqvqrHost
+cusolverSpZcsrlsvchol
+cusolverSpZcsrlsvcholHost
+cusolverSpZcsrlsvluHost
+cusolverSpZcsrlsvqr
+cusolverSpZcsrlsvqrHost
+cusolverSpZcsrluAnalysisHost
+cusolverSpZcsrluBufferInfoHost
+cusolverSpZcsrluExtractBTFHost
+cusolverSpZcsrluExtractHost
+cusolverSpZcsrluExtractMHost
+cusolverSpZcsrluFactorHost
+cusolverSpZcsrluSolveHost
+cusolverSpZcsrluZeroPivotHost
+cusolverSpZcsrlucondHost
+cusolverSpZcsrqrBufferInfo
+cusolverSpZcsrqrBufferInfoBatched
+cusolverSpZcsrqrBufferInfoHost
+cusolverSpZcsrqrFactor
+cusolverSpZcsrqrFactorHost
+cusolverSpZcsrqrSetup
+cusolverSpZcsrqrSetupHost
+cusolverSpZcsrqrSolve
+cusolverSpZcsrqrSolveHost
+cusolverSpZcsrqrZeroPivot
+cusolverSpZcsrqrZeroPivotHost
+cusolverSpZcsrqrrcond
+cusolverSpZcsrqrsvBatched
+cusolverSpZcsrsubm2denseHost
+cusolverSpZcsrzfdHost
+cusolverSpZsnluBuffersize
+cusolverSpZsnluFactor
+cusolverSpZsnluSolve
+cusolverSpZsymgthr
+cusolverSpZsymgthrHost
+cusolverXcsrqr_dump
+pegasusCcsr2dense
+pegasusCcsrmvHost
+pegasusDcsr2dense
+pegasusDcsrmvHost
+pegasusScsr2dense
+pegasusScsrmvHost
+pegasusXcoo2csr
+pegasusXcooStableSortByRow
+pegasusXcooStableSort_bufferSizeExt
+pegasusXcsr2coo
+pegasusZcsr2dense
+pegasusZcsrmvHost
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_0.inc
deleted file mode 100644
index d01117faeb2439..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_0.inc
+++ /dev/null
@@ -1,2283 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cusolverStatus_t CUSOLVERAPI cusolverGetProperty(libraryPropertyType type,
-                                                 int *value) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreate(cusolverDnHandle_t *handle) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroy(cusolverDnHandle_t handle) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSetStream(cusolverDnHandle_t handle,
-                                                 cudaStream_t streamId) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnGetStream(cusolverDnHandle_t handle,
-                                                 cudaStream_t *streamId) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, float *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnDpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, double *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuDoubleComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda,
-                                              float *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda,
-                                              double *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              cuComplex *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const float *A, int lda,
-                                              float *B, int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const double *A,
-                                              int lda, double *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const cuComplex *A,
-                                              int lda, cuComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs,
-                                              const cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, float *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, double *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, cuComplex *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      cuComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrfBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    cuDoubleComplex *Aarray[], int lda, int *infoArray, int batchSize) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      cuDoubleComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrsBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    int nrhs, /* only support rhs = 1*/
-    float *A[], int lda, float *B[], int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, float *[], int, float *[],
-      int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrsBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    int nrhs, /* only support rhs = 1*/
-    double *A[], int lda, double *B[], int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, double *[], int,
-      double *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-                        int nrhs, /* only support rhs = 1*/
-                        cuComplex *A[], int lda, cuComplex *B[], int ldb,
-                        int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, cuComplex *[], int,
-      cuComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-                        int nrhs, /* only support rhs = 1*/
-                        cuDoubleComplex *A[], int lda, cuDoubleComplex *B[],
-                        int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, cuDoubleComplex *[], int,
-      cuDoubleComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuDoubleComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *Workspace, int *devIpiv,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *Workspace, int *devIpiv,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, double *, int, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              cuComplex *Workspace,
-                                              int *devIpiv, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *,
-                                      int, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuDoubleComplex *A,
-                                              int lda,
-                                              cuDoubleComplex *Workspace,
-                                              int *devIpiv, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
-      int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSlaswp(cusolverDnHandle_t handle, int n,
-                                              float *A, int lda, int k1, int k2,
-                                              const int *devIpiv, int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, float *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDlaswp(cusolverDnHandle_t handle, int n,
-                                              double *A, int lda, int k1,
-                                              int k2, const int *devIpiv,
-                                              int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, double *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnClaswp(cusolverDnHandle_t handle, int n,
-                                              cuComplex *A, int lda, int k1,
-                                              int k2, const int *devIpiv,
-                                              int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, cuComplex *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZlaswp(cusolverDnHandle_t handle, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              int k1, int k2,
-                                              const int *devIpiv, int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  cuDoubleComplex *, int, int,
-                                                  int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const float *A, int lda,
-                                              const int *devIpiv, float *B,
-                                              int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const float *, int,
-      const int *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const double *A,
-                                              int lda, const int *devIpiv,
-                                              double *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const double *, int,
-      const int *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const cuComplex *A,
-                                              int lda, const int *devIpiv,
-                                              cuComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const cuComplex *, int,
-      const int *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgetrs(
-    cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuDoubleComplex *A, int lda, const int *devIpiv, cuDoubleComplex *B,
-    int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
-      int, const int *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *TAU, float *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *TAU, double *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *,
-                                      int, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              cuComplex *TAU,
-                                              cuComplex *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, cuComplex *,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuDoubleComplex *A,
-                                              int lda, cuDoubleComplex *TAU,
-                                              cuDoubleComplex *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
-    const float *tau, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int,
-                                      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
-    const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  int, const double *, int,
-                                                  const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const cuComplex *A, int lda,
-    const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  int, const cuComplex *, int,
-                                                  const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const cuDoubleComplex *A,
-    int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, float *, int, const float *, float *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, double *, int, const double *,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, cuComplex *A,
-                                              int lda, const cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, cuComplex *, int, const cuComplex *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungqr(
-    cusolverDnHandle_t handle, int m, int n, int k, cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const float *A, int lda, const float *tau,
-    const float *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const float *, int, const float *, const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const double *A, int lda, const double *tau,
-    const double *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const double *, int, const double *, const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
-    const cuComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuComplex *, int, const cuComplex *, const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const float *A, int lda, const float *tau, float *C,
-    int ldc, float *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const float *, int, const float *, float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const double *A, int lda, const double *tau, double *C,
-    int ldc, double *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const double *, int, const double *, double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
-    cuComplex *C, int ldc, cuComplex *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuComplex *, int, const cuComplex *, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, cuDoubleComplex *C, int ldc,
-    cuDoubleComplex *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, int *ipiv,
-                                              float *work, int lwork,
-                                              int *info) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, int *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, int *ipiv,
-                                              double *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *, double *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, int *ipiv,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              int *ipiv, cuDoubleComplex *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *D, float *E, float *TAUQ,
-                                              float *TAUP, float *Work,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, float *, float *,
-      float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *D, double *E,
-                                              double *TAUQ, double *TAUP,
-                                              double *Work, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, double *, int, double *, double *, double *,
-      double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              float *D, float *E,
-                                              cuComplex *TAUQ, cuComplex *TAUP,
-                                              cuComplex *Work, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuComplex *, int, float *, float *,
-      cuComplex *, cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd(
-    cusolverDnHandle_t handle, int m, int n, cuDoubleComplex *A, int lda,
-    double *D, double *E, cuDoubleComplex *TAUQ, cuDoubleComplex *TAUP,
-    cuDoubleComplex *Work, int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, double *, double *,
-      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const float *A, int lda, const float *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const float *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const double *A, int lda, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const cuComplex *A, int lda, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const cuComplex *,
-      int, const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, float *, int,
-      const float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, double *, int,
-      const double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, cuComplex *A,
-                                              int lda, const cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuComplex *, int,
-      const cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZungbr(cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n,
-                 int k, cuDoubleComplex *A, int lda, const cuDoubleComplex *tau,
-                 cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuDoubleComplex *,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
-    int lda, const float *d, const float *e, const float *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const float *, int,
-      const float *, const float *, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
-    int lda, const double *d, const double *e, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
-      const double *, const double *, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChetrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
-    int lda, const float *d, const float *e, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
-      const float *, const float *, const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *A, int lda, const double *d, const double *e,
-    const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const double *, const double *, const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *d,
-                                              float *e, float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, float *, float *,
-      float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
-    double *d, double *e, double *tau, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, double *,
-      double *, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChetrd(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *d,
-                                              float *e, cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, float *,
-      float *, cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, double *d, double *e, cuDoubleComplex *tau, cuDoubleComplex *work,
-    int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      double *, double *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
-    int lda, const float *tau, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
-    int lda, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
-    int lda, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
-      const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const float *,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const double *,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungtr(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
-    int lda, const cuComplex *tau, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int,
-      const cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              const cuDoubleComplex *tau,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const float *A, int lda,
-    const float *tau, const float *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const float *, int, const float *, const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const double *A, int lda,
-    const double *tau, const double *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const double *, int, const double *, const double *, int,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const cuComplex *A, int lda,
-    const cuComplex *tau, const cuComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const cuComplex *, int, const cuComplex *, const cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const cuDoubleComplex *, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, float *A, int lda, float *tau,
-    float *C, int ldc, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, float *, int, float *, float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, double *A, int lda, double *tau,
-    double *C, int ldc, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, double *, int, double *, double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCunmtr(cusolverDnHandle_t handle, cublasSideMode_t side,
-                 cublasFillMode_t uplo, cublasOperation_t trans, int m, int n,
-                 cuComplex *A, int lda, cuComplex *tau, cuComplex *C, int ldc,
-                 cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, cuComplex *, int, cuComplex *, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, cuDoubleComplex *A, int lda,
-    cuDoubleComplex *tau, cuDoubleComplex *C, int ldc, cuDoubleComplex *work,
-    int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, cuDoubleComplex *, int, cuDoubleComplex *, cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd(
-    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
-    int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
-    float *work, int lwork, float *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, float *, int,
-      float *, float *, int, float *, int, float *, int, float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd(
-    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
-    int n, double *A, int lda, double *S, double *U, int ldu, double *VT,
-    int ldvt, double *work, int lwork, double *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, double *, int,
-      double *, double *, int, double *, int, double *, int, double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
-                 int m, int n, cuComplex *A, int lda, float *S, cuComplex *U,
-                 int ldu, cuComplex *VT, int ldvt, cuComplex *work, int lwork,
-                 float *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, float *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
-                 int m, int n, cuDoubleComplex *A, int lda, double *S,
-                 cuDoubleComplex *U, int ldu, cuDoubleComplex *VT, int ldvt,
-                 cuDoubleComplex *work, int lwork, double *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, cuDoubleComplex *,
-      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, float *A, int lda, float *W, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, double *A, int lda, double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevd(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *W,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevd(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              double *W, cuDoubleComplex *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
-    int ldb, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const float *, int, const float *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
-    int ldb, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const double *, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
-    const cuComplex *B, int ldb, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *B, int ldb, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
-    float *W, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
-    double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
-    float *W, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZhegvd(cusolverDnHandle_t handle, cusolverEigType_t itype,
-                 cusolverEigMode_t jobz, cublasFillMode_t uplo, int n,
-                 cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb,
-                 double *W, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreateSyevjInfo(syevjInfo_t *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateSyevjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroySyevjInfo(syevjInfo_t info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroySyevjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetTolerance(syevjInfo_t info,
-                                                          double tolerance) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetTolerance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, tolerance);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetMaxSweeps(syevjInfo_t info,
-                                                          int max_sweeps) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetMaxSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, max_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetSortEig(syevjInfo_t info,
-                                                        int sort_eig) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetSortEig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, sort_eig);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetResidual(
-    cusolverDnHandle_t handle, syevjInfo_t info, double *residual) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
-                                                  syevjInfo_t, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetResidual");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, residual);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetSweeps(
-    cusolverDnHandle_t handle, syevjInfo_t info, int *executed_sweeps) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, syevjInfo_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, executed_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnCheevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnZheevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, float *A, int lda, float *W, float *work, int lwork, int *info,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, double *A, int lda, double *W, double *work, int lwork, int *info,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuComplex *A, int lda, float *W, cuComplex *work, int lwork,
-    int *info, syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
-    int lwork, int *info, syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
-      syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *W,
-                                              float *work, int lwork, int *info,
-                                              syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, double *W,
-                                              double *work, int lwork,
-                                              int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *W,
-                                              cuComplex *work, int lwork,
-                                              int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
-    int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
-      syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
-    int ldb, const float *W, int *lwork, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const float *, int, const float *, int,
-      const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
-    int ldb, const double *W, int *lwork, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const double *, int, const double *, int,
-      const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
-    const cuComplex *B, int ldb, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
-      const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *B, int ldb, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
-    float *W, float *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
-      int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
-    double *W, double *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
-      int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
-    float *W, cuComplex *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
-      cuComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda,
-    cuDoubleComplex *B, int ldb, double *W, cuDoubleComplex *work, int lwork,
-    int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreateGesvdjInfo(gesvdjInfo_t *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateGesvdjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroyGesvdjInfo(gesvdjInfo_t info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyGesvdjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetTolerance(gesvdjInfo_t info,
-                                                           double tolerance) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetTolerance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, tolerance);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetMaxSweeps(gesvdjInfo_t info,
-                                                           int max_sweeps) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetMaxSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, max_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetSortEig(gesvdjInfo_t info,
-                                                         int sort_svd) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetSortEig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, sort_svd);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetResidual(
-    cusolverDnHandle_t handle, gesvdjInfo_t info, double *residual) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
-                                                  gesvdjInfo_t, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetResidual");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, residual);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetSweeps(
-    cusolverDnHandle_t handle, gesvdjInfo_t info, int *executed_sweeps) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, gesvdjInfo_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, executed_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const float *A, int lda, const float *S, const float *U, int ldu,
-    const float *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const float *, int,
-      const float *, const float *, int, const float *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const double *A, int lda, const double *S, const double *U, int ldu,
-    const double *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const double *, int,
-      const double *, const double *, int, const double *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
-    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuComplex *, int,
-      const float *, const cuComplex *, int, const cuComplex *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const cuDoubleComplex *A, int lda, const double *S,
-    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
-    int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuDoubleComplex *,
-      int, const double *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, float *A,
-    int lda, float *S, float *U, int ldu, float *V, int ldv, float *work,
-    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, float *, int, float *,
-      float *, int, float *, int, float *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, double *A,
-    int lda, double *S, double *U, int ldu, double *V, int ldv, double *work,
-    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, double *, int, double *,
-      double *, int, double *, int, double *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
-    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
-    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
-    gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const float *A, int lda, const float *S, const float *U, int ldu,
-    const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
-      const float *, const float *, int, const float *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const double *A, int lda, const double *S, const double *U, int ldu,
-    const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
-      const double *, const double *, int, const double *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
-    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
-      int, const float *, const cuComplex *, int, const cuComplex *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const cuDoubleComplex *A, int lda, const double *S,
-    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
-    int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
-      const cuDoubleComplex *, int, const double *, const cuDoubleComplex *,
-      int, const cuDoubleComplex *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
-    float *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, float *, int,
-      float *, float *, int, float *, int, float *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
-    double *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, double *, int,
-      double *, double *, int, double *, int, double *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
-    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
-    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
-    gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuDoubleComplex *,
-      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_1.inc b/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_1.inc
deleted file mode 100644
index d247958143a299..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_1.inc
+++ /dev/null
@@ -1,3139 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cusolverStatus_t CUSOLVERAPI cusolverGetProperty(libraryPropertyType type,
-                                                 int *value) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverGetVersion(int *version) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreate(cusolverDnHandle_t *handle) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroy(cusolverDnHandle_t handle) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSetStream(cusolverDnHandle_t handle,
-                                                 cudaStream_t streamId) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnGetStream(cusolverDnHandle_t handle,
-                                                 cudaStream_t *streamId) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, float *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnDpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, double *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuDoubleComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda,
-                                              float *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda,
-                                              double *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              cuComplex *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const float *A, int lda,
-                                              float *B, int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const double *A,
-                                              int lda, double *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const cuComplex *A,
-                                              int lda, cuComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs,
-                                              const cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, float *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, double *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, cuComplex *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      cuComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrfBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    cuDoubleComplex *Aarray[], int lda, int *infoArray, int batchSize) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      cuDoubleComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrsBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    int nrhs, /* only support rhs = 1*/
-    float *A[], int lda, float *B[], int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, float *[], int, float *[],
-      int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrsBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    int nrhs, /* only support rhs = 1*/
-    double *A[], int lda, double *B[], int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, double *[], int,
-      double *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-                        int nrhs, /* only support rhs = 1*/
-                        cuComplex *A[], int lda, cuComplex *B[], int ldb,
-                        int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, cuComplex *[], int,
-      cuComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-                        int nrhs, /* only support rhs = 1*/
-                        cuDoubleComplex *A[], int lda, cuDoubleComplex *B[],
-                        int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, cuDoubleComplex *[], int,
-      cuDoubleComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnDpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, double *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              cuComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnStrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnStrtri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo,
-                                              cublasDiagType_t diag, int n,
-                                              float *A, int lda, float *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo,
-                                              cublasDiagType_t diag, int n,
-                                              double *A, int lda, double *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
-      int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, cuComplex *A, int lda, cuComplex *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
-      int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo,
-                                              cublasDiagType_t diag, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
-      cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnDlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnClauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSlauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDlauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, double *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnClauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              cuComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZlauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuDoubleComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *Workspace, int *devIpiv,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *Workspace, int *devIpiv,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, double *, int, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              cuComplex *Workspace,
-                                              int *devIpiv, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *,
-                                      int, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuDoubleComplex *A,
-                                              int lda,
-                                              cuDoubleComplex *Workspace,
-                                              int *devIpiv, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
-      int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSlaswp(cusolverDnHandle_t handle, int n,
-                                              float *A, int lda, int k1, int k2,
-                                              const int *devIpiv, int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, float *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDlaswp(cusolverDnHandle_t handle, int n,
-                                              double *A, int lda, int k1,
-                                              int k2, const int *devIpiv,
-                                              int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, double *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnClaswp(cusolverDnHandle_t handle, int n,
-                                              cuComplex *A, int lda, int k1,
-                                              int k2, const int *devIpiv,
-                                              int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, cuComplex *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZlaswp(cusolverDnHandle_t handle, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              int k1, int k2,
-                                              const int *devIpiv, int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  cuDoubleComplex *, int, int,
-                                                  int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const float *A, int lda,
-                                              const int *devIpiv, float *B,
-                                              int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const float *, int,
-      const int *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const double *A,
-                                              int lda, const int *devIpiv,
-                                              double *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const double *, int,
-      const int *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const cuComplex *A,
-                                              int lda, const int *devIpiv,
-                                              cuComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const cuComplex *, int,
-      const int *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgetrs(
-    cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuDoubleComplex *A, int lda, const int *devIpiv, cuDoubleComplex *B,
-    int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
-      int, const int *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *TAU, float *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *TAU, double *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *,
-                                      int, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              cuComplex *TAU,
-                                              cuComplex *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, cuComplex *,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuDoubleComplex *A,
-                                              int lda, cuDoubleComplex *TAU,
-                                              cuDoubleComplex *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
-    const float *tau, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int,
-                                      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
-    const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  int, const double *, int,
-                                                  const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const cuComplex *A, int lda,
-    const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  int, const cuComplex *, int,
-                                                  const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const cuDoubleComplex *A,
-    int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, float *, int, const float *, float *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, double *, int, const double *,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, cuComplex *A,
-                                              int lda, const cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, cuComplex *, int, const cuComplex *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungqr(
-    cusolverDnHandle_t handle, int m, int n, int k, cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const float *A, int lda, const float *tau,
-    const float *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const float *, int, const float *, const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const double *A, int lda, const double *tau,
-    const double *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const double *, int, const double *, const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
-    const cuComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuComplex *, int, const cuComplex *, const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const float *A, int lda, const float *tau, float *C,
-    int ldc, float *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const float *, int, const float *, float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const double *A, int lda, const double *tau, double *C,
-    int ldc, double *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const double *, int, const double *, double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
-    cuComplex *C, int ldc, cuComplex *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuComplex *, int, const cuComplex *, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, cuDoubleComplex *C, int ldc,
-    cuDoubleComplex *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, int *ipiv,
-                                              float *work, int lwork,
-                                              int *info) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, int *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, int *ipiv,
-                                              double *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *, double *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, int *ipiv,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              int *ipiv, cuDoubleComplex *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const float *A, int lda, const int *ipiv, float *B, int ldb, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
-      const int *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const double *A, int lda, const int *ipiv, double *B, int ldb, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
-      const int *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const cuComplex *A, int lda, const int *ipiv, cuComplex *B, int ldb,
-    int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
-      const int *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
-    int ldb, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      int, const int *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const float *A, int lda,
-                                              const int *ipiv, float *B,
-                                              int ldb, float *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
-      const int *, float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const double *A,
-                                              int lda, const int *ipiv,
-                                              double *B, int ldb, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
-      const int *, double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCsytrs(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-                 int nrhs, const cuComplex *A, int lda, const int *ipiv,
-                 cuComplex *B, int ldb, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
-      const int *, cuComplex *, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
-    int ldb, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      int, const int *, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda,
-    const int *ipiv, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
-    const int *ipiv, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
-    int lda, const int *ipiv, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      cuComplex *, int, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, const int *ipiv, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda,
-                                              const int *ipiv, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const int *,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda,
-                                              const int *ipiv, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const int *,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              const int *ipiv, cuComplex *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, const int *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytri(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, const int *ipiv, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      const int *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *D, float *E, float *TAUQ,
-                                              float *TAUP, float *Work,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, float *, float *,
-      float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *D, double *E,
-                                              double *TAUQ, double *TAUP,
-                                              double *Work, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, double *, int, double *, double *, double *,
-      double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              float *D, float *E,
-                                              cuComplex *TAUQ, cuComplex *TAUP,
-                                              cuComplex *Work, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuComplex *, int, float *, float *,
-      cuComplex *, cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd(
-    cusolverDnHandle_t handle, int m, int n, cuDoubleComplex *A, int lda,
-    double *D, double *E, cuDoubleComplex *TAUQ, cuDoubleComplex *TAUP,
-    cuDoubleComplex *Work, int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, double *, double *,
-      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const float *A, int lda, const float *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const float *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const double *A, int lda, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const cuComplex *A, int lda, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const cuComplex *,
-      int, const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, float *, int,
-      const float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, double *, int,
-      const double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, cuComplex *A,
-                                              int lda, const cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuComplex *, int,
-      const cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZungbr(cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n,
-                 int k, cuDoubleComplex *A, int lda, const cuDoubleComplex *tau,
-                 cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuDoubleComplex *,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
-    int lda, const float *d, const float *e, const float *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const float *, int,
-      const float *, const float *, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
-    int lda, const double *d, const double *e, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
-      const double *, const double *, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChetrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
-    int lda, const float *d, const float *e, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
-      const float *, const float *, const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *A, int lda, const double *d, const double *e,
-    const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const double *, const double *, const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *d,
-                                              float *e, float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, float *, float *,
-      float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
-    double *d, double *e, double *tau, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, double *,
-      double *, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChetrd(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *d,
-                                              float *e, cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, float *,
-      float *, cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, double *d, double *e, cuDoubleComplex *tau, cuDoubleComplex *work,
-    int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      double *, double *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
-    int lda, const float *tau, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
-    int lda, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
-    int lda, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
-      const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const float *,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const double *,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungtr(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
-    int lda, const cuComplex *tau, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int,
-      const cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              const cuDoubleComplex *tau,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const float *A, int lda,
-    const float *tau, const float *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const float *, int, const float *, const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const double *A, int lda,
-    const double *tau, const double *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const double *, int, const double *, const double *, int,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const cuComplex *A, int lda,
-    const cuComplex *tau, const cuComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const cuComplex *, int, const cuComplex *, const cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const cuDoubleComplex *, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, float *A, int lda, float *tau,
-    float *C, int ldc, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, float *, int, float *, float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, double *A, int lda, double *tau,
-    double *C, int ldc, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, double *, int, double *, double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCunmtr(cusolverDnHandle_t handle, cublasSideMode_t side,
-                 cublasFillMode_t uplo, cublasOperation_t trans, int m, int n,
-                 cuComplex *A, int lda, cuComplex *tau, cuComplex *C, int ldc,
-                 cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, cuComplex *, int, cuComplex *, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, cuDoubleComplex *A, int lda,
-    cuDoubleComplex *tau, cuDoubleComplex *C, int ldc, cuDoubleComplex *work,
-    int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, cuDoubleComplex *, int, cuDoubleComplex *, cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd(
-    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
-    int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
-    float *work, int lwork, float *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, float *, int,
-      float *, float *, int, float *, int, float *, int, float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd(
-    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
-    int n, double *A, int lda, double *S, double *U, int ldu, double *VT,
-    int ldvt, double *work, int lwork, double *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, double *, int,
-      double *, double *, int, double *, int, double *, int, double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
-                 int m, int n, cuComplex *A, int lda, float *S, cuComplex *U,
-                 int ldu, cuComplex *VT, int ldvt, cuComplex *work, int lwork,
-                 float *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, float *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
-                 int m, int n, cuDoubleComplex *A, int lda, double *S,
-                 cuDoubleComplex *U, int ldu, cuDoubleComplex *VT, int ldvt,
-                 cuDoubleComplex *work, int lwork, double *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, cuDoubleComplex *,
-      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, float *A, int lda, float *W, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, double *A, int lda, double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevd(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *W,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevd(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              double *W, cuDoubleComplex *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu,
-    int il, int iu, int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const float *, int, float, float, int, int, int *,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const double *A, int lda, double vl,
-    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const double *, int, double, double, int, int,
-      int *, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const cuComplex *A, int lda, float vl,
-    float vu, int il, int iu, int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const cuComplex *, int, float, float, int, int,
-      int *, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda, double vl,
-    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const cuDoubleComplex *, int, double, double, int,
-      int, int *, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il,
-    int iu, int *meig, float *W, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, float *, int, float, float, int, int, int *,
-      float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu,
-    int il, int iu, int *meig, double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, double *, int, double, double, int, int, int *,
-      double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCheevdx(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
-                  cusolverEigRange_t range, cublasFillMode_t uplo, int n,
-                  cuComplex *A, int lda, float vl, float vu, int il, int iu,
-                  int *meig, float *W, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, cuComplex *, int, float, float, int, int, int *,
-      float *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda, double vl,
-    double vu, int il, int iu, int *meig, double *W, cuDoubleComplex *work,
-    int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, cuDoubleComplex *, int, double, double, int, int,
-      int *, double *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const float *A,
-    int lda, const float *B, int ldb, float vl, float vu, int il, int iu,
-    int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const float *, int,
-      const float *, int, float, float, int, int, int *, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const double *A,
-    int lda, const double *B, int ldb, double vl, double vu, int il, int iu,
-    int *meig, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const double *, int,
-      const double *, int, double, double, int, int, int *, const double *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const cuComplex *A,
-    int lda, const cuComplex *B, int ldb, float vl, float vu, int il, int iu,
-    int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const cuComplex *, int,
-      const cuComplex *, int, float, float, int, int, int *, const float *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    double vl, double vu, int il, int iu, int *meig, const double *W,
-    int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, double, double, int, int, int *,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, float *A, int lda,
-    float *B, int ldb, float vl, float vu, int il, int iu, int *meig, float *W,
-    float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, float *, int, float *, int,
-      float, float, int, int, int *, float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, double *A, int lda,
-    double *B, int ldb, double vl, double vu, int il, int iu, int *meig,
-    double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, double *, int, double *, int,
-      double, double, int, int, int *, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuComplex *A,
-    int lda, cuComplex *B, int ldb, float vl, float vu, int il, int iu,
-    int *meig, float *W, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, float, float, int, int, int *, float *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, cuDoubleComplex *B, int ldb, double vl, double vu, int il, int iu,
-    int *meig, double *W, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, double, double, int, int, int *, double *,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
-    int ldb, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const float *, int, const float *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
-    int ldb, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const double *, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
-    const cuComplex *B, int ldb, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *B, int ldb, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
-    float *W, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
-    double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
-    float *W, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZhegvd(cusolverDnHandle_t handle, cusolverEigType_t itype,
-                 cusolverEigMode_t jobz, cublasFillMode_t uplo, int n,
-                 cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb,
-                 double *W, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreateSyevjInfo(syevjInfo_t *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateSyevjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroySyevjInfo(syevjInfo_t info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroySyevjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetTolerance(syevjInfo_t info,
-                                                          double tolerance) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetTolerance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, tolerance);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetMaxSweeps(syevjInfo_t info,
-                                                          int max_sweeps) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetMaxSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, max_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetSortEig(syevjInfo_t info,
-                                                        int sort_eig) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetSortEig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, sort_eig);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetResidual(
-    cusolverDnHandle_t handle, syevjInfo_t info, double *residual) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
-                                                  syevjInfo_t, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetResidual");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, residual);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetSweeps(
-    cusolverDnHandle_t handle, syevjInfo_t info, int *executed_sweeps) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, syevjInfo_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, executed_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnCheevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnZheevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, float *A, int lda, float *W, float *work, int lwork, int *info,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, double *A, int lda, double *W, double *work, int lwork, int *info,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuComplex *A, int lda, float *W, cuComplex *work, int lwork,
-    int *info, syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
-    int lwork, int *info, syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
-      syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *W,
-                                              float *work, int lwork, int *info,
-                                              syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, double *W,
-                                              double *work, int lwork,
-                                              int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *W,
-                                              cuComplex *work, int lwork,
-                                              int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
-    int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
-      syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
-    int ldb, const float *W, int *lwork, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const float *, int, const float *, int,
-      const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
-    int ldb, const double *W, int *lwork, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const double *, int, const double *, int,
-      const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
-    const cuComplex *B, int ldb, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
-      const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *B, int ldb, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
-    float *W, float *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
-      int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
-    double *W, double *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
-      int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
-    float *W, cuComplex *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
-      cuComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda,
-    cuDoubleComplex *B, int ldb, double *W, cuDoubleComplex *work, int lwork,
-    int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreateGesvdjInfo(gesvdjInfo_t *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateGesvdjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroyGesvdjInfo(gesvdjInfo_t info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyGesvdjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetTolerance(gesvdjInfo_t info,
-                                                           double tolerance) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetTolerance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, tolerance);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetMaxSweeps(gesvdjInfo_t info,
-                                                           int max_sweeps) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetMaxSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, max_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetSortEig(gesvdjInfo_t info,
-                                                         int sort_svd) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetSortEig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, sort_svd);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetResidual(
-    cusolverDnHandle_t handle, gesvdjInfo_t info, double *residual) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
-                                                  gesvdjInfo_t, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetResidual");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, residual);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetSweeps(
-    cusolverDnHandle_t handle, gesvdjInfo_t info, int *executed_sweeps) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, gesvdjInfo_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, executed_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const float *A, int lda, const float *S, const float *U, int ldu,
-    const float *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const float *, int,
-      const float *, const float *, int, const float *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const double *A, int lda, const double *S, const double *U, int ldu,
-    const double *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const double *, int,
-      const double *, const double *, int, const double *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
-    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuComplex *, int,
-      const float *, const cuComplex *, int, const cuComplex *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const cuDoubleComplex *A, int lda, const double *S,
-    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
-    int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuDoubleComplex *,
-      int, const double *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, float *A,
-    int lda, float *S, float *U, int ldu, float *V, int ldv, float *work,
-    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, float *, int, float *,
-      float *, int, float *, int, float *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, double *A,
-    int lda, double *S, double *U, int ldu, double *V, int ldv, double *work,
-    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, double *, int, double *,
-      double *, int, double *, int, double *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
-    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
-    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
-    gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const float *A, int lda, const float *S, const float *U, int ldu,
-    const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
-      const float *, const float *, int, const float *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const double *A, int lda, const double *S, const double *U, int ldu,
-    const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
-      const double *, const double *, int, const double *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
-    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
-      int, const float *, const cuComplex *, int, const cuComplex *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const cuDoubleComplex *A, int lda, const double *S,
-    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
-    int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
-      const cuDoubleComplex *, int, const double *, const cuDoubleComplex *,
-      int, const cuDoubleComplex *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
-    float *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, float *, int,
-      float *, float *, int, float *, int, float *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
-    double *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, double *, int,
-      double *, double *, int, double *, int, double *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
-    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
-    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
-    gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuDoubleComplex *,
-      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const float *d_A, int lda, long long int strideA, const float *d_S,
-    long long int strideS, const float *d_U, int ldu, long long int strideU,
-    const float *d_V, int ldv, long long int strideV, int *lwork,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
-      long long, const float *, long long, const float *, int, long long,
-      const float *, int, long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const double *d_A, int lda, long long int strideA, const double *d_S,
-    long long int strideS, const double *d_U, int ldu, long long int strideU,
-    const double *d_V, int ldv, long long int strideV, int *lwork,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
-      long long, const double *, long long, const double *, int, long long,
-      const double *, int, long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuComplex *d_A, int lda, long long int strideA, const float *d_S,
-    long long int strideS, const cuComplex *d_U, int ldu, long long int strideU,
-    const cuComplex *d_V, int ldv, long long int strideV, int *lwork,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
-      int, long long, const float *, long long, const cuComplex *, int,
-      long long, const cuComplex *, int, long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuDoubleComplex *d_A, int lda, long long int strideA,
-    const double *d_S, long long int strideS, const cuDoubleComplex *d_U,
-    int ldu, long long int strideU, const cuDoubleComplex *d_V, int ldv,
-    long long int strideV, int *lwork, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
-      const cuDoubleComplex *, int, long long, const double *, long long,
-      const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int,
-      long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const float *d_A, int lda, long long int strideA, float *d_S,
-    long long int strideS, float *d_U, int ldu, long long int strideU,
-    float *d_V, int ldv, long long int strideV, float *d_work, int lwork,
-    int *d_info, double *h_R_nrmF, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
-      long long, float *, long long, float *, int, long long, float *, int,
-      long long, float *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const double *d_A, int lda, long long int strideA, double *d_S,
-    long long int strideS, double *d_U, int ldu, long long int strideU,
-    double *d_V, int ldv, long long int strideV, double *d_work, int lwork,
-    int *d_info, double *h_R_nrmF, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
-      long long, double *, long long, double *, int, long long, double *, int,
-      long long, double *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuComplex *d_A, int lda, long long int strideA, float *d_S,
-    long long int strideS, cuComplex *d_U, int ldu, long long int strideU,
-    cuComplex *d_V, int ldv, long long int strideV, cuComplex *d_work,
-    int lwork, int *d_info, double *h_R_nrmF, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
-      int, long long, float *, long long, cuComplex *, int, long long,
-      cuComplex *, int, long long, cuComplex *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuDoubleComplex *d_A, int lda, long long int strideA, double *d_S,
-    long long int strideS, cuDoubleComplex *d_U, int ldu, long long int strideU,
-    cuDoubleComplex *d_V, int ldv, long long int strideV,
-    cuDoubleComplex *d_work, int lwork, int *d_info, double *h_R_nrmF,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
-      const cuDoubleComplex *, int, long long, double *, long long,
-      cuDoubleComplex *, int, long long, cuDoubleComplex *, int, long long,
-      cuDoubleComplex *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_2.inc b/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_2.inc
deleted file mode 100644
index 5160c6337b2030..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_10_2.inc
+++ /dev/null
@@ -1,3667 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cusolverStatus_t CUSOLVERAPI cusolverGetProperty(libraryPropertyType type,
-                                                 int *value) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverGetVersion(int *version) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreate(cusolverDnHandle_t *handle) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroy(cusolverDnHandle_t handle) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSetStream(cusolverDnHandle_t handle,
-                                                 cudaStream_t streamId) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnGetStream(cusolverDnHandle_t handle,
-                                                 cudaStream_t *streamId) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnIRSParamsCreate(cusolverDnIRSParams_t *params_ptr) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params_ptr);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnIRSParamsDestroy(cusolverDnIRSParams_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetTol(
-    cusolverDnIRSParams_t params, cudaDataType data_type, double val) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
-                                                  cudaDataType, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetTol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, data_type, val);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetTolInner(
-    cusolverDnIRSParams_t params, cudaDataType data_type, double val) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
-                                                  cudaDataType, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetTolInner");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, data_type, val);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverPrecisions(
-    cusolverDnIRSParams_t params, cudaDataType solver_main_precision,
-    cudaDataType solver_lowest_precision) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
-                                                  cudaDataType, cudaDataType);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverPrecisions");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, solver_main_precision, solver_lowest_precision);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetRefinementSolver(
-    cusolverDnIRSParams_t params, cusolverIRSRefinement_t refinement_solver) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
-                                                  cusolverIRSRefinement_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetRefinementSolver");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, refinement_solver);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetMaxIters(
-    cusolverDnIRSParams_t params, cusolver_int_t maxiters) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetMaxIters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, maxiters);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetMaxItersInner(
-    cusolverDnIRSParams_t params, cusolver_int_t maxiters_inner) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetMaxItersInner");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, maxiters_inner);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsGetNiters(
-    cusolverDnIRSParams_t params, cusolver_int_t *niters) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsGetNiters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, niters);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsGetOuterNiters(
-    cusolverDnIRSParams_t params, cusolver_int_t *outer_niters) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsGetOuterNiters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, outer_niters);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsGetMaxIters(
-    cusolverDnIRSParams_t params, cusolver_int_t *maxiters) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsGetMaxIters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, maxiters);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverMainPrecision(
-    cusolverDnIRSParams_t params, cudaDataType solver_main_precision) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cudaDataType);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverMainPrecision");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, solver_main_precision);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverLowestPrecision(
-    cusolverDnIRSParams_t params, cudaDataType solver_lowest_precision) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cudaDataType);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverLowestPrecision");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, solver_lowest_precision);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosDestroy(
-    cusolverDnIRSParams_t params, cusolverDnIRSInfos_t infos) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
-                                                  cusolverDnIRSInfos_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, infos);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosCreate(
-    cusolverDnIRSParams_t params, cusolverDnIRSInfos_t *infos_ptr) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
-                                                  cusolverDnIRSInfos_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, infos_ptr);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetNiters(
-    cusolverDnIRSParams_t params, cusolverDnIRSInfos_t infos,
-    cusolver_int_t *niters) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnIRSParams_t, cusolverDnIRSInfos_t, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetNiters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, infos, niters);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetOuterNiters(
-    cusolverDnIRSParams_t params, cusolverDnIRSInfos_t infos,
-    cusolver_int_t *outer_niters) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnIRSParams_t, cusolverDnIRSInfos_t, cusolver_int_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetOuterNiters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, infos, outer_niters);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetMaxIters(
-    cusolverDnIRSParams_t params, cusolverDnIRSInfos_t infos,
-    cusolver_int_t *maxiters) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnIRSParams_t, cusolverDnIRSInfos_t, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetMaxIters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, infos, maxiters);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosRequestResidual(
-    cusolverDnIRSParams_t params, cusolverDnIRSInfos_t infos) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
-                                                  cusolverDnIRSInfos_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSInfosRequestResidual");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, infos);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetResidualHistory(
-    cusolverDnIRSParams_t params, cusolverDnIRSInfos_t infos,
-    void **residual_history) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnIRSParams_t, cusolverDnIRSInfos_t, void **);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetResidualHistory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, infos, residual_history);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv(
-    cusolverDnHandle_t handle, cusolverDnIRSParams_t gesv_irs_params,
-    cusolverDnIRSInfos_t gesv_irs_infos, cudaDataType inout_data_type,
-    cusolver_int_t n, cusolver_int_t nrhs, void *dA, cusolver_int_t ldda,
-    cusolver_int_t *dipiv, void *dB, cusolver_int_t lddb, void *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *niters, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnIRSParams_t, cusolverDnIRSInfos_t,
-      cudaDataType, cusolver_int_t, cusolver_int_t, void *, cusolver_int_t,
-      cusolver_int_t *, void *, cusolver_int_t, void *, cusolver_int_t, void *,
-      size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, gesv_irs_params, gesv_irs_infos, inout_data_type, n,
-                  nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, niters, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnIRSParams_t params, cusolver_int_t n,
-    cusolver_int_t nrhs, size_t *lwork_bytes) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnIRSParams_t,
-                                      cusolver_int_t, cusolver_int_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, n, nrhs, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, float *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnDpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, double *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuDoubleComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda,
-                                              float *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda,
-                                              double *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              cuComplex *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const float *A, int lda,
-                                              float *B, int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const double *A,
-                                              int lda, double *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const cuComplex *A,
-                                              int lda, cuComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs,
-                                              const cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, float *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, double *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, cuComplex *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      cuComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrfBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    cuDoubleComplex *Aarray[], int lda, int *infoArray, int batchSize) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      cuDoubleComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrsBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    int nrhs, /* only support rhs = 1*/
-    float *A[], int lda, float *B[], int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, float *[], int, float *[],
-      int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrsBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    int nrhs, /* only support rhs = 1*/
-    double *A[], int lda, double *B[], int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, double *[], int,
-      double *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-                        int nrhs, /* only support rhs = 1*/
-                        cuComplex *A[], int lda, cuComplex *B[], int ldb,
-                        int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, cuComplex *[], int,
-      cuComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-                        int nrhs, /* only support rhs = 1*/
-                        cuDoubleComplex *A[], int lda, cuDoubleComplex *B[],
-                        int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, cuDoubleComplex *[], int,
-      cuDoubleComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnDpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, double *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              cuComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnStrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnStrtri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo,
-                                              cublasDiagType_t diag, int n,
-                                              float *A, int lda, float *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo,
-                                              cublasDiagType_t diag, int n,
-                                              double *A, int lda, double *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
-      int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, cuComplex *A, int lda, cuComplex *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
-      int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo,
-                                              cublasDiagType_t diag, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
-      cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnDlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnClauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSlauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDlauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, double *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnClauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              cuComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZlauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuDoubleComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *Workspace, int *devIpiv,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *Workspace, int *devIpiv,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, double *, int, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              cuComplex *Workspace,
-                                              int *devIpiv, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *,
-                                      int, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuDoubleComplex *A,
-                                              int lda,
-                                              cuDoubleComplex *Workspace,
-                                              int *devIpiv, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
-      int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSlaswp(cusolverDnHandle_t handle, int n,
-                                              float *A, int lda, int k1, int k2,
-                                              const int *devIpiv, int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, float *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDlaswp(cusolverDnHandle_t handle, int n,
-                                              double *A, int lda, int k1,
-                                              int k2, const int *devIpiv,
-                                              int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, double *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnClaswp(cusolverDnHandle_t handle, int n,
-                                              cuComplex *A, int lda, int k1,
-                                              int k2, const int *devIpiv,
-                                              int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, cuComplex *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZlaswp(cusolverDnHandle_t handle, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              int k1, int k2,
-                                              const int *devIpiv, int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  cuDoubleComplex *, int, int,
-                                                  int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const float *A, int lda,
-                                              const int *devIpiv, float *B,
-                                              int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const float *, int,
-      const int *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const double *A,
-                                              int lda, const int *devIpiv,
-                                              double *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const double *, int,
-      const int *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const cuComplex *A,
-                                              int lda, const int *devIpiv,
-                                              cuComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const cuComplex *, int,
-      const int *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgetrs(
-    cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuDoubleComplex *A, int lda, const int *devIpiv, cuDoubleComplex *B,
-    int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
-      int, const int *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *TAU, float *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *TAU, double *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *,
-                                      int, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              cuComplex *TAU,
-                                              cuComplex *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, cuComplex *,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuDoubleComplex *A,
-                                              int lda, cuDoubleComplex *TAU,
-                                              cuDoubleComplex *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
-    const float *tau, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int,
-                                      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
-    const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  int, const double *, int,
-                                                  const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const cuComplex *A, int lda,
-    const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  int, const cuComplex *, int,
-                                                  const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const cuDoubleComplex *A,
-    int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, float *, int, const float *, float *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, double *, int, const double *,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, cuComplex *A,
-                                              int lda, const cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, cuComplex *, int, const cuComplex *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungqr(
-    cusolverDnHandle_t handle, int m, int n, int k, cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const float *A, int lda, const float *tau,
-    const float *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const float *, int, const float *, const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const double *A, int lda, const double *tau,
-    const double *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const double *, int, const double *, const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
-    const cuComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuComplex *, int, const cuComplex *, const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const float *A, int lda, const float *tau, float *C,
-    int ldc, float *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const float *, int, const float *, float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const double *A, int lda, const double *tau, double *C,
-    int ldc, double *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const double *, int, const double *, double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
-    cuComplex *C, int ldc, cuComplex *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuComplex *, int, const cuComplex *, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, cuDoubleComplex *C, int ldc,
-    cuDoubleComplex *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, int *ipiv,
-                                              float *work, int lwork,
-                                              int *info) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, int *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, int *ipiv,
-                                              double *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *, double *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, int *ipiv,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              int *ipiv, cuDoubleComplex *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const float *A, int lda, const int *ipiv, float *B, int ldb, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
-      const int *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const double *A, int lda, const int *ipiv, double *B, int ldb, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
-      const int *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const cuComplex *A, int lda, const int *ipiv, cuComplex *B, int ldb,
-    int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
-      const int *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
-    int ldb, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      int, const int *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const float *A, int lda,
-                                              const int *ipiv, float *B,
-                                              int ldb, float *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
-      const int *, float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const double *A,
-                                              int lda, const int *ipiv,
-                                              double *B, int ldb, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
-      const int *, double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCsytrs(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-                 int nrhs, const cuComplex *A, int lda, const int *ipiv,
-                 cuComplex *B, int ldb, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
-      const int *, cuComplex *, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
-    int ldb, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      int, const int *, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda,
-    const int *ipiv, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
-    const int *ipiv, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
-    int lda, const int *ipiv, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      cuComplex *, int, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, const int *ipiv, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda,
-                                              const int *ipiv, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const int *,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda,
-                                              const int *ipiv, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const int *,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              const int *ipiv, cuComplex *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, const int *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytri(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, const int *ipiv, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      const int *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *D, float *E, float *TAUQ,
-                                              float *TAUP, float *Work,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, float *, float *,
-      float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *D, double *E,
-                                              double *TAUQ, double *TAUP,
-                                              double *Work, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, double *, int, double *, double *, double *,
-      double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              float *D, float *E,
-                                              cuComplex *TAUQ, cuComplex *TAUP,
-                                              cuComplex *Work, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuComplex *, int, float *, float *,
-      cuComplex *, cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd(
-    cusolverDnHandle_t handle, int m, int n, cuDoubleComplex *A, int lda,
-    double *D, double *E, cuDoubleComplex *TAUQ, cuDoubleComplex *TAUP,
-    cuDoubleComplex *Work, int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, double *, double *,
-      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const float *A, int lda, const float *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const float *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const double *A, int lda, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const cuComplex *A, int lda, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const cuComplex *,
-      int, const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, float *, int,
-      const float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, double *, int,
-      const double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, cuComplex *A,
-                                              int lda, const cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuComplex *, int,
-      const cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZungbr(cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n,
-                 int k, cuDoubleComplex *A, int lda, const cuDoubleComplex *tau,
-                 cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuDoubleComplex *,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
-    int lda, const float *d, const float *e, const float *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const float *, int,
-      const float *, const float *, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
-    int lda, const double *d, const double *e, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
-      const double *, const double *, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChetrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
-    int lda, const float *d, const float *e, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
-      const float *, const float *, const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *A, int lda, const double *d, const double *e,
-    const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const double *, const double *, const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *d,
-                                              float *e, float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, float *, float *,
-      float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
-    double *d, double *e, double *tau, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, double *,
-      double *, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChetrd(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *d,
-                                              float *e, cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, float *,
-      float *, cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, double *d, double *e, cuDoubleComplex *tau, cuDoubleComplex *work,
-    int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      double *, double *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
-    int lda, const float *tau, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
-    int lda, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
-    int lda, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
-      const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const float *,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const double *,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungtr(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
-    int lda, const cuComplex *tau, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int,
-      const cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              const cuDoubleComplex *tau,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const float *A, int lda,
-    const float *tau, const float *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const float *, int, const float *, const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const double *A, int lda,
-    const double *tau, const double *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const double *, int, const double *, const double *, int,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const cuComplex *A, int lda,
-    const cuComplex *tau, const cuComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const cuComplex *, int, const cuComplex *, const cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const cuDoubleComplex *, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, float *A, int lda, float *tau,
-    float *C, int ldc, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, float *, int, float *, float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, double *A, int lda, double *tau,
-    double *C, int ldc, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, double *, int, double *, double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCunmtr(cusolverDnHandle_t handle, cublasSideMode_t side,
-                 cublasFillMode_t uplo, cublasOperation_t trans, int m, int n,
-                 cuComplex *A, int lda, cuComplex *tau, cuComplex *C, int ldc,
-                 cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, cuComplex *, int, cuComplex *, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, cuDoubleComplex *A, int lda,
-    cuDoubleComplex *tau, cuDoubleComplex *C, int ldc, cuDoubleComplex *work,
-    int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, cuDoubleComplex *, int, cuDoubleComplex *, cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd(
-    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
-    int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
-    float *work, int lwork, float *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, float *, int,
-      float *, float *, int, float *, int, float *, int, float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd(
-    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
-    int n, double *A, int lda, double *S, double *U, int ldu, double *VT,
-    int ldvt, double *work, int lwork, double *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, double *, int,
-      double *, double *, int, double *, int, double *, int, double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
-                 int m, int n, cuComplex *A, int lda, float *S, cuComplex *U,
-                 int ldu, cuComplex *VT, int ldvt, cuComplex *work, int lwork,
-                 float *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, float *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
-                 int m, int n, cuDoubleComplex *A, int lda, double *S,
-                 cuDoubleComplex *U, int ldu, cuDoubleComplex *VT, int ldvt,
-                 cuDoubleComplex *work, int lwork, double *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, cuDoubleComplex *,
-      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, float *A, int lda, float *W, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, double *A, int lda, double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevd(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *W,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevd(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              double *W, cuDoubleComplex *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu,
-    int il, int iu, int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const float *, int, float, float, int, int, int *,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const double *A, int lda, double vl,
-    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const double *, int, double, double, int, int,
-      int *, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const cuComplex *A, int lda, float vl,
-    float vu, int il, int iu, int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const cuComplex *, int, float, float, int, int,
-      int *, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda, double vl,
-    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const cuDoubleComplex *, int, double, double, int,
-      int, int *, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il,
-    int iu, int *meig, float *W, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, float *, int, float, float, int, int, int *,
-      float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu,
-    int il, int iu, int *meig, double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, double *, int, double, double, int, int, int *,
-      double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCheevdx(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
-                  cusolverEigRange_t range, cublasFillMode_t uplo, int n,
-                  cuComplex *A, int lda, float vl, float vu, int il, int iu,
-                  int *meig, float *W, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, cuComplex *, int, float, float, int, int, int *,
-      float *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda, double vl,
-    double vu, int il, int iu, int *meig, double *W, cuDoubleComplex *work,
-    int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, cuDoubleComplex *, int, double, double, int, int,
-      int *, double *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const float *A,
-    int lda, const float *B, int ldb, float vl, float vu, int il, int iu,
-    int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const float *, int,
-      const float *, int, float, float, int, int, int *, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const double *A,
-    int lda, const double *B, int ldb, double vl, double vu, int il, int iu,
-    int *meig, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const double *, int,
-      const double *, int, double, double, int, int, int *, const double *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const cuComplex *A,
-    int lda, const cuComplex *B, int ldb, float vl, float vu, int il, int iu,
-    int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const cuComplex *, int,
-      const cuComplex *, int, float, float, int, int, int *, const float *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    double vl, double vu, int il, int iu, int *meig, const double *W,
-    int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, double, double, int, int, int *,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, float *A, int lda,
-    float *B, int ldb, float vl, float vu, int il, int iu, int *meig, float *W,
-    float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, float *, int, float *, int,
-      float, float, int, int, int *, float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, double *A, int lda,
-    double *B, int ldb, double vl, double vu, int il, int iu, int *meig,
-    double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, double *, int, double *, int,
-      double, double, int, int, int *, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuComplex *A,
-    int lda, cuComplex *B, int ldb, float vl, float vu, int il, int iu,
-    int *meig, float *W, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, float, float, int, int, int *, float *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, cuDoubleComplex *B, int ldb, double vl, double vu, int il, int iu,
-    int *meig, double *W, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, double, double, int, int, int *, double *,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
-    int ldb, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const float *, int, const float *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
-    int ldb, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const double *, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
-    const cuComplex *B, int ldb, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *B, int ldb, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
-    float *W, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
-    double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
-    float *W, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZhegvd(cusolverDnHandle_t handle, cusolverEigType_t itype,
-                 cusolverEigMode_t jobz, cublasFillMode_t uplo, int n,
-                 cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb,
-                 double *W, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreateSyevjInfo(syevjInfo_t *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateSyevjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroySyevjInfo(syevjInfo_t info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroySyevjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetTolerance(syevjInfo_t info,
-                                                          double tolerance) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetTolerance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, tolerance);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetMaxSweeps(syevjInfo_t info,
-                                                          int max_sweeps) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetMaxSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, max_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetSortEig(syevjInfo_t info,
-                                                        int sort_eig) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetSortEig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, sort_eig);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetResidual(
-    cusolverDnHandle_t handle, syevjInfo_t info, double *residual) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
-                                                  syevjInfo_t, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetResidual");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, residual);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetSweeps(
-    cusolverDnHandle_t handle, syevjInfo_t info, int *executed_sweeps) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, syevjInfo_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, executed_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnCheevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnZheevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, float *A, int lda, float *W, float *work, int lwork, int *info,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, double *A, int lda, double *W, double *work, int lwork, int *info,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuComplex *A, int lda, float *W, cuComplex *work, int lwork,
-    int *info, syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
-    int lwork, int *info, syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
-      syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *W,
-                                              float *work, int lwork, int *info,
-                                              syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, double *W,
-                                              double *work, int lwork,
-                                              int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *W,
-                                              cuComplex *work, int lwork,
-                                              int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
-    int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
-      syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
-    int ldb, const float *W, int *lwork, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const float *, int, const float *, int,
-      const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
-    int ldb, const double *W, int *lwork, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const double *, int, const double *, int,
-      const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
-    const cuComplex *B, int ldb, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
-      const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *B, int ldb, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
-    float *W, float *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
-      int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
-    double *W, double *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
-      int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
-    float *W, cuComplex *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
-      cuComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda,
-    cuDoubleComplex *B, int ldb, double *W, cuDoubleComplex *work, int lwork,
-    int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreateGesvdjInfo(gesvdjInfo_t *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateGesvdjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroyGesvdjInfo(gesvdjInfo_t info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyGesvdjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetTolerance(gesvdjInfo_t info,
-                                                           double tolerance) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetTolerance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, tolerance);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetMaxSweeps(gesvdjInfo_t info,
-                                                           int max_sweeps) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetMaxSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, max_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetSortEig(gesvdjInfo_t info,
-                                                         int sort_svd) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetSortEig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, sort_svd);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetResidual(
-    cusolverDnHandle_t handle, gesvdjInfo_t info, double *residual) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
-                                                  gesvdjInfo_t, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetResidual");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, residual);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetSweeps(
-    cusolverDnHandle_t handle, gesvdjInfo_t info, int *executed_sweeps) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, gesvdjInfo_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, executed_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const float *A, int lda, const float *S, const float *U, int ldu,
-    const float *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const float *, int,
-      const float *, const float *, int, const float *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const double *A, int lda, const double *S, const double *U, int ldu,
-    const double *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const double *, int,
-      const double *, const double *, int, const double *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
-    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuComplex *, int,
-      const float *, const cuComplex *, int, const cuComplex *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const cuDoubleComplex *A, int lda, const double *S,
-    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
-    int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuDoubleComplex *,
-      int, const double *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, float *A,
-    int lda, float *S, float *U, int ldu, float *V, int ldv, float *work,
-    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, float *, int, float *,
-      float *, int, float *, int, float *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, double *A,
-    int lda, double *S, double *U, int ldu, double *V, int ldv, double *work,
-    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, double *, int, double *,
-      double *, int, double *, int, double *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
-    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
-    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
-    gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const float *A, int lda, const float *S, const float *U, int ldu,
-    const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
-      const float *, const float *, int, const float *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const double *A, int lda, const double *S, const double *U, int ldu,
-    const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
-      const double *, const double *, int, const double *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
-    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
-      int, const float *, const cuComplex *, int, const cuComplex *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const cuDoubleComplex *A, int lda, const double *S,
-    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
-    int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
-      const cuDoubleComplex *, int, const double *, const cuDoubleComplex *,
-      int, const cuDoubleComplex *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
-    float *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, float *, int,
-      float *, float *, int, float *, int, float *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
-    double *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, double *, int,
-      double *, double *, int, double *, int, double *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
-    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
-    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
-    gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuDoubleComplex *,
-      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const float *d_A, int lda, long long int strideA, const float *d_S,
-    long long int strideS, const float *d_U, int ldu, long long int strideU,
-    const float *d_V, int ldv, long long int strideV, int *lwork,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
-      long long, const float *, long long, const float *, int, long long,
-      const float *, int, long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const double *d_A, int lda, long long int strideA, const double *d_S,
-    long long int strideS, const double *d_U, int ldu, long long int strideU,
-    const double *d_V, int ldv, long long int strideV, int *lwork,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
-      long long, const double *, long long, const double *, int, long long,
-      const double *, int, long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuComplex *d_A, int lda, long long int strideA, const float *d_S,
-    long long int strideS, const cuComplex *d_U, int ldu, long long int strideU,
-    const cuComplex *d_V, int ldv, long long int strideV, int *lwork,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
-      int, long long, const float *, long long, const cuComplex *, int,
-      long long, const cuComplex *, int, long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuDoubleComplex *d_A, int lda, long long int strideA,
-    const double *d_S, long long int strideS, const cuDoubleComplex *d_U,
-    int ldu, long long int strideU, const cuDoubleComplex *d_V, int ldv,
-    long long int strideV, int *lwork, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
-      const cuDoubleComplex *, int, long long, const double *, long long,
-      const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int,
-      long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const float *d_A, int lda, long long int strideA, float *d_S,
-    long long int strideS, float *d_U, int ldu, long long int strideU,
-    float *d_V, int ldv, long long int strideV, float *d_work, int lwork,
-    int *d_info, double *h_R_nrmF, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
-      long long, float *, long long, float *, int, long long, float *, int,
-      long long, float *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const double *d_A, int lda, long long int strideA, double *d_S,
-    long long int strideS, double *d_U, int ldu, long long int strideU,
-    double *d_V, int ldv, long long int strideV, double *d_work, int lwork,
-    int *d_info, double *h_R_nrmF, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
-      long long, double *, long long, double *, int, long long, double *, int,
-      long long, double *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuComplex *d_A, int lda, long long int strideA, float *d_S,
-    long long int strideS, cuComplex *d_U, int ldu, long long int strideU,
-    cuComplex *d_V, int ldv, long long int strideV, cuComplex *d_work,
-    int lwork, int *d_info, double *h_R_nrmF, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
-      int, long long, float *, long long, cuComplex *, int, long long,
-      cuComplex *, int, long long, cuComplex *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuDoubleComplex *d_A, int lda, long long int strideA, double *d_S,
-    long long int strideS, cuDoubleComplex *d_U, int ldu, long long int strideU,
-    cuDoubleComplex *d_V, int ldv, long long int strideV,
-    cuDoubleComplex *d_work, int lwork, int *d_info, double *h_R_nrmF,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
-      const cuDoubleComplex *, int, long long, double *, long long,
-      cuDoubleComplex *, int, long long, cuDoubleComplex *, int, long long,
-      cuDoubleComplex *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_11_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_11_0.inc
deleted file mode 100644
index 89db3b965cfc1a..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_dense_11_0.inc
+++ /dev/null
@@ -1,5149 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cusolverStatus_t CUSOLVERAPI cusolverGetProperty(libraryPropertyType type,
-                                                 int *value) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverGetVersion(int *version) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(version);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreate(cusolverDnHandle_t *handle) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverSpCreate(cusolverSpHandle_t *handle) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverSpHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverSpCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroy(cusolverDnHandle_t handle) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverSpDestroy(cusolverSpHandle_t handle) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverSpHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverSpDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSetStream(cusolverDnHandle_t handle,
-                                                 cudaStream_t streamId) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverSpSetStream(cusolverSpHandle_t handle,
-                                                 cudaStream_t streamId) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverSpHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverSpSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnGetStream(cusolverDnHandle_t handle,
-                                                 cudaStream_t *streamId) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnIRSParamsCreate(cusolverDnIRSParams_t *params_ptr) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params_ptr);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnIRSParamsDestroy(cusolverDnIRSParams_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetRefinementSolver(
-    cusolverDnIRSParams_t params, cusolverIRSRefinement_t refinement_solver) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
-                                                  cusolverIRSRefinement_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetRefinementSolver");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, refinement_solver);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverMainPrecision(
-    cusolverDnIRSParams_t params, cusolverPrecType_t solver_main_precision) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
-                                                  cusolverPrecType_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverMainPrecision");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, solver_main_precision);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverLowestPrecision(
-    cusolverDnIRSParams_t params, cusolverPrecType_t solver_lowest_precision) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
-                                                  cusolverPrecType_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverLowestPrecision");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, solver_lowest_precision);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverPrecisions(
-    cusolverDnIRSParams_t params, cusolverPrecType_t solver_main_precision,
-    cusolverPrecType_t solver_lowest_precision) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnIRSParams_t, cusolverPrecType_t, cusolverPrecType_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverPrecisions");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, solver_main_precision, solver_lowest_precision);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnIRSParamsSetTol(cusolverDnIRSParams_t params, double val) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetTol");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, val);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnIRSParamsSetTolInner(cusolverDnIRSParams_t params, double val) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetTolInner");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, val);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetMaxIters(
-    cusolverDnIRSParams_t params, cusolver_int_t maxiters) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetMaxIters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, maxiters);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetMaxItersInner(
-    cusolverDnIRSParams_t params, cusolver_int_t maxiters_inner) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetMaxItersInner");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, maxiters_inner);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsGetMaxIters(
-    cusolverDnIRSParams_t params, cusolver_int_t *maxiters) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsGetMaxIters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, maxiters);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnIRSParamsEnableFallback(cusolverDnIRSParams_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsEnableFallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnIRSParamsDisableFallback(cusolverDnIRSParams_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSParamsDisableFallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnIRSInfosDestroy(cusolverDnIRSInfos_t infos) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(infos);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnIRSInfosCreate(cusolverDnIRSInfos_t *infos_ptr) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(infos_ptr);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetNiters(
-    cusolverDnIRSInfos_t infos, cusolver_int_t *niters) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetNiters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(infos, niters);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetOuterNiters(
-    cusolverDnIRSInfos_t infos, cusolver_int_t *outer_niters) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetOuterNiters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(infos, outer_niters);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnIRSInfosRequestResidual(cusolverDnIRSInfos_t infos) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSInfosRequestResidual");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(infos);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetResidualHistory(
-    cusolverDnIRSInfos_t infos, void **residual_history) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, void **);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetResidualHistory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(infos, residual_history);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetMaxIters(
-    cusolverDnIRSInfos_t infos, cusolver_int_t *maxiters) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetMaxIters");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(infos, maxiters);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZEgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZYgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCEgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCYgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDBgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDXgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSBgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSXgesv(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZEgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZYgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
-      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCEgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCYgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
-      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
-      cuComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDBgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDXgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
-    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
-      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSBgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSXgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
-    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
-    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
-      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
-                  dWorkspace, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZZgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
-                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-                 cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZCgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
-                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-                 cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZKgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
-                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-                 cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZEgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
-                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-                 cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZYgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
-                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-                 cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCCgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCKgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCEgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCYgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDDgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      double *, cusolver_int_t, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDSgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      double *, cusolver_int_t, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDHgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      double *, cusolver_int_t, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDBgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      double *, cusolver_int_t, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDXgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      double *, cusolver_int_t, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSSgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
-    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
-      void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSHgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
-    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
-      void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSBgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
-    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
-      void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSXgels(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
-    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
-      void *, size_t, cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes, iter, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZZgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZCgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZKgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZEgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZYgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
-    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
-      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCCgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCKgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCEgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCYgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
-    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDDgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      double *, cusolver_int_t, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDSgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      double *, cusolver_int_t, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDHgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      double *, cusolver_int_t, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDBgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      double *, cusolver_int_t, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDXgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
-    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      double *, cusolver_int_t, double *, cusolver_int_t, double *,
-      cusolver_int_t, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSSgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
-    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
-      void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSHgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
-    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
-      void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSBgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
-    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
-      void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSXgels_bufferSize(
-    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
-    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
-      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
-      void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
-                  lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv(
-    cusolverDnHandle_t handle, cusolverDnIRSParams_t gesv_irs_params,
-    cusolverDnIRSInfos_t gesv_irs_infos, cusolver_int_t n, cusolver_int_t nrhs,
-    void *dA, cusolver_int_t ldda, void *dB, cusolver_int_t lddb, void *dX,
-    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
-    cusolver_int_t *niters, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnIRSParams_t, cusolverDnIRSInfos_t,
-      cusolver_int_t, cusolver_int_t, void *, cusolver_int_t, void *,
-      cusolver_int_t, void *, cusolver_int_t, void *, size_t, cusolver_int_t *,
-      cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgesv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, gesv_irs_params, gesv_irs_infos, n, nrhs, dA, ldda,
-                  dB, lddb, dX, lddx, dWorkspace, lwork_bytes, niters, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnIRSParams_t params, cusolver_int_t n,
-    cusolver_int_t nrhs, size_t *lwork_bytes) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnIRSParams_t,
-                                      cusolver_int_t, cusolver_int_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgesv_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, n, nrhs, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgels(
-    cusolverDnHandle_t handle, cusolverDnIRSParams_t gels_irs_params,
-    cusolverDnIRSInfos_t gels_irs_infos, cusolver_int_t m, cusolver_int_t n,
-    cusolver_int_t nrhs, void *dA, cusolver_int_t ldda, void *dB,
-    cusolver_int_t lddb, void *dX, cusolver_int_t lddx, void *dWorkspace,
-    size_t lwork_bytes, cusolver_int_t *niters, cusolver_int_t *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnIRSParams_t, cusolverDnIRSInfos_t,
-      cusolver_int_t, cusolver_int_t, cusolver_int_t, void *, cusolver_int_t,
-      void *, cusolver_int_t, void *, cusolver_int_t, void *, size_t,
-      cusolver_int_t *, cusolver_int_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgels");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, gels_irs_params, gels_irs_infos, m, n, nrhs, dA, ldda,
-                  dB, lddb, dX, lddx, dWorkspace, lwork_bytes, niters, d_info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgels_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnIRSParams_t params, cusolver_int_t m,
-    cusolver_int_t n, cusolver_int_t nrhs, size_t *lwork_bytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnIRSParams_t, cusolver_int_t, cusolver_int_t,
-      cusolver_int_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgels_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, m, n, nrhs, lwork_bytes);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, float *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnDpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, double *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuDoubleComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda,
-                                              float *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda,
-                                              double *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              cuComplex *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const float *A, int lda,
-                                              float *B, int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const double *A,
-                                              int lda, double *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const cuComplex *A,
-                                              int lda, cuComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs,
-                                              const cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, float *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, double *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotrfBatched(cusolverDnHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n, cuComplex *Aarray[],
-                                                     int lda, int *infoArray,
-                                                     int batchSize) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      cuComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotrfBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    cuDoubleComplex *Aarray[], int lda, int *infoArray, int batchSize) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      cuDoubleComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrfBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotrsBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    int nrhs, /* only support rhs = 1*/
-    float *A[], int lda, float *B[], int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, float *[], int, float *[],
-      int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotrsBatched(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    int nrhs, /* only support rhs = 1*/
-    double *A[], int lda, double *B[], int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, double *[], int,
-      double *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-                        int nrhs, /* only support rhs = 1*/
-                        cuComplex *A[], int lda, cuComplex *B[], int ldb,
-                        int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, cuComplex *[], int,
-      cuComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-                        int nrhs, /* only support rhs = 1*/
-                        cuDoubleComplex *A[], int lda, cuDoubleComplex *B[],
-                        int ldb, int *d_info, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, cuDoubleComplex *[], int,
-      cuDoubleComplex *[], int, int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrsBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnDpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, double *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              cuComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZpotri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnStrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnStrtri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo,
-                                              cublasDiagType_t diag, int n,
-                                              float *A, int lda, float *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo,
-                                              cublasDiagType_t diag, int n,
-                                              double *A, int lda, double *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
-      int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
-    int n, cuComplex *A, int lda, cuComplex *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
-      int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo,
-                                              cublasDiagType_t diag, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
-      cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnDlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnClauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
-                            int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSlauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDlauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, double *work,
-                                              int lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnClauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              cuComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZlauum(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuDoubleComplex *A, int lda, int *Lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *Workspace, int *devIpiv,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *Workspace, int *devIpiv,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, double *, int, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              cuComplex *Workspace,
-                                              int *devIpiv, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *,
-                                      int, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgetrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuDoubleComplex *A,
-                                              int lda,
-                                              cuDoubleComplex *Workspace,
-                                              int *devIpiv, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
-      int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSlaswp(cusolverDnHandle_t handle, int n,
-                                              float *A, int lda, int k1, int k2,
-                                              const int *devIpiv, int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, float *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDlaswp(cusolverDnHandle_t handle, int n,
-                                              double *A, int lda, int k1,
-                                              int k2, const int *devIpiv,
-                                              int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, double *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnClaswp(cusolverDnHandle_t handle, int n,
-                                              cuComplex *A, int lda, int k1,
-                                              int k2, const int *devIpiv,
-                                              int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, cuComplex *, int, int, int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZlaswp(cusolverDnHandle_t handle, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              int k1, int k2,
-                                              const int *devIpiv, int incx) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  cuDoubleComplex *, int, int,
-                                                  int, const int *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlaswp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const float *A, int lda,
-                                              const int *devIpiv, float *B,
-                                              int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const float *, int,
-      const int *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const double *A,
-                                              int lda, const int *devIpiv,
-                                              double *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const double *, int,
-      const int *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgetrs(cusolverDnHandle_t handle,
-                                              cublasOperation_t trans, int n,
-                                              int nrhs, const cuComplex *A,
-                                              int lda, const int *devIpiv,
-                                              cuComplex *B, int ldb,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const cuComplex *, int,
-      const int *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgetrs(
-    cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-    const cuDoubleComplex *A, int lda, const int *devIpiv, cuDoubleComplex *B,
-    int ldb, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
-      int, const int *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
-                            cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *TAU, float *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *TAU, double *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *,
-                                      int, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              cuComplex *TAU,
-                                              cuComplex *Workspace, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  cuComplex *, int, cuComplex *,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgeqrf(cusolverDnHandle_t handle, int m,
-                                              int n, cuDoubleComplex *A,
-                                              int lda, cuDoubleComplex *TAU,
-                                              cuDoubleComplex *Workspace,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvqr(cusolverSpHandle_t handle,
-    int m, int nnz,
-    const cusparseMatDescr_t descrA,
-    const float *csrValA,
-    const int *csrRowPtrA,
-    const int *csrColIndA,
-    const float *b, float tol,
-    int reorder, float *x,
-    int *singularity) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-    cusolverSpHandle_t, int, int, const cusparseMatDescr_t, const float *,
-    const int *, const int *, const float *, float, int, float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverSpScsrlsvqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  b, tol, reorder, x, singularity);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvqr(cusolverSpHandle_t handle,
-    int m, int nnz,
-    const cusparseMatDescr_t descrA,
-    const double *csrValA,
-    const int *csrRowPtrA,
-    const int *csrColIndA,
-    const double *b, double tol,
-    int reorder, double *x,
-    int *singularity) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-    cusolverSpHandle_t, int, int, const cusparseMatDescr_t, const double *,
-    const int *, const int *, const double *, double, int, double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverSpDcsrlsvqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  b, tol, reorder, x, singularity);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvqr(cusolverSpHandle_t handle,
-    int m, int nnz,
-    const cusparseMatDescr_t descrA,
-    const cuComplex *csrValA,
-    const int *csrRowPtrA,
-    const int *csrColIndA,
-    const cuComplex *b, float tol,
-    int reorder, cuComplex *x,
-    int *singularity) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-    cusolverSpHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-    const int *, const int *, const cuComplex *, float, int,
-    cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverSpCcsrlsvqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  b, tol, reorder, x, singularity);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvqr(cusolverSpHandle_t handle,
-    int m, int nnz,
-    const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrValA,
-    const int *csrRowPtrA,
-    const int *csrColIndA,
-    const cuDoubleComplex *b,
-    double tol,
-    int reorder,
-    cuDoubleComplex *x,
-    int *singularity) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-    cusolverSpHandle_t, int, int, const cusparseMatDescr_t,
-    const cuDoubleComplex *, const int *, const int *, const cuDoubleComplex *,
-    double, int, cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverSpZcsrlsvqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  b, tol, reorder, x, singularity);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
-    const float *tau, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int,
-                                      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
-    const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  int, const double *, int,
-                                                  const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const cuComplex *A, int lda,
-    const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
-                                                  int, const cuComplex *, int,
-                                                  const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungqr_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int k, const cuDoubleComplex *A,
-    int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, float *, int, const float *, float *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, double *, int, const double *,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungqr(cusolverDnHandle_t handle, int m,
-                                              int n, int k, cuComplex *A,
-                                              int lda, const cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, cuComplex *, int, const cuComplex *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungqr(
-    cusolverDnHandle_t handle, int m, int n, int k, cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, int, cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const float *A, int lda, const float *tau,
-    const float *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const float *, int, const float *, const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const double *A, int lda, const double *tau,
-    const double *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const double *, int, const double *, const double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
-    const cuComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuComplex *, int, const cuComplex *, const cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const float *A, int lda, const float *tau, float *C,
-    int ldc, float *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const float *, int, const float *, float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const double *A, int lda, const double *tau, double *C,
-    int ldc, double *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const double *, int, const double *, double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
-    cuComplex *C, int ldc, cuComplex *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuComplex *, int, const cuComplex *, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-    int m, int n, int k, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, cuDoubleComplex *C, int ldc,
-    cuDoubleComplex *work, int lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
-                  lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, float *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, double *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, cuComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
-                                                  cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf_bufferSize(
-    cusolverDnHandle_t handle, int n, cuDoubleComplex *A, int lda, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, A, lda, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, int *ipiv,
-                                              float *work, int lwork,
-                                              int *info) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, int *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, int *ipiv,
-                                              double *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *, double *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, int *ipiv,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              int *ipiv, cuDoubleComplex *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const float *A, int lda, const int *ipiv, float *B, int ldb, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
-      const int *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const double *A, int lda, const int *ipiv, double *B, int ldb, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
-      const int *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const cuComplex *A, int lda, const int *ipiv, cuComplex *B, int ldb,
-    int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
-      const int *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
-    int ldb, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      int, const int *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const float *A, int lda,
-                                              const int *ipiv, float *B,
-                                              int ldb, float *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
-      const int *, float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              int nrhs, const double *A,
-                                              int lda, const int *ipiv,
-                                              double *B, int ldb, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
-      const int *, double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCsytrs(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-                 int nrhs, const cuComplex *A, int lda, const int *ipiv,
-                 cuComplex *B, int ldb, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
-      const int *, cuComplex *, int, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
-    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
-    int ldb, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
-      int, const int *, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda,
-    const int *ipiv, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      float *, int, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
-    const int *ipiv, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      double *, int, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
-    int lda, const int *ipiv, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      cuComplex *, int, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytri_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, const int *ipiv, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda,
-                                              const int *ipiv, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const int *,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda,
-                                              const int *ipiv, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const int *,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCsytri(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda,
-                                              const int *ipiv, cuComplex *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, const int *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZsytri(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, const int *ipiv, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      const int *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, Lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, float *A, int lda,
-                                              float *D, float *E, float *TAUQ,
-                                              float *TAUP, float *Work,
-                                              int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, float *, int, float *, float *, float *,
-      float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, double *A, int lda,
-                                              double *D, double *E,
-                                              double *TAUQ, double *TAUP,
-                                              double *Work, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, double *, int, double *, double *, double *,
-      double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd(cusolverDnHandle_t handle, int m,
-                                              int n, cuComplex *A, int lda,
-                                              float *D, float *E,
-                                              cuComplex *TAUQ, cuComplex *TAUP,
-                                              cuComplex *Work, int Lwork,
-                                              int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuComplex *, int, float *, float *,
-      cuComplex *, cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd(
-    cusolverDnHandle_t handle, int m, int n, cuDoubleComplex *A, int lda,
-    double *D, double *E, cuDoubleComplex *TAUQ, cuDoubleComplex *TAUP,
-    cuDoubleComplex *Work, int Lwork, int *devInfo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, double *, double *,
-      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const float *A, int lda, const float *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const float *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const double *A, int lda, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const cuComplex *A, int lda, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const cuComplex *,
-      int, const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungbr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, float *, int,
-      const float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, double *, int,
-      const double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungbr(cusolverDnHandle_t handle,
-                                              cublasSideMode_t side, int m,
-                                              int n, int k, cuComplex *A,
-                                              int lda, const cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuComplex *, int,
-      const cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZungbr(cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n,
-                 int k, cuDoubleComplex *A, int lda, const cuDoubleComplex *tau,
-                 cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuDoubleComplex *,
-      int, const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
-    int lda, const float *d, const float *e, const float *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const float *, int,
-      const float *, const float *, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
-    int lda, const double *d, const double *e, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
-      const double *, const double *, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChetrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
-    int lda, const float *d, const float *e, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
-      const float *, const float *, const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *A, int lda, const double *d, const double *e,
-    const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const double *, const double *, const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *d,
-                                              float *e, float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, float *, float *,
-      float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
-    double *d, double *e, double *tau, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, double *,
-      double *, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChetrd(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *d,
-                                              float *e, cuComplex *tau,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, float *,
-      float *, cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, double *d, double *e, cuDoubleComplex *tau, cuDoubleComplex *work,
-    int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      double *, double *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
-    int lda, const float *tau, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
-                                      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
-    int lda, const double *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
-    int lda, const cuComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
-      const cuComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungtr_bufferSize(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda,
-                                              const float *tau, float *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const float *,
-      float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda,
-                                              const double *tau, double *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const double *,
-      double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCungtr(
-    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
-    int lda, const cuComplex *tau, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int,
-      const cuComplex *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZungtr(cusolverDnHandle_t handle,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              const cuDoubleComplex *tau,
-                                              cuDoubleComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const float *A, int lda,
-    const float *tau, const float *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const float *, int, const float *, const float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const double *A, int lda,
-    const double *tau, const double *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const double *, int, const double *, const double *, int,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCunmtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const cuComplex *A, int lda,
-    const cuComplex *tau, const cuComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const cuComplex *, int, const cuComplex *, const cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr_bufferSize(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, const cuDoubleComplex *, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSormtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, float *A, int lda, float *tau,
-    float *C, int ldc, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, float *, int, float *, float *, int, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDormtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, double *A, int lda, double *tau,
-    double *C, int ldc, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, double *, int, double *, double *, int, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCunmtr(cusolverDnHandle_t handle, cublasSideMode_t side,
-                 cublasFillMode_t uplo, cublasOperation_t trans, int m, int n,
-                 cuComplex *A, int lda, cuComplex *tau, cuComplex *C, int ldc,
-                 cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, cuComplex *, int, cuComplex *, cuComplex *, int, cuComplex *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr(
-    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
-    cublasOperation_t trans, int m, int n, cuDoubleComplex *A, int lda,
-    cuDoubleComplex *tau, cuDoubleComplex *C, int ldc, cuDoubleComplex *work,
-    int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
-      int, int, cuDoubleComplex *, int, cuDoubleComplex *, cuDoubleComplex *,
-      int, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
-                  lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvd_bufferSize(
-    cusolverDnHandle_t handle, int m, int n, int *lwork) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd(
-    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
-    int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
-    float *work, int lwork, float *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, float *, int,
-      float *, float *, int, float *, int, float *, int, float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd(
-    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
-    int n, double *A, int lda, double *S, double *U, int ldu, double *VT,
-    int ldvt, double *work, int lwork, double *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, double *, int,
-      double *, double *, int, double *, int, double *, int, double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
-                 int m, int n, cuComplex *A, int lda, float *S, cuComplex *U,
-                 int ldu, cuComplex *VT, int ldvt, cuComplex *work, int lwork,
-                 float *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, float *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
-                 int m, int n, cuDoubleComplex *A, int lda, double *S,
-                 cuDoubleComplex *U, int ldu, cuDoubleComplex *VT, int ldvt,
-                 cuDoubleComplex *work, int lwork, double *rwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, signed char, signed char, int, int, cuDoubleComplex *,
-      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
-                  lwork, rwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, float *A, int lda, float *W, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, double *A, int lda, double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevd(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *W,
-                                              cuComplex *work, int lwork,
-                                              int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevd(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuDoubleComplex *A, int lda,
-                                              double *W, cuDoubleComplex *work,
-                                              int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu,
-    int il, int iu, int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const float *, int, float, float, int, int, int *,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const double *A, int lda, double vl,
-    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const double *, int, double, double, int, int,
-      int *, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const cuComplex *A, int lda, float vl,
-    float vu, int il, int iu, int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const cuComplex *, int, float, float, int, int,
-      int *, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda, double vl,
-    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, const cuDoubleComplex *, int, double, double, int,
-      int, int *, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il,
-    int iu, int *meig, float *W, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, float *, int, float, float, int, int, int *,
-      float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu,
-    int il, int iu, int *meig, double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, double *, int, double, double, int, int, int *,
-      double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCheevdx(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
-                  cusolverEigRange_t range, cublasFillMode_t uplo, int n,
-                  cuComplex *A, int lda, float vl, float vu, int il, int iu,
-                  int *meig, float *W, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, cuComplex *, int, float, float, int, int, int *,
-      float *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda, double vl,
-    double vu, int il, int iu, int *meig, double *W, cuDoubleComplex *work,
-    int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
-      cublasFillMode_t, int, cuDoubleComplex *, int, double, double, int, int,
-      int *, double *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
-                  work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const float *A,
-    int lda, const float *B, int ldb, float vl, float vu, int il, int iu,
-    int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const float *, int,
-      const float *, int, float, float, int, int, int *, const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const double *A,
-    int lda, const double *B, int ldb, double vl, double vu, int il, int iu,
-    int *meig, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const double *, int,
-      const double *, int, double, double, int, int, int *, const double *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const cuComplex *A,
-    int lda, const cuComplex *B, int ldb, float vl, float vu, int il, int iu,
-    int *meig, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const cuComplex *, int,
-      const cuComplex *, int, float, float, int, int, int *, const float *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n,
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-    double vl, double vu, int il, int iu, int *meig, const double *W,
-    int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, double, double, int, int, int *,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, float *A, int lda,
-    float *B, int ldb, float vl, float vu, int il, int iu, int *meig, float *W,
-    float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, float *, int, float *, int,
-      float, float, int, int, int *, float *, float *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, double *A, int lda,
-    double *B, int ldb, double vl, double vu, int il, int iu, int *meig,
-    double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, double *, int, double *, int,
-      double, double, int, int, int *, double *, double *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuComplex *A,
-    int lda, cuComplex *B, int ldb, float vl, float vu, int il, int iu,
-    int *meig, float *W, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
-      int, float, float, int, int, int *, float *, cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
-    int lda, cuDoubleComplex *B, int ldb, double vl, double vu, int il, int iu,
-    int *meig, double *W, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, double, double, int, int, int *, double *,
-      cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
-                  il, iu, meig, W, work, lwork, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
-    int ldb, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const float *, int, const float *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
-    int ldb, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const double *, int, const double *, int,
-      const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
-    const cuComplex *B, int ldb, const float *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
-      const float *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *B, int ldb, const double *W, int *lwork) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
-    float *W, float *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
-    double *W, double *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
-      int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvd(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
-    float *W, cuComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
-      cuComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnZhegvd(cusolverDnHandle_t handle, cusolverEigType_t itype,
-                 cusolverEigMode_t jobz, cublasFillMode_t uplo, int n,
-                 cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb,
-                 double *W, cuDoubleComplex *work, int lwork, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreateSyevjInfo(syevjInfo_t *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateSyevjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroySyevjInfo(syevjInfo_t info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroySyevjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetTolerance(syevjInfo_t info,
-                                                          double tolerance) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetTolerance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, tolerance);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetMaxSweeps(syevjInfo_t info,
-                                                          int max_sweeps) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetMaxSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, max_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetSortEig(syevjInfo_t info,
-                                                        int sort_eig) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetSortEig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, sort_eig);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetResidual(
-    cusolverDnHandle_t handle, syevjInfo_t info, double *residual) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
-                                                  syevjInfo_t, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetResidual");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, residual);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetSweeps(
-    cusolverDnHandle_t handle, syevjInfo_t info, int *executed_sweeps) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, syevjInfo_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, executed_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnCheevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnZheevjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, float *A, int lda, float *W, float *work, int lwork, int *info,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, double *A, int lda, double *W, double *work, int lwork, int *info,
-    syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuComplex *A, int lda, float *W, cuComplex *work, int lwork,
-    int *info, syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *, syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
-    int lwork, int *info, syevjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
-      syevjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const float *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const float *, int, const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const double *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const double *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuComplex *A, int lda, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuComplex *, int, const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              float *A, int lda, float *W,
-                                              float *work, int lwork, int *info,
-                                              syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
-      int, float *, float *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              double *A, int lda, double *W,
-                                              double *work, int lwork,
-                                              int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
-      int, double *, double *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCheevj(cusolverDnHandle_t handle,
-                                              cusolverEigMode_t jobz,
-                                              cublasFillMode_t uplo, int n,
-                                              cuComplex *A, int lda, float *W,
-                                              cuComplex *work, int lwork,
-                                              int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
-      int, float *, cuComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZheevj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
-    int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
-      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
-      syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
-    int ldb, const float *W, int *lwork, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const float *, int, const float *, int,
-      const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
-    int ldb, const double *W, int *lwork, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const double *, int, const double *, int,
-      const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
-    const cuComplex *B, int ldb, const float *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
-      const float *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
-    const cuDoubleComplex *B, int ldb, const double *W, int *lwork,
-    syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
-    float *W, float *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
-      int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
-    double *W, double *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
-      int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnChegvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
-    float *W, cuComplex *work, int lwork, int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
-      cuComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj(
-    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
-    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda,
-    cuDoubleComplex *B, int ldb, double *W, cuDoubleComplex *work, int lwork,
-    int *info, syevjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
-      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, int *, syevjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
-                  info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCreateGesvdjInfo(gesvdjInfo_t *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateGesvdjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDestroyGesvdjInfo(gesvdjInfo_t info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyGesvdjInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetTolerance(gesvdjInfo_t info,
-                                                           double tolerance) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetTolerance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, tolerance);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetMaxSweeps(gesvdjInfo_t info,
-                                                           int max_sweeps) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetMaxSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, max_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetSortEig(gesvdjInfo_t info,
-                                                         int sort_svd) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetSortEig");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, sort_svd);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetResidual(
-    cusolverDnHandle_t handle, gesvdjInfo_t info, double *residual) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
-                                                  gesvdjInfo_t, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetResidual");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, residual);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetSweeps(
-    cusolverDnHandle_t handle, gesvdjInfo_t info, int *executed_sweeps) {
-  using FuncPtr =
-      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, gesvdjInfo_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetSweeps");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, executed_sweeps);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const float *A, int lda, const float *S, const float *U, int ldu,
-    const float *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const float *, int,
-      const float *, const float *, int, const float *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const double *A, int lda, const double *S, const double *U, int ldu,
-    const double *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const double *, int,
-      const double *, const double *, int, const double *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
-    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuComplex *, int,
-      const float *, const cuComplex *, int, const cuComplex *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    const cuDoubleComplex *A, int lda, const double *S,
-    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
-    int *lwork, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuDoubleComplex *,
-      int, const double *, const cuDoubleComplex *, int,
-      const cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
-                  batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, float *A,
-    int lda, float *S, float *U, int ldu, float *V, int ldv, float *work,
-    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, float *, int, float *,
-      float *, int, float *, int, float *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, double *A,
-    int lda, double *S, double *U, int ldu, double *V, int ldv, double *work,
-    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, double *, int, double *,
-      double *, int, double *, int, double *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
-    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
-      gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
-    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
-    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
-    gesvdjInfo_t params, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuDoubleComplex *, int,
-      double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
-                  info, params, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const float *A, int lda, const float *S, const float *U, int ldu,
-    const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
-      const float *, const float *, int, const float *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const double *A, int lda, const double *S, const double *U, int ldu,
-    const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
-      const double *, const double *, int, const double *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
-    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
-      int, const float *, const cuComplex *, int, const cuComplex *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    const cuDoubleComplex *A, int lda, const double *S,
-    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
-    int *lwork, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
-      const cuDoubleComplex *, int, const double *, const cuDoubleComplex *,
-      int, const cuDoubleComplex *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
-                  params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
-    float *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, float *, int,
-      float *, float *, int, float *, int, float *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
-    double *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, double *, int,
-      double *, double *, int, double *, int, double *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
-    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuComplex *, int,
-      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
-      gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
-    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
-    gesvdjInfo_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuDoubleComplex *,
-      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
-      cuDoubleComplex *, int, int *, gesvdjInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
-                  lwork, info, params);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const float *d_A, int lda, long long int strideA, const float *d_S,
-    long long int strideS, const float *d_U, int ldu, long long int strideU,
-    const float *d_V, int ldv, long long int strideV, int *lwork,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
-      long long, const float *, long long, const float *, int, long long,
-      const float *, int, long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const double *d_A, int lda, long long int strideA, const double *d_S,
-    long long int strideS, const double *d_U, int ldu, long long int strideU,
-    const double *d_V, int ldv, long long int strideV, int *lwork,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
-      long long, const double *, long long, const double *, int, long long,
-      const double *, int, long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuComplex *d_A, int lda, long long int strideA, const float *d_S,
-    long long int strideS, const cuComplex *d_U, int ldu, long long int strideU,
-    const cuComplex *d_V, int ldv, long long int strideV, int *lwork,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
-      int, long long, const float *, long long, const cuComplex *, int,
-      long long, const cuComplex *, int, long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched_bufferSize(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuDoubleComplex *d_A, int lda, long long int strideA,
-    const double *d_S, long long int strideS, const cuDoubleComplex *d_U,
-    int ldu, long long int strideU, const cuDoubleComplex *d_V, int ldv,
-    long long int strideV, int *lwork, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
-      const cuDoubleComplex *, int, long long, const double *, long long,
-      const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int,
-      long long, int *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const float *d_A, int lda, long long int strideA, float *d_S,
-    long long int strideS, float *d_U, int ldu, long long int strideU,
-    float *d_V, int ldv, long long int strideV, float *d_work, int lwork,
-    int *d_info, double *h_R_nrmF, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
-      long long, float *, long long, float *, int, long long, float *, int,
-      long long, float *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const double *d_A, int lda, long long int strideA, double *d_S,
-    long long int strideS, double *d_U, int ldu, long long int strideU,
-    double *d_V, int ldv, long long int strideV, double *d_work, int lwork,
-    int *d_info, double *h_R_nrmF, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
-      long long, double *, long long, double *, int, long long, double *, int,
-      long long, double *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuComplex *d_A, int lda, long long int strideA, float *d_S,
-    long long int strideS, cuComplex *d_U, int ldu, long long int strideU,
-    cuComplex *d_V, int ldv, long long int strideV, cuComplex *d_work,
-    int lwork, int *d_info, double *h_R_nrmF, int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
-      int, long long, float *, long long, cuComplex *, int, long long,
-      cuComplex *, int, long long, cuComplex *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched(
-    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
-    const cuDoubleComplex *d_A, int lda, long long int strideA, double *d_S,
-    long long int strideS, cuDoubleComplex *d_U, int ldu, long long int strideU,
-    cuDoubleComplex *d_V, int ldv, long long int strideV,
-    cuDoubleComplex *d_work, int lwork, int *d_info, double *h_R_nrmF,
-    int batchSize) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
-      const cuDoubleComplex *, int, long long, double *, long long,
-      cuDoubleComplex *, int, long long, cuDoubleComplex *, int, long long,
-      cuDoubleComplex *, int, int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
-                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
-                  h_R_nrmF, batchSize);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnCreateParams(cusolverDnParams_t *params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnParams_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnDestroyParams(cusolverDnParams_t params) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnParams_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSetAdvOptions(cusolverDnParams_t params,
-                        cusolverDnFunction_t function, cusolverAlgMode_t algo) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnParams_t, cusolverDnFunction_t, cusolverAlgMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetAdvOptions");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(params, function, algo);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnPotrf_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, cublasFillMode_t uplo,
-    int64_t n, cudaDataType dataTypeA, const void *A, int64_t lda,
-    cudaDataType computeType, size_t *workspaceInBytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
-      cudaDataType, const void *, int64_t, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, uplo, n, dataTypeA, A, lda, computeType,
-                  workspaceInBytes);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnPotrf(cusolverDnHandle_t handle, cusolverDnParams_t params,
-                cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA,
-                void *A, int64_t lda, cudaDataType computeType, void *pBuffer,
-                size_t workspaceInBytes, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
-      cudaDataType, void *, int64_t, cudaDataType, void *, size_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, uplo, n, dataTypeA, A, lda, computeType,
-                  pBuffer, workspaceInBytes, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnPotrs(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, cublasFillMode_t uplo,
-    int64_t n, int64_t nrhs, cudaDataType dataTypeA, const void *A, int64_t lda,
-    cudaDataType dataTypeB, void *B, int64_t ldb, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
-      int64_t, cudaDataType, const void *, int64_t, cudaDataType, void *,
-      int64_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, uplo, n, nrhs, dataTypeA, A, lda, dataTypeB,
-                  B, ldb, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnGeqrf_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m, int64_t n,
-    cudaDataType dataTypeA, const void *A, int64_t lda,
-    cudaDataType dataTypeTau, const void *tau, cudaDataType computeType,
-    size_t *workspaceInBytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
-      const void *, int64_t, cudaDataType, const void *, cudaDataType,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, m, n, dataTypeA, A, lda, dataTypeTau, tau,
-                  computeType, workspaceInBytes);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnGeqrf(cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m,
-                int64_t n, cudaDataType dataTypeA, void *A, int64_t lda,
-                cudaDataType dataTypeTau, void *tau, cudaDataType computeType,
-                void *pBuffer, size_t workspaceInBytes, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
-      void *, int64_t, cudaDataType, void *, cudaDataType, void *, size_t,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, m, n, dataTypeA, A, lda, dataTypeTau, tau,
-                  computeType, pBuffer, workspaceInBytes, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnGetrf_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m, int64_t n,
-    cudaDataType dataTypeA, const void *A, int64_t lda,
-    cudaDataType computeType, size_t *workspaceInBytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
-      const void *, int64_t, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, m, n, dataTypeA, A, lda, computeType,
-                  workspaceInBytes);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnGetrf(cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m,
-                int64_t n, cudaDataType dataTypeA, void *A, int64_t lda,
-                int64_t *ipiv, cudaDataType computeType, void *pBuffer,
-                size_t workspaceInBytes, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
-      void *, int64_t, int64_t *, cudaDataType, void *, size_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, m, n, dataTypeA, A, lda, ipiv, computeType,
-                  pBuffer, workspaceInBytes, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnGetrs(
-    cusolverDnHandle_t handle, cusolverDnParams_t params,
-    cublasOperation_t trans, int64_t n, int64_t nrhs, cudaDataType dataTypeA,
-    const void *A, int64_t lda, const int64_t *ipiv, cudaDataType dataTypeB,
-    void *B, int64_t ldb, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cublasOperation_t, int64_t,
-      int64_t, cudaDataType, const void *, int64_t, const int64_t *,
-      cudaDataType, void *, int64_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, trans, n, nrhs, dataTypeA, A, lda, ipiv,
-                  dataTypeB, B, ldb, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSyevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params,
-    cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n,
-    cudaDataType dataTypeA, const void *A, int64_t lda, cudaDataType dataTypeW,
-    const void *W, cudaDataType computeType, size_t *workspaceInBytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
-      cublasFillMode_t, int64_t, cudaDataType, const void *, int64_t,
-      cudaDataType, const void *, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW,
-                  W, computeType, workspaceInBytes);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnSyevd(cusolverDnHandle_t handle, cusolverDnParams_t params,
-                cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n,
-                cudaDataType dataTypeA, void *A, int64_t lda,
-                cudaDataType dataTypeW, void *W, cudaDataType computeType,
-                void *pBuffer, size_t workspaceInBytes, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
-      cublasFillMode_t, int64_t, cudaDataType, void *, int64_t, cudaDataType,
-      void *, cudaDataType, void *, size_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW,
-                  W, computeType, pBuffer, workspaceInBytes, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSyevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params,
-    cusolverEigMode_t jobz, cusolverEigRange_t range, cublasFillMode_t uplo,
-    int64_t n, cudaDataType dataTypeA, const void *A, int64_t lda, void *vl,
-    void *vu, int64_t il, int64_t iu, int64_t *h_meig, cudaDataType dataTypeW,
-    const void *W, cudaDataType computeType, size_t *workspaceInBytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int64_t, cudaDataType, const void *,
-      int64_t, void *, void *, int64_t, int64_t, int64_t *, cudaDataType,
-      const void *, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobz, range, uplo, n, dataTypeA, A, lda, vl,
-                  vu, il, iu, h_meig, dataTypeW, W, computeType,
-                  workspaceInBytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnSyevdx(
-    cusolverDnHandle_t handle, cusolverDnParams_t params,
-    cusolverEigMode_t jobz, cusolverEigRange_t range, cublasFillMode_t uplo,
-    int64_t n, cudaDataType dataTypeA, void *A, int64_t lda, void *vl, void *vu,
-    int64_t il, int64_t iu, int64_t *meig64, cudaDataType dataTypeW, void *W,
-    cudaDataType computeType, void *pBuffer, size_t workspaceInBytes,
-    int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int64_t, cudaDataType, void *,
-      int64_t, void *, void *, int64_t, int64_t, int64_t *, cudaDataType,
-      void *, cudaDataType, void *, size_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobz, range, uplo, n, dataTypeA, A, lda, vl,
-                  vu, il, iu, meig64, dataTypeW, W, computeType, pBuffer,
-                  workspaceInBytes, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnGesvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, signed char jobu,
-    signed char jobvt, int64_t m, int64_t n, cudaDataType dataTypeA,
-    const void *A, int64_t lda, cudaDataType dataTypeS, const void *S,
-    cudaDataType dataTypeU, const void *U, int64_t ldu, cudaDataType dataTypeVT,
-    const void *VT, int64_t ldvt, cudaDataType computeType,
-    size_t *workspaceInBytes) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, signed char, signed char, int64_t,
-      int64_t, cudaDataType, const void *, int64_t, cudaDataType, const void *,
-      cudaDataType, const void *, int64_t, cudaDataType, const void *, int64_t,
-      cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobu, jobvt, m, n, dataTypeA, A, lda,
-                  dataTypeS, S, dataTypeU, U, ldu, dataTypeVT, VT, ldvt,
-                  computeType, workspaceInBytes);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnGesvd(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, signed char jobu,
-    signed char jobvt, int64_t m, int64_t n, cudaDataType dataTypeA, void *A,
-    int64_t lda, cudaDataType dataTypeS, void *S, cudaDataType dataTypeU,
-    void *U, int64_t ldu, cudaDataType dataTypeVT, void *VT, int64_t ldvt,
-    cudaDataType computeType, void *pBuffer, size_t workspaceInBytes,
-    int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, signed char, signed char, int64_t,
-      int64_t, cudaDataType, void *, int64_t, cudaDataType, void *,
-      cudaDataType, void *, int64_t, cudaDataType, void *, int64_t,
-      cudaDataType, void *, size_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobu, jobvt, m, n, dataTypeA, A, lda,
-                  dataTypeS, S, dataTypeU, U, ldu, dataTypeVT, VT, ldvt,
-                  computeType, pBuffer, workspaceInBytes, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXpotrf_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, cublasFillMode_t uplo,
-    int64_t n, cudaDataType dataTypeA, const void *A, int64_t lda,
-    cudaDataType computeType, size_t *workspaceInBytesOnDevice,
-    size_t *workspaceInBytesOnHost) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
-      cudaDataType, const void *, int64_t, cudaDataType, size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXpotrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, uplo, n, dataTypeA, A, lda, computeType,
-                  workspaceInBytesOnDevice, workspaceInBytesOnHost);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnXpotrf(cusolverDnHandle_t handle, cusolverDnParams_t params,
-                 cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA,
-                 void *A, int64_t lda, cudaDataType computeType,
-                 void *bufferOnDevice, size_t workspaceInBytesOnDevice,
-                 void *bufferOnHost, size_t workspaceInBytesOnHost, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
-      cudaDataType, void *, int64_t, cudaDataType, void *, size_t, void *,
-      size_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXpotrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, uplo, n, dataTypeA, A, lda, computeType,
-                  bufferOnDevice, workspaceInBytesOnDevice, bufferOnHost,
-                  workspaceInBytesOnHost, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXpotrs(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, cublasFillMode_t uplo,
-    int64_t n, int64_t nrhs, cudaDataType dataTypeA, const void *A, int64_t lda,
-    cudaDataType dataTypeB, void *B, int64_t ldb, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
-      int64_t, cudaDataType, const void *, int64_t, cudaDataType, void *,
-      int64_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXpotrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, uplo, n, nrhs, dataTypeA, A, lda, dataTypeB,
-                  B, ldb, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgeqrf_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m, int64_t n,
-    cudaDataType dataTypeA, const void *A, int64_t lda,
-    cudaDataType dataTypeTau, const void *tau, cudaDataType computeType,
-    size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
-      const void *, int64_t, cudaDataType, const void *, cudaDataType, size_t *,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgeqrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, m, n, dataTypeA, A, lda, dataTypeTau, tau,
-                  computeType, workspaceInBytesOnDevice,
-                  workspaceInBytesOnHost);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgeqrf(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m, int64_t n,
-    cudaDataType dataTypeA, void *A, int64_t lda, cudaDataType dataTypeTau,
-    void *tau, cudaDataType computeType, void *bufferOnDevice,
-    size_t workspaceInBytesOnDevice, void *bufferOnHost,
-    size_t workspaceInBytesOnHost, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
-      void *, int64_t, cudaDataType, void *, cudaDataType, void *, size_t,
-      void *, size_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgeqrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, m, n, dataTypeA, A, lda, dataTypeTau, tau,
-                  computeType, bufferOnDevice, workspaceInBytesOnDevice,
-                  bufferOnHost, workspaceInBytesOnHost, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgetrf_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m, int64_t n,
-    cudaDataType dataTypeA, const void *A, int64_t lda,
-    cudaDataType computeType, size_t *workspaceInBytesOnDevice,
-    size_t *workspaceInBytesOnHost) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
-      const void *, int64_t, cudaDataType, size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgetrf_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, m, n, dataTypeA, A, lda, computeType,
-                  workspaceInBytesOnDevice, workspaceInBytesOnHost);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnXgetrf(cusolverDnHandle_t handle, cusolverDnParams_t params,
-                 int64_t m, int64_t n, cudaDataType dataTypeA, void *A,
-                 int64_t lda, int64_t *ipiv, cudaDataType computeType,
-                 void *bufferOnDevice, size_t workspaceInBytesOnDevice,
-                 void *bufferOnHost, size_t workspaceInBytesOnHost, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
-      void *, int64_t, int64_t *, cudaDataType, void *, size_t, void *, size_t,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgetrf");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, m, n, dataTypeA, A, lda, ipiv, computeType,
-                  bufferOnDevice, workspaceInBytesOnDevice, bufferOnHost,
-                  workspaceInBytesOnHost, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgetrs(
-    cusolverDnHandle_t handle, cusolverDnParams_t params,
-    cublasOperation_t trans, int64_t n, int64_t nrhs, cudaDataType dataTypeA,
-    const void *A, int64_t lda, const int64_t *ipiv, cudaDataType dataTypeB,
-    void *B, int64_t ldb, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cublasOperation_t, int64_t,
-      int64_t, cudaDataType, const void *, int64_t, const int64_t *,
-      cudaDataType, void *, int64_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgetrs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, trans, n, nrhs, dataTypeA, A, lda, ipiv,
-                  dataTypeB, B, ldb, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevd_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params,
-    cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n,
-    cudaDataType dataTypeA, const void *A, int64_t lda, cudaDataType dataTypeW,
-    const void *W, cudaDataType computeType, size_t *workspaceInBytesOnDevice,
-    size_t *workspaceInBytesOnHost) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
-      cublasFillMode_t, int64_t, cudaDataType, const void *, int64_t,
-      cudaDataType, const void *, cudaDataType, size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW,
-                  W, computeType, workspaceInBytesOnDevice,
-                  workspaceInBytesOnHost);
-}
-
-cusolverStatus_t CUSOLVERAPI
-cusolverDnXsyevd(cusolverDnHandle_t handle, cusolverDnParams_t params,
-                 cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n,
-                 cudaDataType dataTypeA, void *A, int64_t lda,
-                 cudaDataType dataTypeW, void *W, cudaDataType computeType,
-                 void *bufferOnDevice, size_t workspaceInBytesOnDevice,
-                 void *bufferOnHost, size_t workspaceInBytesOnHost, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
-      cublasFillMode_t, int64_t, cudaDataType, void *, int64_t, cudaDataType,
-      void *, cudaDataType, void *, size_t, void *, size_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW,
-                  W, computeType, bufferOnDevice, workspaceInBytesOnDevice,
-                  bufferOnHost, workspaceInBytesOnHost, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevdx_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params,
-    cusolverEigMode_t jobz, cusolverEigRange_t range, cublasFillMode_t uplo,
-    int64_t n, cudaDataType dataTypeA, const void *A, int64_t lda, void *vl,
-    void *vu, int64_t il, int64_t iu, int64_t *h_meig, cudaDataType dataTypeW,
-    const void *W, cudaDataType computeType, size_t *workspaceInBytesOnDevice,
-    size_t *workspaceInBytesOnHost) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int64_t, cudaDataType, const void *,
-      int64_t, void *, void *, int64_t, int64_t, int64_t *, cudaDataType,
-      const void *, cudaDataType, size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevdx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobz, range, uplo, n, dataTypeA, A, lda, vl,
-                  vu, il, iu, h_meig, dataTypeW, W, computeType,
-                  workspaceInBytesOnDevice, workspaceInBytesOnHost);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXsyevdx(
-    cusolverDnHandle_t handle, cusolverDnParams_t params,
-    cusolverEigMode_t jobz, cusolverEigRange_t range, cublasFillMode_t uplo,
-    int64_t n, cudaDataType dataTypeA, void *A, int64_t lda, void *vl, void *vu,
-    int64_t il, int64_t iu, int64_t *meig64, cudaDataType dataTypeW, void *W,
-    cudaDataType computeType, void *bufferOnDevice,
-    size_t workspaceInBytesOnDevice, void *bufferOnHost,
-    size_t workspaceInBytesOnHost, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
-      cusolverEigRange_t, cublasFillMode_t, int64_t, cudaDataType, void *,
-      int64_t, void *, void *, int64_t, int64_t, int64_t *, cudaDataType,
-      void *, cudaDataType, void *, size_t, void *, size_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevdx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobz, range, uplo, n, dataTypeA, A, lda, vl,
-                  vu, il, iu, meig64, dataTypeW, W, computeType, bufferOnDevice,
-                  workspaceInBytesOnDevice, bufferOnHost,
-                  workspaceInBytesOnHost, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvd_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, signed char jobu,
-    signed char jobvt, int64_t m, int64_t n, cudaDataType dataTypeA,
-    const void *A, int64_t lda, cudaDataType dataTypeS, const void *S,
-    cudaDataType dataTypeU, const void *U, int64_t ldu, cudaDataType dataTypeVT,
-    const void *VT, int64_t ldvt, cudaDataType computeType,
-    size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, signed char, signed char, int64_t,
-      int64_t, cudaDataType, const void *, int64_t, cudaDataType, const void *,
-      cudaDataType, const void *, int64_t, cudaDataType, const void *, int64_t,
-      cudaDataType, size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvd_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobu, jobvt, m, n, dataTypeA, A, lda,
-                  dataTypeS, S, dataTypeU, U, ldu, dataTypeVT, VT, ldvt,
-                  computeType, workspaceInBytesOnDevice,
-                  workspaceInBytesOnHost);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvd(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, signed char jobu,
-    signed char jobvt, int64_t m, int64_t n, cudaDataType dataTypeA, void *A,
-    int64_t lda, cudaDataType dataTypeS, void *S, cudaDataType dataTypeU,
-    void *U, int64_t ldu, cudaDataType dataTypeVT, void *VT, int64_t ldvt,
-    cudaDataType computeType, void *bufferOnDevice,
-    size_t workspaceInBytesOnDevice, void *bufferOnHost,
-    size_t workspaceInBytesOnHost, int *info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, signed char, signed char, int64_t,
-      int64_t, cudaDataType, void *, int64_t, cudaDataType, void *,
-      cudaDataType, void *, int64_t, cudaDataType, void *, int64_t,
-      cudaDataType, void *, size_t, void *, size_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvd");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobu, jobvt, m, n, dataTypeA, A, lda,
-                  dataTypeS, S, dataTypeU, U, ldu, dataTypeVT, VT, ldvt,
-                  computeType, bufferOnDevice, workspaceInBytesOnDevice,
-                  bufferOnHost, workspaceInBytesOnHost, info);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdp_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params,
-    cusolverEigMode_t jobz, int econ, int64_t m, int64_t n,
-    cudaDataType dataTypeA, const void *A, int64_t lda, cudaDataType dataTypeS,
-    const void *S, cudaDataType dataTypeU, const void *U, int64_t ldu,
-    cudaDataType dataTypeV, const void *V, int64_t ldv,
-    cudaDataType computeType, size_t *workspaceInBytesOnDevice,
-    size_t *workspaceInBytesOnHost) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, int, int64_t,
-      int64_t, cudaDataType, const void *, int64_t, cudaDataType, const void *,
-      cudaDataType, const void *, int64_t, cudaDataType, const void *, int64_t,
-      cudaDataType, size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdp_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobz, econ, m, n, dataTypeA, A, lda,
-                  dataTypeS, S, dataTypeU, U, ldu, dataTypeV, V, ldv,
-                  computeType, workspaceInBytesOnDevice,
-                  workspaceInBytesOnHost);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdp(
-    cusolverDnHandle_t handle, cusolverDnParams_t params,
-    cusolverEigMode_t jobz, int econ, int64_t m, int64_t n,
-    cudaDataType dataTypeA, void *A, int64_t lda, cudaDataType dataTypeS,
-    void *S, cudaDataType dataTypeU, void *U, int64_t ldu,
-    cudaDataType dataTypeV, void *V, int64_t ldv, cudaDataType computeType,
-    void *bufferOnDevice, size_t workspaceInBytesOnDevice, void *bufferOnHost,
-    size_t workspaceInBytesOnHost, int *d_info, double *h_err_sigma) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, int, int64_t,
-      int64_t, cudaDataType, void *, int64_t, cudaDataType, void *,
-      cudaDataType, void *, int64_t, cudaDataType, void *, int64_t,
-      cudaDataType, void *, size_t, void *, size_t, int *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobz, econ, m, n, dataTypeA, A, lda,
-                  dataTypeS, S, dataTypeU, U, ldu, dataTypeV, V, ldv,
-                  computeType, bufferOnDevice, workspaceInBytesOnDevice,
-                  bufferOnHost, workspaceInBytesOnHost, d_info, h_err_sigma);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdr_bufferSize(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, signed char jobu,
-    signed char jobv, int64_t m, int64_t n, int64_t k, int64_t p,
-    int64_t niters, cudaDataType dataTypeA, const void *A, int64_t lda,
-    cudaDataType dataTypeSrand, const void *Srand, cudaDataType dataTypeUrand,
-    const void *Urand, int64_t ldUrand, cudaDataType dataTypeVrand,
-    const void *Vrand, int64_t ldVrand, cudaDataType computeType,
-    size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, signed char, signed char, int64_t,
-      int64_t, int64_t, int64_t, int64_t, cudaDataType, const void *, int64_t,
-      cudaDataType, const void *, cudaDataType, const void *, int64_t,
-      cudaDataType, const void *, int64_t, cudaDataType, size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobu, jobv, m, n, k, p, niters, dataTypeA, A,
-                  lda, dataTypeSrand, Srand, dataTypeUrand, Urand, ldUrand,
-                  dataTypeVrand, Vrand, ldVrand, computeType,
-                  workspaceInBytesOnDevice, workspaceInBytesOnHost);
-}
-
-cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdr(
-    cusolverDnHandle_t handle, cusolverDnParams_t params, signed char jobu,
-    signed char jobv, int64_t m, int64_t n, int64_t k, int64_t p,
-    int64_t niters, cudaDataType dataTypeA, void *A, int64_t lda,
-    cudaDataType dataTypeSrand, void *Srand, cudaDataType dataTypeUrand,
-    void *Urand, int64_t ldUrand, cudaDataType dataTypeVrand, void *Vrand,
-    int64_t ldVrand, cudaDataType computeType, void *bufferOnDevice,
-    size_t workspaceInBytesOnDevice, void *bufferOnHost,
-    size_t workspaceInBytesOnHost, int *d_info) {
-  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
-      cusolverDnHandle_t, cusolverDnParams_t, signed char, signed char, int64_t,
-      int64_t, int64_t, int64_t, int64_t, cudaDataType, void *, int64_t,
-      cudaDataType, void *, cudaDataType, void *, int64_t, cudaDataType, void *,
-      int64_t, cudaDataType, void *, size_t, void *, size_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, params, jobu, jobv, m, n, k, p, niters, dataTypeA, A,
-                  lda, dataTypeSrand, Srand, dataTypeUrand, Urand, ldUrand,
-                  dataTypeVrand, Vrand, ldVrand, computeType, bufferOnDevice,
-                  workspaceInBytesOnDevice, bufferOnHost,
-                  workspaceInBytesOnHost, d_info);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cusolver_stub.cc
index f441d7ebe7da8a..d11601b3bd4217 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cusolver_stub.cc
@@ -35,28 +35,40 @@ void* GetDsoHandle() {
 #endif
 }
 
-template <typename T>
-T LoadSymbol(const char* symbol_name) {
+void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
     tsl::Env::Default()
         ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
-  return reinterpret_cast<T>(symbol);
+  return symbol;
 }
 
-cusolverStatus_t GetSymbolNotFoundError() {
+const char* kSymbols[] = {
+#include "tsl/cuda/cusolver.inc"
+};
+
+constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
+
+}  // namespace
+
+extern "C" {
+
+static cusolverStatus_t GetSymbolNotFoundError() {
   return CUSOLVER_STATUS_INTERNAL_ERROR;
 }
-}  // namespace
 
-#if CUDA_VERSION < 10010
-#include "tsl/cuda/cusolver_dense_10_0.inc"
-#elif CUDA_VERSION < 10020
-#include "tsl/cuda/cusolver_dense_10_1.inc"
-#elif CUDA_VERSION < 11000
-#include "tsl/cuda/cusolver_dense_10_2.inc"
-#else
-#include "tsl/cuda/cusolver_dense_11_0.inc"
-#endif
+extern void* _cusolver_tramp_table[];
+
+void _cusolver_tramp_resolve(int i) {
+  CHECK_LE(0, i);
+  CHECK_LT(i, kNumSymbols);
+  void* p = LoadSymbol(kSymbols[i]);
+  if (!p) {
+    p = reinterpret_cast<void*>(&GetSymbolNotFoundError);
+  }
+  _cusolver_tramp_table[i] = p;
+}
+
+}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusparse.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cusparse.symbols
new file mode 100644
index 00000000000000..0b7ff187600793
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cusparse.symbols
@@ -0,0 +1,421 @@
+cusparseAxpby
+cusparseBlockedEllGet
+cusparseBsrSetStridedBatch
+cusparseCbsr2csr
+cusparseCbsric02
+cusparseCbsric02_analysis
+cusparseCbsric02_bufferSize
+cusparseCbsrilu02
+cusparseCbsrilu02_analysis
+cusparseCbsrilu02_bufferSize
+cusparseCbsrilu02_numericBoost
+cusparseCbsrmm
+cusparseCbsrmv
+cusparseCbsrsm2_analysis
+cusparseCbsrsm2_bufferSize
+cusparseCbsrsm2_solve
+cusparseCbsrsv2_analysis
+cusparseCbsrsv2_bufferSize
+cusparseCbsrsv2_solve
+cusparseCbsrxmv
+cusparseCcsr2bsr
+cusparseCcsr2csr_compress
+cusparseCcsr2csru
+cusparseCcsr2gebsr
+cusparseCcsr2gebsr_bufferSize
+cusparseCcsrcolor
+cusparseCcsrgeam2
+cusparseCcsrgeam2_bufferSizeExt
+cusparseCcsric02
+cusparseCcsric02_analysis
+cusparseCcsric02_bufferSize
+cusparseCcsrilu02
+cusparseCcsrilu02_analysis
+cusparseCcsrilu02_bufferSize
+cusparseCcsrilu02_numericBoost
+cusparseCcsru2csr
+cusparseCcsru2csr_bufferSizeExt
+cusparseCgebsr2csr
+cusparseCgebsr2gebsc
+cusparseCgebsr2gebsc_bufferSize
+cusparseCgebsr2gebsr
+cusparseCgebsr2gebsr_bufferSize
+cusparseCgemvi
+cusparseCgemvi_bufferSize
+cusparseCgpsvInterleavedBatch
+cusparseCgpsvInterleavedBatch_bufferSizeExt
+cusparseCgtsv2
+cusparseCgtsv2StridedBatch
+cusparseCgtsv2StridedBatch_bufferSizeExt
+cusparseCgtsv2_bufferSizeExt
+cusparseCgtsv2_nopivot
+cusparseCgtsv2_nopivot_bufferSizeExt
+cusparseCgtsvInterleavedBatch
+cusparseCgtsvInterleavedBatch_bufferSizeExt
+cusparseCnnz
+cusparseCnnz_compress
+cusparseConstBlockedEllGet
+cusparseConstCooGet
+cusparseConstCscGet
+cusparseConstCsrGet
+cusparseConstDnMatGet
+cusparseConstDnMatGetValues
+cusparseConstDnVecGet
+cusparseConstDnVecGetValues
+cusparseConstSpMatGetValues
+cusparseConstSpVecGet
+cusparseConstSpVecGetValues
+cusparseCooGet
+cusparseCooSetPointers
+cusparseCooSetStridedBatch
+cusparseCreate
+cusparseCreateBlockedEll
+cusparseCreateBsr
+cusparseCreateBsric02Info
+cusparseCreateBsrilu02Info
+cusparseCreateBsrsm2Info
+cusparseCreateBsrsv2Info
+cusparseCreateColorInfo
+cusparseCreateConstBlockedEll
+cusparseCreateConstBsr
+cusparseCreateConstCoo
+cusparseCreateConstCsc
+cusparseCreateConstCsr
+cusparseCreateConstDnMat
+cusparseCreateConstDnVec
+cusparseCreateConstSlicedEll
+cusparseCreateConstSpVec
+cusparseCreateCoo
+cusparseCreateCsc
+cusparseCreateCsr
+cusparseCreateCsric02Info
+cusparseCreateCsrilu02Info
+cusparseCreateCsru2csrInfo
+cusparseCreateDnMat
+cusparseCreateDnVec
+cusparseCreateIdentityPermutation
+cusparseCreateMatDescr
+cusparseCreatePruneInfo
+cusparseCreateSlicedEll
+cusparseCreateSpVec
+cusparseCscGet
+cusparseCscSetPointers
+cusparseCsr2cscEx2
+cusparseCsr2cscEx2_bufferSize
+cusparseCsrGet
+cusparseCsrSetPointers
+cusparseCsrSetStridedBatch
+cusparseDbsr2csr
+cusparseDbsric02
+cusparseDbsric02_analysis
+cusparseDbsric02_bufferSize
+cusparseDbsrilu02
+cusparseDbsrilu02_analysis
+cusparseDbsrilu02_bufferSize
+cusparseDbsrilu02_numericBoost
+cusparseDbsrmm
+cusparseDbsrmv
+cusparseDbsrsm2_analysis
+cusparseDbsrsm2_bufferSize
+cusparseDbsrsm2_solve
+cusparseDbsrsv2_analysis
+cusparseDbsrsv2_bufferSize
+cusparseDbsrsv2_solve
+cusparseDbsrxmv
+cusparseDcsr2bsr
+cusparseDcsr2csr_compress
+cusparseDcsr2csru
+cusparseDcsr2gebsr
+cusparseDcsr2gebsr_bufferSize
+cusparseDcsrcolor
+cusparseDcsrgeam2
+cusparseDcsrgeam2_bufferSizeExt
+cusparseDcsric02
+cusparseDcsric02_analysis
+cusparseDcsric02_bufferSize
+cusparseDcsrilu02
+cusparseDcsrilu02_analysis
+cusparseDcsrilu02_bufferSize
+cusparseDcsrilu02_numericBoost
+cusparseDcsru2csr
+cusparseDcsru2csr_bufferSizeExt
+cusparseDenseToSparse_analysis
+cusparseDenseToSparse_bufferSize
+cusparseDenseToSparse_convert
+cusparseDestroy
+cusparseDestroyBsric02Info
+cusparseDestroyBsrilu02Info
+cusparseDestroyBsrsm2Info
+cusparseDestroyBsrsv2Info
+cusparseDestroyColorInfo
+cusparseDestroyCsric02Info
+cusparseDestroyCsrilu02Info
+cusparseDestroyCsru2csrInfo
+cusparseDestroyDnMat
+cusparseDestroyDnVec
+cusparseDestroyMatDescr
+cusparseDestroyPruneInfo
+cusparseDestroySpMat
+cusparseDestroySpVec
+cusparseDgebsr2csr
+cusparseDgebsr2gebsc
+cusparseDgebsr2gebsc_bufferSize
+cusparseDgebsr2gebsr
+cusparseDgebsr2gebsr_bufferSize
+cusparseDgemvi
+cusparseDgemvi_bufferSize
+cusparseDgpsvInterleavedBatch
+cusparseDgpsvInterleavedBatch_bufferSizeExt
+cusparseDgtsv2
+cusparseDgtsv2StridedBatch
+cusparseDgtsv2StridedBatch_bufferSizeExt
+cusparseDgtsv2_bufferSizeExt
+cusparseDgtsv2_nopivot
+cusparseDgtsv2_nopivot_bufferSizeExt
+cusparseDgtsvInterleavedBatch
+cusparseDgtsvInterleavedBatch_bufferSizeExt
+cusparseDnMatGet
+cusparseDnMatGetStridedBatch
+cusparseDnMatGetValues
+cusparseDnMatSetStridedBatch
+cusparseDnMatSetValues
+cusparseDnVecGet
+cusparseDnVecGetValues
+cusparseDnVecSetValues
+cusparseDnnz
+cusparseDnnz_compress
+cusparseDpruneCsr2csr
+cusparseDpruneCsr2csrByPercentage
+cusparseDpruneCsr2csrByPercentage_bufferSizeExt
+cusparseDpruneCsr2csrNnz
+cusparseDpruneCsr2csrNnzByPercentage
+cusparseDpruneCsr2csr_bufferSizeExt
+cusparseDpruneDense2csr
+cusparseDpruneDense2csrByPercentage
+cusparseDpruneDense2csrByPercentage_bufferSizeExt
+cusparseDpruneDense2csrNnz
+cusparseDpruneDense2csrNnzByPercentage
+cusparseDpruneDense2csr_bufferSizeExt
+cusparseGather
+cusparseGetErrorName
+cusparseGetErrorString
+cusparseGetMatDiagType
+cusparseGetMatFillMode
+cusparseGetMatIndexBase
+cusparseGetMatType
+cusparseGetPointerMode
+cusparseGetProperty
+cusparseGetStream
+cusparseGetVersion
+cusparseHpruneCsr2csr
+cusparseHpruneCsr2csrByPercentage
+cusparseHpruneCsr2csrByPercentage_bufferSizeExt
+cusparseHpruneCsr2csrNnz
+cusparseHpruneCsr2csrNnzByPercentage
+cusparseHpruneCsr2csr_bufferSizeExt
+cusparseHpruneDense2csr
+cusparseHpruneDense2csrByPercentage
+cusparseHpruneDense2csrByPercentage_bufferSizeExt
+cusparseHpruneDense2csrNnz
+cusparseHpruneDense2csrNnzByPercentage
+cusparseHpruneDense2csr_bufferSizeExt
+cusparseLoggerForceDisable
+cusparseLoggerOpenFile
+cusparseLoggerSetCallback
+cusparseLoggerSetFile
+cusparseLoggerSetLevel
+cusparseLoggerSetMask
+cusparseRot
+cusparseSDDMM
+cusparseSDDMM_bufferSize
+cusparseSDDMM_preprocess
+cusparseSbsr2csr
+cusparseSbsric02
+cusparseSbsric02_analysis
+cusparseSbsric02_bufferSize
+cusparseSbsrilu02
+cusparseSbsrilu02_analysis
+cusparseSbsrilu02_bufferSize
+cusparseSbsrilu02_numericBoost
+cusparseSbsrmm
+cusparseSbsrmv
+cusparseSbsrsm2_analysis
+cusparseSbsrsm2_bufferSize
+cusparseSbsrsm2_solve
+cusparseSbsrsv2_analysis
+cusparseSbsrsv2_bufferSize
+cusparseSbsrsv2_solve
+cusparseSbsrxmv
+cusparseScatter
+cusparseScsr2bsr
+cusparseScsr2csr_compress
+cusparseScsr2csru
+cusparseScsr2gebsr
+cusparseScsr2gebsr_bufferSize
+cusparseScsrcolor
+cusparseScsrgeam2
+cusparseScsrgeam2_bufferSizeExt
+cusparseScsric02
+cusparseScsric02_analysis
+cusparseScsric02_bufferSize
+cusparseScsrilu02
+cusparseScsrilu02_analysis
+cusparseScsrilu02_bufferSize
+cusparseScsrilu02_numericBoost
+cusparseScsru2csr
+cusparseScsru2csr_bufferSizeExt
+cusparseSetMatDiagType
+cusparseSetMatFillMode
+cusparseSetMatIndexBase
+cusparseSetMatType
+cusparseSetPointerMode
+cusparseSetStream
+cusparseSgebsr2csr
+cusparseSgebsr2gebsc
+cusparseSgebsr2gebsc_bufferSize
+cusparseSgebsr2gebsr
+cusparseSgebsr2gebsr_bufferSize
+cusparseSgemvi
+cusparseSgemvi_bufferSize
+cusparseSgpsvInterleavedBatch
+cusparseSgpsvInterleavedBatch_bufferSizeExt
+cusparseSgtsv2
+cusparseSgtsv2StridedBatch
+cusparseSgtsv2StridedBatch_bufferSizeExt
+cusparseSgtsv2_bufferSizeExt
+cusparseSgtsv2_nopivot
+cusparseSgtsv2_nopivot_bufferSizeExt
+cusparseSgtsvInterleavedBatch
+cusparseSgtsvInterleavedBatch_bufferSizeExt
+cusparseSnnz
+cusparseSnnz_compress
+cusparseSpGEMM_compute
+cusparseSpGEMM_copy
+cusparseSpGEMM_createDescr
+cusparseSpGEMM_destroyDescr
+cusparseSpGEMM_estimateMemory
+cusparseSpGEMM_getNumProducts
+cusparseSpGEMM_workEstimation
+cusparseSpGEMMreuse_compute
+cusparseSpGEMMreuse_copy
+cusparseSpGEMMreuse_nnz
+cusparseSpGEMMreuse_workEstimation
+cusparseSpMM
+cusparseSpMMOp
+cusparseSpMMOp_createPlan
+cusparseSpMMOp_destroyPlan
+cusparseSpMM_bufferSize
+cusparseSpMM_preprocess
+cusparseSpMV
+cusparseSpMV_bufferSize
+cusparseSpMatGetAttribute
+cusparseSpMatGetFormat
+cusparseSpMatGetIndexBase
+cusparseSpMatGetSize
+cusparseSpMatGetStridedBatch
+cusparseSpMatGetValues
+cusparseSpMatSetAttribute
+cusparseSpMatSetValues
+cusparseSpSM_analysis
+cusparseSpSM_bufferSize
+cusparseSpSM_createDescr
+cusparseSpSM_destroyDescr
+cusparseSpSM_solve
+cusparseSpSV_analysis
+cusparseSpSV_bufferSize
+cusparseSpSV_createDescr
+cusparseSpSV_destroyDescr
+cusparseSpSV_solve
+cusparseSpSV_updateMatrix
+cusparseSpVV
+cusparseSpVV_bufferSize
+cusparseSpVecGet
+cusparseSpVecGetIndexBase
+cusparseSpVecGetValues
+cusparseSpVecSetValues
+cusparseSparseToDense
+cusparseSparseToDense_bufferSize
+cusparseSpruneCsr2csr
+cusparseSpruneCsr2csrByPercentage
+cusparseSpruneCsr2csrByPercentage_bufferSizeExt
+cusparseSpruneCsr2csrNnz
+cusparseSpruneCsr2csrNnzByPercentage
+cusparseSpruneCsr2csr_bufferSizeExt
+cusparseSpruneDense2csr
+cusparseSpruneDense2csrByPercentage
+cusparseSpruneDense2csrByPercentage_bufferSizeExt
+cusparseSpruneDense2csrNnz
+cusparseSpruneDense2csrNnzByPercentage
+cusparseSpruneDense2csr_bufferSizeExt
+cusparseXbsric02_zeroPivot
+cusparseXbsrilu02_zeroPivot
+cusparseXbsrsm2_zeroPivot
+cusparseXbsrsv2_zeroPivot
+cusparseXcoo2csr
+cusparseXcoosortByColumn
+cusparseXcoosortByRow
+cusparseXcoosort_bufferSizeExt
+cusparseXcscsort
+cusparseXcscsort_bufferSizeExt
+cusparseXcsr2bsrNnz
+cusparseXcsr2coo
+cusparseXcsr2gebsrNnz
+cusparseXcsrgeam2Nnz
+cusparseXcsric02_zeroPivot
+cusparseXcsrilu02_zeroPivot
+cusparseXcsrsort
+cusparseXcsrsort_bufferSizeExt
+cusparseXgebsr2gebsrNnz
+cusparseZbsr2csr
+cusparseZbsric02
+cusparseZbsric02_analysis
+cusparseZbsric02_bufferSize
+cusparseZbsrilu02
+cusparseZbsrilu02_analysis
+cusparseZbsrilu02_bufferSize
+cusparseZbsrilu02_numericBoost
+cusparseZbsrmm
+cusparseZbsrmv
+cusparseZbsrsm2_analysis
+cusparseZbsrsm2_bufferSize
+cusparseZbsrsm2_solve
+cusparseZbsrsv2_analysis
+cusparseZbsrsv2_bufferSize
+cusparseZbsrsv2_solve
+cusparseZbsrxmv
+cusparseZcsr2bsr
+cusparseZcsr2csr_compress
+cusparseZcsr2csru
+cusparseZcsr2gebsr
+cusparseZcsr2gebsr_bufferSize
+cusparseZcsrcolor
+cusparseZcsrgeam2
+cusparseZcsrgeam2_bufferSizeExt
+cusparseZcsric02
+cusparseZcsric02_analysis
+cusparseZcsric02_bufferSize
+cusparseZcsrilu02
+cusparseZcsrilu02_analysis
+cusparseZcsrilu02_bufferSize
+cusparseZcsrilu02_numericBoost
+cusparseZcsru2csr
+cusparseZcsru2csr_bufferSizeExt
+cusparseZgebsr2csr
+cusparseZgebsr2gebsc
+cusparseZgebsr2gebsc_bufferSize
+cusparseZgebsr2gebsr
+cusparseZgebsr2gebsr_bufferSize
+cusparseZgemvi
+cusparseZgemvi_bufferSize
+cusparseZgpsvInterleavedBatch
+cusparseZgpsvInterleavedBatch_bufferSizeExt
+cusparseZgtsv2
+cusparseZgtsv2StridedBatch
+cusparseZgtsv2StridedBatch_bufferSizeExt
+cusparseZgtsv2_bufferSizeExt
+cusparseZgtsv2_nopivot
+cusparseZgtsv2_nopivot_bufferSizeExt
+cusparseZgtsvInterleavedBatch
+cusparseZgtsvInterleavedBatch_bufferSizeExt
+cusparseZnnz
+cusparseZnnz_compress
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_0.inc
deleted file mode 100644
index 71851e05d36cc0..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_0.inc
+++ /dev/null
@@ -1,7832 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle,
-                                                int *version) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, version);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type,
-                                                 int *value) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle,
-                                               cudaStream_t streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle,
-                                               cudaStream_t *streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t,
-                                                  cusparsePointerMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateMatDescr(cusparseMatDescr_t *descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyMatDescr(cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCopyMatDescr(cusparseMatDescr_t dest, const cusparseMatDescr_t src) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t,
-                                                  const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCopyMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dest, src);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA,
-                                                cusparseMatrixType_t type) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseMatrixType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, type);
-}
-
-cusparseMatrixType_t CUSPARSEAPI
-cusparseGetMatType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseMatrixType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatType");
-  if (!func_ptr) return cusparseMatrixType_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseFillMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatFillMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, fillMode);
-}
-
-cusparseFillMode_t CUSPARSEAPI
-cusparseGetMatFillMode(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseFillMode_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatFillMode");
-  if (!func_ptr) return cusparseFillMode_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseDiagType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatDiagType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, diagType);
-}
-
-cusparseDiagType_t CUSPARSEAPI
-cusparseGetMatDiagType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseDiagType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatDiagType");
-  if (!func_ptr) return cusparseDiagType_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA,
-                                                     cusparseIndexBase_t base) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, base);
-}
-
-cusparseIndexBase_t CUSPARSEAPI
-cusparseGetMatIndexBase(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseIndexBase_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatIndexBase");
-  if (!func_ptr) return cusparseIndexBase_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateSolveAnalysisInfo(cusparseSolveAnalysisInfo_t *info) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSolveAnalysisInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSolveAnalysisInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySolveAnalysisInfo(cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSolveAnalysisInfo_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDestroySolveAnalysisInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseGetLevelInfo(cusparseHandle_t handle, cusparseSolveAnalysisInfo_t info,
-                     int *nlevels, int **levelPtr, int **levelInd) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseSolveAnalysisInfo_t, int *, int **, int **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetLevelInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, nlevels, levelPtr, levelInd);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsv2Info(csrsv2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsv2Info(csrsv2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateHybMat(cusparseHybMat_t *hybA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHybMat_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateHybMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hybA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyHybMat(cusparseHybMat_t hybA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHybMat_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyHybMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hybA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateColorInfo(cusparseColorInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyColorInfo(cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t alg) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t *alg) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t,
-                                                  cusparseColorAlg_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreatePruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyPruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSaxpyi(cusparseHandle_t handle, int nnz,
-                                            const float *alpha,
-                                            const float *xVal, const int *xInd,
-                                            float *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const int *, float *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDaxpyi(cusparseHandle_t handle, int nnz,
-                                            const double *alpha,
-                                            const double *xVal, const int *xInd,
-                                            double *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const int *,
-      double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCaxpyi(cusparseHandle_t handle, int nnz,
-                                            const cuComplex *alpha,
-                                            const cuComplex *xVal,
-                                            const int *xInd, cuComplex *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *, const int *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZaxpyi(cusparseHandle_t handle, int nnz,
-                                            const cuDoubleComplex *alpha,
-                                            const cuDoubleComplex *xVal,
-                                            const int *xInd, cuDoubleComplex *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const int *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdoti(cusparseHandle_t handle, int nnz,
-                                           const float *xVal, const int *xInd,
-                                           const float *y,
-                                           float *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const int *, const float *, float *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdoti(cusparseHandle_t handle, int nnz,
-                                           const double *xVal, const int *xInd,
-                                           const double *y,
-                                           double *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const int *, const double *,
-      double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdoti(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *xVal,
-                                           const int *xInd, const cuComplex *y,
-                                           cuComplex *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, const cuComplex *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdoti(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *xVal,
-                                           const int *xInd,
-                                           const cuDoubleComplex *y,
-                                           cuDoubleComplex *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdotci(cusparseHandle_t handle, int nnz,
-                                            const cuComplex *xVal,
-                                            const int *xInd, const cuComplex *y,
-                                            cuComplex *resultDevHostPtr,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, const cuComplex *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdotci");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdotci(cusparseHandle_t handle, int nnz,
-                                            const cuDoubleComplex *xVal,
-                                            const int *xInd,
-                                            const cuDoubleComplex *y,
-                                            cuDoubleComplex *resultDevHostPtr,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdotci");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgthr(cusparseHandle_t handle, int nnz,
-                                           const float *y, float *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, float *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgthr(cusparseHandle_t handle, int nnz,
-                                           const double *y, double *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, double *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgthr(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *y, cuComplex *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, cuComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgthr(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *y,
-                                           cuDoubleComplex *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *,
-      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgthrz(cusparseHandle_t handle, int nnz,
-                                            float *y, float *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, float *, float *,
-                                      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgthrz(cusparseHandle_t handle, int nnz,
-                                            double *y, double *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, double *, double *,
-                                      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgthrz(cusparseHandle_t handle, int nnz,
-                                            cuComplex *y, cuComplex *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cuComplex *, cuComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgthrz(cusparseHandle_t handle, int nnz,
-                                            cuDoubleComplex *y,
-                                            cuDoubleComplex *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cuDoubleComplex *, cuDoubleComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSsctr(cusparseHandle_t handle, int nnz,
-                                           const float *xVal, const int *xInd,
-                                           float *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int,
-                                                  const float *, const int *,
-                                                  float *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDsctr(cusparseHandle_t handle, int nnz,
-                                           const double *xVal, const int *xInd,
-                                           double *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const int *, double *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsctr(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *xVal,
-                                           const int *xInd, cuComplex *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, cuComplex *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZsctr(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *xVal,
-                                           const int *xInd, cuDoubleComplex *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSroti(cusparseHandle_t handle, int nnz,
-                                           float *xVal, const int *xInd,
-                                           float *y, const float *c,
-                                           const float *s,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, float *, const int *, float *, const float *,
-      const float *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSroti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDroti(cusparseHandle_t handle, int nnz,
-                                           double *xVal, const int *xInd,
-                                           double *y, const double *c,
-                                           const double *s,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, double *, const int *, double *, const double *,
-      const double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDroti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const float *alpha, /* host or device pointer */
-               const float *A, int lda, int nnz, const float *xVal,
-               const int *xInd, const float *beta, /* host or device pointer */
-               float *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const float *, int, int, const float *, const int *, const float *,
-      float *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const double *alpha, /* host or device pointer */
-               const double *A, int lda, int nnz, const double *xVal,
-               const int *xInd, const double *beta, /* host or device pointer */
-               double *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const double *, int, int, const double *, const int *, const double *,
-      double *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, int nnz, const cuComplex *xVal,
-    const int *xInd, const cuComplex *beta, /* host or device pointer */
-    cuComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cuComplex *, int, int, const cuComplex *, const int *,
-      const cuComplex *, cuComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, int nnz, const cuDoubleComplex *xVal,
-    const int *xInd, const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmv(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int nnz, const double *alpha,
-               const cusparseMatDescr_t descrA, const double *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrmv(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int nnz, const cuComplex *alpha,
-               const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const cuComplex *x, const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx_bufferSize(
-    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
-    const cusparseMatDescr_t descrA, const void *csrValA,
-    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
-    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
-    void *y, cudaDataType ytype, cudaDataType executiontype,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
-      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
-      cudaDataType, const int *, const int *, const void *, cudaDataType,
-      const void *, cudaDataType, void *, cudaDataType, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
-                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
-                  betatype, y, ytype, executiontype, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx(
-    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
-    const cusparseMatDescr_t descrA, const void *csrValA,
-    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
-    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
-    void *y, cudaDataType ytype, cudaDataType executiontype, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
-      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
-      cudaDataType, const int *, const int *, const void *, cudaDataType,
-      const void *, cudaDataType, void *, cudaDataType, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
-                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
-                  betatype, y, ytype, executiontype, buffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmv_mp(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-                  int n, int nnz, const double *alpha,
-                  const cusparseMatDescr_t descrA, const double *csrSortedValA,
-                  const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                  const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *x, const cuComplex *beta,
-    cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const float *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const float *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const float *,
-      const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const double *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const double *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const double *,
-      const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const cuComplex *x, const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const cuComplex *,
-      const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZhybmv(cusparseHandle_t handle, cusparseOperation_t transA,
-               const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-               const cusparseHybMat_t hybA, const cuDoubleComplex *x,
-               const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCbsrmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-               cusparseOperation_t transA, int mb, int nb, int nnzb,
-               const cuComplex *alpha, const cusparseMatDescr_t descrA,
-               const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-               const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-               const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const cuComplex *, const cuComplex *,
-      cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-                const float *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const float *x,
-                const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const int *, const int *, int, const float *, const float *,
-      float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-                const double *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const double *x,
-                const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const int *, const int *, int, const double *,
-      const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-    const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const int *, const int *, int,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const int *,
-      const int *, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrsv_analysisEx(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const void *csrSortedValA,
-    cudaDataType csrSortedValAtype, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const void *, cudaDataType, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrsv_analysisEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedValAtype, csrSortedRowPtrA, csrSortedColIndA, info,
-                  executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrsv_solveEx(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const void *alpha, cudaDataType alphatype, const cusparseMatDescr_t descrA,
-    const void *csrSortedValA, cudaDataType csrSortedValAtype,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info, const void *f, cudaDataType ftype,
-    void *x, cudaDataType xtype, cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const void *, cudaDataType,
-      const cusparseMatDescr_t, const void *, cudaDataType, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const void *, cudaDataType,
-      void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrsv_solveEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, alphatype, descrA, csrSortedValA,
-                  csrSortedValAtype, csrSortedRowPtrA, csrSortedColIndA, info,
-                  f, ftype, x, xtype, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const float *f, float *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const double *f, double *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuComplex *f, cuComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *f, cuDoubleComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const cuDoubleComplex *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle,
-                                                       csrsv2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsv2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const float *f, float *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      csrsv2Info_t, const float *, float *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const double *f, double *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      csrsv2Info_t, const double *, double *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const cuComplex *f,
-    cuComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      csrsv2Info_t, const cuComplex *, cuComplex *, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const cuDoubleComplex *f,
-    cuDoubleComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, csrsv2Info_t, const cuDoubleComplex *, cuDoubleComplex *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsv2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsv2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const float *f, float *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, bsrsv2Info_t, const float *, float *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const double *f, double *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, bsrsv2Info_t, const double *, double *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuComplex *f, cuComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, bsrsv2Info_t, const cuComplex *,
-      cuComplex *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuDoubleComplex *f, cuDoubleComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseShybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDhybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseChybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZhybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const float *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const float *f, float *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const float *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const cuComplex *f, cuComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const double *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const double *f, double *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const double *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cusparseHybMat_t hybA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *f, cuDoubleComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrmm(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int k, int nnz, const float *alpha,
-               const cusparseMatDescr_t descrA, const float *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const float *B, int ldb, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *B, int ldb, const double *beta,
-    double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb,
-    const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const float *alpha, const cusparseMatDescr_t descrA,
-                const float *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const float *B, int ldb,
-                const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const double *alpha, const cusparseMatDescr_t descrA,
-                const double *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const double *B, int ldb,
-                const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, const double *, double *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const cuComplex *alpha, const cusparseMatDescr_t descrA,
-                const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const cuComplex *B, int ldb,
-                const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmm2(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const float *B,
-    const int ldb, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const int, const float *, const int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const double *B,
-    const int ldb, const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const int, const double *, const int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const cuComplex *B,
-    const int ldb, const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const int, const cuComplex *, const int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA,
-    const int blockSize, const cuDoubleComplex *B, const int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, const int, const cuDoubleComplex *, const int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz,
-    const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *cscValB, const int *cscColPtrB,
-    const int *cscRowIndB, const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const float *, const float *, int,
-      const float *, const int *, const int *, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz,
-    const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *cscValB, const int *cscColPtrB,
-    const int *cscRowIndB, const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const double *, const double *, int,
-      const double *, const int *, const int *, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCgemmi(cusparseHandle_t handle, int m, int n, int k, int nnz,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *cscValB,
-               const int *cscColPtrB, const int *cscRowIndB,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, const int *, const int *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *cscValB,
-    const int *cscColPtrB, const int *cscRowIndB,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const float *B, int ldb, float *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const double *B, int ldb, double *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuComplex *B, int ldb, cuComplex *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsm2Info(csrsm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsm2Info(csrsm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsm2_zeroPivot(cusparseHandle_t handle,
-                                                       csrsm2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsm2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    csrsm2Info_t info, cusparseSolvePolicy_t policy, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_analysis(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_solve(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, float *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, float *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_solve(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, double *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_solve(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_solve(
-    cusparseHandle_t handle, int algo, /* algo = 0, 1 */
-    cusparseOperation_t transA, cusparseOperation_t transB, int m, int nrhs,
-    int nnz, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuDoubleComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *, int,
-      csrsm2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsm2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsm2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const float *B, int ldb, float *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsm2Info_t, const float *, int, float *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const double *B, int ldb, double *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsm2Info_t, const double *, int, double *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuComplex *B, int ldb, cuComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsm2Info_t, const cuComplex *, int, cuComplex *, int,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsm2Info_t, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrilu0Ex(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, void *csrSortedValA_ValM,
-    cudaDataType csrSortedValA_ValMtype,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info, cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      void *, cudaDataType, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrilu0Ex");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedValA_ValMtype, csrSortedRowPtrA, csrSortedColIndA,
-                  info, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, float *csrSortedValA_ValM,
-                 /* matrix A values are updated inplace
-                    to be the preconditioner M values */
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, double *csrSortedValA_ValM,
-                 /* matrix A values are updated inplace
-                    to be the preconditioner M values */
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, cuComplex *csrSortedValA_ValM,
-                 /* matrix A values are updated inplace
-                    to be the preconditioner M values */
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu0(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA_ValM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(
-    cusparseHandle_t handle, csrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(
-    cusparseHandle_t handle, bsrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsric0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                const cusparseMatDescr_t descrA, float *csrSortedValA_ValM,
-                /* matrix A values are updated inplace
-                   to be the preconditioner M values */
-                const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsric0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                const cusparseMatDescr_t descrA, double *csrSortedValA_ValM,
-                /* matrix A values are updated inplace
-                   to be the preconditioner M values */
-                const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsric0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                const cusparseMatDescr_t descrA, cuComplex *csrSortedValA_ValM,
-                /* matrix A values are updated inplace
-                   to be the preconditioner M values */
-                const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric0(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA_ValM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
-                                                        csric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
-                                                        bsric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv(cusparseHandle_t handle, int m,
-                                           int n, const float *dl,
-                                           const float *d, const float *du,
-                                           float *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  const float *, const float *,
-                                                  const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv(cusparseHandle_t handle, int m,
-                                           int n, const double *dl,
-                                           const double *d, const double *du,
-                                           double *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv(cusparseHandle_t handle, int m,
-                                           int n, const cuComplex *dl,
-                                           const cuComplex *d,
-                                           const cuComplex *du, cuComplex *B,
-                                           int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv(cusparseHandle_t handle, int m,
-                                           int n, const cuDoubleComplex *dl,
-                                           const cuDoubleComplex *d,
-                                           const cuDoubleComplex *du,
-                                           cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const float *dl,
-                                            const float *d, const float *du,
-                                            float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const double *dl,
-                                            const double *d, const double *du,
-                                            double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuComplex *dl,
-                                            const cuComplex *d,
-                                            const cuComplex *du, cuComplex *B,
-                                            int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuDoubleComplex *dl,
-                                            const cuDoubleComplex *d,
-                                            const cuDoubleComplex *du,
-                                            cuDoubleComplex *B, int ldb,
-                                            void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgtsv_nopivot(cusparseHandle_t handle, int m, int n, const float *dl,
-                      const float *d, const float *du, float *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  const float *, const float *,
-                                                  const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgtsv_nopivot(cusparseHandle_t handle, int m, int n, const double *dl,
-                      const double *d, const double *du, double *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgtsv_nopivot(cusparseHandle_t handle, int m, int n,
-                      const cuDoubleComplex *dl, const cuDoubleComplex *d,
-                      const cuDoubleComplex *du, cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *B,
-    int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, float *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      float *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const double *dl, const double *d,
-    const double *du, double *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      double *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, cuComplex *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
-    int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, const float *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      const float *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const double *dl, const double *d,
-    const double *du, const double *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      const double *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, const cuComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, float *x, int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      float *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgtsv2StridedBatch(cusparseHandle_t handle, int m, const double *dl,
-                           const double *d, const double *du, double *x,
-                           int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      double *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, cuComplex *x, int batchCount, int batchStride,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
-    int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const float *dl, const float *d,
-    const float *du, const float *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const double *dl, const double *d,
-    const double *du, const double *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, float *dl, float *d, float *du,
-    float *x, int batchCount, void *pBuffer) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int, float *,
-                                      float *, float *, float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, double *dl, double *d, double *du,
-    double *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  double *, double *, double *,
-                                                  double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuComplex *dl, cuComplex *d,
-    cuComplex *du, cuComplex *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
-      cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *dl,
-    cuDoubleComplex *d, cuDoubleComplex *du, cuDoubleComplex *x, int batchCount,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, cuDoubleComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const float *ds, const float *dl,
-    const float *d, const float *du, const float *dw, const float *x,
-    int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, const float *, const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const double *ds,
-    const double *dl, const double *d, const double *du, const double *dw,
-    const double *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, const double *, const double *, int,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuComplex *ds,
-    const cuComplex *dl, const cuComplex *d, const cuComplex *du,
-    const cuComplex *dw, const cuComplex *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, const cuComplex *,
-      const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *ds,
-    const cuDoubleComplex *dl, const cuDoubleComplex *d,
-    const cuDoubleComplex *du, const cuDoubleComplex *dw,
-    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, float *ds, float *dl, float *d,
-    float *du, float *dw, float *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, float *, float *, float *, float *, float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, double *ds, double *dl, double *d,
-    double *du, double *dw, double *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, double *, double *, double *, double *,
-      double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuComplex *ds, cuComplex *dl,
-    cuComplex *d, cuComplex *du, cuComplex *dw, cuComplex *x, int batchCount,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
-      cuComplex *, cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *ds,
-    cuDoubleComplex *dl, cuDoubleComplex *d, cuDoubleComplex *du,
-    cuDoubleComplex *dw, cuDoubleComplex *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseXcsrgemmNnz(cusparseHandle_t handle, cusparseOperation_t transA,
-                    cusparseOperation_t transB, int m, int n, int k,
-                    const cusparseMatDescr_t descrA, const int nnzA,
-                    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                    const cusparseMatDescr_t descrB, const int nnzB,
-                    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-                    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-                    int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, const int, const int *, const int *,
-      const cusparseMatDescr_t, const int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemmNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, const int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, const int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, const int, const float *, const int *,
-      const int *, const cusparseMatDescr_t, const int, const float *,
-      const int *, const int *, const cusparseMatDescr_t, float *, const int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuComplex *,
-      const int *, const int *, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
-      const int *, const int *, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrgemm2Info(csrgemm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrgemm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrgemm2Info(csrgemm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrgemm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
-    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, csrgemm2Info_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
-      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const float *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
-      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const double *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cuComplex *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuDoubleComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cuDoubleComplex *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Nnz(
-    cusparseHandle_t handle, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, const csrgemm2Info_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, int,
-      const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *, const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemm2Nnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
-    const cusparseMatDescr_t descrD, int nnzD, const float *csrSortedValD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
-      int, const float *, const int *, const int *, const cusparseMatDescr_t,
-      int, const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const double *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
-      int, const double *, const int *, const int *, const cusparseMatDescr_t,
-      int, const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const cuComplex *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuComplex *,
-      const int *, const int *, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuComplex *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrD, int nnzD,
-    const cuDoubleComplex *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
-      const int *, const int *, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuDoubleComplex *, const int *,
-      int *, const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgeamNnz(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeamNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, const float *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, const cuComplex *, const int *,
-      const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgeam2Nnz(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeam2Nnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, workspace);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const double *, int *,
-      int *, int *, const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const float *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const double *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuDoubleComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const float *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const float *,
-      const int *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const double *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const double *,
-      const int *, int *, int *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, int *, int *, cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    int *nnzPerRow, int *nnzC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuDoubleComplex *,
-      const int *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    float *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, int, const int *, float *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    double *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, int, const int *, double *, int *, int *,
-      double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    cuComplex *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const int *, cuComplex *, int *, int *,
-      cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    cuDoubleComplex *csrSortedValC, int *csrSortedColIndC,
-    int *csrSortedRowPtrC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, const int *,
-      cuDoubleComplex *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerRow, float *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerRow, double *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerRow, cuComplex *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *A, int lda, const int *nnzPerRow,
-    cuDoubleComplex *csrSortedValA, int *csrSortedRowPtrA,
-    int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerCol, float *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerCol, double *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerCol, cuComplex *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *A, int lda, const int *nnzPerCol,
-    cuDoubleComplex *cscSortedValA, int *cscSortedRowIndA,
-    int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle,
-                                              const int *cooRowInd, int nnz,
-                                              int m, int *csrSortedRowPtr,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoo2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, cooRowInd, nnz, m, csrSortedRowPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle,
-                                              const int *csrSortedRowPtr,
-                                              int nnz, int m, int *cooRowInd,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2coo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, csrSortedRowPtr, nnz, m, cooRowInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrSortedVal,
-    cudaDataType csrSortedValtype, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, void *cscSortedVal,
-    cudaDataType cscSortedValtype, int *cscSortedRowInd, int *cscSortedColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-    cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, cudaDataType, const int *,
-      const int *, void *, cudaDataType, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedValtype,
-                  csrSortedRowPtr, csrSortedColInd, cscSortedVal,
-                  cscSortedValtype, cscSortedRowInd, cscSortedColPtr,
-                  copyValues, idxBase, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz, const float *csrSortedVal,
-    const int *csrSortedRowPtr, const int *csrSortedColInd, float *cscSortedVal,
-    int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      float *, int *, int *, cusparseAction_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz, const double *csrSortedVal,
-    const int *csrSortedRowPtr, const int *csrSortedColInd,
-    double *cscSortedVal, int *cscSortedRowInd, int *cscSortedColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      double *, int *, int *, cusparseAction_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsr2csc(cusparseHandle_t handle, int m, int n, int nnz,
-                 const cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-                 const int *csrSortedColInd, cuComplex *cscSortedVal,
-                 int *cscSortedRowInd, int *cscSortedColPtr,
-                 cusparseAction_t copyValues, cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, cuComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, cuDoubleComplex *cscSortedVal,
-    int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, cuDoubleComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZdense2hyb(cusparseHandle_t handle, int m, int n,
-                   const cusparseMatDescr_t descrA, const cuDoubleComplex *A,
-                   int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-                   int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              float *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              double *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuComplex *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuDoubleComplex *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              float *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              double *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuComplex *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuDoubleComplex *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, int blockDim, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2bsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim, float *bscVal,
-    int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t baseIdx, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, float *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, baseIdx, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    double *bscVal, int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t baseIdx, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, double *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, baseIdx, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t baseIdx, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, cuComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, baseIdx, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuDoubleComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t baseIdx, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, cuDoubleComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, baseIdx, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDim, int colBlockDim,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, int, const cusparseMatDescr_t, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int rowBlockDim, int colBlockDim,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, const cusparseMatDescr_t, int *, int, int,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedRowPtrC, rowBlockDim,
-                  colBlockDim, nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, const cusparseMatDescr_t,
-      float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, const cusparseMatDescr_t,
-      double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, const cusparseMatDescr_t,
-      cuComplex *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDimA, int colBlockDimA,
-    const cusparseMatDescr_t descrC, int *bsrSortedRowPtrC, int rowBlockDimC,
-    int colBlockDimC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const int *, const int *, int, int,
-      const cusparseMatDescr_t, int *, int, int, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC,
-                  bsrSortedRowPtrC, rowBlockDimC, colBlockDimC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *bsrSortedValC, int *bsrSortedRowPtrC,
-    int *bsrSortedColIndC, int rowBlockDimC, int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *,
-      int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateIdentityPermutation(cusparseHandle_t handle, int n, int *p) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCreateIdentityPermutation");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, p);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cooRowsA,
-    const int *cooColsA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle,
-                                                   int m, int n, int nnz,
-                                                   int *cooRowsA, int *cooColsA,
-                                                   int *P, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByRow");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle,
-                                                      int m, int n, int nnz,
-                                                      int *cooRowsA,
-                                                      int *cooColsA, int *P,
-                                                      void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByColumn");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *csrRowPtrA,
-    const int *csrColIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrRowPtrA, csrColIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *csrRowPtrA,
-                                              int *csrColIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrRowPtrA, csrColIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cscColPtrA,
-    const int *cscRowIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cscColPtrA, cscRowIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *cscColPtrA,
-                                              int *cscRowIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, cscColPtrA, cscRowIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, float *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, float *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, double *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, double *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuDoubleComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      const float *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      const double *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, const float *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, const double *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, const float *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, const float *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, const double *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, const double *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, float *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, double *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_1.inc b/third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_1.inc
deleted file mode 100644
index 03d6d0c20d5223..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_1.inc
+++ /dev/null
@@ -1,8262 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle,
-                                                int *version) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, version);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type,
-                                                 int *value) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-const char *CUSPARSEAPI cusparseGetErrorName(cusparseStatus_t status) {
-  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorName");
-  if (!func_ptr) return "cusparseGetErrorName symbol not found.";
-  return func_ptr(status);
-}
-
-const char *CUSPARSEAPI cusparseGetErrorString(cusparseStatus_t status) {
-  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorString");
-  if (!func_ptr) return "cusparseGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle,
-                                               cudaStream_t streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle,
-                                               cudaStream_t *streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t,
-                                                  cusparsePointerMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateMatDescr(cusparseMatDescr_t *descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyMatDescr(cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCopyMatDescr(cusparseMatDescr_t dest, const cusparseMatDescr_t src) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t,
-                                                  const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCopyMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dest, src);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA,
-                                                cusparseMatrixType_t type) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseMatrixType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, type);
-}
-
-cusparseMatrixType_t CUSPARSEAPI
-cusparseGetMatType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseMatrixType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatType");
-  if (!func_ptr) return cusparseMatrixType_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseFillMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatFillMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, fillMode);
-}
-
-cusparseFillMode_t CUSPARSEAPI
-cusparseGetMatFillMode(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseFillMode_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatFillMode");
-  if (!func_ptr) return cusparseFillMode_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseDiagType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatDiagType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, diagType);
-}
-
-cusparseDiagType_t CUSPARSEAPI
-cusparseGetMatDiagType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseDiagType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatDiagType");
-  if (!func_ptr) return cusparseDiagType_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA,
-                                                     cusparseIndexBase_t base) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, base);
-}
-
-cusparseIndexBase_t CUSPARSEAPI
-cusparseGetMatIndexBase(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseIndexBase_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatIndexBase");
-  if (!func_ptr) return cusparseIndexBase_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateSolveAnalysisInfo(cusparseSolveAnalysisInfo_t *info) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSolveAnalysisInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSolveAnalysisInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySolveAnalysisInfo(cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSolveAnalysisInfo_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDestroySolveAnalysisInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseGetLevelInfo(cusparseHandle_t handle, cusparseSolveAnalysisInfo_t info,
-                     int *nlevels, int **levelPtr, int **levelInd) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseSolveAnalysisInfo_t, int *, int **, int **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetLevelInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, nlevels, levelPtr, levelInd);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsv2Info(csrsv2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsv2Info(csrsv2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateHybMat(cusparseHybMat_t *hybA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHybMat_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateHybMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hybA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyHybMat(cusparseHybMat_t hybA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHybMat_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyHybMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hybA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateColorInfo(cusparseColorInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyColorInfo(cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t alg) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t *alg) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t,
-                                                  cusparseColorAlg_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreatePruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyPruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSaxpyi(cusparseHandle_t handle, int nnz,
-                                            const float *alpha,
-                                            const float *xVal, const int *xInd,
-                                            float *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const int *, float *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDaxpyi(cusparseHandle_t handle, int nnz,
-                                            const double *alpha,
-                                            const double *xVal, const int *xInd,
-                                            double *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const int *,
-      double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCaxpyi(cusparseHandle_t handle, int nnz,
-                                            const cuComplex *alpha,
-                                            const cuComplex *xVal,
-                                            const int *xInd, cuComplex *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *, const int *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZaxpyi(cusparseHandle_t handle, int nnz,
-                                            const cuDoubleComplex *alpha,
-                                            const cuDoubleComplex *xVal,
-                                            const int *xInd, cuDoubleComplex *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const int *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdoti(cusparseHandle_t handle, int nnz,
-                                           const float *xVal, const int *xInd,
-                                           const float *y,
-                                           float *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const int *, const float *, float *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdoti(cusparseHandle_t handle, int nnz,
-                                           const double *xVal, const int *xInd,
-                                           const double *y,
-                                           double *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const int *, const double *,
-      double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdoti(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *xVal,
-                                           const int *xInd, const cuComplex *y,
-                                           cuComplex *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, const cuComplex *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdoti(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *xVal,
-                                           const int *xInd,
-                                           const cuDoubleComplex *y,
-                                           cuDoubleComplex *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdotci(cusparseHandle_t handle, int nnz,
-                                            const cuComplex *xVal,
-                                            const int *xInd, const cuComplex *y,
-                                            cuComplex *resultDevHostPtr,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, const cuComplex *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdotci");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdotci(cusparseHandle_t handle, int nnz,
-                                            const cuDoubleComplex *xVal,
-                                            const int *xInd,
-                                            const cuDoubleComplex *y,
-                                            cuDoubleComplex *resultDevHostPtr,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdotci");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgthr(cusparseHandle_t handle, int nnz,
-                                           const float *y, float *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, float *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgthr(cusparseHandle_t handle, int nnz,
-                                           const double *y, double *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, double *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgthr(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *y, cuComplex *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, cuComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgthr(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *y,
-                                           cuDoubleComplex *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *,
-      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgthrz(cusparseHandle_t handle, int nnz,
-                                            float *y, float *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, float *, float *,
-                                      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgthrz(cusparseHandle_t handle, int nnz,
-                                            double *y, double *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, double *, double *,
-                                      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgthrz(cusparseHandle_t handle, int nnz,
-                                            cuComplex *y, cuComplex *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cuComplex *, cuComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgthrz(cusparseHandle_t handle, int nnz,
-                                            cuDoubleComplex *y,
-                                            cuDoubleComplex *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cuDoubleComplex *, cuDoubleComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSsctr(cusparseHandle_t handle, int nnz,
-                                           const float *xVal, const int *xInd,
-                                           float *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int,
-                                                  const float *, const int *,
-                                                  float *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDsctr(cusparseHandle_t handle, int nnz,
-                                           const double *xVal, const int *xInd,
-                                           double *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const int *, double *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsctr(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *xVal,
-                                           const int *xInd, cuComplex *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, cuComplex *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZsctr(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *xVal,
-                                           const int *xInd, cuDoubleComplex *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSroti(cusparseHandle_t handle, int nnz,
-                                           float *xVal, const int *xInd,
-                                           float *y, const float *c,
-                                           const float *s,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, float *, const int *, float *, const float *,
-      const float *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSroti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDroti(cusparseHandle_t handle, int nnz,
-                                           double *xVal, const int *xInd,
-                                           double *y, const double *c,
-                                           const double *s,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, double *, const int *, double *, const double *,
-      const double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDroti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const float *alpha, const float *A, int lda, int nnz,
-               const float *xVal, const int *xInd, const float *beta, float *y,
-               cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const float *, int, int, const float *, const int *, const float *,
-      float *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const double *alpha, const double *A, int lda, int nnz,
-               const double *xVal, const int *xInd, const double *beta,
-               double *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const double *, int, int, const double *, const int *, const double *,
-      double *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuComplex *alpha, const cuComplex *A, int lda, int nnz,
-    const cuComplex *xVal, const int *xInd, const cuComplex *beta, cuComplex *y,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cuComplex *, int, int, const cuComplex *, const int *,
-      const cuComplex *, cuComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, int nnz,
-    const cuDoubleComplex *xVal, const int *xInd, const cuDoubleComplex *beta,
-    cuDoubleComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmv(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int nnz, const double *alpha,
-               const cusparseMatDescr_t descrA, const double *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrmv(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int nnz, const cuComplex *alpha,
-               const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const cuComplex *x, const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx_bufferSize(
-    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
-    const cusparseMatDescr_t descrA, const void *csrValA,
-    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
-    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
-    void *y, cudaDataType ytype, cudaDataType executiontype,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
-      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
-      cudaDataType, const int *, const int *, const void *, cudaDataType,
-      const void *, cudaDataType, void *, cudaDataType, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
-                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
-                  betatype, y, ytype, executiontype, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx(
-    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
-    const cusparseMatDescr_t descrA, const void *csrValA,
-    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
-    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
-    void *y, cudaDataType ytype, cudaDataType executiontype, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
-      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
-      cudaDataType, const int *, const int *, const void *, cudaDataType,
-      const void *, cudaDataType, void *, cudaDataType, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
-                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
-                  betatype, y, ytype, executiontype, buffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmv_mp(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-                  int n, int nnz, const double *alpha,
-                  const cusparseMatDescr_t descrA, const double *csrSortedValA,
-                  const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                  const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *x, const cuComplex *beta,
-    cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const float *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const float *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const float *,
-      const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const double *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const double *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const double *,
-      const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const cuComplex *x, const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const cuComplex *,
-      const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZhybmv(cusparseHandle_t handle, cusparseOperation_t transA,
-               const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-               const cusparseHybMat_t hybA, const cuDoubleComplex *x,
-               const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCbsrmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-               cusparseOperation_t transA, int mb, int nb, int nnzb,
-               const cuComplex *alpha, const cusparseMatDescr_t descrA,
-               const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-               const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-               const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const cuComplex *, const cuComplex *,
-      cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-                const float *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const float *x,
-                const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const int *, const int *, int, const float *, const float *,
-      float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-                const double *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const double *x,
-                const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const int *, const int *, int, const double *,
-      const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-    const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const int *, const int *, int,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const int *,
-      const int *, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrsv_analysisEx(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const void *csrSortedValA,
-    cudaDataType csrSortedValAtype, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const void *, cudaDataType, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrsv_analysisEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedValAtype, csrSortedRowPtrA, csrSortedColIndA, info,
-                  executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrsv_solveEx(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const void *alpha, cudaDataType alphatype, const cusparseMatDescr_t descrA,
-    const void *csrSortedValA, cudaDataType csrSortedValAtype,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info, const void *f, cudaDataType ftype,
-    void *x, cudaDataType xtype, cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const void *, cudaDataType,
-      const cusparseMatDescr_t, const void *, cudaDataType, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const void *, cudaDataType,
-      void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrsv_solveEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, alphatype, descrA, csrSortedValA,
-                  csrSortedValAtype, csrSortedRowPtrA, csrSortedColIndA, info,
-                  f, ftype, x, xtype, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const float *f, float *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const double *f, double *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuComplex *f, cuComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *f, cuDoubleComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const cuDoubleComplex *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle,
-                                                       csrsv2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsv2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const float *f, float *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      csrsv2Info_t, const float *, float *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const double *f, double *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      csrsv2Info_t, const double *, double *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const cuComplex *f,
-    cuComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      csrsv2Info_t, const cuComplex *, cuComplex *, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const cuDoubleComplex *f,
-    cuDoubleComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, csrsv2Info_t, const cuDoubleComplex *, cuDoubleComplex *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsv2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsv2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const float *f, float *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, bsrsv2Info_t, const float *, float *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const double *f, double *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, bsrsv2Info_t, const double *, double *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuComplex *f, cuComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, bsrsv2Info_t, const cuComplex *,
-      cuComplex *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuDoubleComplex *f, cuDoubleComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseShybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDhybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseChybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZhybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const float *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const float *f, float *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const float *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const cuComplex *f, cuComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const double *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const double *f, double *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const double *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cusparseHybMat_t hybA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *f, cuDoubleComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrmm(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int k, int nnz, const float *alpha,
-               const cusparseMatDescr_t descrA, const float *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const float *B, int ldb, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *B, int ldb, const double *beta,
-    double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb,
-    const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const float *alpha, const cusparseMatDescr_t descrA,
-                const float *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const float *B, int ldb,
-                const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const double *alpha, const cusparseMatDescr_t descrA,
-                const double *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const double *B, int ldb,
-                const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, const double *, double *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const cuComplex *alpha, const cusparseMatDescr_t descrA,
-                const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const cuComplex *B, int ldb,
-                const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmm2(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const float *B,
-    const int ldb, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const int, const float *, const int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const double *B,
-    const int ldb, const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const int, const double *, const int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const cuComplex *B,
-    const int ldb, const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const int, const cuComplex *, const int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA,
-    const int blockSize, const cuDoubleComplex *B, const int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, const int, const cuDoubleComplex *, const int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz, const float *alpha,
-    const float *A, int lda, const float *cscValB, const int *cscColPtrB,
-    const int *cscRowIndB, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const float *, const float *, int,
-      const float *, const int *, const int *, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz, const double *alpha,
-    const double *A, int lda, const double *cscValB, const int *cscColPtrB,
-    const int *cscRowIndB, const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const double *, const double *, int,
-      const double *, const int *, const int *, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz,
-    const cuComplex *alpha, const cuComplex *A, int lda,
-    const cuComplex *cscValB, const int *cscColPtrB, const int *cscRowIndB,
-    const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, const int *, const int *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgemmi(cusparseHandle_t handle, int m, int n, int k, int nnz,
-               const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda,
-               const cuDoubleComplex *cscValB, const int *cscColPtrB,
-               const int *cscRowIndB, const cuDoubleComplex *beta,
-               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const float *B, int ldb, float *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const double *B, int ldb, double *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuComplex *B, int ldb, cuComplex *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsm2Info(csrsm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsm2Info(csrsm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsm2_zeroPivot(cusparseHandle_t handle,
-                                                       csrsm2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsm2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    csrsm2Info_t info, cusparseSolvePolicy_t policy, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float *B, int ldb,
-    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, float *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, double *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuDoubleComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *, int,
-      csrsm2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsm2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsm2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const float *B, int ldb, float *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsm2Info_t, const float *, int, float *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const double *B, int ldb, double *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsm2Info_t, const double *, int, double *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuComplex *B, int ldb, cuComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsm2Info_t, const cuComplex *, int, cuComplex *, int,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsm2Info_t, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrilu0Ex(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, void *csrSortedValA_ValM,
-    cudaDataType csrSortedValA_ValMtype, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      void *, cudaDataType, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrilu0Ex");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedValA_ValMtype, csrSortedRowPtrA, csrSortedColIndA,
-                  info, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, float *csrSortedValA_ValM,
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, double *csrSortedValA_ValM,
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, cuComplex *csrSortedValA_ValM,
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu0(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA_ValM,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(
-    cusparseHandle_t handle, csrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(
-    cusparseHandle_t handle, bsrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric0(cusparseHandle_t handle,
-                                             cusparseOperation_t trans, int m,
-                                             const cusparseMatDescr_t descrA,
-                                             float *csrSortedValA_ValM,
-                                             const int *csrSortedRowPtrA,
-                                             const int *csrSortedColIndA,
-                                             cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric0(cusparseHandle_t handle,
-                                             cusparseOperation_t trans, int m,
-                                             const cusparseMatDescr_t descrA,
-                                             double *csrSortedValA_ValM,
-                                             const int *csrSortedRowPtrA,
-                                             const int *csrSortedColIndA,
-                                             cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric0(cusparseHandle_t handle,
-                                             cusparseOperation_t trans, int m,
-                                             const cusparseMatDescr_t descrA,
-                                             cuComplex *csrSortedValA_ValM,
-                                             const int *csrSortedRowPtrA,
-                                             const int *csrSortedColIndA,
-                                             cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric0(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA_ValM,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
-                                                        csric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
-                                                        bsric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv(cusparseHandle_t handle, int m,
-                                           int n, const float *dl,
-                                           const float *d, const float *du,
-                                           float *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  const float *, const float *,
-                                                  const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv(cusparseHandle_t handle, int m,
-                                           int n, const double *dl,
-                                           const double *d, const double *du,
-                                           double *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv(cusparseHandle_t handle, int m,
-                                           int n, const cuComplex *dl,
-                                           const cuComplex *d,
-                                           const cuComplex *du, cuComplex *B,
-                                           int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv(cusparseHandle_t handle, int m,
-                                           int n, const cuDoubleComplex *dl,
-                                           const cuDoubleComplex *d,
-                                           const cuDoubleComplex *du,
-                                           cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const float *dl,
-                                            const float *d, const float *du,
-                                            float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const double *dl,
-                                            const double *d, const double *du,
-                                            double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuComplex *dl,
-                                            const cuComplex *d,
-                                            const cuComplex *du, cuComplex *B,
-                                            int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuDoubleComplex *dl,
-                                            const cuDoubleComplex *d,
-                                            const cuDoubleComplex *du,
-                                            cuDoubleComplex *B, int ldb,
-                                            void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgtsv_nopivot(cusparseHandle_t handle, int m, int n, const float *dl,
-                      const float *d, const float *du, float *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  const float *, const float *,
-                                                  const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgtsv_nopivot(cusparseHandle_t handle, int m, int n, const double *dl,
-                      const double *d, const double *du, double *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgtsv_nopivot(cusparseHandle_t handle, int m, int n,
-                      const cuDoubleComplex *dl, const cuDoubleComplex *d,
-                      const cuDoubleComplex *du, cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *B,
-    int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, float *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      float *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const double *dl, const double *d,
-    const double *du, double *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      double *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, cuComplex *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
-    int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, const float *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      const float *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const double *dl, const double *d,
-    const double *du, const double *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      const double *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, const cuComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, float *x, int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      float *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgtsv2StridedBatch(cusparseHandle_t handle, int m, const double *dl,
-                           const double *d, const double *du, double *x,
-                           int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      double *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, cuComplex *x, int batchCount, int batchStride,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
-    int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const float *dl, const float *d,
-    const float *du, const float *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const double *dl, const double *d,
-    const double *du, const double *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, float *dl, float *d, float *du,
-    float *x, int batchCount, void *pBuffer) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int, float *,
-                                      float *, float *, float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, double *dl, double *d, double *du,
-    double *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  double *, double *, double *,
-                                                  double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuComplex *dl, cuComplex *d,
-    cuComplex *du, cuComplex *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
-      cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *dl,
-    cuDoubleComplex *d, cuDoubleComplex *du, cuDoubleComplex *x, int batchCount,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, cuDoubleComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const float *ds, const float *dl,
-    const float *d, const float *du, const float *dw, const float *x,
-    int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, const float *, const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const double *ds,
-    const double *dl, const double *d, const double *du, const double *dw,
-    const double *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, const double *, const double *, int,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuComplex *ds,
-    const cuComplex *dl, const cuComplex *d, const cuComplex *du,
-    const cuComplex *dw, const cuComplex *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, const cuComplex *,
-      const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *ds,
-    const cuDoubleComplex *dl, const cuDoubleComplex *d,
-    const cuDoubleComplex *du, const cuDoubleComplex *dw,
-    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, float *ds, float *dl, float *d,
-    float *du, float *dw, float *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, float *, float *, float *, float *, float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, double *ds, double *dl, double *d,
-    double *du, double *dw, double *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, double *, double *, double *, double *,
-      double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuComplex *ds, cuComplex *dl,
-    cuComplex *d, cuComplex *du, cuComplex *dw, cuComplex *x, int batchCount,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
-      cuComplex *, cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *ds,
-    cuDoubleComplex *dl, cuDoubleComplex *d, cuDoubleComplex *du,
-    cuDoubleComplex *dw, cuDoubleComplex *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseXcsrgemmNnz(cusparseHandle_t handle, cusparseOperation_t transA,
-                    cusparseOperation_t transB, int m, int n, int k,
-                    const cusparseMatDescr_t descrA, const int nnzA,
-                    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                    const cusparseMatDescr_t descrB, const int nnzB,
-                    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-                    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-                    int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, const int, const int *, const int *,
-      const cusparseMatDescr_t, const int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemmNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, const int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, const int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, const int, const float *, const int *,
-      const int *, const cusparseMatDescr_t, const int, const float *,
-      const int *, const int *, const cusparseMatDescr_t, float *, const int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuComplex *,
-      const int *, const int *, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
-      const int *, const int *, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrgemm2Info(csrgemm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrgemm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrgemm2Info(csrgemm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrgemm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
-    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, csrgemm2Info_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
-      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const float *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
-      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const double *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cuComplex *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuDoubleComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cuDoubleComplex *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Nnz(
-    cusparseHandle_t handle, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, const csrgemm2Info_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, int,
-      const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *, const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemm2Nnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
-    const cusparseMatDescr_t descrD, int nnzD, const float *csrSortedValD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
-      int, const float *, const int *, const int *, const cusparseMatDescr_t,
-      int, const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const double *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
-      int, const double *, const int *, const int *, const cusparseMatDescr_t,
-      int, const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const cuComplex *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuComplex *,
-      const int *, const int *, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuComplex *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrD, int nnzD,
-    const cuDoubleComplex *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
-      const int *, const int *, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuDoubleComplex *, const int *,
-      int *, const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgeamNnz(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeamNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, const float *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, const cuComplex *, const int *,
-      const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgeam2Nnz(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeam2Nnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, workspace);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const double *, int *,
-      int *, int *, const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const float *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const double *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuDoubleComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const float *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const float *,
-      const int *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const double *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const double *,
-      const int *, int *, int *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, int *, int *, cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    int *nnzPerRow, int *nnzC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuDoubleComplex *,
-      const int *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    float *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, int, const int *, float *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    double *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, int, const int *, double *, int *, int *,
-      double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    cuComplex *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const int *, cuComplex *, int *, int *,
-      cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    cuDoubleComplex *csrSortedValC, int *csrSortedColIndC,
-    int *csrSortedRowPtrC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, const int *,
-      cuDoubleComplex *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerRow, float *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerRow, double *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerRow, cuComplex *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *A, int lda, const int *nnzPerRow,
-    cuDoubleComplex *csrSortedValA, int *csrSortedRowPtrA,
-    int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerCol, float *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerCol, double *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerCol, cuComplex *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *A, int lda, const int *nnzPerCol,
-    cuDoubleComplex *cscSortedValA, int *cscSortedRowIndA,
-    int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle,
-                                              const int *cooRowInd, int nnz,
-                                              int m, int *csrSortedRowPtr,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoo2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, cooRowInd, nnz, m, csrSortedRowPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle,
-                                              const int *csrSortedRowPtr,
-                                              int nnz, int m, int *cooRowInd,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2coo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, csrSortedRowPtr, nnz, m, cooRowInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrSortedVal,
-    cudaDataType csrSortedValtype, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, void *cscSortedVal,
-    cudaDataType cscSortedValtype, int *cscSortedRowInd, int *cscSortedColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-    cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, cudaDataType, const int *,
-      const int *, void *, cudaDataType, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedValtype,
-                  csrSortedRowPtr, csrSortedColInd, cscSortedVal,
-                  cscSortedValtype, cscSortedRowInd, cscSortedColPtr,
-                  copyValues, idxBase, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz, const float *csrSortedVal,
-    const int *csrSortedRowPtr, const int *csrSortedColInd, float *cscSortedVal,
-    int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      float *, int *, int *, cusparseAction_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz, const double *csrSortedVal,
-    const int *csrSortedRowPtr, const int *csrSortedColInd,
-    double *cscSortedVal, int *cscSortedRowInd, int *cscSortedColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      double *, int *, int *, cusparseAction_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsr2csc(cusparseHandle_t handle, int m, int n, int nnz,
-                 const cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-                 const int *csrSortedColInd, cuComplex *cscSortedVal,
-                 int *cscSortedRowInd, int *cscSortedColPtr,
-                 cusparseAction_t copyValues, cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, cuComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, cuDoubleComplex *cscSortedVal,
-    int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, cuDoubleComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZdense2hyb(cusparseHandle_t handle, int m, int n,
-                   const cusparseMatDescr_t descrA, const cuDoubleComplex *A,
-                   int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-                   int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              float *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              double *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuComplex *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuDoubleComplex *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              float *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              double *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuComplex *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuDoubleComplex *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, int blockDim, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2bsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim, float *bscVal,
-    int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, float *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    double *bscVal, int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, double *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, cuComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuDoubleComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, cuDoubleComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDim, int colBlockDim,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, int, const cusparseMatDescr_t, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int rowBlockDim, int colBlockDim,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, const cusparseMatDescr_t, int *, int, int,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedRowPtrC, rowBlockDim,
-                  colBlockDim, nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, const cusparseMatDescr_t,
-      float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, const cusparseMatDescr_t,
-      double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, const cusparseMatDescr_t,
-      cuComplex *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDimA, int colBlockDimA,
-    const cusparseMatDescr_t descrC, int *bsrSortedRowPtrC, int rowBlockDimC,
-    int colBlockDimC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const int *, const int *, int, int,
-      const cusparseMatDescr_t, int *, int, int, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC,
-                  bsrSortedRowPtrC, rowBlockDimC, colBlockDimC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *bsrSortedValC, int *bsrSortedRowPtrC,
-    int *bsrSortedColIndC, int rowBlockDimC, int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *,
-      int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateIdentityPermutation(cusparseHandle_t handle, int n, int *p) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCreateIdentityPermutation");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, p);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cooRowsA,
-    const int *cooColsA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle,
-                                                   int m, int n, int nnz,
-                                                   int *cooRowsA, int *cooColsA,
-                                                   int *P, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByRow");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle,
-                                                      int m, int n, int nnz,
-                                                      int *cooRowsA,
-                                                      int *cooColsA, int *P,
-                                                      void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByColumn");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *csrRowPtrA,
-    const int *csrColIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrRowPtrA, csrColIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *csrRowPtrA,
-                                              int *csrColIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrRowPtrA, csrColIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cscColPtrA,
-    const int *cscRowIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cscColPtrA, cscRowIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *cscColPtrA,
-                                              int *cscRowIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, cscColPtrA, cscRowIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, float *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, float *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, double *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, double *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuDoubleComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      const float *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      const double *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, const float *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, const float *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, const double *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, const double *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, float *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, double *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
-    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
-    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
-      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
-      cusparseCsr2CscAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
-                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
-                  buffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2_bufferSize(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
-    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
-    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
-      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
-      cusparseCsr2CscAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
-                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
-                  bufferSize);
-}
-
-#if !defined(_WIN32)
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateSpVec(cusparseSpVecDescr_t *spVecDescr, int64_t size, int64_t nnz,
-                    void *indices, void *values, cusparseIndexType_t idxType,
-                    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpVecDescr_t *, int64_t, int64_t, void *, void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSpVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySpVec(cusparseSpVecDescr_t spVecDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVecGet(
-    const cusparseSpVecDescr_t spVecDescr, int64_t *size, int64_t *nnz,
-    void **indices, void **values, cusparseIndexType_t *idxType,
-    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseSpVecDescr_t, int64_t *, int64_t *, void **, void **,
-      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVecGetIndexBase(
-    const cusparseSpVecDescr_t spVecDescr, cusparseIndexBase_t *idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpVecDescr_t,
-                                                  cusparseIndexBase_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVecGetValues(const cusparseSpVecDescr_t spVecDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpVecDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateDnVec(cusparseDnVecDescr_t *dnVecDescr, int64_t size,
-                    void *values, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnVecDescr_t *, int64_t, void *, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, size, values, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyDnVec(cusparseDnVecDescr_t dnVecDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnVecGet(const cusparseDnVecDescr_t dnVecDescr, int64_t *size,
-                 void **values, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseDnVecDescr_t, int64_t *, void **, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, size, values, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnVecGetValues(const cusparseDnVecDescr_t dnVecDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(const cusparseDnVecDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCoo(cusparseSpMatDescr_t *spMatDescr,
-                                               int64_t rows, int64_t cols,
-                                               int64_t nnz, void *cooRowInd,
-                                               void *cooColInd, void *cooValues,
-                                               cusparseIndexType_t cooIdxType,
-                                               cusparseIndexBase_t idxBase,
-                                               cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCoo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
-                  cooIdxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsr(
-    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
-    void *csrRowOffsets, void *csrColInd, void *csrValues,
-    cusparseIndexType_t csrRowOffsetsType, cusparseIndexType_t csrColIndType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
-      cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,
-      cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
-                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCooAoS(
-    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
-    void *cooInd, void *cooValues, cusparseIndexType_t cooIdxType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCooAoS");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, cooIdxType,
-                  idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySpMat(cusparseSpMatDescr_t spMatDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCooGet(const cusparseSpMatDescr_t spMatDescr, int64_t *rows,
-               int64_t *cols, int64_t *nnz,
-               void **cooRowInd,  // COO row indices
-               void **cooColInd,  // COO column indices
-               void **cooValues,  // COO values
-               cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
-               cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
-      void **, void **, cusparseIndexType_t *, cusparseIndexBase_t *,
-      cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
-                  idxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCooAoSGet(const cusparseSpMatDescr_t spMatDescr, int64_t *rows,
-                  int64_t *cols, int64_t *nnz,
-                  void **cooInd,     // COO indices
-                  void **cooValues,  // COO values
-                  cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
-                  cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
-      void **, cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooAoSGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, idxType,
-                  idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrGet(
-    const cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
-    int64_t *nnz, void **csrRowOffsets, void **csrColInd, void **csrValues,
-    cusparseIndexType_t *csrRowOffsetsType, cusparseIndexType_t *csrColIndType,
-    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
-      void **, void **, cusparseIndexType_t *, cusparseIndexType_t *,
-      cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
-                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetFormat(
-    const cusparseSpMatDescr_t spMatDescr, cusparseFormat_t *format) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t,
-                                                  cusparseFormat_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, format);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetIndexBase(
-    const cusparseSpMatDescr_t spMatDescr, cusparseIndexBase_t *idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t,
-                                                  cusparseIndexBase_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatGetValues(const cusparseSpMatDescr_t spMatDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatSetStridedBatch(cusparseSpMatDescr_t spMatDescr, int batchCount) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, batchCount);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetStridedBatch(
-    const cusparseSpMatDescr_t spMatDescr, int *batchCount) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, batchCount);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateDnMat(
-    cusparseDnMatDescr_t *dnMatDescr, int64_t rows, int64_t cols, int64_t ld,
-    void *values, cudaDataType valueType, cusparseOrder_t order) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnMatDescr_t *, int64_t, int64_t, int64_t, void *, cudaDataType,
-      cusparseOrder_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, rows, cols, ld, values, valueType, order);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyDnMat(cusparseDnMatDescr_t dnMatDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnMatGet(
-    const cusparseDnMatDescr_t dnMatDescr, int64_t *rows, int64_t *cols,
-    int64_t *ld, void **values, cudaDataType *type, cusparseOrder_t *order) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseDnMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
-      cudaDataType *, cusparseOrder_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, rows, cols, ld, values, type, order);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnMatGetValues(const cusparseDnMatDescr_t dnMatDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(const cusparseDnMatDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnMatSetStridedBatch(
-    cusparseDnMatDescr_t dnMatDescr, int batchCount, int64_t batchStride) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, int, int64_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnMatGetStridedBatch(const cusparseDnMatDescr_t dnMatDescr,
-                             int *batchCount, int64_t *batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseDnMatDescr_t,
-                                                  int *, int64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVV(cusparseHandle_t handle, cusparseOperation_t opX,
-             const cusparseSpVecDescr_t vecX, const cusparseDnVecDescr_t vecY,
-             void *result, cudaDataType computeType, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseSpVecDescr_t,
-      const cusparseDnVecDescr_t, void *, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opX, vecX, vecY, result, computeType, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVV_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opX,
-    const cusparseSpVecDescr_t vecX, const cusparseDnVecDescr_t vecY,
-    const void *result, cudaDataType computeType, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseSpVecDescr_t,
-      const cusparseDnVecDescr_t, const void *, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opX, vecX, vecY, result, computeType, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMV(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-    const void *beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType,
-    cusparseSpMVAlg_t alg, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *,
-      const cusparseSpMatDescr_t, const cusparseDnVecDescr_t, const void *,
-      const cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
-                  externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMV_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-    const void *beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType,
-    cusparseSpMVAlg_t alg, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *,
-      const cusparseSpMatDescr_t, const cusparseDnVecDescr_t, const void *,
-      const cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
-                  bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMM(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, const cusparseSpMatDescr_t matA,
-    const cusparseDnMatDescr_t matB, const void *beta,
-    cusparseDnMatDescr_t matC, cudaDataType computeType, cusparseSpMMAlg_t alg,
-    void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      const cusparseSpMatDescr_t, const cusparseDnMatDescr_t, const void *,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMM_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, const cusparseSpMatDescr_t matA,
-    const cusparseDnMatDescr_t matB, const void *beta,
-    cusparseDnMatDescr_t matC, cudaDataType computeType, cusparseSpMMAlg_t alg,
-    size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      const cusparseSpMatDescr_t, const cusparseDnMatDescr_t, const void *,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, const cusparseDnMatDescr_t matA,
-    const cusparseDnMatDescr_t matB, const void *beta,
-    cusparseSpMatDescr_t matC, cudaDataType computeType, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      const cusparseDnMatDescr_t, const cusparseDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstrainedGeMM");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, const cusparseDnMatDescr_t matA,
-    const cusparseDnMatDescr_t matB, const void *beta,
-    cusparseSpMatDescr_t matC, cudaDataType computeType, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      const cusparseDnMatDescr_t, const cusparseDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseConstrainedGeMM_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  bufferSize);
-}
-
-#endif  // _WIN32
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_2.inc b/third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_2.inc
deleted file mode 100644
index 03d6d0c20d5223..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_10_2.inc
+++ /dev/null
@@ -1,8262 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle,
-                                                int *version) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, version);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type,
-                                                 int *value) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-const char *CUSPARSEAPI cusparseGetErrorName(cusparseStatus_t status) {
-  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorName");
-  if (!func_ptr) return "cusparseGetErrorName symbol not found.";
-  return func_ptr(status);
-}
-
-const char *CUSPARSEAPI cusparseGetErrorString(cusparseStatus_t status) {
-  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorString");
-  if (!func_ptr) return "cusparseGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle,
-                                               cudaStream_t streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle,
-                                               cudaStream_t *streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t,
-                                                  cusparsePointerMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateMatDescr(cusparseMatDescr_t *descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyMatDescr(cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCopyMatDescr(cusparseMatDescr_t dest, const cusparseMatDescr_t src) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t,
-                                                  const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCopyMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dest, src);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA,
-                                                cusparseMatrixType_t type) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseMatrixType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, type);
-}
-
-cusparseMatrixType_t CUSPARSEAPI
-cusparseGetMatType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseMatrixType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatType");
-  if (!func_ptr) return cusparseMatrixType_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseFillMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatFillMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, fillMode);
-}
-
-cusparseFillMode_t CUSPARSEAPI
-cusparseGetMatFillMode(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseFillMode_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatFillMode");
-  if (!func_ptr) return cusparseFillMode_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseDiagType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatDiagType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, diagType);
-}
-
-cusparseDiagType_t CUSPARSEAPI
-cusparseGetMatDiagType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseDiagType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatDiagType");
-  if (!func_ptr) return cusparseDiagType_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA,
-                                                     cusparseIndexBase_t base) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, base);
-}
-
-cusparseIndexBase_t CUSPARSEAPI
-cusparseGetMatIndexBase(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseIndexBase_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatIndexBase");
-  if (!func_ptr) return cusparseIndexBase_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateSolveAnalysisInfo(cusparseSolveAnalysisInfo_t *info) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSolveAnalysisInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSolveAnalysisInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySolveAnalysisInfo(cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSolveAnalysisInfo_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDestroySolveAnalysisInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseGetLevelInfo(cusparseHandle_t handle, cusparseSolveAnalysisInfo_t info,
-                     int *nlevels, int **levelPtr, int **levelInd) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseSolveAnalysisInfo_t, int *, int **, int **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetLevelInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, nlevels, levelPtr, levelInd);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsv2Info(csrsv2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsv2Info(csrsv2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateHybMat(cusparseHybMat_t *hybA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHybMat_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateHybMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hybA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyHybMat(cusparseHybMat_t hybA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHybMat_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyHybMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hybA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateColorInfo(cusparseColorInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyColorInfo(cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t alg) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t *alg) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t,
-                                                  cusparseColorAlg_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreatePruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyPruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSaxpyi(cusparseHandle_t handle, int nnz,
-                                            const float *alpha,
-                                            const float *xVal, const int *xInd,
-                                            float *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const int *, float *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDaxpyi(cusparseHandle_t handle, int nnz,
-                                            const double *alpha,
-                                            const double *xVal, const int *xInd,
-                                            double *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const int *,
-      double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCaxpyi(cusparseHandle_t handle, int nnz,
-                                            const cuComplex *alpha,
-                                            const cuComplex *xVal,
-                                            const int *xInd, cuComplex *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *, const int *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZaxpyi(cusparseHandle_t handle, int nnz,
-                                            const cuDoubleComplex *alpha,
-                                            const cuDoubleComplex *xVal,
-                                            const int *xInd, cuDoubleComplex *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const int *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdoti(cusparseHandle_t handle, int nnz,
-                                           const float *xVal, const int *xInd,
-                                           const float *y,
-                                           float *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const int *, const float *, float *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdoti(cusparseHandle_t handle, int nnz,
-                                           const double *xVal, const int *xInd,
-                                           const double *y,
-                                           double *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const int *, const double *,
-      double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdoti(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *xVal,
-                                           const int *xInd, const cuComplex *y,
-                                           cuComplex *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, const cuComplex *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdoti(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *xVal,
-                                           const int *xInd,
-                                           const cuDoubleComplex *y,
-                                           cuDoubleComplex *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdotci(cusparseHandle_t handle, int nnz,
-                                            const cuComplex *xVal,
-                                            const int *xInd, const cuComplex *y,
-                                            cuComplex *resultDevHostPtr,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, const cuComplex *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdotci");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdotci(cusparseHandle_t handle, int nnz,
-                                            const cuDoubleComplex *xVal,
-                                            const int *xInd,
-                                            const cuDoubleComplex *y,
-                                            cuDoubleComplex *resultDevHostPtr,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdotci");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgthr(cusparseHandle_t handle, int nnz,
-                                           const float *y, float *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, float *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgthr(cusparseHandle_t handle, int nnz,
-                                           const double *y, double *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, double *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgthr(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *y, cuComplex *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, cuComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgthr(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *y,
-                                           cuDoubleComplex *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *,
-      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgthrz(cusparseHandle_t handle, int nnz,
-                                            float *y, float *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, float *, float *,
-                                      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgthrz(cusparseHandle_t handle, int nnz,
-                                            double *y, double *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, double *, double *,
-                                      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgthrz(cusparseHandle_t handle, int nnz,
-                                            cuComplex *y, cuComplex *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cuComplex *, cuComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgthrz(cusparseHandle_t handle, int nnz,
-                                            cuDoubleComplex *y,
-                                            cuDoubleComplex *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cuDoubleComplex *, cuDoubleComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSsctr(cusparseHandle_t handle, int nnz,
-                                           const float *xVal, const int *xInd,
-                                           float *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int,
-                                                  const float *, const int *,
-                                                  float *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDsctr(cusparseHandle_t handle, int nnz,
-                                           const double *xVal, const int *xInd,
-                                           double *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const int *, double *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsctr(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *xVal,
-                                           const int *xInd, cuComplex *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, cuComplex *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZsctr(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *xVal,
-                                           const int *xInd, cuDoubleComplex *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSroti(cusparseHandle_t handle, int nnz,
-                                           float *xVal, const int *xInd,
-                                           float *y, const float *c,
-                                           const float *s,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, float *, const int *, float *, const float *,
-      const float *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSroti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDroti(cusparseHandle_t handle, int nnz,
-                                           double *xVal, const int *xInd,
-                                           double *y, const double *c,
-                                           const double *s,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, double *, const int *, double *, const double *,
-      const double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDroti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const float *alpha, const float *A, int lda, int nnz,
-               const float *xVal, const int *xInd, const float *beta, float *y,
-               cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const float *, int, int, const float *, const int *, const float *,
-      float *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const double *alpha, const double *A, int lda, int nnz,
-               const double *xVal, const int *xInd, const double *beta,
-               double *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const double *, int, int, const double *, const int *, const double *,
-      double *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuComplex *alpha, const cuComplex *A, int lda, int nnz,
-    const cuComplex *xVal, const int *xInd, const cuComplex *beta, cuComplex *y,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cuComplex *, int, int, const cuComplex *, const int *,
-      const cuComplex *, cuComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, int nnz,
-    const cuDoubleComplex *xVal, const int *xInd, const cuDoubleComplex *beta,
-    cuDoubleComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmv(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int nnz, const double *alpha,
-               const cusparseMatDescr_t descrA, const double *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrmv(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int nnz, const cuComplex *alpha,
-               const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const cuComplex *x, const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx_bufferSize(
-    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
-    const cusparseMatDescr_t descrA, const void *csrValA,
-    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
-    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
-    void *y, cudaDataType ytype, cudaDataType executiontype,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
-      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
-      cudaDataType, const int *, const int *, const void *, cudaDataType,
-      const void *, cudaDataType, void *, cudaDataType, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
-                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
-                  betatype, y, ytype, executiontype, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx(
-    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
-    const cusparseMatDescr_t descrA, const void *csrValA,
-    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
-    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
-    void *y, cudaDataType ytype, cudaDataType executiontype, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
-      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
-      cudaDataType, const int *, const int *, const void *, cudaDataType,
-      const void *, cudaDataType, void *, cudaDataType, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
-                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
-                  betatype, y, ytype, executiontype, buffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmv_mp(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-                  int n, int nnz, const double *alpha,
-                  const cusparseMatDescr_t descrA, const double *csrSortedValA,
-                  const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                  const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *x, const cuComplex *beta,
-    cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const float *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const float *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const float *,
-      const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const double *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const double *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const double *,
-      const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const cuComplex *x, const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const cuComplex *,
-      const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZhybmv(cusparseHandle_t handle, cusparseOperation_t transA,
-               const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-               const cusparseHybMat_t hybA, const cuDoubleComplex *x,
-               const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCbsrmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-               cusparseOperation_t transA, int mb, int nb, int nnzb,
-               const cuComplex *alpha, const cusparseMatDescr_t descrA,
-               const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-               const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-               const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const cuComplex *, const cuComplex *,
-      cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-                const float *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const float *x,
-                const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const int *, const int *, int, const float *, const float *,
-      float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-                const double *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const double *x,
-                const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const int *, const int *, int, const double *,
-      const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-    const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const int *, const int *, int,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const int *,
-      const int *, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrsv_analysisEx(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const void *csrSortedValA,
-    cudaDataType csrSortedValAtype, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const void *, cudaDataType, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrsv_analysisEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedValAtype, csrSortedRowPtrA, csrSortedColIndA, info,
-                  executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrsv_solveEx(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const void *alpha, cudaDataType alphatype, const cusparseMatDescr_t descrA,
-    const void *csrSortedValA, cudaDataType csrSortedValAtype,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info, const void *f, cudaDataType ftype,
-    void *x, cudaDataType xtype, cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const void *, cudaDataType,
-      const cusparseMatDescr_t, const void *, cudaDataType, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const void *, cudaDataType,
-      void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrsv_solveEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, alphatype, descrA, csrSortedValA,
-                  csrSortedValAtype, csrSortedRowPtrA, csrSortedColIndA, info,
-                  f, ftype, x, xtype, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const float *f, float *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const double *f, double *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuComplex *f, cuComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *f, cuDoubleComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const cuDoubleComplex *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle,
-                                                       csrsv2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsv2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const float *f, float *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      csrsv2Info_t, const float *, float *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const double *f, double *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      csrsv2Info_t, const double *, double *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const cuComplex *f,
-    cuComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      csrsv2Info_t, const cuComplex *, cuComplex *, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const cuDoubleComplex *f,
-    cuDoubleComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, csrsv2Info_t, const cuDoubleComplex *, cuDoubleComplex *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsv2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsv2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const float *f, float *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, bsrsv2Info_t, const float *, float *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const double *f, double *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, bsrsv2Info_t, const double *, double *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuComplex *f, cuComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, bsrsv2Info_t, const cuComplex *,
-      cuComplex *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuDoubleComplex *f, cuDoubleComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseShybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDhybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseChybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZhybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const float *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const float *f, float *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const float *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const cuComplex *f, cuComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const double *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const double *f, double *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const double *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cusparseHybMat_t hybA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *f, cuDoubleComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descrA, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrmm(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int k, int nnz, const float *alpha,
-               const cusparseMatDescr_t descrA, const float *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const float *B, int ldb, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *B, int ldb, const double *beta,
-    double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb,
-    const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const float *alpha, const cusparseMatDescr_t descrA,
-                const float *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const float *B, int ldb,
-                const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const double *alpha, const cusparseMatDescr_t descrA,
-                const double *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const double *B, int ldb,
-                const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, const double *, double *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const cuComplex *alpha, const cusparseMatDescr_t descrA,
-                const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const cuComplex *B, int ldb,
-                const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmm2(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const float *B,
-    const int ldb, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const int, const float *, const int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const double *B,
-    const int ldb, const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const int, const double *, const int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const cuComplex *B,
-    const int ldb, const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const int, const cuComplex *, const int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA,
-    const int blockSize, const cuDoubleComplex *B, const int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, const int, const cuDoubleComplex *, const int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz, const float *alpha,
-    const float *A, int lda, const float *cscValB, const int *cscColPtrB,
-    const int *cscRowIndB, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const float *, const float *, int,
-      const float *, const int *, const int *, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz, const double *alpha,
-    const double *A, int lda, const double *cscValB, const int *cscColPtrB,
-    const int *cscRowIndB, const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const double *, const double *, int,
-      const double *, const int *, const int *, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz,
-    const cuComplex *alpha, const cuComplex *A, int lda,
-    const cuComplex *cscValB, const int *cscColPtrB, const int *cscRowIndB,
-    const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, const int *, const int *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgemmi(cusparseHandle_t handle, int m, int n, int k, int nnz,
-               const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda,
-               const cuDoubleComplex *cscValB, const int *cscColPtrB,
-               const int *cscRowIndB, const cuDoubleComplex *beta,
-               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const float *B, int ldb, float *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const double *B, int ldb, double *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuComplex *B, int ldb, cuComplex *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, B, ldb, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsm2Info(csrsm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsm2Info(csrsm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsm2_zeroPivot(cusparseHandle_t handle,
-                                                       csrsm2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsm2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    csrsm2Info_t info, cusparseSolvePolicy_t policy, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float *B, int ldb,
-    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, float *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, double *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuDoubleComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *, int,
-      csrsm2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsm2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsm2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const float *B, int ldb, float *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsm2Info_t, const float *, int, float *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const double *B, int ldb, double *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsm2Info_t, const double *, int, double *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuComplex *B, int ldb, cuComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsm2Info_t, const cuComplex *, int, cuComplex *, int,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsm2Info_t, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrilu0Ex(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, void *csrSortedValA_ValM,
-    cudaDataType csrSortedValA_ValMtype, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      void *, cudaDataType, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrilu0Ex");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedValA_ValMtype, csrSortedRowPtrA, csrSortedColIndA,
-                  info, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, float *csrSortedValA_ValM,
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, double *csrSortedValA_ValM,
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, cuComplex *csrSortedValA_ValM,
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu0(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA_ValM,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(
-    cusparseHandle_t handle, csrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(
-    cusparseHandle_t handle, bsrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric0(cusparseHandle_t handle,
-                                             cusparseOperation_t trans, int m,
-                                             const cusparseMatDescr_t descrA,
-                                             float *csrSortedValA_ValM,
-                                             const int *csrSortedRowPtrA,
-                                             const int *csrSortedColIndA,
-                                             cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric0(cusparseHandle_t handle,
-                                             cusparseOperation_t trans, int m,
-                                             const cusparseMatDescr_t descrA,
-                                             double *csrSortedValA_ValM,
-                                             const int *csrSortedRowPtrA,
-                                             const int *csrSortedColIndA,
-                                             cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric0(cusparseHandle_t handle,
-                                             cusparseOperation_t trans, int m,
-                                             const cusparseMatDescr_t descrA,
-                                             cuComplex *csrSortedValA_ValM,
-                                             const int *csrSortedRowPtrA,
-                                             const int *csrSortedColIndA,
-                                             cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric0(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA_ValM,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
-                                                        csric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
-                                                        bsric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv(cusparseHandle_t handle, int m,
-                                           int n, const float *dl,
-                                           const float *d, const float *du,
-                                           float *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  const float *, const float *,
-                                                  const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv(cusparseHandle_t handle, int m,
-                                           int n, const double *dl,
-                                           const double *d, const double *du,
-                                           double *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv(cusparseHandle_t handle, int m,
-                                           int n, const cuComplex *dl,
-                                           const cuComplex *d,
-                                           const cuComplex *du, cuComplex *B,
-                                           int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv(cusparseHandle_t handle, int m,
-                                           int n, const cuDoubleComplex *dl,
-                                           const cuDoubleComplex *d,
-                                           const cuDoubleComplex *du,
-                                           cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const float *dl,
-                                            const float *d, const float *du,
-                                            float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const double *dl,
-                                            const double *d, const double *du,
-                                            double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuComplex *dl,
-                                            const cuComplex *d,
-                                            const cuComplex *du, cuComplex *B,
-                                            int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuDoubleComplex *dl,
-                                            const cuDoubleComplex *d,
-                                            const cuDoubleComplex *du,
-                                            cuDoubleComplex *B, int ldb,
-                                            void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgtsv_nopivot(cusparseHandle_t handle, int m, int n, const float *dl,
-                      const float *d, const float *du, float *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  const float *, const float *,
-                                                  const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgtsv_nopivot(cusparseHandle_t handle, int m, int n, const double *dl,
-                      const double *d, const double *du, double *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgtsv_nopivot(cusparseHandle_t handle, int m, int n,
-                      const cuDoubleComplex *dl, const cuDoubleComplex *d,
-                      const cuDoubleComplex *du, cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *B,
-    int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, float *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      float *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const double *dl, const double *d,
-    const double *du, double *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      double *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, cuComplex *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
-    int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, const float *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      const float *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const double *dl, const double *d,
-    const double *du, const double *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      const double *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, const cuComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, float *x, int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      float *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgtsv2StridedBatch(cusparseHandle_t handle, int m, const double *dl,
-                           const double *d, const double *du, double *x,
-                           int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      double *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, cuComplex *x, int batchCount, int batchStride,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
-    int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const float *dl, const float *d,
-    const float *du, const float *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const double *dl, const double *d,
-    const double *du, const double *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, float *dl, float *d, float *du,
-    float *x, int batchCount, void *pBuffer) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int, float *,
-                                      float *, float *, float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, double *dl, double *d, double *du,
-    double *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  double *, double *, double *,
-                                                  double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuComplex *dl, cuComplex *d,
-    cuComplex *du, cuComplex *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
-      cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *dl,
-    cuDoubleComplex *d, cuDoubleComplex *du, cuDoubleComplex *x, int batchCount,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, cuDoubleComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const float *ds, const float *dl,
-    const float *d, const float *du, const float *dw, const float *x,
-    int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, const float *, const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const double *ds,
-    const double *dl, const double *d, const double *du, const double *dw,
-    const double *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, const double *, const double *, int,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuComplex *ds,
-    const cuComplex *dl, const cuComplex *d, const cuComplex *du,
-    const cuComplex *dw, const cuComplex *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, const cuComplex *,
-      const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *ds,
-    const cuDoubleComplex *dl, const cuDoubleComplex *d,
-    const cuDoubleComplex *du, const cuDoubleComplex *dw,
-    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, float *ds, float *dl, float *d,
-    float *du, float *dw, float *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, float *, float *, float *, float *, float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, double *ds, double *dl, double *d,
-    double *du, double *dw, double *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, double *, double *, double *, double *,
-      double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuComplex *ds, cuComplex *dl,
-    cuComplex *d, cuComplex *du, cuComplex *dw, cuComplex *x, int batchCount,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
-      cuComplex *, cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *ds,
-    cuDoubleComplex *dl, cuDoubleComplex *d, cuDoubleComplex *du,
-    cuDoubleComplex *dw, cuDoubleComplex *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseXcsrgemmNnz(cusparseHandle_t handle, cusparseOperation_t transA,
-                    cusparseOperation_t transB, int m, int n, int k,
-                    const cusparseMatDescr_t descrA, const int nnzA,
-                    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                    const cusparseMatDescr_t descrB, const int nnzB,
-                    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-                    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-                    int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, const int, const int *, const int *,
-      const cusparseMatDescr_t, const int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemmNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, const int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, const int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, const int, const float *, const int *,
-      const int *, const cusparseMatDescr_t, const int, const float *,
-      const int *, const int *, const cusparseMatDescr_t, float *, const int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuComplex *,
-      const int *, const int *, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
-      const int *, const int *, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrgemm2Info(csrgemm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrgemm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrgemm2Info(csrgemm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrgemm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
-    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, csrgemm2Info_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
-      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const float *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
-      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const double *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cuComplex *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuDoubleComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cuDoubleComplex *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Nnz(
-    cusparseHandle_t handle, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, const csrgemm2Info_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, int,
-      const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *, const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemm2Nnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
-    const cusparseMatDescr_t descrD, int nnzD, const float *csrSortedValD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
-      int, const float *, const int *, const int *, const cusparseMatDescr_t,
-      int, const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const double *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
-      int, const double *, const int *, const int *, const cusparseMatDescr_t,
-      int, const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const cuComplex *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuComplex *,
-      const int *, const int *, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuComplex *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrD, int nnzD,
-    const cuDoubleComplex *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
-      const int *, const int *, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuDoubleComplex *, const int *,
-      int *, const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgeamNnz(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeamNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, const float *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, const cuComplex *, const int *,
-      const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgeam2Nnz(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeam2Nnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, workspace);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const double *, int *,
-      int *, int *, const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const float *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const double *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuDoubleComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const float *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const float *,
-      const int *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const double *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const double *,
-      const int *, int *, int *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, int *, int *, cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    int *nnzPerRow, int *nnzC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuDoubleComplex *,
-      const int *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    float *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, int, const int *, float *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    double *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, int, const int *, double *, int *, int *,
-      double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    cuComplex *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const int *, cuComplex *, int *, int *,
-      cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    cuDoubleComplex *csrSortedValC, int *csrSortedColIndC,
-    int *csrSortedRowPtrC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, const int *,
-      cuDoubleComplex *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerRow, float *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerRow, double *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerRow, cuComplex *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *A, int lda, const int *nnzPerRow,
-    cuDoubleComplex *csrSortedValA, int *csrSortedRowPtrA,
-    int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerCol, float *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerCol, double *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerCol, cuComplex *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *A, int lda, const int *nnzPerCol,
-    cuDoubleComplex *cscSortedValA, int *cscSortedRowIndA,
-    int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle,
-                                              const int *cooRowInd, int nnz,
-                                              int m, int *csrSortedRowPtr,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoo2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, cooRowInd, nnz, m, csrSortedRowPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle,
-                                              const int *csrSortedRowPtr,
-                                              int nnz, int m, int *cooRowInd,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2coo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, csrSortedRowPtr, nnz, m, cooRowInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrSortedVal,
-    cudaDataType csrSortedValtype, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, void *cscSortedVal,
-    cudaDataType cscSortedValtype, int *cscSortedRowInd, int *cscSortedColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-    cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, cudaDataType, const int *,
-      const int *, void *, cudaDataType, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedValtype,
-                  csrSortedRowPtr, csrSortedColInd, cscSortedVal,
-                  cscSortedValtype, cscSortedRowInd, cscSortedColPtr,
-                  copyValues, idxBase, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz, const float *csrSortedVal,
-    const int *csrSortedRowPtr, const int *csrSortedColInd, float *cscSortedVal,
-    int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      float *, int *, int *, cusparseAction_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz, const double *csrSortedVal,
-    const int *csrSortedRowPtr, const int *csrSortedColInd,
-    double *cscSortedVal, int *cscSortedRowInd, int *cscSortedColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      double *, int *, int *, cusparseAction_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsr2csc(cusparseHandle_t handle, int m, int n, int nnz,
-                 const cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-                 const int *csrSortedColInd, cuComplex *cscSortedVal,
-                 int *cscSortedRowInd, int *cscSortedColPtr,
-                 cusparseAction_t copyValues, cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, cuComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, cuDoubleComplex *cscSortedVal,
-    int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, cuDoubleComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZdense2hyb(cusparseHandle_t handle, int m, int n,
-                   const cusparseMatDescr_t descrA, const cuDoubleComplex *A,
-                   int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-                   int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              float *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              double *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuComplex *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuDoubleComplex *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              float *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              double *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuComplex *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuDoubleComplex *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, int blockDim, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2bsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim, float *bscVal,
-    int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, float *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    double *bscVal, int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, double *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, cuComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuDoubleComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, cuDoubleComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDim, int colBlockDim,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, int, const cusparseMatDescr_t, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int rowBlockDim, int colBlockDim,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, const cusparseMatDescr_t, int *, int, int,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedRowPtrC, rowBlockDim,
-                  colBlockDim, nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, const cusparseMatDescr_t,
-      float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, const cusparseMatDescr_t,
-      double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, const cusparseMatDescr_t,
-      cuComplex *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDimA, int colBlockDimA,
-    const cusparseMatDescr_t descrC, int *bsrSortedRowPtrC, int rowBlockDimC,
-    int colBlockDimC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const int *, const int *, int, int,
-      const cusparseMatDescr_t, int *, int, int, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC,
-                  bsrSortedRowPtrC, rowBlockDimC, colBlockDimC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *bsrSortedValC, int *bsrSortedRowPtrC,
-    int *bsrSortedColIndC, int rowBlockDimC, int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *,
-      int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateIdentityPermutation(cusparseHandle_t handle, int n, int *p) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCreateIdentityPermutation");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, p);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cooRowsA,
-    const int *cooColsA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle,
-                                                   int m, int n, int nnz,
-                                                   int *cooRowsA, int *cooColsA,
-                                                   int *P, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByRow");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle,
-                                                      int m, int n, int nnz,
-                                                      int *cooRowsA,
-                                                      int *cooColsA, int *P,
-                                                      void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByColumn");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *csrRowPtrA,
-    const int *csrColIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrRowPtrA, csrColIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *csrRowPtrA,
-                                              int *csrColIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrRowPtrA, csrColIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cscColPtrA,
-    const int *cscRowIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cscColPtrA, cscRowIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *cscColPtrA,
-                                              int *cscRowIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, cscColPtrA, cscRowIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, float *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, float *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, double *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, double *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuDoubleComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      const float *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      const double *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, const float *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, const float *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, const double *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, const double *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, float *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, double *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
-    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
-    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
-      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
-      cusparseCsr2CscAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
-                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
-                  buffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2_bufferSize(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
-    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
-    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
-      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
-      cusparseCsr2CscAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
-                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
-                  bufferSize);
-}
-
-#if !defined(_WIN32)
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateSpVec(cusparseSpVecDescr_t *spVecDescr, int64_t size, int64_t nnz,
-                    void *indices, void *values, cusparseIndexType_t idxType,
-                    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpVecDescr_t *, int64_t, int64_t, void *, void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSpVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySpVec(cusparseSpVecDescr_t spVecDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVecGet(
-    const cusparseSpVecDescr_t spVecDescr, int64_t *size, int64_t *nnz,
-    void **indices, void **values, cusparseIndexType_t *idxType,
-    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseSpVecDescr_t, int64_t *, int64_t *, void **, void **,
-      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVecGetIndexBase(
-    const cusparseSpVecDescr_t spVecDescr, cusparseIndexBase_t *idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpVecDescr_t,
-                                                  cusparseIndexBase_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVecGetValues(const cusparseSpVecDescr_t spVecDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpVecDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateDnVec(cusparseDnVecDescr_t *dnVecDescr, int64_t size,
-                    void *values, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnVecDescr_t *, int64_t, void *, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, size, values, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyDnVec(cusparseDnVecDescr_t dnVecDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnVecGet(const cusparseDnVecDescr_t dnVecDescr, int64_t *size,
-                 void **values, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseDnVecDescr_t, int64_t *, void **, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, size, values, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnVecGetValues(const cusparseDnVecDescr_t dnVecDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(const cusparseDnVecDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCoo(cusparseSpMatDescr_t *spMatDescr,
-                                               int64_t rows, int64_t cols,
-                                               int64_t nnz, void *cooRowInd,
-                                               void *cooColInd, void *cooValues,
-                                               cusparseIndexType_t cooIdxType,
-                                               cusparseIndexBase_t idxBase,
-                                               cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCoo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
-                  cooIdxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsr(
-    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
-    void *csrRowOffsets, void *csrColInd, void *csrValues,
-    cusparseIndexType_t csrRowOffsetsType, cusparseIndexType_t csrColIndType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
-      cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,
-      cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
-                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCooAoS(
-    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
-    void *cooInd, void *cooValues, cusparseIndexType_t cooIdxType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCooAoS");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, cooIdxType,
-                  idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySpMat(cusparseSpMatDescr_t spMatDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCooGet(const cusparseSpMatDescr_t spMatDescr, int64_t *rows,
-               int64_t *cols, int64_t *nnz,
-               void **cooRowInd,  // COO row indices
-               void **cooColInd,  // COO column indices
-               void **cooValues,  // COO values
-               cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
-               cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
-      void **, void **, cusparseIndexType_t *, cusparseIndexBase_t *,
-      cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
-                  idxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCooAoSGet(const cusparseSpMatDescr_t spMatDescr, int64_t *rows,
-                  int64_t *cols, int64_t *nnz,
-                  void **cooInd,     // COO indices
-                  void **cooValues,  // COO values
-                  cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
-                  cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
-      void **, cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooAoSGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, idxType,
-                  idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrGet(
-    const cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
-    int64_t *nnz, void **csrRowOffsets, void **csrColInd, void **csrValues,
-    cusparseIndexType_t *csrRowOffsetsType, cusparseIndexType_t *csrColIndType,
-    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
-      void **, void **, cusparseIndexType_t *, cusparseIndexType_t *,
-      cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
-                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetFormat(
-    const cusparseSpMatDescr_t spMatDescr, cusparseFormat_t *format) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t,
-                                                  cusparseFormat_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, format);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetIndexBase(
-    const cusparseSpMatDescr_t spMatDescr, cusparseIndexBase_t *idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t,
-                                                  cusparseIndexBase_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatGetValues(const cusparseSpMatDescr_t spMatDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatSetStridedBatch(cusparseSpMatDescr_t spMatDescr, int batchCount) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, batchCount);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetStridedBatch(
-    const cusparseSpMatDescr_t spMatDescr, int *batchCount) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(const cusparseSpMatDescr_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, batchCount);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateDnMat(
-    cusparseDnMatDescr_t *dnMatDescr, int64_t rows, int64_t cols, int64_t ld,
-    void *values, cudaDataType valueType, cusparseOrder_t order) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnMatDescr_t *, int64_t, int64_t, int64_t, void *, cudaDataType,
-      cusparseOrder_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, rows, cols, ld, values, valueType, order);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyDnMat(cusparseDnMatDescr_t dnMatDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnMatGet(
-    const cusparseDnMatDescr_t dnMatDescr, int64_t *rows, int64_t *cols,
-    int64_t *ld, void **values, cudaDataType *type, cusparseOrder_t *order) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      const cusparseDnMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
-      cudaDataType *, cusparseOrder_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, rows, cols, ld, values, type, order);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnMatGetValues(const cusparseDnMatDescr_t dnMatDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(const cusparseDnMatDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnMatSetStridedBatch(
-    cusparseDnMatDescr_t dnMatDescr, int batchCount, int64_t batchStride) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, int, int64_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnMatGetStridedBatch(const cusparseDnMatDescr_t dnMatDescr,
-                             int *batchCount, int64_t *batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const cusparseDnMatDescr_t,
-                                                  int *, int64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVV(cusparseHandle_t handle, cusparseOperation_t opX,
-             const cusparseSpVecDescr_t vecX, const cusparseDnVecDescr_t vecY,
-             void *result, cudaDataType computeType, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseSpVecDescr_t,
-      const cusparseDnVecDescr_t, void *, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opX, vecX, vecY, result, computeType, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVV_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opX,
-    const cusparseSpVecDescr_t vecX, const cusparseDnVecDescr_t vecY,
-    const void *result, cudaDataType computeType, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseSpVecDescr_t,
-      const cusparseDnVecDescr_t, const void *, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opX, vecX, vecY, result, computeType, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMV(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-    const void *beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType,
-    cusparseSpMVAlg_t alg, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *,
-      const cusparseSpMatDescr_t, const cusparseDnVecDescr_t, const void *,
-      const cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
-                  externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMV_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-    const void *beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType,
-    cusparseSpMVAlg_t alg, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *,
-      const cusparseSpMatDescr_t, const cusparseDnVecDescr_t, const void *,
-      const cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
-                  bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMM(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, const cusparseSpMatDescr_t matA,
-    const cusparseDnMatDescr_t matB, const void *beta,
-    cusparseDnMatDescr_t matC, cudaDataType computeType, cusparseSpMMAlg_t alg,
-    void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      const cusparseSpMatDescr_t, const cusparseDnMatDescr_t, const void *,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMM_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, const cusparseSpMatDescr_t matA,
-    const cusparseDnMatDescr_t matB, const void *beta,
-    cusparseDnMatDescr_t matC, cudaDataType computeType, cusparseSpMMAlg_t alg,
-    size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      const cusparseSpMatDescr_t, const cusparseDnMatDescr_t, const void *,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, const cusparseDnMatDescr_t matA,
-    const cusparseDnMatDescr_t matB, const void *beta,
-    cusparseSpMatDescr_t matC, cudaDataType computeType, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      const cusparseDnMatDescr_t, const cusparseDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstrainedGeMM");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, const cusparseDnMatDescr_t matA,
-    const cusparseDnMatDescr_t matB, const void *beta,
-    cusparseSpMatDescr_t matC, cudaDataType computeType, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      const cusparseDnMatDescr_t, const cusparseDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseConstrainedGeMM_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  bufferSize);
-}
-
-#endif  // _WIN32
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_11_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cusparse_11_0.inc
deleted file mode 100644
index f762efb1932edb..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_11_0.inc
+++ /dev/null
@@ -1,7025 +0,0 @@
-// Auto-generated, do not edit.
-
-#define CUSPARSE_DEPRECATED(new_func)
-
-extern "C" {
-
-cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle,
-                                                int *version) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, version);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type,
-                                                 int *value) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-const char *CUSPARSEAPI cusparseGetErrorName(cusparseStatus_t status) {
-  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorName");
-  if (!func_ptr) return "cusparseGetErrorName symbol not found.";
-  return func_ptr(status);
-}
-
-const char *CUSPARSEAPI cusparseGetErrorString(cusparseStatus_t status) {
-  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorString");
-  if (!func_ptr) return "cusparseGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle,
-                                               cudaStream_t streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle,
-                                               cudaStream_t *streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t,
-                                                  cusparsePointerMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateMatDescr(cusparseMatDescr_t *descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyMatDescr(cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCopyMatDescr(cusparseMatDescr_t dest, const cusparseMatDescr_t src) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t,
-                                                  const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCopyMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dest, src);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA,
-                                                cusparseMatrixType_t type) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseMatrixType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, type);
-}
-
-cusparseMatrixType_t CUSPARSEAPI
-cusparseGetMatType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseMatrixType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatType");
-  if (!func_ptr) return cusparseMatrixType_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseFillMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatFillMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, fillMode);
-}
-
-cusparseFillMode_t CUSPARSEAPI
-cusparseGetMatFillMode(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseFillMode_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatFillMode");
-  if (!func_ptr) return cusparseFillMode_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseDiagType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatDiagType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, diagType);
-}
-
-cusparseDiagType_t CUSPARSEAPI
-cusparseGetMatDiagType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseDiagType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatDiagType");
-  if (!func_ptr) return cusparseDiagType_t(-1);
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA,
-                                                     cusparseIndexBase_t base) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, base);
-}
-
-cusparseIndexBase_t CUSPARSEAPI
-cusparseGetMatIndexBase(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseIndexBase_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatIndexBase");
-  if (!func_ptr) return cusparseIndexBase_t(-1);
-  return func_ptr(descrA);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsv2Info(csrsv2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsv2Info(csrsv2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateColorInfo(cusparseColorInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyColorInfo(cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t alg) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t *alg) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t,
-                                                  cusparseColorAlg_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreatePruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyPruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-CUSPARSE_DEPRECATED(cusparseAxpby)
-cusparseStatus_t CUSPARSEAPI cusparseSaxpyi(cusparseHandle_t handle, int nnz,
-                                            const float *alpha,
-                                            const float *xVal, const int *xInd,
-                                            float *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const int *, float *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseAxpby)
-cusparseStatus_t CUSPARSEAPI cusparseDaxpyi(cusparseHandle_t handle, int nnz,
-                                            const double *alpha,
-                                            const double *xVal, const int *xInd,
-                                            double *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const int *,
-      double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseAxpby)
-cusparseStatus_t CUSPARSEAPI cusparseCaxpyi(cusparseHandle_t handle, int nnz,
-                                            const cuComplex *alpha,
-                                            const cuComplex *xVal,
-                                            const int *xInd, cuComplex *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *, const int *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseAxpby)
-cusparseStatus_t CUSPARSEAPI cusparseZaxpyi(cusparseHandle_t handle, int nnz,
-                                            const cuDoubleComplex *alpha,
-                                            const cuDoubleComplex *xVal,
-                                            const int *xInd, cuDoubleComplex *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const int *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseGather)
-cusparseStatus_t CUSPARSEAPI cusparseSgthr(cusparseHandle_t handle, int nnz,
-                                           const float *y, float *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, float *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseGather)
-cusparseStatus_t CUSPARSEAPI cusparseDgthr(cusparseHandle_t handle, int nnz,
-                                           const double *y, double *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, double *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseGather)
-cusparseStatus_t CUSPARSEAPI cusparseCgthr(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *y, cuComplex *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, cuComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseGather)
-cusparseStatus_t CUSPARSEAPI cusparseZgthr(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *y,
-                                           cuDoubleComplex *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *,
-      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseGather)
-cusparseStatus_t CUSPARSEAPI cusparseSgthrz(cusparseHandle_t handle, int nnz,
-                                            float *y, float *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, float *, float *,
-                                      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseGather)
-cusparseStatus_t CUSPARSEAPI cusparseDgthrz(cusparseHandle_t handle, int nnz,
-                                            double *y, double *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, double *, double *,
-                                      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseGather)
-cusparseStatus_t CUSPARSEAPI cusparseCgthrz(cusparseHandle_t handle, int nnz,
-                                            cuComplex *y, cuComplex *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cuComplex *, cuComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseGather)
-cusparseStatus_t CUSPARSEAPI cusparseZgthrz(cusparseHandle_t handle, int nnz,
-                                            cuDoubleComplex *y,
-                                            cuDoubleComplex *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cuDoubleComplex *, cuDoubleComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseScatter)
-cusparseStatus_t CUSPARSEAPI cusparseSsctr(cusparseHandle_t handle, int nnz,
-                                           const float *xVal, const int *xInd,
-                                           float *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int,
-                                                  const float *, const int *,
-                                                  float *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseScatter)
-cusparseStatus_t CUSPARSEAPI cusparseDsctr(cusparseHandle_t handle, int nnz,
-                                           const double *xVal, const int *xInd,
-                                           double *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const int *, double *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseScatter)
-cusparseStatus_t CUSPARSEAPI cusparseCsctr(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *xVal,
-                                           const int *xInd, cuComplex *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, cuComplex *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseScatter)
-cusparseStatus_t CUSPARSEAPI cusparseZsctr(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *xVal,
-                                           const int *xInd, cuDoubleComplex *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseRot)
-cusparseStatus_t CUSPARSEAPI cusparseSroti(cusparseHandle_t handle, int nnz,
-                                           float *xVal, const int *xInd,
-                                           float *y, const float *c,
-                                           const float *s,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, float *, const int *, float *, const float *,
-      const float *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSroti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
-}
-
-CUSPARSE_DEPRECATED(cusparseRot)
-cusparseStatus_t CUSPARSEAPI cusparseDroti(cusparseHandle_t handle, int nnz,
-                                           double *xVal, const int *xInd,
-                                           double *y, const double *c,
-                                           const double *s,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, double *, const int *, double *, const double *,
-      const double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDroti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const float *alpha, const float *A, int lda, int nnz,
-               const float *xVal, const int *xInd, const float *beta, float *y,
-               cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const float *, int, int, const float *, const int *, const float *,
-      float *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const double *alpha, const double *A, int lda, int nnz,
-               const double *xVal, const int *xInd, const double *beta,
-               double *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const double *, int, int, const double *, const int *, const double *,
-      double *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuComplex *alpha, const cuComplex *A, int lda, int nnz,
-    const cuComplex *xVal, const int *xInd, const cuComplex *beta, cuComplex *y,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cuComplex *, int, int, const cuComplex *, const int *,
-      const cuComplex *, cuComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, int nnz,
-    const cuDoubleComplex *xVal, const int *xInd, const cuDoubleComplex *beta,
-    cuDoubleComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpMV)
-cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx_bufferSize(
-    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
-    const cusparseMatDescr_t descrA, const void *csrValA,
-    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
-    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
-    void *y, cudaDataType ytype, cudaDataType executiontype,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
-      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
-      cudaDataType, const int *, const int *, const void *, cudaDataType,
-      const void *, cudaDataType, void *, cudaDataType, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
-                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
-                  betatype, y, ytype, executiontype, bufferSizeInBytes);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpMV)
-cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx(
-    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
-    const cusparseMatDescr_t descrA, const void *csrValA,
-    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
-    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
-    void *y, cudaDataType ytype, cudaDataType executiontype, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
-      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
-      cudaDataType, const int *, const int *, const void *, cudaDataType,
-      const void *, cudaDataType, void *, cudaDataType, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
-                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
-                  betatype, y, ytype, executiontype, buffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCbsrmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-               cusparseOperation_t transA, int mb, int nb, int nnzb,
-               const cuComplex *alpha, const cusparseMatDescr_t descrA,
-               const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-               const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-               const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const cuComplex *, const cuComplex *,
-      cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-                const float *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const float *x,
-                const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const int *, const int *, int, const float *, const float *,
-      float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-                const double *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const double *x,
-                const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const int *, const int *, int, const double *,
-      const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-    const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const int *, const int *, int,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const int *,
-      const int *, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle,
-                                                       csrsv2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsv2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const float *f, float *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      csrsv2Info_t, const float *, float *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const double *f, double *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      csrsv2Info_t, const double *, double *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const cuComplex *f,
-    cuComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      csrsv2Info_t, const cuComplex *, cuComplex *, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpSV)
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const cuDoubleComplex *f,
-    cuDoubleComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, csrsv2Info_t, const cuDoubleComplex *, cuDoubleComplex *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsv2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsv2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const float *f, float *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, bsrsv2Info_t, const float *, float *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const double *f, double *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, bsrsv2Info_t, const double *, double *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuComplex *f, cuComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, bsrsv2Info_t, const cuComplex *,
-      cuComplex *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuDoubleComplex *f, cuDoubleComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const float *B,
-    const int ldb, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const int, const float *, const int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const double *B,
-    const int ldb, const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const int, const double *, const int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const cuComplex *B,
-    const int ldb, const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const int, const cuComplex *, const int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA,
-    const int blockSize, const cuDoubleComplex *B, const int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, const int, const cuDoubleComplex *, const int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpMM)
-cusparseStatus_t CUSPARSEAPI cusparseSgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz, const float *alpha,
-    const float *A, int lda, const float *cscValB, const int *cscColPtrB,
-    const int *cscRowIndB, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const float *, const float *, int,
-      const float *, const int *, const int *, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpMM)
-cusparseStatus_t CUSPARSEAPI cusparseDgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz, const double *alpha,
-    const double *A, int lda, const double *cscValB, const int *cscColPtrB,
-    const int *cscRowIndB, const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const double *, const double *, int,
-      const double *, const int *, const int *, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpMM)
-cusparseStatus_t CUSPARSEAPI cusparseCgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz,
-    const cuComplex *alpha, const cuComplex *A, int lda,
-    const cuComplex *cscValB, const int *cscColPtrB, const int *cscRowIndB,
-    const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, const int *, const int *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpMM)
-cusparseStatus_t CUSPARSEAPI
-cusparseZgemmi(cusparseHandle_t handle, int m, int n, int k, int nnz,
-               const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda,
-               const cuDoubleComplex *cscValB, const int *cscColPtrB,
-               const int *cscRowIndB, const cuDoubleComplex *beta,
-               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsm2Info(csrsm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsm2Info(csrsm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsm2_zeroPivot(cusparseHandle_t handle,
-                                                       csrsm2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsm2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    csrsm2Info_t info, cusparseSolvePolicy_t policy, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_analysis(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float *B, int ldb,
-    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, float *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, double *B,
-    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int, csrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_solve(
-    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int nrhs, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuDoubleComplex *B, int ldb, csrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *, int,
-      csrsm2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsm2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsm2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const float *B, int ldb, float *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsm2Info_t, const float *, int, float *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const double *B, int ldb, double *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsm2Info_t, const double *, int, double *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuComplex *B, int ldb, cuComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsm2Info_t, const cuComplex *, int, cuComplex *, int,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsm2Info_t, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(
-    cusparseHandle_t handle, csrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(
-    cusparseHandle_t handle, bsrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
-                                                        csric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
-                                                        bsric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const float *dl,
-                                            const float *d, const float *du,
-                                            float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const double *dl,
-                                            const double *d, const double *du,
-                                            double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuComplex *dl,
-                                            const cuComplex *d,
-                                            const cuComplex *du, cuComplex *B,
-                                            int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuDoubleComplex *dl,
-                                            const cuDoubleComplex *d,
-                                            const cuDoubleComplex *du,
-                                            cuDoubleComplex *B, int ldb,
-                                            void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *B,
-    int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, const float *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      const float *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const double *dl, const double *d,
-    const double *du, const double *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      const double *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, const cuComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, float *x, int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      float *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgtsv2StridedBatch(cusparseHandle_t handle, int m, const double *dl,
-                           const double *d, const double *du, double *x,
-                           int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      double *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, cuComplex *x, int batchCount, int batchStride,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
-    int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const float *dl, const float *d,
-    const float *du, const float *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const double *dl, const double *d,
-    const double *du, const double *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, float *dl, float *d, float *du,
-    float *x, int batchCount, void *pBuffer) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int, float *,
-                                      float *, float *, float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, double *dl, double *d, double *du,
-    double *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  double *, double *, double *,
-                                                  double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuComplex *dl, cuComplex *d,
-    cuComplex *du, cuComplex *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
-      cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *dl,
-    cuDoubleComplex *d, cuDoubleComplex *du, cuDoubleComplex *x, int batchCount,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, cuDoubleComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const float *ds, const float *dl,
-    const float *d, const float *du, const float *dw, const float *x,
-    int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, const float *, const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const double *ds,
-    const double *dl, const double *d, const double *du, const double *dw,
-    const double *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, const double *, const double *, int,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuComplex *ds,
-    const cuComplex *dl, const cuComplex *d, const cuComplex *du,
-    const cuComplex *dw, const cuComplex *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, const cuComplex *,
-      const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *ds,
-    const cuDoubleComplex *dl, const cuDoubleComplex *d,
-    const cuDoubleComplex *du, const cuDoubleComplex *dw,
-    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, float *ds, float *dl, float *d,
-    float *du, float *dw, float *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, float *, float *, float *, float *, float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, double *ds, double *dl, double *d,
-    double *du, double *dw, double *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, double *, double *, double *, double *,
-      double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuComplex *ds, cuComplex *dl,
-    cuComplex *d, cuComplex *du, cuComplex *dw, cuComplex *x, int batchCount,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
-      cuComplex *, cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *ds,
-    cuDoubleComplex *dl, cuDoubleComplex *d, cuDoubleComplex *du,
-    cuDoubleComplex *dw, cuDoubleComplex *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpGEMM)
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrgemm2Info(csrgemm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrgemm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpGEMM)
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrgemm2Info(csrgemm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrgemm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpGEMM)
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
-    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, csrgemm2Info_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
-      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const float *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpGEMM)
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
-      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const double *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpGEMM)
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cuComplex *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpGEMM)
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuDoubleComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cuDoubleComplex *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpGEMM)
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Nnz(
-    cusparseHandle_t handle, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, const csrgemm2Info_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, int,
-      const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *, const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemm2Nnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpGEMM)
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
-    const cusparseMatDescr_t descrD, int nnzD, const float *csrSortedValD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
-      int, const float *, const int *, const int *, const cusparseMatDescr_t,
-      int, const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpGEMM)
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const double *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
-      int, const double *, const int *, const int *, const cusparseMatDescr_t,
-      int, const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpGEMM)
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const cuComplex *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuComplex *,
-      const int *, const int *, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuComplex *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSpGEMM)
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrD, int nnzD,
-    const cuDoubleComplex *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
-      const int *, const int *, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuDoubleComplex *, const int *,
-      int *, const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, const float *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, const cuComplex *, const int *,
-      const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgeam2Nnz(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeam2Nnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, workspace);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const double *, int *,
-      int *, int *, const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const float *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const double *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuDoubleComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const float *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const float *,
-      const int *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const double *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const double *,
-      const int *, int *, int *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, int *, int *, cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    int *nnzPerRow, int *nnzC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuDoubleComplex *,
-      const int *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    float *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, int, const int *, float *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    double *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, int, const int *, double *, int *, int *,
-      double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    cuComplex *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const int *, cuComplex *, int *, int *,
-      cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    cuDoubleComplex *csrSortedValC, int *csrSortedColIndC,
-    int *csrSortedRowPtrC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, const int *,
-      cuDoubleComplex *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-CUSPARSE_DEPRECATED(cusparseDenseToSparse)
-cusparseStatus_t CUSPARSEAPI cusparseSdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerRow, float *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-CUSPARSE_DEPRECATED(cusparseDenseToSparse)
-cusparseStatus_t CUSPARSEAPI cusparseDdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerRow, double *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-CUSPARSE_DEPRECATED(cusparseDenseToSparse)
-cusparseStatus_t CUSPARSEAPI cusparseCdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerRow, cuComplex *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-CUSPARSE_DEPRECATED(cusparseDenseToSparse)
-cusparseStatus_t CUSPARSEAPI cusparseZdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *A, int lda, const int *nnzPerRow,
-    cuDoubleComplex *csrSortedValA, int *csrSortedRowPtrA,
-    int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-CUSPARSE_DEPRECATED(cusparseSparseToDense)
-cusparseStatus_t CUSPARSEAPI cusparseScsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-CUSPARSE_DEPRECATED(cusparseSparseToDense)
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-CUSPARSE_DEPRECATED(cusparseSparseToDense)
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-CUSPARSE_DEPRECATED(cusparseSparseToDense)
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-CUSPARSE_DEPRECATED(cusparseDenseToSparse)
-cusparseStatus_t CUSPARSEAPI cusparseSdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerCol, float *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-CUSPARSE_DEPRECATED(cusparseDenseToSparse)
-cusparseStatus_t CUSPARSEAPI cusparseDdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerCol, double *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-CUSPARSE_DEPRECATED(cusparseDenseToSparse)
-cusparseStatus_t CUSPARSEAPI cusparseCdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerCol, cuComplex *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-CUSPARSE_DEPRECATED(cusparseDenseToSparse)
-cusparseStatus_t CUSPARSEAPI cusparseZdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *A, int lda, const int *nnzPerCol,
-    cuDoubleComplex *cscSortedValA, int *cscSortedRowIndA,
-    int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-CUSPARSE_DEPRECATED(cusparseSparseToDense)
-cusparseStatus_t CUSPARSEAPI cusparseScsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-CUSPARSE_DEPRECATED(cusparseSparseToDense)
-cusparseStatus_t CUSPARSEAPI cusparseDcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-CUSPARSE_DEPRECATED(cusparseSparseToDense)
-cusparseStatus_t CUSPARSEAPI cusparseCcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-CUSPARSE_DEPRECATED(cusparseSparseToDense)
-cusparseStatus_t CUSPARSEAPI cusparseZcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle,
-                                              const int *cooRowInd, int nnz,
-                                              int m, int *csrSortedRowPtr,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoo2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, cooRowInd, nnz, m, csrSortedRowPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle,
-                                              const int *csrSortedRowPtr,
-                                              int nnz, int m, int *cooRowInd,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2coo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, csrSortedRowPtr, nnz, m, cooRowInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, int blockDim, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2bsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim, float *bscVal,
-    int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, float *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    double *bscVal, int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, double *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, cuComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuDoubleComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, cuDoubleComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDim, int colBlockDim,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, int, const cusparseMatDescr_t, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int rowBlockDim, int colBlockDim,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, const cusparseMatDescr_t, int *, int, int,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedRowPtrC, rowBlockDim,
-                  colBlockDim, nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, const cusparseMatDescr_t,
-      float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, const cusparseMatDescr_t,
-      double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, const cusparseMatDescr_t,
-      cuComplex *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDimA, int colBlockDimA,
-    const cusparseMatDescr_t descrC, int *bsrSortedRowPtrC, int rowBlockDimC,
-    int colBlockDimC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const int *, const int *, int, int,
-      const cusparseMatDescr_t, int *, int, int, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC,
-                  bsrSortedRowPtrC, rowBlockDimC, colBlockDimC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *bsrSortedValC, int *bsrSortedRowPtrC,
-    int *bsrSortedColIndC, int rowBlockDimC, int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *,
-      int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateIdentityPermutation(cusparseHandle_t handle, int n, int *p) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCreateIdentityPermutation");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, p);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cooRowsA,
-    const int *cooColsA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle,
-                                                   int m, int n, int nnz,
-                                                   int *cooRowsA, int *cooColsA,
-                                                   int *P, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByRow");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle,
-                                                      int m, int n, int nnz,
-                                                      int *cooRowsA,
-                                                      int *cooColsA, int *P,
-                                                      void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByColumn");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *csrRowPtrA,
-    const int *csrColIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrRowPtrA, csrColIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *csrRowPtrA,
-                                              int *csrColIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrRowPtrA, csrColIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cscColPtrA,
-    const int *cscRowIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cscColPtrA, cscRowIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *cscColPtrA,
-                                              int *cscRowIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, cscColPtrA, cscRowIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, float *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, float *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, double *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, double *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuDoubleComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      const float *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      const double *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, const float *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, const float *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, const double *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, const double *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, float *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, double *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
-    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
-    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
-      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
-      cusparseCsr2CscAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
-                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
-                  buffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2_bufferSize(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
-    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
-    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
-      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
-      cusparseCsr2CscAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
-                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
-                  bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateSpVec(cusparseSpVecDescr_t *spVecDescr, int64_t size, int64_t nnz,
-                    void *indices, void *values, cusparseIndexType_t idxType,
-                    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpVecDescr_t *, int64_t, int64_t, void *, void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSpVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySpVec(cusparseSpVecDescr_t spVecDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVecGet(cusparseSpVecDescr_t spVecDescr,
-                                              int64_t *size, int64_t *nnz,
-                                              void **indices, void **values,
-                                              cusparseIndexType_t *idxType,
-                                              cusparseIndexBase_t *idxBase,
-                                              cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpVecDescr_t, int64_t *, int64_t *, void **, void **,
-      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVecGetIndexBase(
-    cusparseSpVecDescr_t spVecDescr, cusparseIndexBase_t *idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t,
-                                                  cusparseIndexBase_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVecGetValues(cusparseSpVecDescr_t spVecDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateDnVec(cusparseDnVecDescr_t *dnVecDescr, int64_t size,
-                    void *values, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnVecDescr_t *, int64_t, void *, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, size, values, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyDnVec(cusparseDnVecDescr_t dnVecDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnVecGet(cusparseDnVecDescr_t dnVecDescr,
-                                              int64_t *size, void **values,
-                                              cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnVecDescr_t, int64_t *, void **, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, size, values, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnVecGetValues(cusparseDnVecDescr_t dnVecDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySpMat(cusparseSpMatDescr_t spMatDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetFormat(
-    cusparseSpMatDescr_t spMatDescr, cusparseFormat_t *format) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, cusparseFormat_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, format);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetIndexBase(
-    cusparseSpMatDescr_t spMatDescr, cusparseIndexBase_t *idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t,
-                                                  cusparseIndexBase_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatGetValues(cusparseSpMatDescr_t spMatDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatGetSize(cusparseSpMatDescr_t spMatDescr, int64_t *rows,
-                     int64_t *cols, int64_t *nnz) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatSetStridedBatch(cusparseSpMatDescr_t spMatDescr, int batchCount) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, batchCount);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatGetStridedBatch(cusparseSpMatDescr_t spMatDescr, int *batchCount) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, batchCount);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCooSetStridedBatch(
-    cusparseSpMatDescr_t spMatDescr, int batchCount, int64_t batchStride) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int, int64_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooSetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrSetStridedBatch(
-    cusparseSpMatDescr_t spMatDescr, int batchCount, int64_t offsetsBatchStride,
-    int64_t columnsValuesBatchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int,
-                                                  int64_t, int64_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrSetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, batchCount, offsetsBatchStride,
-                  columnsValuesBatchStride);
-}
-
-#if CUDA_VERSION >= 11030
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetAttribute(
-    cusparseSpMatDescr_t spMatDescr, cusparseSpMatAttribute_t attribute,
-    void *data, size_t dataSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, cusparseSpMatAttribute_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, attribute, data, dataSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatSetAttribute(
-    cusparseSpMatDescr_t spMatDescr, cusparseSpMatAttribute_t attribute,
-    void *data, size_t dataSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, cusparseSpMatAttribute_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, attribute, data, dataSize);
-}
-
-#endif  // CUDA_VERSION >= 11030
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsr(
-    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
-    void *csrRowOffsets, void *csrColInd, void *csrValues,
-    cusparseIndexType_t csrRowOffsetsType, cusparseIndexType_t csrColIndType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
-      cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,
-      cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
-                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsc(
-    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
-    void *cscColOffsets, void *cscRowInd, void *cscValues,
-    cusparseIndexType_t cscColOffsetsType, cusparseIndexType_t cscRowIndType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
-      cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,
-      cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cscColOffsets, cscRowInd,
-                  cscValues, cscColOffsetsType, cscRowIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrGet(
-    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
-    void **csrRowOffsets, void **csrColInd, void **csrValues,
-    cusparseIndexType_t *csrRowOffsetsType, cusparseIndexType_t *csrColIndType,
-    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
-      void **, cusparseIndexType_t *, cusparseIndexType_t *,
-      cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
-                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCsrSetPointers(cusparseSpMatDescr_t spMatDescr, void *csrRowOffsets,
-                       void *csrColInd, void *csrValues) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *,
-                                                  void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrSetPointers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, csrRowOffsets, csrColInd, csrValues);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCscSetPointers(cusparseSpMatDescr_t spMatDescr, void *cscColOffsets,
-                       void *cscRowInd, void *cscValues) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *,
-                                                  void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCscSetPointers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, cscColOffsets, cscRowInd, cscValues);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCoo(cusparseSpMatDescr_t *spMatDescr,
-                                               int64_t rows, int64_t cols,
-                                               int64_t nnz, void *cooRowInd,
-                                               void *cooColInd, void *cooValues,
-                                               cusparseIndexType_t cooIdxType,
-                                               cusparseIndexBase_t idxBase,
-                                               cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCoo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
-                  cooIdxType, idxBase, valueType);
-}
-
-CUSPARSE_DEPRECATED(cusparseCreateCoo)
-cusparseStatus_t CUSPARSEAPI cusparseCreateCooAoS(
-    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
-    void *cooInd, void *cooValues, cusparseIndexType_t cooIdxType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCooAoS");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, cooIdxType,
-                  idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCooGet(
-    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
-    void **cooRowInd,  // COO row indices
-    void **cooColInd,  // COO column indices
-    void **cooValues,  // COO values
-    cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
-    cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
-      void **, cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
-                  idxType, idxBase, valueType);
-}
-
-CUSPARSE_DEPRECATED(cusparseCooGet)
-cusparseStatus_t CUSPARSEAPI cusparseCooAoSGet(cusparseSpMatDescr_t spMatDescr,
-                                               int64_t *rows, int64_t *cols,
-                                               int64_t *nnz,
-                                               void **cooInd,     // COO indices
-                                               void **cooValues,  // COO values
-                                               cusparseIndexType_t *idxType,
-                                               cusparseIndexBase_t *idxBase,
-                                               cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
-      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooAoSGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, idxType,
-                  idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCooSetPointers(cusparseSpMatDescr_t spMatDescr, void *cooRows,
-                       void *cooColumns, void *cooValues) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *,
-                                                  void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooSetPointers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, cooRows, cooColumns, cooValues);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBlockedEll(
-    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols,
-    int64_t ellBlockSize, int64_t ellCols, void *ellColInd, void *ellValue,
-    cusparseIndexType_t ellIdxType, cusparseIndexBase_t idxBase,
-    cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, int64_t, void *,
-      void *, cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBlockedEll");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, ellBlockSize, ellCols, ellColInd,
-                  ellValue, ellIdxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseBlockedEllGet(
-    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
-    int64_t *ellBlockSize, int64_t *ellCols, void **ellColInd, void **ellValue,
-    cusparseIndexType_t *ellIdxType, cusparseIndexBase_t *idxBase,
-    cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, int64_t *, void **,
-      void **, cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseBlockedEllGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, ellBlockSize, ellCols, ellColInd,
-                  ellValue, ellIdxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateDnMat(
-    cusparseDnMatDescr_t *dnMatDescr, int64_t rows, int64_t cols, int64_t ld,
-    void *values, cudaDataType valueType, cusparseOrder_t order) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnMatDescr_t *, int64_t, int64_t, int64_t, void *, cudaDataType,
-      cusparseOrder_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, rows, cols, ld, values, valueType, order);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyDnMat(cusparseDnMatDescr_t dnMatDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnMatGet(cusparseDnMatDescr_t dnMatDescr,
-                                              int64_t *rows, int64_t *cols,
-                                              int64_t *ld, void **values,
-                                              cudaDataType *type,
-                                              cusparseOrder_t *order) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
-      cudaDataType *, cusparseOrder_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, rows, cols, ld, values, type, order);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnMatGetValues(cusparseDnMatDescr_t dnMatDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnMatSetStridedBatch(
-    cusparseDnMatDescr_t dnMatDescr, int batchCount, int64_t batchStride) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, int, int64_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnMatGetStridedBatch(
-    cusparseDnMatDescr_t dnMatDescr, int *batchCount, int64_t *batchStride) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, int *, int64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseAxpby(cusparseHandle_t handle,
-                                           const void *alpha,
-                                           cusparseSpVecDescr_t vecX,
-                                           const void *beta,
-                                           cusparseDnVecDescr_t vecY) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const void *, cusparseSpVecDescr_t, const void *,
-      cusparseDnVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseAxpby");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, vecX, beta, vecY);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGather(cusparseHandle_t handle,
-                                            cusparseDnVecDescr_t vecY,
-                                            cusparseSpVecDescr_t vecX) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDnVecDescr_t, cusparseSpVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGather");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, vecY, vecX);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScatter(cusparseHandle_t handle,
-                                             cusparseSpVecDescr_t vecX,
-                                             cusparseDnVecDescr_t vecY) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseSpVecDescr_t, cusparseDnVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScatter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, vecX, vecY);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseRot(cusparseHandle_t handle,
-                                         const void *c_coeff,
-                                         const void *s_coeff,
-                                         cusparseSpVecDescr_t vecX,
-                                         cusparseDnVecDescr_t vecY) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const void *, const void *, cusparseSpVecDescr_t,
-      cusparseDnVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseRot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, c_coeff, s_coeff, vecX, vecY);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVV_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opX, cusparseSpVecDescr_t vecX,
-    cusparseDnVecDescr_t vecY, const void *result, cudaDataType computeType,
-    size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseSpVecDescr_t,
-      cusparseDnVecDescr_t, const void *, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opX, vecX, vecY, result, computeType, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVV(cusparseHandle_t handle, cusparseOperation_t opX,
-             cusparseSpVecDescr_t vecX, cusparseDnVecDescr_t vecY, void *result,
-             cudaDataType computeType, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseSpVecDescr_t,
-      cusparseDnVecDescr_t, void *, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opX, vecX, vecY, result, computeType, externalBuffer);
-}
-
-#if CUSPARSE_VERSION >= 11300
-
-cusparseStatus_t CUSPARSEAPI cusparseSparseToDense_bufferSize(
-    cusparseHandle_t handle, cusparseSpMatDescr_t matA,
-    cusparseDnMatDescr_t matB, cusparseSparseToDenseAlg_t alg,
-    size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseSpMatDescr_t, cusparseDnMatDescr_t,
-      cusparseSparseToDenseAlg_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSparseToDense_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, matA, matB, alg, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSparseToDense(
-    cusparseHandle_t handle, cusparseSpMatDescr_t matA,
-    cusparseDnMatDescr_t matB, cusparseSparseToDenseAlg_t alg, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseSpMatDescr_t, cusparseDnMatDescr_t,
-      cusparseSparseToDenseAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSparseToDense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, matA, matB, alg, buffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDenseToSparse_bufferSize(
-    cusparseHandle_t handle, cusparseDnMatDescr_t matA,
-    cusparseSpMatDescr_t matB, cusparseDenseToSparseAlg_t alg,
-    size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDnMatDescr_t, cusparseSpMatDescr_t,
-      cusparseDenseToSparseAlg_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDenseToSparse_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, matA, matB, alg, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDenseToSparse_analysis(
-    cusparseHandle_t handle, cusparseDnMatDescr_t matA,
-    cusparseSpMatDescr_t matB, cusparseDenseToSparseAlg_t alg, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDnMatDescr_t, cusparseSpMatDescr_t,
-      cusparseDenseToSparseAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDenseToSparse_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, matA, matB, alg, buffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDenseToSparse_convert(
-    cusparseHandle_t handle, cusparseDnMatDescr_t matA,
-    cusparseSpMatDescr_t matB, cusparseDenseToSparseAlg_t alg, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDnMatDescr_t, cusparseSpMatDescr_t,
-      cusparseDenseToSparseAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDenseToSparse_convert");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, matA, matB, alg, buffer);
-}
-
-#endif  // CUSPARSE_VERSION >= 11300
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMV(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    cusparseSpMatDescr_t matA, cusparseDnVecDescr_t vecX, const void *beta,
-    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpMVAlg_t alg,
-    void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t,
-      cusparseDnVecDescr_t, const void *, cusparseDnVecDescr_t, cudaDataType,
-      cusparseSpMVAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
-                  externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMV_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    cusparseSpMatDescr_t matA, cusparseDnVecDescr_t vecX, const void *beta,
-    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpMVAlg_t alg,
-    size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t,
-      cusparseDnVecDescr_t, const void *, cusparseDnVecDescr_t, cudaDataType,
-      cusparseSpMVAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
-                  bufferSize);
-}
-
-#if CUDA_VERSION >= 11030
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpSV_createDescr(cusparseSpSVDescr_t *descr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpSVDescr_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_createDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpSV_destroyDescr(cusparseSpSVDescr_t descr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpSVDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_destroyDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpSV_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    cusparseSpMatDescr_t matA, cusparseDnVecDescr_t vecX,
-    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpSVAlg_t alg,
-    cusparseSpSVDescr_t spsvDescr, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t,
-      cusparseDnVecDescr_t, cusparseDnVecDescr_t, cudaDataType,
-      cusparseSpSVAlg_t, cusparseSpSVDescr_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, vecY, computeType, alg,
-                  spsvDescr, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpSV_analysis(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    cusparseSpMatDescr_t matA, cusparseDnVecDescr_t vecX,
-    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpSVAlg_t alg,
-    cusparseSpSVDescr_t spsvDescr, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t,
-      cusparseDnVecDescr_t, cusparseDnVecDescr_t, cudaDataType,
-      cusparseSpSVAlg_t, cusparseSpSVDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, vecY, computeType, alg,
-                  spsvDescr, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpSV_solve(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    cusparseSpMatDescr_t matA, cusparseDnVecDescr_t vecX,
-    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpSVAlg_t alg,
-    cusparseSpSVDescr_t spsvDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t,
-      cusparseDnVecDescr_t, cusparseDnVecDescr_t, cudaDataType,
-      cusparseSpSVAlg_t, cusparseSpSVDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, vecY, computeType, alg,
-                  spsvDescr);
-}
-
-#endif  // CUDA_VERSION >= 11030
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMM_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseSpMatDescr_t matA, cusparseDnMatDescr_t matB,
-    const void *beta, cusparseDnMatDescr_t matC, cudaDataType computeType,
-    cusparseSpMMAlg_t alg, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseSpMatDescr_t, cusparseDnMatDescr_t, const void *,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMM_preprocess(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseSpMatDescr_t matA, cusparseDnMatDescr_t matB,
-    const void *beta, cusparseDnMatDescr_t matC, cudaDataType computeType,
-    cusparseSpMMAlg_t alg, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseSpMatDescr_t, cusparseDnMatDescr_t, const void *,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_preprocess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMM(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseSpMatDescr_t matA, cusparseDnMatDescr_t matB,
-    const void *beta, cusparseDnMatDescr_t matC, cudaDataType computeType,
-    cusparseSpMMAlg_t alg, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseSpMatDescr_t, cusparseDnMatDescr_t, const void *,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpGEMM_createDescr(cusparseSpGEMMDescr_t *descr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_createDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpGEMM_destroyDescr(cusparseSpGEMMDescr_t descr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_destroyDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_workEstimation(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseSpMatDescr_t matA, cusparseSpMatDescr_t matB,
-    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
-    cusparseSpGEMMAlg_t alg, cusparseSpGEMMDescr_t spgemmDescr,
-    size_t *bufferSize1, void *externalBuffer1) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
-      cusparseSpGEMMDescr_t, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_workEstimation");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, spgemmDescr, bufferSize1, externalBuffer1);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_compute(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseSpMatDescr_t matA, cusparseSpMatDescr_t matB,
-    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
-    cusparseSpGEMMAlg_t alg, cusparseSpGEMMDescr_t spgemmDescr,
-    size_t *bufferSize2, void *externalBuffer2) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
-      cusparseSpGEMMDescr_t, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_compute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, spgemmDescr, bufferSize2, externalBuffer2);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_copy(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseSpMatDescr_t matA, cusparseSpMatDescr_t matB,
-    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
-    cusparseSpGEMMAlg_t alg, cusparseSpGEMMDescr_t spgemmDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
-      cusparseSpGEMMDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_copy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, spgemmDescr);
-}
-
-CUSPARSE_DEPRECATED(cusparseSDDMM)
-cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseDnMatDescr_t matA, cusparseDnMatDescr_t matB,
-    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
-    void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseDnMatDescr_t, cusparseDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstrainedGeMM");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  externalBuffer);
-}
-
-CUSPARSE_DEPRECATED(cusparseSDDMM)
-cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseDnMatDescr_t matA, cusparseDnMatDescr_t matB,
-    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
-    size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseDnMatDescr_t, cusparseDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseConstrainedGeMM_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  bufferSize);
-}
-
-#if CUSPARSE_VERSION >= 11400
-
-cusparseStatus_t CUSPARSEAPI cusparseSDDMM_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseDnMatDescr_t matA, cusparseDnMatDescr_t matB,
-    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
-    cusparseSDDMMAlg_t alg, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseDnMatDescr_t, cusparseDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSDDMMAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSDDMM_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSDDMM_preprocess(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseDnMatDescr_t matA, cusparseDnMatDescr_t matB,
-    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
-    cusparseSDDMMAlg_t alg, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseDnMatDescr_t, cusparseDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSDDMMAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSDDMM_preprocess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSDDMM(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseDnMatDescr_t matA, cusparseDnMatDescr_t matB,
-    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
-    cusparseSDDMMAlg_t alg, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseDnMatDescr_t, cusparseDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSDDMMAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSDDMM");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, externalBuffer);
-}
-
-#endif  // CUSPARSE_VERSION >= 11400
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_12_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cusparse_12_0.inc
deleted file mode 100644
index 91641482860d90..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_12_0.inc
+++ /dev/null
@@ -1,6080 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle,
-                                                int *version) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, version);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type,
-                                                 int *value) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-const char *CUSPARSEAPI cusparseGetErrorName(cusparseStatus_t status) {
-  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorName");
-  if (!func_ptr) return "cusparseGetErrorName symbol not found.";
-  return func_ptr(status);
-}
-
-const char *CUSPARSEAPI cusparseGetErrorString(cusparseStatus_t status) {
-  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorString");
-  if (!func_ptr) return "cusparseGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle,
-                                               cudaStream_t streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle,
-                                               cudaStream_t *streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t,
-                                                  cusparsePointerMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseLoggerSetCallback(cusparseLoggerCallback_t callback) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseLoggerCallback_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerSetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(callback);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseLoggerSetFile(FILE *file) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(FILE *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerSetFile");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(file);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseLoggerOpenFile(const char *logFile) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const char *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerOpenFile");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(logFile);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseLoggerSetLevel(int level) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerSetLevel");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(level);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseLoggerSetMask(int mask) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerSetMask");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseLoggerForceDisable(void) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerForceDisable");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateMatDescr(cusparseMatDescr_t *descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyMatDescr(cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA,
-                                                cusparseMatrixType_t type) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseMatrixType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, type);
-}
-
-cusparseMatrixType_t CUSPARSEAPI
-cusparseGetMatType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseMatrixType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatType");
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseFillMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatFillMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, fillMode);
-}
-
-cusparseFillMode_t CUSPARSEAPI
-cusparseGetMatFillMode(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseFillMode_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatFillMode");
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseDiagType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatDiagType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, diagType);
-}
-
-cusparseDiagType_t CUSPARSEAPI
-cusparseGetMatDiagType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseDiagType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatDiagType");
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA,
-                                                     cusparseIndexBase_t base) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, base);
-}
-
-cusparseIndexBase_t CUSPARSEAPI
-cusparseGetMatIndexBase(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseIndexBase_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatIndexBase");
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateColorInfo(cusparseColorInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyColorInfo(cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t alg) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t *alg) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t,
-                                                  cusparseColorAlg_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreatePruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyPruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const float *alpha, const float *A, int lda, int nnz,
-               const float *xVal, const int *xInd, const float *beta, float *y,
-               cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const float *, int, int, const float *, const int *, const float *,
-      float *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const double *alpha, const double *A, int lda, int nnz,
-               const double *xVal, const int *xInd, const double *beta,
-               double *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const double *, int, int, const double *, const int *, const double *,
-      double *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuComplex *alpha, const cuComplex *A, int lda, int nnz,
-    const cuComplex *xVal, const int *xInd, const cuComplex *beta, cuComplex *y,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cuComplex *, int, int, const cuComplex *, const int *,
-      const cuComplex *, cuComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, int nnz,
-    const cuDoubleComplex *xVal, const int *xInd, const cuDoubleComplex *beta,
-    cuDoubleComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCbsrmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-               cusparseOperation_t transA, int mb, int nb, int nnzb,
-               const cuComplex *alpha, const cusparseMatDescr_t descrA,
-               const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-               const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-               const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const cuComplex *, const cuComplex *,
-      cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-                const float *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const float *x,
-                const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const int *, const int *, int, const float *, const float *,
-      float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-                const double *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const double *x,
-                const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const int *, const int *, int, const double *,
-      const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-    const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const int *, const int *, int,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const int *,
-      const int *, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsv2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsv2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const float *f, float *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, bsrsv2Info_t, const float *, float *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const double *f, double *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, bsrsv2Info_t, const double *, double *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuComplex *f, cuComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, bsrsv2Info_t, const cuComplex *,
-      cuComplex *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuDoubleComplex *f, cuDoubleComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const float *B,
-    const int ldb, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const int, const float *, const int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const double *B,
-    const int ldb, const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const int, const double *, const int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const cuComplex *B,
-    const int ldb, const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const int, const cuComplex *, const int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA,
-    const int blockSize, const cuDoubleComplex *B, const int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, const int, const cuDoubleComplex *, const int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsm2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsm2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const float *B, int ldb, float *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsm2Info_t, const float *, int, float *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const double *B, int ldb, double *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsm2Info_t, const double *, int, double *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuComplex *B, int ldb, cuComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsm2Info_t, const cuComplex *, int, cuComplex *, int,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsm2Info_t, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, B, ldb, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(
-    cusparseHandle_t handle, csrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(
-    cusparseHandle_t handle, bsrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
-                                                        csric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
-                                                        bsric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const float *dl,
-                                            const float *d, const float *du,
-                                            float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const double *dl,
-                                            const double *d, const double *du,
-                                            double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuComplex *dl,
-                                            const cuComplex *d,
-                                            const cuComplex *du, cuComplex *B,
-                                            int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuDoubleComplex *dl,
-                                            const cuDoubleComplex *d,
-                                            const cuDoubleComplex *du,
-                                            cuDoubleComplex *B, int ldb,
-                                            void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *B,
-    int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, const float *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      const float *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const double *dl, const double *d,
-    const double *du, const double *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      const double *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, const cuComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, float *x, int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      float *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgtsv2StridedBatch(cusparseHandle_t handle, int m, const double *dl,
-                           const double *d, const double *du, double *x,
-                           int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      double *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, cuComplex *x, int batchCount, int batchStride,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
-    int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const float *dl, const float *d,
-    const float *du, const float *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const double *dl, const double *d,
-    const double *du, const double *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, float *dl, float *d, float *du,
-    float *x, int batchCount, void *pBuffer) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int, float *,
-                                      float *, float *, float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, double *dl, double *d, double *du,
-    double *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  double *, double *, double *,
-                                                  double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuComplex *dl, cuComplex *d,
-    cuComplex *du, cuComplex *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
-      cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *dl,
-    cuDoubleComplex *d, cuDoubleComplex *du, cuDoubleComplex *x, int batchCount,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, cuDoubleComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const float *ds, const float *dl,
-    const float *d, const float *du, const float *dw, const float *x,
-    int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, const float *, const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const double *ds,
-    const double *dl, const double *d, const double *du, const double *dw,
-    const double *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, const double *, const double *, int,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuComplex *ds,
-    const cuComplex *dl, const cuComplex *d, const cuComplex *du,
-    const cuComplex *dw, const cuComplex *x, int batchCount,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, const cuComplex *,
-      const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *ds,
-    const cuDoubleComplex *dl, const cuDoubleComplex *d,
-    const cuDoubleComplex *du, const cuDoubleComplex *dw,
-    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, float *ds, float *dl, float *d,
-    float *du, float *dw, float *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, float *, float *, float *, float *, float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, double *ds, double *dl, double *d,
-    double *du, double *dw, double *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, double *, double *, double *, double *,
-      double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuComplex *ds, cuComplex *dl,
-    cuComplex *d, cuComplex *du, cuComplex *dw, cuComplex *x, int batchCount,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
-      cuComplex *, cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch(
-    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *ds,
-    cuDoubleComplex *dl, cuDoubleComplex *d, cuDoubleComplex *du,
-    cuDoubleComplex *dw, cuDoubleComplex *x, int batchCount, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *,
-      cuDoubleComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, const float *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, const cuComplex *, const int *,
-      const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    const cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgeam2Nnz(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeam2Nnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, workspace);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const double *, int *,
-      int *, int *, const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const float *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const double *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuDoubleComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const float *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const float *,
-      const int *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const double *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const double *,
-      const int *, int *, int *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
-    int *nnzC, cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, int *, int *, cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    int *nnzPerRow, int *nnzC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuDoubleComplex *,
-      const int *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
-                  nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    float *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, int, const int *, float *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    double *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, int, const int *, double *, int *, int *,
-      double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    cuComplex *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
-    cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const int *, cuComplex *, int *, int *,
-      cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedColIndA,
-    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
-    cuDoubleComplex *csrSortedValC, int *csrSortedColIndC,
-    int *csrSortedRowPtrC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, const int *,
-      cuDoubleComplex *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
-                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
-                  csrSortedColIndC, csrSortedRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle,
-                                              const int *cooRowInd, int nnz,
-                                              int m, int *csrSortedRowPtr,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoo2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, cooRowInd, nnz, m, csrSortedRowPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle,
-                                              const int *csrSortedRowPtr,
-                                              int nnz, int m, int *cooRowInd,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2coo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, csrSortedRowPtr, nnz, m, cooRowInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, int blockDim, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2bsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim, float *bscVal,
-    int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, float *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    double *bscVal, int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, double *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, cuComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuDoubleComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, cuDoubleComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDim, int colBlockDim,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, int, const cusparseMatDescr_t, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int rowBlockDim, int colBlockDim,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, const cusparseMatDescr_t, int *, int, int,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedRowPtrC, rowBlockDim,
-                  colBlockDim, nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, const cusparseMatDescr_t,
-      float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, const cusparseMatDescr_t,
-      double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, const cusparseMatDescr_t,
-      cuComplex *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDimA, int colBlockDimA,
-    const cusparseMatDescr_t descrC, int *bsrSortedRowPtrC, int rowBlockDimC,
-    int colBlockDimC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const int *, const int *, int, int,
-      const cusparseMatDescr_t, int *, int, int, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC,
-                  bsrSortedRowPtrC, rowBlockDimC, colBlockDimC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *bsrSortedValC, int *bsrSortedRowPtrC,
-    int *bsrSortedColIndC, int rowBlockDimC, int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *,
-      int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateIdentityPermutation(cusparseHandle_t handle, int n, int *p) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCreateIdentityPermutation");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, p);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cooRowsA,
-    const int *cooColsA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle,
-                                                   int m, int n, int nnz,
-                                                   int *cooRowsA, int *cooColsA,
-                                                   int *P, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByRow");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle,
-                                                      int m, int n, int nnz,
-                                                      int *cooRowsA,
-                                                      int *cooColsA, int *P,
-                                                      void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByColumn");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *csrRowPtrA,
-    const int *csrColIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrRowPtrA, csrColIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *csrRowPtrA,
-                                              int *csrColIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrRowPtrA, csrColIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cscColPtrA,
-    const int *cscRowIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cscColPtrA, cscRowIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *cscColPtrA,
-                                              int *cscRowIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, cscColPtrA, cscRowIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, float *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, float *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, double *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, double *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuDoubleComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      const float *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      const double *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, threshold, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC,
-    const float *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC,
-    const double *csrSortedValC, const int *csrSortedRowPtrC,
-    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, const float *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, const float *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, const double *csrSortedValC,
-    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, const double *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, float *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, double *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, percentage, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
-    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
-    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
-      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
-      cusparseCsr2CscAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
-                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
-                  buffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2_bufferSize(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
-    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
-    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
-      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
-      cusparseCsr2CscAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
-                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
-                  bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateSpVec(cusparseSpVecDescr_t *spVecDescr, int64_t size, int64_t nnz,
-                    void *indices, void *values, cusparseIndexType_t idxType,
-                    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpVecDescr_t *, int64_t, int64_t, void *, void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSpVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateConstSpVec(
-    cusparseConstSpVecDescr_t *spVecDescr, int64_t size, int64_t nnz,
-    const void *indices, const void *values, cusparseIndexType_t idxType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpVecDescr_t *, int64_t, int64_t, const void *, const void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstSpVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySpVec(cusparseConstSpVecDescr_t spVecDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVecGet(cusparseSpVecDescr_t spVecDescr,
-                                              int64_t *size, int64_t *nnz,
-                                              void **indices, void **values,
-                                              cusparseIndexType_t *idxType,
-                                              cusparseIndexBase_t *idxBase,
-                                              cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpVecDescr_t, int64_t *, int64_t *, void **, void **,
-      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstSpVecGet(
-    cusparseConstSpVecDescr_t spVecDescr, int64_t *size, int64_t *nnz,
-    const void **indices, const void **values, cusparseIndexType_t *idxType,
-    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpVecDescr_t, int64_t *, int64_t *, const void **,
-      const void **, cusparseIndexType_t *, cusparseIndexBase_t *,
-      cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstSpVecGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVecGetIndexBase(
-    cusparseConstSpVecDescr_t spVecDescr, cusparseIndexBase_t *idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpVecDescr_t,
-                                                  cusparseIndexBase_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVecGetValues(cusparseSpVecDescr_t spVecDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstSpVecGetValues(
-    cusparseConstSpVecDescr_t spVecDescr, const void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpVecDescr_t, const void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstSpVecGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateDnVec(cusparseDnVecDescr_t *dnVecDescr, int64_t size,
-                    void *values, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnVecDescr_t *, int64_t, void *, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, size, values, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateConstDnVec(cusparseConstDnVecDescr_t *dnVecDescr, int64_t size,
-                         const void *values, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstDnVecDescr_t *, int64_t, const void *, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstDnVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, size, values, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyDnVec(cusparseConstDnVecDescr_t dnVecDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstDnVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnVec");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnVecGet(cusparseDnVecDescr_t dnVecDescr,
-                                              int64_t *size, void **values,
-                                              cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnVecDescr_t, int64_t *, void **, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, size, values, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseConstDnVecGet(cusparseConstDnVecDescr_t dnVecDescr, int64_t *size,
-                      const void **values, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstDnVecDescr_t, int64_t *, const void **, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstDnVecGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, size, values, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnVecGetValues(cusparseDnVecDescr_t dnVecDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstDnVecGetValues(
-    cusparseConstDnVecDescr_t dnVecDescr, const void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseConstDnVecDescr_t, const void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstDnVecGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnVecDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySpMat(cusparseConstSpMatDescr_t spMatDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetFormat(
-    cusparseConstSpMatDescr_t spMatDescr, cusparseFormat_t *format) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpMatDescr_t,
-                                                  cusparseFormat_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetFormat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, format);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetIndexBase(
-    cusparseConstSpMatDescr_t spMatDescr, cusparseIndexBase_t *idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpMatDescr_t,
-                                                  cusparseIndexBase_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatGetValues(cusparseSpMatDescr_t spMatDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstSpMatGetValues(
-    cusparseConstSpMatDescr_t spMatDescr, const void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpMatDescr_t, const void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstSpMatGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMatGetSize(cusparseConstSpMatDescr_t spMatDescr, int64_t *rows,
-                     int64_t *cols, int64_t *nnz) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpMatDescr_t, int64_t *, int64_t *, int64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetStridedBatch(
-    cusparseConstSpMatDescr_t spMatDescr, int *batchCount) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpMatDescr_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, batchCount);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCooSetStridedBatch(
-    cusparseSpMatDescr_t spMatDescr, int batchCount, int64_t batchStride) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int, int64_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooSetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrSetStridedBatch(
-    cusparseSpMatDescr_t spMatDescr, int batchCount, int64_t offsetsBatchStride,
-    int64_t columnsValuesBatchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int,
-                                                  int64_t, int64_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrSetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, batchCount, offsetsBatchStride,
-                  columnsValuesBatchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatGetAttribute(
-    cusparseConstSpMatDescr_t spMatDescr, cusparseSpMatAttribute_t attribute,
-    void *data, size_t dataSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpMatDescr_t, cusparseSpMatAttribute_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, attribute, data, dataSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMatSetAttribute(
-    cusparseSpMatDescr_t spMatDescr, cusparseSpMatAttribute_t attribute,
-    void *data, size_t dataSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, cusparseSpMatAttribute_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, attribute, data, dataSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsr(
-    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
-    void *csrRowOffsets, void *csrColInd, void *csrValues,
-    cusparseIndexType_t csrRowOffsetsType, cusparseIndexType_t csrColIndType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
-      cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,
-      cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
-                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateConstCsr(
-    cusparseConstSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols,
-    int64_t nnz, const void *csrRowOffsets, const void *csrColInd,
-    const void *csrValues, cusparseIndexType_t csrRowOffsetsType,
-    cusparseIndexType_t csrColIndType, cusparseIndexBase_t idxBase,
-    cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpMatDescr_t *, int64_t, int64_t, int64_t, const void *,
-      const void *, const void *, cusparseIndexType_t, cusparseIndexType_t,
-      cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstCsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
-                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsc(
-    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
-    void *cscColOffsets, void *cscRowInd, void *cscValues,
-    cusparseIndexType_t cscColOffsetsType, cusparseIndexType_t cscRowIndType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
-      cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,
-      cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cscColOffsets, cscRowInd,
-                  cscValues, cscColOffsetsType, cscRowIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateConstCsc(
-    cusparseConstSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols,
-    int64_t nnz, const void *cscColOffsets, const void *cscRowInd,
-    const void *cscValues, cusparseIndexType_t cscColOffsetsType,
-    cusparseIndexType_t cscRowIndType, cusparseIndexBase_t idxBase,
-    cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpMatDescr_t *, int64_t, int64_t, int64_t, const void *,
-      const void *, const void *, cusparseIndexType_t, cusparseIndexType_t,
-      cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstCsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cscColOffsets, cscRowInd,
-                  cscValues, cscColOffsetsType, cscRowIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrGet(
-    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
-    void **csrRowOffsets, void **csrColInd, void **csrValues,
-    cusparseIndexType_t *csrRowOffsetsType, cusparseIndexType_t *csrColIndType,
-    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
-      void **, cusparseIndexType_t *, cusparseIndexType_t *,
-      cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
-                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstCsrGet(
-    cusparseConstSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
-    int64_t *nnz, const void **csrRowOffsets, const void **csrColInd,
-    const void **csrValues, cusparseIndexType_t *csrRowOffsetsType,
-    cusparseIndexType_t *csrColIndType, cusparseIndexBase_t *idxBase,
-    cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpMatDescr_t, int64_t *, int64_t *, int64_t *, const void **,
-      const void **, const void **, cusparseIndexType_t *,
-      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstCsrGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
-                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCscGet(
-    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
-    void **cscColOffsets, void **cscRowInd, void **cscValues,
-    cusparseIndexType_t *cscColOffsetsType, cusparseIndexType_t *cscRowIndType,
-    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
-      void **, cusparseIndexType_t *, cusparseIndexType_t *,
-      cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCscGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cscColOffsets, cscRowInd,
-                  cscValues, cscColOffsetsType, cscRowIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstCscGet(
-    cusparseConstSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
-    int64_t *nnz, const void **cscColOffsets, const void **cscRowInd,
-    const void **cscValues, cusparseIndexType_t *cscColOffsetsType,
-    cusparseIndexType_t *cscRowIndType, cusparseIndexBase_t *idxBase,
-    cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpMatDescr_t, int64_t *, int64_t *, int64_t *, const void **,
-      const void **, const void **, cusparseIndexType_t *,
-      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstCscGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cscColOffsets, cscRowInd,
-                  cscValues, cscColOffsetsType, cscRowIndType, idxBase,
-                  valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCsrSetPointers(cusparseSpMatDescr_t spMatDescr, void *csrRowOffsets,
-                       void *csrColInd, void *csrValues) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *,
-                                                  void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrSetPointers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, csrRowOffsets, csrColInd, csrValues);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCscSetPointers(cusparseSpMatDescr_t spMatDescr, void *cscColOffsets,
-                       void *cscRowInd, void *cscValues) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *,
-                                                  void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCscSetPointers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, cscColOffsets, cscRowInd, cscValues);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCoo(cusparseSpMatDescr_t *spMatDescr,
-                                               int64_t rows, int64_t cols,
-                                               int64_t nnz, void *cooRowInd,
-                                               void *cooColInd, void *cooValues,
-                                               cusparseIndexType_t cooIdxType,
-                                               cusparseIndexBase_t idxBase,
-                                               cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
-      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCoo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
-                  cooIdxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateConstCoo(
-    cusparseConstSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols,
-    int64_t nnz, const void *cooRowInd, const void *cooColInd,
-    const void *cooValues, cusparseIndexType_t cooIdxType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpMatDescr_t *, int64_t, int64_t, int64_t, const void *,
-      const void *, const void *, cusparseIndexType_t, cusparseIndexBase_t,
-      cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstCoo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
-                  cooIdxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCooGet(
-    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
-    void **cooRowInd,  // COO row indices
-    void **cooColInd,  // COO column indices
-    void **cooValues,  // COO values
-    cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
-    cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *,
-      void **,  // COO row indices
-      void **,  // COO column indices
-      void **,  // COO values
-      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
-                  idxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseConstCooGet(cusparseConstSpMatDescr_t spMatDescr, int64_t *rows,
-                    int64_t *cols, int64_t *nnz,
-                    const void **cooRowInd,  // COO row indices
-                    const void **cooColInd,  // COO column indices
-                    const void **cooValues,  // COO values
-                    cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
-                    cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpMatDescr_t, int64_t *, int64_t *, int64_t *,
-      const void **,  // COO row indices
-      const void **,  // COO column indices
-      const void **,  // COO values
-      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstCooGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
-                  idxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCooSetPointers(cusparseSpMatDescr_t spMatDescr, void *cooRows,
-                       void *cooColumns, void *cooValues) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *,
-                                                  void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooSetPointers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, cooRows, cooColumns, cooValues);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBlockedEll(
-    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols,
-    int64_t ellBlockSize, int64_t ellCols, void *ellColInd, void *ellValue,
-    cusparseIndexType_t ellIdxType, cusparseIndexBase_t idxBase,
-    cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, int64_t, void *,
-      void *, cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBlockedEll");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, ellBlockSize, ellCols, ellColInd,
-                  ellValue, ellIdxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateConstBlockedEll(
-    cusparseConstSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols,
-    int64_t ellBlockSize, int64_t ellCols, const void *ellColInd,
-    const void *ellValue, cusparseIndexType_t ellIdxType,
-    cusparseIndexBase_t idxBase, cudaDataType valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpMatDescr_t *, int64_t, int64_t, int64_t, int64_t,
-      const void *, const void *, cusparseIndexType_t, cusparseIndexBase_t,
-      cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstBlockedEll");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, ellBlockSize, ellCols, ellColInd,
-                  ellValue, ellIdxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseBlockedEllGet(
-    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
-    int64_t *ellBlockSize, int64_t *ellCols, void **ellColInd, void **ellValue,
-    cusparseIndexType_t *ellIdxType, cusparseIndexBase_t *idxBase,
-    cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, int64_t *, void **,
-      void **, cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseBlockedEllGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, ellBlockSize, ellCols, ellColInd,
-                  ellValue, ellIdxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstBlockedEllGet(
-    cusparseConstSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
-    int64_t *ellBlockSize, int64_t *ellCols, const void **ellColInd,
-    const void **ellValue, cusparseIndexType_t *ellIdxType,
-    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstSpMatDescr_t, int64_t *, int64_t *, int64_t *, int64_t *,
-      const void **, const void **, cusparseIndexType_t *,
-      cusparseIndexBase_t *, cudaDataType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstBlockedEllGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spMatDescr, rows, cols, ellBlockSize, ellCols, ellColInd,
-                  ellValue, ellIdxType, idxBase, valueType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateDnMat(
-    cusparseDnMatDescr_t *dnMatDescr, int64_t rows, int64_t cols, int64_t ld,
-    void *values, cudaDataType valueType, cusparseOrder_t order) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnMatDescr_t *, int64_t, int64_t, int64_t, void *, cudaDataType,
-      cusparseOrder_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, rows, cols, ld, values, valueType, order);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateConstDnMat(cusparseConstDnMatDescr_t *dnMatDescr, int64_t rows,
-                         int64_t cols, int64_t ld, const void *values,
-                         cudaDataType valueType, cusparseOrder_t order) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstDnMatDescr_t *, int64_t, int64_t, int64_t, const void *,
-      cudaDataType, cusparseOrder_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstDnMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, rows, cols, ld, values, valueType, order);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyDnMat(cusparseConstDnMatDescr_t dnMatDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstDnMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnMatGet(cusparseDnMatDescr_t dnMatDescr,
-                                              int64_t *rows, int64_t *cols,
-                                              int64_t *ld, void **values,
-                                              cudaDataType *type,
-                                              cusparseOrder_t *order) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseDnMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
-      cudaDataType *, cusparseOrder_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, rows, cols, ld, values, type, order);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseConstDnMatGet(cusparseConstDnMatDescr_t dnMatDescr, int64_t *rows,
-                      int64_t *cols, int64_t *ld, const void **values,
-                      cudaDataType *type, cusparseOrder_t *order) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseConstDnMatDescr_t, int64_t *, int64_t *, int64_t *, const void **,
-      cudaDataType *, cusparseOrder_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstDnMatGet");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, rows, cols, ld, values, type, order);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnMatGetValues(cusparseDnMatDescr_t dnMatDescr, void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseConstDnMatGetValues(
-    cusparseConstDnMatDescr_t dnMatDescr, const void **values) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseConstDnMatDescr_t, const void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstDnMatGetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr, void *values) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetValues");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, values);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnMatSetStridedBatch(
-    cusparseDnMatDescr_t dnMatDescr, int batchCount, int64_t batchStride) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, int, int64_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnMatGetStridedBatch(cusparseConstDnMatDescr_t dnMatDescr,
-                             int *batchCount, int64_t *batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstDnMatDescr_t,
-                                                  int *, int64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dnMatDescr, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseAxpby(cusparseHandle_t handle,
-                                           const void *alpha,
-                                           cusparseConstSpVecDescr_t vecX,
-                                           const void *beta,
-                                           cusparseDnVecDescr_t vecY) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const void *, cusparseConstSpVecDescr_t, const void *,
-      cusparseDnVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseAxpby");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, vecX, beta, vecY);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGather(cusparseHandle_t handle,
-                                            cusparseConstDnVecDescr_t vecY,
-                                            cusparseSpVecDescr_t vecX) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseConstDnVecDescr_t, cusparseSpVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGather");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, vecY, vecX);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScatter(cusparseHandle_t handle,
-                                             cusparseConstSpVecDescr_t vecX,
-                                             cusparseDnVecDescr_t vecY) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseConstSpVecDescr_t, cusparseDnVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScatter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, vecX, vecY);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseRot(cusparseHandle_t handle,
-                                         const void *c_coeff,
-                                         const void *s_coeff,
-                                         cusparseSpVecDescr_t vecX,
-                                         cusparseDnVecDescr_t vecY) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const void *, const void *, cusparseSpVecDescr_t,
-      cusparseDnVecDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseRot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, c_coeff, s_coeff, vecX, vecY);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpVV_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opX,
-    cusparseConstSpVecDescr_t vecX, cusparseConstDnVecDescr_t vecY,
-    const void *result, cudaDataType computeType, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseConstSpVecDescr_t,
-      cusparseConstDnVecDescr_t, const void *, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opX, vecX, vecY, result, computeType, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpVV(cusparseHandle_t handle, cusparseOperation_t opX,
-             cusparseConstSpVecDescr_t vecX, cusparseConstDnVecDescr_t vecY,
-             void *result, cudaDataType computeType, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseConstSpVecDescr_t,
-      cusparseConstDnVecDescr_t, void *, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opX, vecX, vecY, result, computeType, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSparseToDense_bufferSize(
-    cusparseHandle_t handle, cusparseConstSpMatDescr_t matA,
-    cusparseDnMatDescr_t matB, cusparseSparseToDenseAlg_t alg,
-    size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseConstSpMatDescr_t, cusparseDnMatDescr_t,
-      cusparseSparseToDenseAlg_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSparseToDense_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, matA, matB, alg, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSparseToDense(cusparseHandle_t handle, cusparseConstSpMatDescr_t matA,
-                      cusparseDnMatDescr_t matB, cusparseSparseToDenseAlg_t alg,
-                      void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseConstSpMatDescr_t, cusparseDnMatDescr_t,
-      cusparseSparseToDenseAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSparseToDense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, matA, matB, alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDenseToSparse_bufferSize(
-    cusparseHandle_t handle, cusparseConstDnMatDescr_t matA,
-    cusparseSpMatDescr_t matB, cusparseDenseToSparseAlg_t alg,
-    size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseConstDnMatDescr_t, cusparseSpMatDescr_t,
-      cusparseDenseToSparseAlg_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDenseToSparse_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, matA, matB, alg, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDenseToSparse_analysis(
-    cusparseHandle_t handle, cusparseConstDnMatDescr_t matA,
-    cusparseSpMatDescr_t matB, cusparseDenseToSparseAlg_t alg,
-    void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseConstDnMatDescr_t, cusparseSpMatDescr_t,
-      cusparseDenseToSparseAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDenseToSparse_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, matA, matB, alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDenseToSparse_convert(
-    cusparseHandle_t handle, cusparseConstDnMatDescr_t matA,
-    cusparseSpMatDescr_t matB, cusparseDenseToSparseAlg_t alg,
-    void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseConstDnMatDescr_t, cusparseSpMatDescr_t,
-      cusparseDenseToSparseAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDenseToSparse_convert");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, matA, matB, alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMV(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX,
-    const void *beta, cusparseDnVecDescr_t vecY, cudaDataType computeType,
-    cusparseSpMVAlg_t alg, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t, const void *,
-      cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
-                  externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMV_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX,
-    const void *beta, cusparseDnVecDescr_t vecY, cudaDataType computeType,
-    cusparseSpMVAlg_t alg, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t, const void *,
-      cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
-                  bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpSV_createDescr(cusparseSpSVDescr_t *descr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpSVDescr_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_createDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpSV_destroyDescr(cusparseSpSVDescr_t descr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpSVDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_destroyDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpSV_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX,
-    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpSVAlg_t alg,
-    cusparseSpSVDescr_t spsvDescr, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t,
-      cusparseDnVecDescr_t, cudaDataType, cusparseSpSVAlg_t,
-      cusparseSpSVDescr_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, vecY, computeType, alg,
-                  spsvDescr, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpSV_analysis(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX,
-    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpSVAlg_t alg,
-    cusparseSpSVDescr_t spsvDescr, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t,
-      cusparseDnVecDescr_t, cudaDataType, cusparseSpSVAlg_t,
-      cusparseSpSVDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, vecY, computeType, alg,
-                  spsvDescr, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpSV_solve(
-    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
-    cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX,
-    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpSVAlg_t alg,
-    cusparseSpSVDescr_t spsvDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t,
-      cusparseDnVecDescr_t, cudaDataType, cusparseSpSVAlg_t,
-      cusparseSpSVDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, alpha, matA, vecX, vecY, computeType, alg,
-                  spsvDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpSM_createDescr(cusparseSpSMDescr_t *descr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpSMDescr_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSM_createDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpSM_destroyDescr(cusparseSpSMDescr_t descr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpSMDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSM_destroyDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpSM_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstSpMatDescr_t matA,
-    cusparseConstDnMatDescr_t matB, cusparseDnMatDescr_t matC,
-    cudaDataType computeType, cusparseSpSMAlg_t alg,
-    cusparseSpSMDescr_t spsmDescr, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpSMAlg_t,
-      cusparseSpSMDescr_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSM_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, matC, computeType, alg,
-                  spsmDescr, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpSM_analysis(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstSpMatDescr_t matA,
-    cusparseConstDnMatDescr_t matB, cusparseDnMatDescr_t matC,
-    cudaDataType computeType, cusparseSpSMAlg_t alg,
-    cusparseSpSMDescr_t spsmDescr, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpSMAlg_t,
-      cusparseSpSMDescr_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSM_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, matC, computeType, alg,
-                  spsmDescr, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpSM_solve(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstSpMatDescr_t matA,
-    cusparseConstDnMatDescr_t matB, cusparseDnMatDescr_t matC,
-    cudaDataType computeType, cusparseSpSMAlg_t alg,
-    cusparseSpSMDescr_t spsmDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpSMAlg_t,
-      cusparseSpSMDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSM_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, matC, computeType, alg,
-                  spsmDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMM_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstSpMatDescr_t matA,
-    cusparseConstDnMatDescr_t matB, const void *beta, cusparseDnMatDescr_t matC,
-    cudaDataType computeType, cusparseSpMMAlg_t alg, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t, const void *,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMM_preprocess(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstSpMatDescr_t matA,
-    cusparseConstDnMatDescr_t matB, const void *beta, cusparseDnMatDescr_t matC,
-    cudaDataType computeType, cusparseSpMMAlg_t alg, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t, const void *,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_preprocess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMM(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstSpMatDescr_t matA,
-    cusparseConstDnMatDescr_t matB, const void *beta, cusparseDnMatDescr_t matC,
-    cudaDataType computeType, cusparseSpMMAlg_t alg, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t, const void *,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpGEMM_createDescr(cusparseSpGEMMDescr_t *descr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_createDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpGEMM_destroyDescr(cusparseSpGEMMDescr_t descr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_destroyDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_workEstimation(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstSpMatDescr_t matA,
-    cusparseConstSpMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
-    cudaDataType computeType, cusparseSpGEMMAlg_t alg,
-    cusparseSpGEMMDescr_t spgemmDescr, size_t *bufferSize1,
-    void *externalBuffer1) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
-      cusparseSpGEMMDescr_t, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_workEstimation");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, spgemmDescr, bufferSize1, externalBuffer1);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_getNumProducts(
-    cusparseSpGEMMDescr_t spgemmDescr, int64_t *num_prods) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t, int64_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_getNumProducts");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(spgemmDescr, num_prods);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_estimateMemory(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstSpMatDescr_t matA,
-    cusparseConstSpMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
-    cudaDataType computeType, cusparseSpGEMMAlg_t alg,
-    cusparseSpGEMMDescr_t spgemmDescr, float chunk_fraction,
-    size_t *bufferSize3, void *externalBuffer3, size_t *bufferSize2) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
-      cusparseSpGEMMDescr_t, float, size_t *, void *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_estimateMemory");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, spgemmDescr, chunk_fraction, bufferSize3,
-                  externalBuffer3, bufferSize2);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_compute(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstSpMatDescr_t matA,
-    cusparseConstSpMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
-    cudaDataType computeType, cusparseSpGEMMAlg_t alg,
-    cusparseSpGEMMDescr_t spgemmDescr, size_t *bufferSize2,
-    void *externalBuffer2) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
-      cusparseSpGEMMDescr_t, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_compute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, spgemmDescr, bufferSize2, externalBuffer2);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_copy(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstSpMatDescr_t matA,
-    cusparseConstSpMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
-    cudaDataType computeType, cusparseSpGEMMAlg_t alg,
-    cusparseSpGEMMDescr_t spgemmDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
-      cusparseSpGEMMDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_copy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, spgemmDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMMreuse_workEstimation(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    cusparseConstSpMatDescr_t matA, cusparseConstSpMatDescr_t matB,
-    cusparseSpMatDescr_t matC, cusparseSpGEMMAlg_t alg,
-    cusparseSpGEMMDescr_t spgemmDescr, size_t *bufferSize1,
-    void *externalBuffer1) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t,
-      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t,
-      cusparseSpMatDescr_t, cusparseSpGEMMAlg_t, cusparseSpGEMMDescr_t,
-      size_t *, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpGEMMreuse_workEstimation");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, matA, matB, matC, alg, spgemmDescr,
-                  bufferSize1, externalBuffer1);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMMreuse_nnz(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    cusparseConstSpMatDescr_t matA, cusparseConstSpMatDescr_t matB,
-    cusparseSpMatDescr_t matC, cusparseSpGEMMAlg_t alg,
-    cusparseSpGEMMDescr_t spgemmDescr, size_t *bufferSize2,
-    void *externalBuffer2, size_t *bufferSize3, void *externalBuffer3,
-    size_t *bufferSize4, void *externalBuffer4) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t,
-      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t,
-      cusparseSpMatDescr_t, cusparseSpGEMMAlg_t, cusparseSpGEMMDescr_t,
-      size_t *, void *, size_t *, void *, size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMMreuse_nnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, matA, matB, matC, alg, spgemmDescr,
-                  bufferSize2, externalBuffer2, bufferSize3, externalBuffer3,
-                  bufferSize4, externalBuffer4);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMMreuse_copy(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    cusparseConstSpMatDescr_t matA, cusparseConstSpMatDescr_t matB,
-    cusparseSpMatDescr_t matC, cusparseSpGEMMAlg_t alg,
-    cusparseSpGEMMDescr_t spgemmDescr, size_t *bufferSize5,
-    void *externalBuffer5) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t,
-      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t,
-      cusparseSpMatDescr_t, cusparseSpGEMMAlg_t, cusparseSpGEMMDescr_t,
-      size_t *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMMreuse_copy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, matA, matB, matC, alg, spgemmDescr,
-                  bufferSize5, externalBuffer5);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpGEMMreuse_compute(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstSpMatDescr_t matA,
-    cusparseConstSpMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
-    cudaDataType computeType, cusparseSpGEMMAlg_t alg,
-    cusparseSpGEMMDescr_t spgemmDescr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
-      cusparseSpGEMMDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMMreuse_compute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, spgemmDescr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSDDMM_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstDnMatDescr_t matA,
-    cusparseConstDnMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
-    cudaDataType computeType, cusparseSDDMMAlg_t alg, size_t *bufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstDnMatDescr_t, cusparseConstDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSDDMMAlg_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSDDMM_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, bufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSDDMM_preprocess(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstDnMatDescr_t matA,
-    cusparseConstDnMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
-    cudaDataType computeType, cusparseSDDMMAlg_t alg, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstDnMatDescr_t, cusparseConstDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSDDMMAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSDDMM_preprocess");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSDDMM(
-    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-    const void *alpha, cusparseConstDnMatDescr_t matA,
-    cusparseConstDnMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
-    cudaDataType computeType, cusparseSDDMMAlg_t alg, void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
-      cusparseConstDnMatDescr_t, cusparseConstDnMatDescr_t, const void *,
-      cusparseSpMatDescr_t, cudaDataType, cusparseSDDMMAlg_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSDDMM");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
-                  alg, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMMOp_createPlan(
-    cusparseHandle_t handle, cusparseSpMMOpPlan_t *plan,
-    cusparseOperation_t opA, cusparseOperation_t opB,
-    cusparseConstSpMatDescr_t matA, cusparseConstDnMatDescr_t matB,
-    cusparseDnMatDescr_t matC, cudaDataType computeType,
-    cusparseSpMMOpAlg_t alg, const void *addOperationNvvmBuffer,
-    size_t addOperationBufferSize, const void *mulOperationNvvmBuffer,
-    size_t mulOperationBufferSize, const void *epilogueNvvmBuffer,
-    size_t epilogueBufferSize, size_t *SpMMWorkspaceSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseSpMMOpPlan_t *, cusparseOperation_t,
-      cusparseOperation_t, cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t,
-      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMOpAlg_t, const void *,
-      size_t, const void *, size_t, const void *, size_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMMOp_createPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, plan, opA, opB, matA, matB, matC, computeType, alg,
-                  addOperationNvvmBuffer, addOperationBufferSize,
-                  mulOperationNvvmBuffer, mulOperationBufferSize,
-                  epilogueNvvmBuffer, epilogueBufferSize, SpMMWorkspaceSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpMMOp(cusparseSpMMOpPlan_t plan,
-                                            void *externalBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMMOpPlan_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMMOp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, externalBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSpMMOp_destroyPlan(cusparseSpMMOpPlan_t plan) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMMOpPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMMOp_destroyPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_9_0.inc b/third_party/xla/third_party/tsl/tsl/cuda/cusparse_9_0.inc
deleted file mode 100644
index 22d0105b1bea5a..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_9_0.inc
+++ /dev/null
@@ -1,7152 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-
-cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle,
-                                                int *version) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetVersion");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, version);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type,
-                                                 int *value) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle,
-                                               cudaStream_t streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle,
-                                               cudaStream_t *streamId) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t,
-                                                  cusparsePointerMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetPointerMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateMatDescr(cusparseMatDescr_t *descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyMatDescr(cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCopyMatDescr(cusparseMatDescr_t dest, const cusparseMatDescr_t src) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t,
-                                                  const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCopyMatDescr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dest, src);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA,
-                                                cusparseMatrixType_t type) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseMatrixType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, type);
-}
-
-cusparseMatrixType_t CUSPARSEAPI
-cusparseGetMatType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseMatrixType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseFillMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatFillMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, fillMode);
-}
-
-cusparseFillMode_t CUSPARSEAPI
-cusparseGetMatFillMode(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseFillMode_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatFillMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseDiagType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatDiagType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, diagType);
-}
-
-cusparseDiagType_t CUSPARSEAPI
-cusparseGetMatDiagType(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseDiagType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatDiagType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA,
-                                                     cusparseIndexBase_t base) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA, base);
-}
-
-cusparseIndexBase_t CUSPARSEAPI
-cusparseGetMatIndexBase(const cusparseMatDescr_t descrA) {
-  using FuncPtr = cusparseIndexBase_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatIndexBase");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(descrA);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateSolveAnalysisInfo(cusparseSolveAnalysisInfo_t *info) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseSolveAnalysisInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSolveAnalysisInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroySolveAnalysisInfo(cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSolveAnalysisInfo_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDestroySolveAnalysisInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseGetLevelInfo(cusparseHandle_t handle, cusparseSolveAnalysisInfo_t info,
-                     int *nlevels, int **levelPtr, int **levelInd) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseSolveAnalysisInfo_t, int *, int **, int **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetLevelInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, nlevels, levelPtr, levelInd);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsv2Info(csrsv2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsv2Info(csrsv2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsric02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrilu02Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsv2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateHybMat(cusparseHybMat_t *hybA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHybMat_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateHybMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hybA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyHybMat(cusparseHybMat_t hybA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHybMat_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyHybMat");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(hybA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsru2csrInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateColorInfo(cusparseColorInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDestroyColorInfo(cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyColorInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t alg) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info,
-                                                  cusparseColorAlg_t *alg) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t,
-                                                  cusparseColorAlg_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetColorAlgs");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info, alg);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreatePruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyPruneInfo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSaxpyi(cusparseHandle_t handle, int nnz,
-                                            const float *alpha,
-                                            const float *xVal, const int *xInd,
-                                            float *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const int *, float *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDaxpyi(cusparseHandle_t handle, int nnz,
-                                            const double *alpha,
-                                            const double *xVal, const int *xInd,
-                                            double *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const int *,
-      double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCaxpyi(cusparseHandle_t handle, int nnz,
-                                            const cuComplex *alpha,
-                                            const cuComplex *xVal,
-                                            const int *xInd, cuComplex *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *, const int *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZaxpyi(cusparseHandle_t handle, int nnz,
-                                            const cuDoubleComplex *alpha,
-                                            const cuDoubleComplex *xVal,
-                                            const int *xInd, cuDoubleComplex *y,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const int *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZaxpyi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdoti(cusparseHandle_t handle, int nnz,
-                                           const float *xVal, const int *xInd,
-                                           const float *y,
-                                           float *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const int *, const float *, float *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdoti(cusparseHandle_t handle, int nnz,
-                                           const double *xVal, const int *xInd,
-                                           const double *y,
-                                           double *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const int *, const double *,
-      double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdoti(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *xVal,
-                                           const int *xInd, const cuComplex *y,
-                                           cuComplex *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, const cuComplex *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdoti(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *xVal,
-                                           const int *xInd,
-                                           const cuDoubleComplex *y,
-                                           cuDoubleComplex *resultDevHostPtr,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdoti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdotci(cusparseHandle_t handle, int nnz,
-                                            const cuComplex *xVal,
-                                            const int *xInd, const cuComplex *y,
-                                            cuComplex *resultDevHostPtr,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, const cuComplex *,
-      cuComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdotci");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdotci(cusparseHandle_t handle, int nnz,
-                                            const cuDoubleComplex *xVal,
-                                            const int *xInd,
-                                            const cuDoubleComplex *y,
-                                            cuDoubleComplex *resultDevHostPtr,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdotci");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, resultDevHostPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgthr(cusparseHandle_t handle, int nnz,
-                                           const float *y, float *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, float *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgthr(cusparseHandle_t handle, int nnz,
-                                           const double *y, double *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, double *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgthr(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *y, cuComplex *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, cuComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgthr(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *y,
-                                           cuDoubleComplex *xVal,
-                                           const int *xInd,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *,
-      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgthrz(cusparseHandle_t handle, int nnz,
-                                            float *y, float *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, float *, float *,
-                                      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgthrz(cusparseHandle_t handle, int nnz,
-                                            double *y, double *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, double *, double *,
-                                      const int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgthrz(cusparseHandle_t handle, int nnz,
-                                            cuComplex *y, cuComplex *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cuComplex *, cuComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgthrz(cusparseHandle_t handle, int nnz,
-                                            cuDoubleComplex *y,
-                                            cuDoubleComplex *xVal,
-                                            const int *xInd,
-                                            cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, cuDoubleComplex *, cuDoubleComplex *, const int *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthrz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSsctr(cusparseHandle_t handle, int nnz,
-                                           const float *xVal, const int *xInd,
-                                           float *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int,
-                                                  const float *, const int *,
-                                                  float *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDsctr(cusparseHandle_t handle, int nnz,
-                                           const double *xVal, const int *xInd,
-                                           double *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const int *, double *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsctr(cusparseHandle_t handle, int nnz,
-                                           const cuComplex *xVal,
-                                           const int *xInd, cuComplex *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const int *, cuComplex *,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZsctr(cusparseHandle_t handle, int nnz,
-                                           const cuDoubleComplex *xVal,
-                                           const int *xInd, cuDoubleComplex *y,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
-      cuDoubleComplex *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZsctr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSroti(cusparseHandle_t handle, int nnz,
-                                           float *xVal, const int *xInd,
-                                           float *y, const float *c,
-                                           const float *s,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, float *, const int *, float *, const float *,
-      const float *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSroti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDroti(cusparseHandle_t handle, int nnz,
-                                           double *xVal, const int *xInd,
-                                           double *y, const double *c,
-                                           const double *s,
-                                           cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, double *, const int *, double *, const double *,
-      const double *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDroti");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const float *alpha, /* host or device pointer */
-               const float *A, int lda, int nnz, const float *xVal,
-               const int *xInd, const float *beta, /* host or device pointer */
-               float *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const float *, int, int, const float *, const int *, const float *,
-      float *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, const double *alpha, /* host or device pointer */
-               const double *A, int lda, int nnz, const double *xVal,
-               const int *xInd, const double *beta, /* host or device pointer */
-               double *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const double *, int, int, const double *, const int *, const double *,
-      double *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuComplex *alpha, /* host or device pointer */
-    const cuComplex *A, int lda, int nnz, const cuComplex *xVal,
-    const int *xInd, const cuComplex *beta, /* host or device pointer */
-    cuComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cuComplex *, int, int, const cuComplex *, const int *,
-      const cuComplex *, cuComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgemvi(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, int nnz, const cuDoubleComplex *xVal,
-    const int *xInd, const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, int, const cuDoubleComplex *, const int *,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
-                  idxBase, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
-                          int m, int n, int nnz, int *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmv(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int nnz, const double *alpha,
-               const cusparseMatDescr_t descrA, const double *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrmv(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int nnz, const cuComplex *alpha,
-               const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const cuComplex *x, const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx_bufferSize(
-    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
-    const cusparseMatDescr_t descrA, const void *csrValA,
-    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
-    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
-    void *y, cudaDataType ytype, cudaDataType executiontype,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
-      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
-      cudaDataType, const int *, const int *, const void *, cudaDataType,
-      const void *, cudaDataType, void *, cudaDataType, cudaDataType, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
-                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
-                  betatype, y, ytype, executiontype, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx(
-    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
-    const cusparseMatDescr_t descrA, const void *csrValA,
-    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
-    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
-    void *y, cudaDataType ytype, cudaDataType executiontype, void *buffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
-      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
-      cudaDataType, const int *, const int *, const void *, cudaDataType,
-      const void *, cudaDataType, void *, cudaDataType, cudaDataType, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
-                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
-                  betatype, y, ytype, executiontype, buffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmv_mp(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-                  int n, int nnz, const double *alpha,
-                  const cusparseMatDescr_t descrA, const double *csrSortedValA,
-                  const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                  const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *x, const cuComplex *beta,
-    cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmv_mp(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmv_mp");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const float *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const float *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const float *,
-      const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const double *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const double *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const double *,
-      const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChybmv(
-    cusparseHandle_t handle, cusparseOperation_t transA, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA,
-    const cuComplex *x, const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const cuComplex *,
-      const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZhybmv(cusparseHandle_t handle, cusparseOperation_t transA,
-               const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-               const cusparseHybMat_t hybA, const cuDoubleComplex *x,
-               const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, alpha, descrA, hybA, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const float *x, const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, const float *, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const double *x, const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, const double *, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCbsrmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-               cusparseOperation_t transA, int mb, int nb, int nnzb,
-               const cuComplex *alpha, const cusparseMatDescr_t descrA,
-               const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-               const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-               const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const cuComplex *, const cuComplex *,
-      cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
-                  x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-                const float *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const float *x,
-                const float *beta, float *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const int *, const int *, int, const float *, const float *,
-      float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
-                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
-                int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-                const double *bsrSortedValA, const int *bsrSortedMaskPtrA,
-                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-                const int *bsrSortedColIndA, int blockDim, const double *x,
-                const double *beta, double *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const int *, const int *, int, const double *,
-      const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
-    const cuComplex *beta, cuComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const int *, const int *, int,
-      const cuComplex *, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
-    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
-    const cuDoubleComplex *beta, cuDoubleComplex *y) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const int *,
-      const int *, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrxmv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
-                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrsv_analysisEx(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const void *csrSortedValA,
-    cudaDataType csrSortedValAtype, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const void *, cudaDataType, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrsv_analysisEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedValAtype, csrSortedRowPtrA, csrSortedColIndA, info,
-                  executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrsv_solveEx(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const void *alpha, cudaDataType alphatype, const cusparseMatDescr_t descrA,
-    const void *csrSortedValA, cudaDataType csrSortedValAtype,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info, const void *f, cudaDataType ftype,
-    void *x, cudaDataType xtype, cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const void *, cudaDataType,
-      const cusparseMatDescr_t, const void *, cudaDataType, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const void *, cudaDataType,
-      void *, cudaDataType, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrsv_solveEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, alphatype, descrA, csrSortedValA,
-                  csrSortedValAtype, csrSortedRowPtrA, csrSortedColIndA, info,
-                  f, ftype, x, xtype, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const float *f, float *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const double *f, double *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuComplex *f, cuComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *f, cuDoubleComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const cuDoubleComplex *,
-      cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle,
-                                                       csrsv2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsv2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csrsv2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const float *f, float *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      csrsv2Info_t, const float *, float *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const double *f, double *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      csrsv2Info_t, const double *, double *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const cuComplex *f,
-    cuComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      csrsv2Info_t, const cuComplex *, cuComplex *, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrsv2Info_t info, const cuDoubleComplex *f,
-    cuDoubleComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, csrsv2Info_t, const cuDoubleComplex *, cuDoubleComplex *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsv2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsv2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, float *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, double *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
-      bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
-    bsrsv2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
-      int, bsrsv2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
-                  pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const float *alpha,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const float *f, float *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, int, bsrsv2Info_t, const float *, float *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const double *alpha,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const double *f, double *x, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const double *, const cusparseMatDescr_t, const double *, const int *,
-      const int *, int, bsrsv2Info_t, const double *, double *,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuComplex *f, cuComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, bsrsv2Info_t, const cuComplex *,
-      cuComplex *, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, int mb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    bsrsv2Info_t info, const cuDoubleComplex *f, cuDoubleComplex *x,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t,
-      const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
-                  policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseShybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDhybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseChybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZhybsv_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
-                        const cusparseMatDescr_t descrA, cusparseHybMat_t hybA,
-                        cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cusparseMatDescr_t,
-      cusparseHybMat_t, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybsv_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, descrA, hybA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const float *alpha,
-    const cusparseMatDescr_t descra, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const float *f, float *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const float *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const float *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descra, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const cuComplex *alpha,
-    const cusparseMatDescr_t descra, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const cuComplex *f, cuComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descra, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans, const double *alpha,
-    const cusparseMatDescr_t descra, const cusparseHybMat_t hybA,
-    cusparseSolveAnalysisInfo_t info, const double *f, double *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const double *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descra, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhybsv_solve(
-    cusparseHandle_t handle, cusparseOperation_t trans,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descra,
-    const cusparseHybMat_t hybA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *f, cuDoubleComplex *x) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cusparseHybMat_t,
-      cusparseSolveAnalysisInfo_t, const cuDoubleComplex *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhybsv_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, alpha, descra, hybA, info, f, x);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrmm(cusparseHandle_t handle, cusparseOperation_t transA, int m,
-               int n, int k, int nnz, const float *alpha,
-               const cusparseMatDescr_t descrA, const float *csrSortedValA,
-               const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-               const float *B, int ldb, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *B, int ldb, const double *beta,
-    double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const double *, int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuComplex *B, int ldb,
-    const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int,
-      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmm(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int k,
-    int nnz, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, int, int,
-      const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, k, nnz, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const float *alpha, const cusparseMatDescr_t descrA,
-                const float *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const float *B, int ldb,
-                const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const float *, const cusparseMatDescr_t, const float *, const int *,
-      const int *, const float *, int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const double *alpha, const cusparseMatDescr_t descrA,
-                const double *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const double *B, int ldb,
-                const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const double *, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int, const double *, double *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-                cusparseOperation_t transB, int m, int n, int k, int nnz,
-                const cuComplex *alpha, const cusparseMatDescr_t descrA,
-                const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-                const int *csrSortedColIndA, const cuComplex *B, int ldb,
-                const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const cuComplex *, int, const cuComplex *,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrmm2(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      int, const cuDoubleComplex *, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrmm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, nnz, alpha, descrA,
-                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
-                  beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const float *B,
-    const int ldb, const float *beta, float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      const int, const float *, const int, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const double *B,
-    const int ldb, const double *beta, double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      const int, const double *, const int, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, const int blockSize, const cuComplex *B,
-    const int ldb, const cuComplex *beta, cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      const int, const cuComplex *, const int, const cuComplex *, cuComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int kb, int nnzb, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA,
-    const int blockSize, const cuDoubleComplex *B, const int ldb,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, const int, const cuDoubleComplex *, const int,
-      const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
-                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
-                  B, ldb, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz,
-    const float *alpha, /* host or device pointer */
-    const float *A, int lda, const float *cscValB, const int *cscColPtrB,
-    const int *cscRowIndB, const float *beta, /* host or device pointer */
-    float *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const float *, const float *, int,
-      const float *, const int *, const int *, const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz,
-    const double *alpha, /* host or device pointer */
-    const double *A, int lda, const double *cscValB, const int *cscColPtrB,
-    const int *cscRowIndB, const double *beta, /* host or device pointer */
-    double *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const double *, const double *, int,
-      const double *, const int *, const int *, const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCgemmi(cusparseHandle_t handle, int m, int n, int k, int nnz,
-               const cuComplex *alpha, /* host or device pointer */
-               const cuComplex *A, int lda, const cuComplex *cscValB,
-               const int *cscColPtrB, const int *cscRowIndB,
-               const cuComplex *beta, /* host or device pointer */
-               cuComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const cuComplex *,
-      const cuComplex *, int, const cuComplex *, const int *, const int *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgemmi(
-    cusparseHandle_t handle, int m, int n, int k, int nnz,
-    const cuDoubleComplex *alpha, /* host or device pointer */
-    const cuDoubleComplex *A, int lda, const cuDoubleComplex *cscValB,
-    const int *cscColPtrB, const int *cscRowIndB,
-    const cuDoubleComplex *beta, /* host or device pointer */
-    cuDoubleComplex *C, int ldc) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemmi");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
-                  cscRowIndB, beta, C, ldc);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_analysis(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const float *alpha, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const float *F, int ldf, float *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const float *, int, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, F, ldf, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const double *alpha, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const double *F, int ldf, double *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const double *, int, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, F, ldf, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuComplex *F, int ldf, cuComplex *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, const cuComplex *, int, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, F, ldf, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_solve(
-    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
-    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseSolveAnalysisInfo_t info,
-    const cuDoubleComplex *F, int ldf, cuDoubleComplex *X, int ldx) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, cusparseSolveAnalysisInfo_t, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, m, n, alpha, descrA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, info, F, ldf, X, ldx);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
-                                                       bsrsm2Info_t info,
-                                                       int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsm2_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const float *F, int ldf, float *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      bsrsm2Info_t, const float *, int, float *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, F, ldf, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const double *F, int ldf, double *X, int ldx, cusparseSolvePolicy_t policy,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      bsrsm2Info_t, const double *, int, double *, int, cusparseSolvePolicy_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, F, ldf, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuComplex *F, int ldf, cuComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, bsrsm2Info_t, const cuComplex *, int, cuComplex *, int,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, F, ldf, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(
-    cusparseHandle_t handle, cusparseDirection_t dirA,
-    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
-    int nnzb, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
-    const cuDoubleComplex *F, int ldf, cuDoubleComplex *X, int ldx,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
-      cusparseOperation_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, bsrsm2Info_t, const cuDoubleComplex *, int,
-      cuDoubleComplex *, int, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_solve");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
-                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
-                  info, F, ldf, X, ldx, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsrilu0Ex(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, void *csrSortedValA_ValM,
-    cudaDataType csrSortedValA_ValMtype,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info, cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      void *, cudaDataType, const int *, const int *,
-      cusparseSolveAnalysisInfo_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrilu0Ex");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedValA_ValMtype, csrSortedRowPtrA, csrSortedColIndA,
-                  info, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, float *csrSortedValA_ValM,
-                 /* matrix A values are updated inplace
-                    to be the preconditioner M values */
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, double *csrSortedValA_ValM,
-                 /* matrix A values are updated inplace
-                    to be the preconditioner M values */
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsrilu0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                 const cusparseMatDescr_t descrA, cuComplex *csrSortedValA_ValM,
-                 /* matrix A values are updated inplace
-                    to be the preconditioner M values */
-                 const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                 cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu0(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA_ValM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(
-    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, csrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(
-    cusparseHandle_t handle, csrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csrilu02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    float *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, float *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    double *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(
-    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
-    cuDoubleComplex *boost_val) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, bsrilu02Info_t, int, double *, cuDoubleComplex *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_numericBoost");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, enable_boost, tol, boost_val);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(
-    cusparseHandle_t handle, bsrilu02Info_t info, int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrilu02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsrilu02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descra, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descra, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descra, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descra, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descra, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descra, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descra, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descra, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseScsric0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                const cusparseMatDescr_t descrA, float *csrSortedValA_ValM,
-                /* matrix A values are updated inplace
-                   to be the preconditioner M values */
-                const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDcsric0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                const cusparseMatDescr_t descrA, double *csrSortedValA_ValM,
-                /* matrix A values are updated inplace
-                   to be the preconditioner M values */
-                const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsric0(cusparseHandle_t handle, cusparseOperation_t trans, int m,
-                const cusparseMatDescr_t descrA, cuComplex *csrSortedValA_ValM,
-                /* matrix A values are updated inplace
-                   to be the preconditioner M values */
-                const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric0(
-    cusparseHandle_t handle, cusparseOperation_t trans, int m,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA_ValM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    cusparseSolveAnalysisInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, cusparseSolveAnalysisInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric0");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, descrA, csrSortedValA_ValM,
-                  csrSortedRowPtrA, csrSortedColIndA, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
-                                                        csric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
-    csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, csric02Info_t info,
-    cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, csric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    float *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    double *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuComplex *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsric02(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    cuDoubleComplex *csrSortedValA_valM,
-    /* matrix A values are updated inplace
-       to be the preconditioner M values */
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    csric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
-                  csrSortedColIndA, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
-                                                        bsric02Info_t info,
-                                                        int *position) {
-  using FuncPtr =
-      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsric02_zeroPivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, info, position);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
-    bsric02Info_t info, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockSize, info, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_analysis");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, float *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      float *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, double *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      double *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsric02(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
-    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
-    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
-      cusparseSolvePolicy_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, blockDim, info, policy, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv(cusparseHandle_t handle, int m,
-                                           int n, const float *dl,
-                                           const float *d, const float *du,
-                                           float *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  const float *, const float *,
-                                                  const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv(cusparseHandle_t handle, int m,
-                                           int n, const double *dl,
-                                           const double *d, const double *du,
-                                           double *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv(cusparseHandle_t handle, int m,
-                                           int n, const cuComplex *dl,
-                                           const cuComplex *d,
-                                           const cuComplex *du, cuComplex *B,
-                                           int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv(cusparseHandle_t handle, int m,
-                                           int n, const cuDoubleComplex *dl,
-                                           const cuDoubleComplex *d,
-                                           const cuDoubleComplex *du,
-                                           cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const float *dl,
-                                            const float *d, const float *du,
-                                            float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const double *dl,
-                                            const double *d, const double *du,
-                                            double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuComplex *dl,
-                                            const cuComplex *d,
-                                            const cuComplex *du, cuComplex *B,
-                                            int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2(cusparseHandle_t handle, int m,
-                                            int n, const cuDoubleComplex *dl,
-                                            const cuDoubleComplex *d,
-                                            const cuDoubleComplex *du,
-                                            cuDoubleComplex *B, int ldb,
-                                            void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSgtsv_nopivot(cusparseHandle_t handle, int m, int n, const float *dl,
-                      const float *d, const float *du, float *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
-                                                  const float *, const float *,
-                                                  const float *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgtsv_nopivot(cusparseHandle_t handle, int m, int n, const double *dl,
-                      const double *d, const double *du, double *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZgtsv_nopivot(cusparseHandle_t handle, int m, int n,
-                      const cuDoubleComplex *dl, const cuDoubleComplex *d,
-                      const cuDoubleComplex *du, cuDoubleComplex *B, int ldb) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      const float *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, const double *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
-      int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
-    const float *du, float *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const float *, const float *,
-      float *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
-    const double *du, double *B, int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const double *,
-      const double *, double *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
-    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *B,
-    int ldb, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, float *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      float *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const double *dl, const double *d,
-    const double *du, double *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      double *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, cuComplex *x, int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsvStridedBatch(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
-    int batchCount, int batchStride) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvStridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, const float *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      const float *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const double *dl, const double *d,
-    const double *du, const double *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      const double *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, const cuComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, const cuComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du,
-    const cuDoubleComplex *x, int batchCount, int batchStride,
-    size_t *bufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, const cuDoubleComplex *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
-                  bufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const float *dl, const float *d,
-    const float *du, float *x, int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const float *, const float *, const float *,
-      float *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDgtsv2StridedBatch(cusparseHandle_t handle, int m, const double *dl,
-                           const double *d, const double *du, double *x,
-                           int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const double *, const double *, const double *,
-      double *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
-    const cuComplex *du, cuComplex *x, int batchCount, int batchStride,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
-      const cuComplex *, cuComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch(
-    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
-    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
-    int batchCount, int batchStride, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
-      const cuDoubleComplex *, cuDoubleComplex *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseXcsrgemmNnz(cusparseHandle_t handle, cusparseOperation_t transA,
-                    cusparseOperation_t transB, int m, int n, int k,
-                    const cusparseMatDescr_t descrA, const int nnzA,
-                    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-                    const cusparseMatDescr_t descrB, const int nnzB,
-                    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-                    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-                    int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, const int, const int *, const int *,
-      const cusparseMatDescr_t, const int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemmNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, const int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, const int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, const int, const float *, const int *,
-      const int *, const cusparseMatDescr_t, const int, const float *,
-      const int *, const int *, const cusparseMatDescr_t, float *, const int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuComplex *,
-      const int *, const int *, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, int, int, int,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
-      const int *, const int *, const cusparseMatDescr_t, cuDoubleComplex *,
-      const int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transA, transB, m, n, k, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCreateCsrgemm2Info(csrgemm2Info_t *info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrgemm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrgemm2Info(csrgemm2Info_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrgemm2Info");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
-    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, csrgemm2Info_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
-      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const float *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
-      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const double *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cuComplex *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuDoubleComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int, const int *, const int *,
-      const cuDoubleComplex *, const cusparseMatDescr_t, int, const int *,
-      const int *, csrgemm2Info_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Nnz(
-    cusparseHandle_t handle, int m, int n, int k,
-    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, const csrgemm2Info_t info,
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, int,
-      const int *, const int *, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *, const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemm2Nnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrD, nnzD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
-    const cusparseMatDescr_t descrD, int nnzD, const float *csrSortedValD,
-    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
-      int, const float *, const int *, const int *, const cusparseMatDescr_t,
-      int, const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const double *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
-    const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
-      int, const double *, const int *, const int *, const cusparseMatDescr_t,
-      int, const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
-    const cuComplex *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuComplex *,
-      const int *, const int *, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuComplex *, const int *, int *,
-      const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2(
-    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrD, int nnzD,
-    const cuDoubleComplex *csrSortedValD, const int *csrSortedRowPtrD,
-    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
-    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
-      const int *, const int *, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuDoubleComplex *, const int *,
-      int *, const csrgemm2Info_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
-                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
-                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
-                  csrSortedColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrgeamNnz(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
-      const int *, const cusparseMatDescr_t, int, const int *, const int *,
-      const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeamNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
-                  csrSortedColIndB, descrC, csrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrgeam(
-    cusparseHandle_t handle, int m, int n, const float *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
-    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
-    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
-      const float *, const int *, const int *, const float *,
-      const cusparseMatDescr_t, int, const float *, const int *, const int *,
-      const cusparseMatDescr_t, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam(
-    cusparseHandle_t handle, int m, int n, const double *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const double *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
-      const double *, const int *, const int *, const double *,
-      const cusparseMatDescr_t, int, const double *, const int *, const int *,
-      const cusparseMatDescr_t, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam(
-    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
-    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
-      int, const cuComplex *, const int *, const int *, const cuComplex *,
-      const cusparseMatDescr_t, int, const cuComplex *, const int *,
-      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam(
-    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
-    const cusparseMatDescr_t descrA, int nnzA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cuDoubleComplex *beta,
-    const cusparseMatDescr_t descrB, int nnzB,
-    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
-    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cuDoubleComplex *,
-      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
-      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
-                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, const float *, int *, int *, int *,
-      const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(
-    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
-    int *coloring, int *reordering, const cusparseColorInfo_t info) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, const double *, int *,
-      int *, int *, const cusparseColorInfo_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrcolor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, fractionToColor, ncolors, coloring,
-                  reordering, info);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseSnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const float *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseDnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const double *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-             const cusparseMatDescr_t descrA, const cuDoubleComplex *A, int lda,
-             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const float *csrValA, const int *csrRowPtrA, int *nnzPerRow, int *nnzC,
-    float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const float *,
-      const int *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrValA, csrRowPtrA, nnzPerRow, nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const double *csrValA, const int *csrRowPtrA, int *nnzPerRow, int *nnzC,
-    double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const double *,
-      const int *, int *, int *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrValA, csrRowPtrA, nnzPerRow, nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuComplex *csrValA, const int *csrRowPtrA, int *nnzPerRow, int *nnzC,
-    cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, int *, int *, cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrValA, csrRowPtrA, nnzPerRow, nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(
-    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
-    const cuDoubleComplex *csrValA, const int *csrRowPtrA, int *nnzPerRow,
-    int *nnzC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, const cusparseMatDescr_t, const cuDoubleComplex *,
-      const int *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, descr, csrValA, csrRowPtrA, nnzPerRow, nnzC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descra,
-    const float *csrValA, const int *csrColIndA, const int *csrRowPtrA,
-    int nnzA, const int *nnzPerRow, float *csrValC, int *csrColIndC,
-    int *csrRowPtrC, float tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, int, const int *, float *, int *, int *, float);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descra, csrValA, csrColIndA, csrRowPtrA, nnzA,
-                  nnzPerRow, csrValC, csrColIndC, csrRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(
-    cusparseHandle_t handle,
-    int m,  // number of rows
-    int n, const cusparseMatDescr_t descra,
-    const double *csrValA,  // csr values array-the elements which are below a
-                            // certain tolerance will be remvoed
-    const int *csrColIndA,
-    const int *csrRowPtrA,  // corresponding input noncompressed row pointer
-    int nnzA, const int *nnzPerRow, double *csrValC, int *csrColIndC,
-    int *csrRowPtrC, double tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, int, const int *, double *, int *, int *,
-      double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descra, csrValA, csrColIndA, csrRowPtrA, nnzA,
-                  nnzPerRow, csrValC, csrColIndC, csrRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(
-    cusparseHandle_t handle,
-    int m,  // number of rows
-    int n, const cusparseMatDescr_t descra,
-    const cuComplex *csrValA,  // csr values array-the elements which are below
-                               // a certain tolerance will be remvoed
-    const int *csrColIndA,
-    const int *csrRowPtrA,  // corresponding input noncompressed row pointer
-    int nnzA, const int *nnzPerRow, cuComplex *csrValC, int *csrColIndC,
-    int *csrRowPtrC, cuComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, int, const int *, cuComplex *, int *, int *,
-      cuComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descra, csrValA, csrColIndA, csrRowPtrA, nnzA,
-                  nnzPerRow, csrValC, csrColIndC, csrRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(
-    cusparseHandle_t handle,
-    int m,  // number of rows
-    int n, const cusparseMatDescr_t descra,
-    const cuDoubleComplex
-        *csrValA,  // csr values array-the elements which are below a certain
-                   // tolerance will be remvoed
-    const int *csrColIndA,
-    const int *csrRowPtrA,  // corresponding input noncompressed row pointer
-    int nnzA, const int *nnzPerRow, cuDoubleComplex *csrValC, int *csrColIndC,
-    int *csrRowPtrC, cuDoubleComplex tol) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, const int *,
-      cuDoubleComplex *, int *, int *, cuDoubleComplex);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csr_compress");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descra, csrValA, csrColIndA, csrRowPtrA, nnzA,
-                  nnzPerRow, csrValC, csrColIndC, csrRowPtrC, tol);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerRow, float *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerRow, double *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerRow, cuComplex *csrSortedValA,
-    int *csrSortedRowPtrA, int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdense2csr(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *A, int lda, const int *nnzPerRow,
-    cuDoubleComplex *csrSortedValA, int *csrSortedRowPtrA,
-    int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
-                  csrSortedRowPtrA, csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerCol, float *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerCol, double *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerCol, cuComplex *cscSortedValA,
-    int *cscSortedRowIndA, int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZdense2csc(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *A, int lda, const int *nnzPerCol,
-    cuDoubleComplex *cscSortedValA, int *cscSortedRowIndA,
-    int *cscSortedColPtrA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
-                  cscSortedRowIndA, cscSortedColPtrA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsc2dense(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
-      int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle,
-                                              const int *cooRowInd, int nnz,
-                                              int m, int *csrSortedRowPtr,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoo2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, cooRowInd, nnz, m, csrSortedRowPtr, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle,
-                                              const int *csrSortedRowPtr,
-                                              int nnz, int m, int *cooRowInd,
-                                              cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2coo");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, csrSortedRowPtr, nnz, m, cooRowInd, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx(
-    cusparseHandle_t handle, int m, int n, int nnz, const void *csrSortedVal,
-    cudaDataType csrSortedValtype, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, void *cscSortedVal,
-    cudaDataType cscSortedValtype, int *cscSortedRowInd, int *cscSortedColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-    cudaDataType executiontype) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const void *, cudaDataType, const int *,
-      const int *, void *, cudaDataType, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, cudaDataType);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedValtype,
-                  csrSortedRowPtr, csrSortedColInd, cscSortedVal,
-                  cscSortedValtype, cscSortedRowInd, cscSortedColPtr,
-                  copyValues, idxBase, executiontype);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz, const float *csrSortedVal,
-    const int *csrSortedRowPtr, const int *csrSortedColInd, float *cscSortedVal,
-    int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      float *, int *, int *, cusparseAction_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz, const double *csrSortedVal,
-    const int *csrSortedRowPtr, const int *csrSortedColInd,
-    double *cscSortedVal, int *cscSortedRowInd, int *cscSortedColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      double *, int *, int *, cusparseAction_t, cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCcsr2csc(cusparseHandle_t handle, int m, int n, int nnz,
-                 const cuComplex *csrSortedVal, const int *csrSortedRowPtr,
-                 const int *csrSortedColInd, cuComplex *cscSortedVal,
-                 int *cscSortedRowInd, int *cscSortedColPtr,
-                 cusparseAction_t copyValues, cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, cuComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csc(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
-    const int *csrSortedColInd, cuDoubleComplex *cscSortedVal,
-    int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t idxBase) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, cuDoubleComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr,
-                  csrSortedColInd, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr, copyValues, idxBase);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
-      const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
-      const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCdense2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *A, int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-    int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      int, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseZdense2hyb(cusparseHandle_t handle, int m, int n,
-                   const cusparseMatDescr_t descrA, const cuDoubleComplex *A,
-                   int lda, const int *nnzPerRow, cusparseHybMat_t hybA,
-                   int userEllWidth, cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, int, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, hybA, userEllWidth,
-                  partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                float *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                double *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                cuComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2dense(cusparseHandle_t handle,
-                                                const cusparseMatDescr_t descrA,
-                                                const cusparseHybMat_t hybA,
-                                                cuDoubleComplex *A, int lda) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2dense");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, A, lda);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              float *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              double *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuComplex *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2csr(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuDoubleComplex *csrSortedValA,
-                                              int *csrSortedRowPtrA,
-                                              int *csrSortedColIndA) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const float *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const double *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
-      const int *, const int *, cusparseHybMat_t, int, cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsc2hyb(
-    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
-    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
-    const int *cscSortedColPtrA, cusparseHybMat_t hybA, int userEllWidth,
-    cusparseHybPartition_t partitionType) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, cusparseHybMat_t, int,
-      cusparseHybPartition_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2hyb");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
-                  cscSortedColPtrA, hybA, userEllWidth, partitionType);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseShyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              float *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseShyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDhyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              double *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDhyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseChyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuComplex *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseChyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZhyb2csc(cusparseHandle_t handle,
-                                              const cusparseMatDescr_t descrA,
-                                              const cusparseHybMat_t hybA,
-                                              cuDoubleComplex *cscSortedVal,
-                                              int *cscSortedRowInd,
-                                              int *cscSortedColPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, const cusparseMatDescr_t, const cusparseHybMat_t,
-      cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZhyb2csc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, descrA, hybA, cscSortedVal, cscSortedRowInd,
-                  cscSortedColPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, int blockDim, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int *nnzTotalDevHostPtr) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, const cusparseMatDescr_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2bsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedRowPtrC,
-                  nnzTotalDevHostPtr);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2bsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
-                  bsrSortedRowPtrC, bsrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, const cusparseMatDescr_t,
-      float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, const cusparseMatDescr_t,
-      double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const float *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim, float *bscVal,
-    int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t baseIdx, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
-      int, int, float *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, baseIdx, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const double *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    double *bscVal, int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
-    cusparseIndexBase_t baseIdx, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
-      int, int, double *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, baseIdx, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t baseIdx, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
-      const int *, int, int, cuComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, baseIdx, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(
-    cusparseHandle_t handle, int mb, int nb, int nnzb,
-    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
-    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
-    cuDoubleComplex *bscVal, int *bscRowInd, int *bscColPtr,
-    cusparseAction_t copyValues, cusparseIndexBase_t baseIdx, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
-      const int *, int, int, cuDoubleComplex *, int *, int *, cusparseAction_t,
-      cusparseIndexBase_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
-                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
-                  bscColPtr, copyValues, baseIdx, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDim, int colBlockDim,
-    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, int, int, const cusparseMatDescr_t, int *,
-      int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, float *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, float *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, double *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, double *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
-    int *csrSortedRowPtrC, int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
-    int colBlockDim, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
-    int *csrSortedColIndC) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
-                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
-    int colBlockDim, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
-    const int *csrSortedColIndA, const cusparseMatDescr_t descrC,
-    int *bsrSortedRowPtrC, int rowBlockDim, int colBlockDim,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const int *, const int *, const cusparseMatDescr_t, int *, int, int,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedRowPtrC, rowBlockDim,
-                  colBlockDim, nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const float *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const float *, const int *, const int *, const cusparseMatDescr_t,
-      float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const double *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const double *, const int *, const int *, const cusparseMatDescr_t,
-      double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuComplex *, const int *, const int *, const cusparseMatDescr_t,
-      cuComplex *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
-    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
-    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
-    int colBlockDim, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
-      const cuDoubleComplex *, const int *, const int *,
-      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
-                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
-    int *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, int, int, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
-    const int *bsrSortedColIndA, int rowBlockDimA, int colBlockDimA,
-    const cusparseMatDescr_t descrC, int *bsrSortedRowPtrC, int rowBlockDimC,
-    int colBlockDimC, int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const int *, const int *, int, int,
-      const cusparseMatDescr_t, int *, int, int, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2gebsrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedRowPtrA,
-                  bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC,
-                  bsrSortedRowPtrC, rowBlockDimC, colBlockDimC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, float *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const float *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, double *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const double *, const int *, const int *, int,
-      int, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
-    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
-    int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
-      int, int, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(
-    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
-    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
-    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
-    int colBlockDimA, const cusparseMatDescr_t descrC,
-    cuDoubleComplex *bsrSortedValC, int *bsrSortedRowPtrC,
-    int *bsrSortedColIndC, int rowBlockDimC, int colBlockDimC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, cusparseDirection_t, int, int, int,
-      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
-      const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *,
-      int *, int, int, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
-                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
-                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
-                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI
-cusparseCreateIdentityPermutation(cusparseHandle_t handle, int n, int *p) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseCreateIdentityPermutation");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, p);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cooRowsA,
-    const int *cooColsA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle,
-                                                   int m, int n, int nnz,
-                                                   int *cooRowsA, int *cooColsA,
-                                                   int *P, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByRow");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle,
-                                                      int m, int n, int nnz,
-                                                      int *cooRowsA,
-                                                      int *cooColsA, int *P,
-                                                      void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByColumn");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *csrRowPtrA,
-    const int *csrColIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrRowPtrA, csrColIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *csrRowPtrA,
-                                              int *csrColIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrRowPtrA, csrColIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, const int *cscColPtrA,
-    const int *cscRowIndA, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, cscColPtrA, cscRowIndA,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, int m,
-                                              int n, int nnz,
-                                              const cusparseMatDescr_t descrA,
-                                              const int *cscColPtrA,
-                                              int *cscRowIndA, int *P,
-                                              void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
-      int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, cscColPtrA, cscRowIndA, P,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, float *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, float *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, double *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, double *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnz, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, cuDoubleComplex *, const int *, int *,
-      csru2csrInfo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
-    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
-      const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(
-    cusparseHandle_t handle, int m, int n, int nnz,
-    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
-    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
-      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csru");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC,
-    const float *csrValC, const int *csrRowPtrC, const int *csrColIndC,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrValC, csrRowPtrC,
-                  csrColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC,
-    const double *csrValC, const int *csrRowPtrC, const int *csrColIndC,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrValC, csrRowPtrC,
-                  csrColIndC, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    const float *threshold, const cusparseMatDescr_t descrC, float *csrValC,
-    const int *csrRowPtrC, int *csrColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, const float *,
-      const cusparseMatDescr_t, float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrValC, csrRowPtrC,
-                  csrColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    const double *threshold, const cusparseMatDescr_t descrC, double *csrValC,
-    const int *csrRowPtrC, int *csrColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, const double *,
-      const cusparseMatDescr_t, double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrValC, csrRowPtrC,
-                  csrColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA, const float *threshold,
-    const cusparseMatDescr_t descrC, const float *csrValC,
-    const int *csrRowPtrC, const int *csrColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      const float *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  threshold, descrC, csrValC, csrRowPtrC, csrColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA, const double *threshold,
-    const cusparseMatDescr_t descrC, const double *csrValC,
-    const int *csrRowPtrC, const int *csrColIndC, size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      const double *, const int *, const int *, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  threshold, descrC, csrValC, csrRowPtrC, csrColIndC,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA, const float *threshold,
-    const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  threshold, descrC, csrRowPtrC, nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA, const double *threshold,
-    const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t, int *,
-      int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnz");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  threshold, descrC, csrRowPtrC, nnzTotalDevHostPtr, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA, const float *threshold,
-    const cusparseMatDescr_t descrC, float *csrValC, const int *csrRowPtrC,
-    int *csrColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, const float *, const cusparseMatDescr_t,
-      float *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  threshold, descrC, csrValC, csrRowPtrC, csrColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA, const double *threshold,
-    const cusparseMatDescr_t descrC, double *csrValC, const int *csrRowPtrC,
-    int *csrColIndC, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, const double *, const cusparseMatDescr_t,
-      double *, const int *, int *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  threshold, descrC, csrValC, csrRowPtrC, csrColIndC, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, const float *csrValC,
-    const int *csrRowPtrC, const int *csrColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, const float *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrValC, csrRowPtrC,
-                  csrColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, const double *csrValC,
-    const int *csrRowPtrC, const int *csrColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, const double *, const int *, const int *,
-      pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrValC, csrRowPtrC,
-                  csrColIndC, info, pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
-                  nnzTotalDevHostPtr, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const float *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, float *csrValC, const int *csrRowPtrC,
-    int *csrColIndC, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const float *, int, float,
-      const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrValC, csrRowPtrC,
-                  csrColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, const double *A, int lda,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, double *csrValC, const int *csrRowPtrC,
-    int *csrColIndC, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, const double *, int, float,
-      const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrValC, csrRowPtrC,
-                  csrColIndC, info, pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, const float *csrValC,
-    const int *csrRowPtrC, const int *csrColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, const float *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  percentage, descrC, csrValC, csrRowPtrC, csrColIndC, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, const double *csrValC,
-    const int *csrRowPtrC, const int *csrColIndC, pruneInfo_t info,
-    size_t *pBufferSizeInBytes) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, const double *,
-      const int *, const int *, pruneInfo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage_bufferSizeExt");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  percentage, descrC, csrValC, csrRowPtrC, csrColIndC, info,
-                  pBufferSizeInBytes);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  percentage, descrC, csrRowPtrC, nnzTotalDevHostPtr, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, int *csrRowPtrC,
-    int *nnzTotalDevHostPtr, /* can be on host or device */
-    pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
-      pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnzByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  percentage, descrC, csrRowPtrC, nnzTotalDevHostPtr, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const float *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, float *csrValC, const int *csrRowPtrC,
-    int *csrColIndC, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
-      const int *, const int *, float, const cusparseMatDescr_t, float *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  percentage, descrC, csrValC, csrRowPtrC, csrColIndC, info,
-                  pBuffer);
-}
-
-cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage(
-    cusparseHandle_t handle, int m, int n, int nnzA,
-    const cusparseMatDescr_t descrA, const double *csrValA,
-    const int *csrRowPtrA, const int *csrColIndA,
-    float percentage, /* between 0 to 100 */
-    const cusparseMatDescr_t descrC, double *csrValC, const int *csrRowPtrC,
-    int *csrColIndC, pruneInfo_t info, void *pBuffer) {
-  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
-      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
-      const int *, const int *, float, const cusparseMatDescr_t, double *,
-      const int *, int *, pruneInfo_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA,
-                  percentage, descrC, csrValC, csrRowPtrC, csrColIndC, info,
-                  pBuffer);
-}
-
-}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cusparse_stub.cc
index a138c014248904..16141e51e2613b 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cusparse_stub.cc
@@ -26,8 +26,7 @@ void* GetDsoHandle() {
   return nullptr;
 #else
   static auto handle = []() -> void* {
-    auto handle_or =
-        tsl::internal::DsoLoader::GetCusparseDsoHandle();
+    auto handle_or = tsl::internal::DsoLoader::GetCusparseDsoHandle();
     if (!handle_or.ok()) return nullptr;
     return handle_or.value();
   }();
@@ -35,32 +34,40 @@ void* GetDsoHandle() {
 #endif
 }
 
-template <typename T>
-T LoadSymbol(const char* symbol_name) {
+void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
     tsl::Env::Default()
         ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
-  return reinterpret_cast<T>(symbol);
+  return symbol;
 }
 
-cusparseStatus_t GetSymbolNotFoundError() {
+const char* kSymbols[] = {
+#include "tsl/cuda/cusparse.inc"
+};
+
+constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
+
+}  // namespace
+
+extern "C" {
+
+static cusparseStatus_t GetSymbolNotFoundError() {
   return CUSPARSE_STATUS_INTERNAL_ERROR;
 }
-}  // namespace
 
-#if CUDA_VERSION < 10000
-#include "tsl/cuda/cusparse_9_0.inc"
-#elif CUDA_VERSION < 10010
-#include "tsl/cuda/cusparse_10_0.inc"
-#elif CUDA_VERSION < 10020
-#include "tsl/cuda/cusparse_10_1.inc"
-#elif CUDA_VERSION < 11000
-#include "tsl/cuda/cusparse_10_2.inc"
-#elif CUDA_VERSION < 12000
-#include "tsl/cuda/cusparse_11_0.inc"
-#else
-#include "tsl/cuda/cusparse_12_0.inc"
-#endif
+extern void* _cusparse_tramp_table[];
+
+void _cusparse_tramp_resolve(int i) {
+  CHECK_LE(0, i);
+  CHECK_LT(i, kNumSymbols);
+  void* p = LoadSymbol(kSymbols[i]);
+  if (!p) {
+    p = reinterpret_cast<void*>(&GetSymbolNotFoundError);
+  }
+  _cusparse_tramp_table[i] = p;
+}
+
+}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/stub.bzl b/third_party/xla/third_party/tsl/tsl/cuda/stub.bzl
new file mode 100644
index 00000000000000..0dbfad09658083
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/stub.bzl
@@ -0,0 +1,26 @@
+"""Macros to generate CUDA library stubs from a list of symbols."""
+
+def cuda_stub(name, srcs):
+    """Generates a CUDA stub from a list of symbols.
+
+    Generates two files:
+    * library.inc, which contains a list of symbols suitable for inclusion by
+        C++, and
+    * library.tramp.S, which contains assembly-language trampolines for each
+      symbol.
+    """
+    native.genrule(
+        name = "{}_stub_gen".format(name),
+        srcs = srcs,
+        tools = ["//third_party/implib_so:make_stub"],
+        outs = [
+            "{}.inc".format(name),
+            "{}.tramp.S".format(name),
+        ],
+        tags = ["gpu"],
+        cmd = select({
+            "//tsl:linux_aarch64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target aarch64",
+            "//tsl:linux_x86_64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target x86_64",
+            "//conditions:default": "NOT_IMPLEMENTED_FOR_THIS_PLATFORM_OR_ARCHITECTURE",
+        }),
+    )
diff --git a/third_party/xla/third_party/tsl/workspace2.bzl b/third_party/xla/third_party/tsl/workspace2.bzl
index ac600c35390300..f0c95504df6757 100644
--- a/third_party/xla/third_party/tsl/workspace2.bzl
+++ b/third_party/xla/third_party/tsl/workspace2.bzl
@@ -2,42 +2,43 @@
 
 # Import third party config rules.
 load("@bazel_skylib//lib:versions.bzl", "versions")
-load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
-load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
-load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
-load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
-load("//third_party/git:git_configure.bzl", "git_configure")
-load("//third_party/py:python_configure.bzl", "python_configure")
-load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
-load("//tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
-load("//tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
-load("//tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
+
+# Import external repository rules.
+load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
+load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-load("//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
-load("//tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
-load("//third_party/llvm:setup.bzl", "llvm_setup")
 
 # Import third party repository rules. See go/tfbr-thirdparty.
 load("//third_party/absl:workspace.bzl", absl = "repo")
 load("//third_party/benchmark:workspace.bzl", benchmark = "repo")
+load("//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
 load("//third_party/eigen3:workspace.bzl", eigen3 = "repo")
 load("//third_party/farmhash:workspace.bzl", farmhash = "repo")
 load("//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
+load("//third_party/git:git_configure.bzl", "git_configure")
+load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
 load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
+load("//third_party/implib_so:workspace.bzl", implib_so = "repo")
 load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
+load("//third_party/llvm:setup.bzl", "llvm_setup")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
+load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
+load("//third_party/py:python_configure.bzl", "python_configure")
 load("//third_party/py/ml_dtypes:workspace.bzl", ml_dtypes = "repo")
 load("//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo")
 load("//third_party/pybind11_bazel:workspace.bzl", pybind11_bazel = "repo")
+load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
+load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("//third_party/tensorrt:workspace.bzl", tensorrt = "repo")
-
-# Import external repository rules.
-load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
-load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
-load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies")
-load("//tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
-load("//tools/toolchains/remote:configure.bzl", "remote_execution_configure")
+load("//tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
+load("//tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
+load("//tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
 load("//tools/toolchains/clang6:repo.bzl", "clang6_configure")
+load("//tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
+load("//tools/toolchains/remote:configure.bzl", "remote_execution_configure")
+load("//tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
 
 def _initialize_third_party():
     """ Load third party repositories.  See above load() statements. """
@@ -47,6 +48,7 @@ def _initialize_third_party():
     farmhash()
     gemmlowp()
     hwloc()
+    implib_so()
     jpeg()
     ml_dtypes()
     nasm()

From 04c49a0785d25d0f1878605a719748d6e22d0bcc Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 22 Sep 2023 09:03:24 -0700
Subject: [PATCH 147/567] Cache buffer sizes in `BufferAssignment` when they
 are accessed.

In some cases, the buffer-size computation involves a non-trivial amount of work, and executing it multiple times per value can be very expensive.

PiperOrigin-RevId: 567636486
---
 third_party/xla/xla/service/buffer_assignment.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index 8c43b47fe9ded5..da46122da3b6f0 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -551,10 +551,13 @@ class BufferAssignment {
   BufferAllocation* GetMutableAllocation(BufferAllocation::Index index);
 
   int64_t HloBufferSize(const HloBuffer& buffer) {
+    auto iter = cached_buffer_sizes_.find(buffer.id());
+    if (iter != cached_buffer_sizes_.end()) return iter->second;
     int64_t result = 0;
     for (const HloValue* value : buffer.values()) {
       result = std::max(result, buffer_size_(*value));
     }
+    cached_buffer_sizes_.insert({buffer.id(), result});
     return result;
   }
 
@@ -593,6 +596,8 @@ class BufferAssignment {
 
   Stats stats_;
 
+  absl::flat_hash_map<HloBuffer::Id, int64_t> cached_buffer_sizes_;
+
   BufferAssignment(const BufferAssignment&) = delete;
   BufferAssignment& operator=(const BufferAssignment&) = delete;
 };

From 6b81dd15bd52110374f287b9d54ab7660fb0ce20 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 09:10:54 -0700
Subject: [PATCH 148/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/3b7012849fde349a679c495f5bd421935d6599f9.

PiperOrigin-RevId: 567638210
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 3671380535a7b8..7fb69c7a3ed804 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "e2e1f9f197af799cbf558eacd26221695a7971ce"
-    TFRT_SHA256 = "ef6ca7d0ab5fce018a8ae64500de96dffd838d1ed4bef514d797ba2b99bd8908"
+    TFRT_COMMIT = "3b7012849fde349a679c495f5bd421935d6599f9"
+    TFRT_SHA256 = "aff0363d40d564af9ad599f40b4b24b819e84549eb7ea9d5fb8875817c76a4bf"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index 3671380535a7b8..7fb69c7a3ed804 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "e2e1f9f197af799cbf558eacd26221695a7971ce"
-    TFRT_SHA256 = "ef6ca7d0ab5fce018a8ae64500de96dffd838d1ed4bef514d797ba2b99bd8908"
+    TFRT_COMMIT = "3b7012849fde349a679c495f5bd421935d6599f9"
+    TFRT_SHA256 = "aff0363d40d564af9ad599f40b4b24b819e84549eb7ea9d5fb8875817c76a4bf"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 3671380535a7b8..7fb69c7a3ed804 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "e2e1f9f197af799cbf558eacd26221695a7971ce"
-    TFRT_SHA256 = "ef6ca7d0ab5fce018a8ae64500de96dffd838d1ed4bef514d797ba2b99bd8908"
+    TFRT_COMMIT = "3b7012849fde349a679c495f5bd421935d6599f9"
+    TFRT_SHA256 = "aff0363d40d564af9ad599f40b4b24b819e84549eb7ea9d5fb8875817c76a4bf"
 
     tf_http_archive(
         name = "tf_runtime",

From f697bba3b44fbb5230714acaf2a8412bcae4ec2a Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <rameshsampath@google.com>
Date: Fri, 22 Sep 2023 10:06:25 -0700
Subject: [PATCH 149/567] Remove allow_multiple_exports param from tf_export.

PiperOrigin-RevId: 567651676
---
 tensorflow/python/util/tf_export.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index 7caaa07319f77e..ade7504ca75fae 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -279,6 +279,7 @@ def __init__(
       *args: str,
       api_name: str = TENSORFLOW_API_NAME,
       v1: Optional[Sequence[str]] = None,
+      allow_multiple_exports: bool = True,  # pylint: disable=unused-argument
   ):
     """Export under the names *args (first one is considered canonical).
 
@@ -288,6 +289,7 @@ def __init__(
         `estimator`). Default is `tensorflow`.
       v1: Names for the TensorFlow V1 API. If not set, we will use V2 API names
         both for TensorFlow V1 and V2 APIs.
+      allow_multiple_exports: Deprecated.
     """
     self._names = args
     self._names_v1 = v1 if v1 is not None else args
@@ -318,11 +320,13 @@ def _validate_symbol_names(self) -> None:
               '@tf_export is not allowed to export symbols under %s.*'
               % (subpackage)
           )
-    elif not all(n.startswith(self._api_name) for n in all_symbol_names):
-      raise InvalidSymbolNameError(
-          'Can only export symbols under package name of component. e.g.'
-          ' tensorflow_estimator must export all symbols under tf.estimator'
-      )
+    else:
+      if not all(n.startswith(self._api_name) for n in all_symbol_names):
+        raise InvalidSymbolNameError(
+            'Can only export symbols under package name of component. '
+            'e.g. tensorflow_estimator must export all symbols under '
+            'tf.estimator'
+        )
 
   def __call__(self, func: T) -> T:
     """Calls this decorator.
@@ -407,6 +411,7 @@ def __call__(
       self,
       *v2: str,
       v1: Optional[Sequence[str]] = None,
+      allow_multiple_exports: bool = True,  # Deprecated, no-op
   ) -> api_export:
     ...
 

From 6cfebd9843e5944f49eb937ead055bec0708c080 Mon Sep 17 00:00:00 2001
From: Shibo Wang <shibow@google.com>
Date: Fri, 22 Sep 2023 10:21:36 -0700
Subject: [PATCH 150/567] Improve shardings if the users of a const broadcast
 have conflict shardings.

PiperOrigin-RevId: 567655772
---
 .../xla/xla/hlo/transforms/hlo_constant_splitter.cc        | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.cc b/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.cc
index 461824e53d5eb3..c434e9509e4908 100644
--- a/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.cc
+++ b/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.cc
@@ -42,6 +42,9 @@ bool IsSupportedConstantExpression(const HloInstruction* instruction) {
 StatusOr<bool> DuplicateConstantExpressionPerUser(HloComputation* computation,
                                                   HloInstruction* to_clone,
                                                   HloInstruction* user) {
+  if (to_clone->user_count() == 1) {
+    return false;
+  }
   absl::InlinedVector<std::pair<const HloInstruction*, int>, 8> worklist(
       1, std::make_pair(to_clone, 0));
   absl::InlinedVector<const HloInstruction*, 8> to_clone_vec;
@@ -153,7 +156,7 @@ StatusOr<bool> HloConstantSplitter::Run(
 
     // Perform duplication of the constants/constant expressions.
     for (HloInstruction* instruction : constants_list) {
-      if (instruction->user_count() == 0) {
+      if (instruction->user_count() <= 1) {
         continue;
       }
       absl::InlinedVector<HloInstruction*, 8> users;
@@ -161,7 +164,7 @@ StatusOr<bool> HloConstantSplitter::Run(
       // Consider for splitting only leaf expressions (not constants in the
       // middle of a constant expression). Also only split for non-constant
       // users for expressions.
-      for (int i = 1; i < instruction->user_count(); ++i) {
+      for (int i = 0; i < instruction->user_count(); ++i) {
         if (instruction->opcode() == HloOpcode::kConstant ||
             !constants_set.contains(instruction->users()[i])) {
           users.push_back(instruction->users()[i]);

From ac8da081f7c9af3ce661a4e84dc699762d6830f7 Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <grasskin@google.com>
Date: Fri, 22 Sep 2023 10:24:37 -0700
Subject: [PATCH 151/567] Internal Code Change

PiperOrigin-RevId: 567656609
---
 tensorflow/lite/python/authoring/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/python/authoring/BUILD b/tensorflow/lite/python/authoring/BUILD
index c51e9f249e9414..cf1b46790f5112 100644
--- a/tensorflow/lite/python/authoring/BUILD
+++ b/tensorflow/lite/python/authoring/BUILD
@@ -6,7 +6,6 @@ package(
         "//tensorflow:internal",
         "//tensorflow_estimator:__subpackages__",
         "//tensorflow_federated:__subpackages__",
-        "//third_party/py/keras:__subpackages__",
         "//third_party/py/tensorflow:__subpackages__",
     ],
     licenses = ["notice"],

From 9a2a49232ca43b6254ecdce2f5549f1d71339b44 Mon Sep 17 00:00:00 2001
From: Sagun Bajra <sagunb@google.com>
Date: Fri, 22 Sep 2023 10:43:20 -0700
Subject: [PATCH 152/567] Avoid deleting the boolean inputs from the top level
 FunctionDef's input signature when applying small_constants_optimizer.

PiperOrigin-RevId: 567661561
---
 tensorflow/core/common_runtime/eager/execute.cc | 17 +++++++++++------
 .../eager/small_constants_optimizer.cc          |  4 ++--
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 60c65e60d5c61b..2d37223de46e90 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -80,6 +80,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tsl/platform/fingerprint.h"
+#include "tsl/platform/statusor.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_copy_node.h"
@@ -1063,11 +1064,13 @@ bool IntArgsAndRetvalsOnDevice(EagerOperation* op,
 
 using BoolTensorInputs = std::vector<std::pair<std::string, bool>>;
 
-// Removes boolean tensor inputs from the EagerOperation and returns them.
-// Currently this is only useful to invoke when small_constants_optimizer is
-// enabled because the runtime will have equivalent FunctionDefs of the original
-// tf.function without the boolean tensor input.
-StatusOr<BoolTensorInputs> RemoveBoolInputs(EagerOperation* op) {
+// Identifies boolean tensor inputs from the EagerOperation and returns them. If
+// delete_inputs is set to true then it will also delete them from the
+// function's input signature. Currently this is only useful to invoke when
+// small_constants_optimizer is enabled because the runtime will have equivalent
+// FunctionDefs of the original tf.function without the boolean tensor input.
+StatusOr<BoolTensorInputs> GetBoolInputs(EagerOperation* op,
+                                         bool delete_inputs) {
   BoolTensorInputs result;
   if (!op->is_function()) return result;
   // Extract tensor inputs.
@@ -1108,6 +1111,7 @@ StatusOr<BoolTensorInputs> RemoveBoolInputs(EagerOperation* op) {
     result.emplace_back(input_arg.name(), input_value);
   }
 
+  if (!delete_inputs) return result;
   // If we were able to identify all boolean inputs, update the op's inputs.
   op->Clear();
   for (auto* input : stripped_inputs) {
@@ -1327,7 +1331,8 @@ Status GetOrCreateKernelAndDevice(
   // Update the EagerOperation with information about the boolean input tensors
   // when small constant optimization is enabled.
   if (IsSmallConstantOptimizationEnabled(*op)) {
-    TF_ASSIGN_OR_RETURN(BoolTensorInputs bool_inputs, RemoveBoolInputs(op));
+    TF_ASSIGN_OR_RETURN(BoolTensorInputs bool_inputs,
+                        GetBoolInputs(op, /*delete_inputs=*/false));
     string folded_name = op->Name();
     for (const auto& [input_name, input_value] : bool_inputs) {
       folded_name = small_constants_optimizer::FoldedFunctionName(
diff --git a/tensorflow/core/common_runtime/eager/small_constants_optimizer.cc b/tensorflow/core/common_runtime/eager/small_constants_optimizer.cc
index 7d43f29029f0ce..29f2fe833d6e60 100644
--- a/tensorflow/core/common_runtime/eager/small_constants_optimizer.cc
+++ b/tensorflow/core/common_runtime/eager/small_constants_optimizer.cc
@@ -244,14 +244,14 @@ void GenerateTrueAndFalseFunctions(const FunctionDef& fdef,
   // Add f_true(s).
   auto true_fdefs =
       FoldBoolInputTensor(fdef, input_name_to_fold, /*input_value=*/true,
-                          /*delete_input=*/true, flib, folded_functions);
+                          /*delete_input=*/false, flib, folded_functions);
   for (FunctionDef& fdef : true_fdefs) DisableBoolInputFolding(fdef);
   result.insert(result.end(), std::make_move_iterator(true_fdefs.begin()),
                 std::make_move_iterator(true_fdefs.end()));
   // Add f_false(s).
   auto false_fdefs =
       FoldBoolInputTensor(fdef, input_name_to_fold, /*input_value=*/false,
-                          /*delete_input=*/true, flib, folded_functions);
+                          /*delete_input=*/false, flib, folded_functions);
   for (FunctionDef& fdef : false_fdefs) DisableBoolInputFolding(fdef);
   result.insert(result.end(), std::make_move_iterator(false_fdefs.begin()),
                 std::make_move_iterator(false_fdefs.end()));

From 189556e2126cf36015cd33f2e3fa3f6a7abd0687 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 10:52:57 -0700
Subject: [PATCH 153/567]     Check if op kernel is XlaLocalLaunchOp before op
 kernel GetorCreate inside ExecuteOpConversion for MLRT

PiperOrigin-RevId: 567664326
---
 .../mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir      | 22 +++++
 .../compiler/mlir/tfrt/transforms/mlrt/BUILD  |  2 +
 .../mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc   | 95 ++++++++++++-------
 3 files changed, 83 insertions(+), 36 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
index b468f8f88f9b1f..94a28091c7235f 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
@@ -436,3 +436,25 @@ func.func @unused_future(%x: tensor<i32>) -> tensor<i32> {
   // CHECK: mlrt.await_all_control [[unused]]
   return %x : tensor<i32>
 }
+
+// -----
+
+// Test for XlaLaunch
+
+func.func private @xla_func_0(%arg0: tensor<1x3xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._XlaMustCompile = true, tf._noinline = true, tf._original_func_name = "should_not_be_used"} {
+  %1 = "tf.AddV2"(%arg0, %arg1) {__op_key = 0: i32} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %1 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @xla_func
+func.func @xla_func(%arg0: tensor<1x3xf32>) -> tensor<*xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "input:0", outputs = "output:0"}} {
+  %0 = "tf.VarHandleOp"() {__op_key = 1: i32, device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) {__op_key = 2: i32, device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  // CHECK: tf_mlrt.executeop
+  // CHECK: tf_mlrt.async_executeop{{.*}}op: \22XlaLaunch\22\0A
+  // CHECK: tf_mlrt.await
+  // CHECK: return
+  // CHECK-SAME: !tf_mlrt.tensor
+  %2 = "tf.XlaLaunch"(%arg0, %1) {__op_key = 3: i32, _noinline = true, _xla_compile_device_type = "GPU", device = "/device:GPU:0", function = @xla_func_0, operandSegmentSizes = array<i32: 0, 2, 0>} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<*xf32>
+  func.return %2 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
index 2efb225e93b6e4..03558438ac6f6b 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
@@ -72,6 +72,8 @@ cc_library(
         "//tensorflow/compiler/mlir/tfrt/ir/mlrt:mlrt_ops",
         "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_ops",
         "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_tpu_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:status",
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner_cache",
         "@com_google_protobuf//:protobuf_headers",
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index 603167cb0f5ef8..5f689640e39bd4 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -47,6 +47,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/utils.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h"
 
@@ -54,7 +56,8 @@ namespace tensorflow {
 namespace mlrt_compiler {
 namespace {
 
-// TODO(chky): Add registration interface for custom device
+constexpr char kXlaLaunchOp[] = "XlaLaunch";
+
 mlir::Value CreateCustomDevice(mlir::Location loc, llvm::StringRef device_name,
                                mlir::ConversionPatternRewriter &rewriter) {
   if (device_name == kTpuHostDevice) {
@@ -421,20 +424,6 @@ class ExecuteOpConversion final : public mlir::ConversionPattern {
     std::string node_def_text;
     google::protobuf::TextFormat::PrintToString(node_def, &node_def_text);
 
-    auto op_kernel_runner = op_kernel_cache_.GetOrCreate(
-        tfrt::Location(nullptr, execute_key), node_def.op(), node_def.device(),
-        op->getNumOperands(),
-        [&](tensorflow::AttrValueMap *attr_value_map) {
-          *attr_value_map = node_def.attr();
-          return OkStatus();
-        },
-        fallback_state_.device_manager(),
-        fallback_state_.process_function_library_runtime());
-    // TODO(290630314): Use LOG_IF when absl logging is available
-    if (!op_kernel_runner.ok()) {
-      std::cerr << op_kernel_runner.status() << "\n";
-    }
-
     mlir::Value device;
     if (auto custom_device =
             op->getAttrOfType<mlir::StringAttr>(kTfMlrtCustomDevice)) {
@@ -444,11 +433,8 @@ class ExecuteOpConversion final : public mlir::ConversionPattern {
     }
 
     mlir::Operation *new_op = nullptr;
-    if (op_kernel_runner.ok() && (*op_kernel_runner)->IsAsync()) {
-      // If it is an AsyncOpKernel, we lower it to tf_mlrt.async_executeop,
-      // which return !mlrt.futures. These results will be converted as
-      // necessary through the target materialization hook in the type
-      // converter.
+
+    auto create_async_execute_ops = [&]() -> mlir::LogicalResult {
       llvm::SmallVector<mlir::Type, 4> result_types(
           op->getNumResults(), rewriter.getType<mlrt::compiler::FutureType>());
       if (device) {
@@ -462,25 +448,62 @@ class ExecuteOpConversion final : public mlir::ConversionPattern {
               execute_op_registry_.RegisterExecuteOp(new_op, execute_key))) {
         return op->emitWarning("Fail to register async op");
       }
+      return mlir::success();
+    };
+
+    // TODO(b/300999257): check whether to clean up for AoT mockGpu case later
+
+    if (node_def.op() == kXlaLaunchOp) {
+      // XlaLaunch Op an AsyncOpKernel, we lower it to tf_mlrt.async_executeop,
+      // which return !mlrt.futures. These results will be converted as
+      // necessary through the target materialization hook in the type
+      // converter.
+      if (mlir::failed(create_async_execute_ops())) {
+        return mlir::failure();
+      }
     } else {
-      // Otherwise, lower to tf_mlrt.executeop.
-      llvm::SmallVector<mlir::Type, 4> result_types(
-          op->getNumResults(), rewriter.getType<tf_mlrt::TFTensorType>());
-      if (device) {
-        new_op = rewriter.replaceOpWithNewOp<tf_mlrt::ExecuteOpWithDevice>(
-            op, result_types, device, operands, node_def_text, execute_key);
-      } else {
-        new_op = rewriter.replaceOpWithNewOp<tf_mlrt::ExecuteOp>(
-            op, result_types, operands, node_def_text, execute_key);
+      auto op_kernel_runner = op_kernel_cache_.GetOrCreate(
+          tfrt::Location(nullptr, execute_key), node_def.op(),
+          node_def.device(), op->getNumOperands(),
+          [&](tensorflow::AttrValueMap *attr_value_map) {
+            *attr_value_map = node_def.attr();
+            return OkStatus();
+          },
+          fallback_state_.device_manager(),
+          fallback_state_.process_function_library_runtime());
+      // TODO(290630314): Use LOG_IF when absl logging is available
+      if (!op_kernel_runner.ok()) {
+        std::cerr << op_kernel_runner.status() << "\n";
       }
 
-      if (op_kernel_runner.ok()) {
-        // Only register this executeop if its opkernel can be created.
-        // Otherwise, it is an unused op so we don't need to create them at
-        // runtime.
-        if (mlir::failed(
-                execute_op_registry_.RegisterExecuteOp(new_op, execute_key))) {
-          return op->emitWarning("Fail to register sync op");
+      if (op_kernel_runner.ok() && (*op_kernel_runner)->IsAsync()) {
+        // If it is an AsyncOpKernel, we lower it to tf_mlrt.async_executeop,
+        // which return !mlrt.futures. These results will be converted as
+        // necessary through the target materialization hook in the type
+        // converter.
+        if (mlir::failed(create_async_execute_ops())) {
+          return mlir::failure();
+        }
+      } else {
+        // Otherwise, lower to tf_mlrt.executeop.
+        llvm::SmallVector<mlir::Type, 4> result_types(
+            op->getNumResults(), rewriter.getType<tf_mlrt::TFTensorType>());
+        if (device) {
+          new_op = rewriter.replaceOpWithNewOp<tf_mlrt::ExecuteOpWithDevice>(
+              op, result_types, device, operands, node_def_text, execute_key);
+        } else {
+          new_op = rewriter.replaceOpWithNewOp<tf_mlrt::ExecuteOp>(
+              op, result_types, operands, node_def_text, execute_key);
+        }
+
+        if (op_kernel_runner.ok()) {
+          // Only register this executeop if its opkernel can be created.
+          // Otherwise, it is an unused op so we don't need to create them at
+          // runtime.
+          if (mlir::failed(execute_op_registry_.RegisterExecuteOp(
+                  new_op, execute_key))) {
+            return op->emitWarning("Fail to register sync op");
+          }
         }
       }
     }

From 85ee33e2395f0881c5f20be651ef3fc56a246347 Mon Sep 17 00:00:00 2001
From: Neel Kovelamudi <nkovela@google.com>
Date: Fri, 22 Sep 2023 11:00:23 -0700
Subject: [PATCH 154/567] Replicate small constants so they don't need to be
 sent to their successors. A small constant is replicated to each of its
 successo...

PiperOrigin-RevId: 567666332
---
 tensorflow/core/common_runtime/BUILD          |  50 +--
 .../replicate_constants_pass.cc               | 189 ----------
 .../common_runtime/replicate_constants_pass.h |  50 ---
 .../replicate_constants_pass_test.cc          | 334 ------------------
 tensorflow/core/config/flag_defs.h            |   4 -
 tensorflow/core/config/flags_api_wrapper.cc   |   1 -
 tensorflow/python/flags_pybind.pyi            |   1 -
 7 files changed, 9 insertions(+), 620 deletions(-)
 delete mode 100644 tensorflow/core/common_runtime/replicate_constants_pass.cc
 delete mode 100644 tensorflow/core/common_runtime/replicate_constants_pass.h
 delete mode 100644 tensorflow/core/common_runtime/replicate_constants_pass_test.cc

diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 1db3c0d12ed9f5..4b7c5eb8b7c29d 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1,9 +1,7 @@
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_google",
     "if_libtpu",
-    "if_macos",
     "if_oss",
     "if_zendnn",
     "tf_cc_test",
@@ -23,24 +21,25 @@ load(
     "tf_protos_all",
     "tf_protos_grappler",
 )
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "if_static",
-    "tf_cuda_tests_tags",
-)
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
 load(
-    "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
-    "tf_cc_fuzz_test",
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "if_static",
+    "tf_cuda_tests_tags",
 )
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
     "if_mkl_ml",
 )
+load(
+    "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
+    "tf_cc_fuzz_test",
+)
 
 default_package_visibility = [
     "//tensorflow:internal",
@@ -300,7 +299,6 @@ filegroup(
         "renamed_device.h",
         "rendezvous_mgr.h",
         "rendezvous_util.h",
-        "replicate_constants_pass.h",
         "replicate_per_replica_nodes.h",
         "ring_alg.h",
         "ring_gatherer.h",
@@ -1134,31 +1132,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "replicate_constants_pass",
-    srcs = ["replicate_constants_pass.cc"],
-    hdrs = ["replicate_constants_pass.h"],
-    copts = tf_copts(),
-    deps = [
-        ":optimization_registry",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:portable_gif_internal",
-        "//tensorflow/core/config:flag_defs",
-        "//tensorflow/core/config:flags",
-        "//tensorflow/core/framework:node_def_util",
-        "//tensorflow/core/framework:tensor_proto_cc",
-        "//tensorflow/core/framework:tensor_shape_proto_cc",
-        "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "local_device",
     srcs = ["local_device.cc"],
@@ -1975,10 +1948,7 @@ tf_cuda_library(
         ":step_stats_collector",
         ":threadpool_device",
         ":threadpool_device_factory",
-    ] + if_zendnn([":zen_layout_pass"]) + if_macos(
-        [],
-        [":replicate_constants_pass"],
-    ),
+    ] + if_zendnn([":zen_layout_pass"]),
 )
 
 tf_cuda_library(
@@ -2342,7 +2312,6 @@ tf_cc_tests(
         "optimization_registry_test.cc",
         "pending_counts_test.cc",
         "placer_inspection_required_ops_utils_test.cc",
-        "replicate_constants_pass_test.cc",
         "session_test.cc",
         "threadpool_device_test.cc",
     ],
@@ -2373,7 +2342,6 @@ tf_cc_tests(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/config:flag_defs",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/platform:regexp",
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass.cc b/tensorflow/core/common_runtime/replicate_constants_pass.cc
deleted file mode 100644
index 73f96d66f940bb..00000000000000
--- a/tensorflow/core/common_runtime/replicate_constants_pass.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/replicate_constants_pass.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <limits>
-#include <string>
-#include <vector>
-
-#include "absl/container/btree_map.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/config/flag_defs.h"
-#include "tensorflow/core/config/flags.h"
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/device_name_utils.h"
-#include "tensorflow/core/util/dump_graph.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
-
-namespace tensorflow {
-namespace {
-
-// Maximum size constant to replicate.
-constexpr int64_t kMaxSize = 16;
-
-// Set `node`'s name to <original-name>/replicate/_<unique-index>
-void SetUniqueName(Graph* graph, Node* node) {
-  node->set_name(graph->NewName(absl::StrCat(node->name(), "/replicate")));
-}
-
-// `node` has an output control edge.
-bool HasControlOut(Node* node) {
-  auto control_out_it =
-      std::find_if(node->out_edges().begin(), node->out_edges().end(),
-                   [](const auto& e) { return e->IsControlEdge(); });
-  return control_out_it != node->out_edges().end();
-}
-
-// `node`'s device is a CPU.
-bool HasCpuDevice(const Node* node) {
-  DeviceNameUtils::ParsedName device;
-  if (!DeviceNameUtils::ParseFullName(node->assigned_device_name(), &device))
-    return false;
-  return device.type == "CPU";
-}
-
-// Get the CPU device on the same host as dst.
-Status GetDestinationCpuDevice(const Node* dst, std::string* device) {
-  if (!dst->has_assigned_device_name())
-    return absl::AbortedError(
-        absl::StrCat("Node name: ", dst->name(), " has no assigned device."));
-  return DeviceNameUtils::DeviceNameToCpuDeviceName(dst->assigned_device_name(),
-                                                    device);
-}
-
-// Collect the successor edges of the constant. Group them by the device of the
-// successor.
-Status GetSuccessorEdges(
-    Node* node,
-    absl::btree_map<std::string, std::vector<const Edge*>>& device_to_edges) {
-  for (const auto& edge : node->out_edges()) {
-    const Node* dst = edge->dst();
-    std::string device;
-    TF_RETURN_IF_ERROR(GetDestinationCpuDevice(dst, &device));
-    if (!device_to_edges.count(device)) device_to_edges.insert({device, {}});
-    device_to_edges[device].push_back(edge);
-  }
-  return OkStatus();
-}
-
-// Replicate the constant to each successor device.
-void ReplicateToEachDevice(
-    Graph* graph, Node* node,
-    absl::btree_map<std::string, std::vector<const Edge*>>& device_to_edges) {
-  for (const auto& pair : device_to_edges) {
-    Node* copy = graph->CopyNode(node);
-    SetUniqueName(graph, copy);
-    const std::string device = pair.first;
-    copy->set_assigned_device_name(device);
-    // Set the successor edges to ops on this device.
-    for (const Edge* edge : pair.second) {
-      graph->AddEdge(copy, edge->src_output(), edge->dst(), edge->dst_input());
-    }
-    // Replicate in edges that are control.
-    for (Node* src : node->in_nodes()) {
-      graph->AddControlEdge(src, copy, true);
-    }
-  }
-  graph->RemoveNode(node);
-}
-
-}  // namespace
-
-Status ReplicateConstantsPass::Run(
-    const GraphOptimizationPassOptions& options) {
-  if (!flags::Global().replicate_small_constants.value()) {
-    VLOG(1) << "replicate_constants_pass not enabled";
-    return OkStatus();
-  }
-  VLOG(1) << "replicate_constants_pass will replicate constants with "
-             "number-of-elements <= "
-          << kMaxSize;
-
-  Graph* graph = options.graph->get();
-  if (VLOG_IS_ON(1)) {
-    VLOG(1) << DumpGraphToFile("before_replicate_constants_pass", *graph,
-                               options.flib_def);
-  }
-  int64_t min_skipped = std::numeric_limits<int64_t>::max();
-  int64_t max_skipped = std::numeric_limits<int64_t>::min();
-  for (Node* node : graph->nodes()) {
-    if (!node->IsConstant()) continue;
-
-    // For performance, skip when there is at most one successor.
-    if (node->out_edges().size() <= 1) continue;
-
-    // Skip if the constant has a control successor. Replicating constants with
-    // control successors would require relpicating these control edges, which
-    // could result in even more message passing.
-    if (HasControlOut(node)) continue;
-
-    // Skip if the constant is too large.
-    const TensorProto* value = nullptr;
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "value", &value));
-    TF_ASSIGN_OR_RETURN(TensorShape shape,
-                        TensorShape::BuildTensorShape(value->tensor_shape()));
-    if (shape.num_elements() > kMaxSize) {
-      min_skipped = std::min(min_skipped, shape.num_elements());
-      max_skipped = std::max(max_skipped, shape.num_elements());
-      continue;
-    }
-
-    // Skip if there is no assigned device.
-    if (!node->has_assigned_device_name()) continue;
-
-    // Skip when the original constant is not on a CPU, because is not clear
-    // whether replicating from non-CPU to CPU is valid.
-    if (!HasCpuDevice(node)) continue;
-
-    // Collect successor edges, per device.
-    absl::btree_map<std::string, std::vector<const Edge*>> device_to_edges;
-    TF_RETURN_IF_ERROR(GetSuccessorEdges(node, device_to_edges));
-
-    // Skip if all successors are on the same device.
-    if (device_to_edges.size() <= 1) continue;
-
-    // Replicate the constant to each successor device.
-    ReplicateToEachDevice(graph, node, device_to_edges);
-  }
-  if (min_skipped != std::numeric_limits<int64_t>::max()) {
-    VLOG(1) << "replicate_constants_pass skipped replicating constants with "
-               "number of elements in the range "
-            << min_skipped << " to " << max_skipped << ".";
-  }
-
-  if (VLOG_IS_ON(1)) {
-    VLOG(1) << DumpGraphToFile("after_replicate_constants_pass", *graph,
-                               options.flib_def);
-  }
-  return OkStatus();
-}
-
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_PLACEMENT, 3,
-                      ReplicateConstantsPass);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass.h b/tensorflow/core/common_runtime/replicate_constants_pass.h
deleted file mode 100644
index b7b2f0fe98c0d2..00000000000000
--- a/tensorflow/core/common_runtime/replicate_constants_pass.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
-
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-
-// Small constants are replicated to the hosts of their successors. This pass
-// only applies when there are multiple successors.
-//
-// For example, the graph:
-//   C -> {Op0, Op1, Op2, Op3}
-//   C's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:CPU:0
-//   Op0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:0
-//   Op1's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:1
-//   Op2's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:0
-//   Op3's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:1
-// is rewritten to:
-//   C0 -> {Op0, Op1}
-//   C1 -> {Op2, Op3}
-//   C0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:CPU:0
-//   C1's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:CPU:0
-//   Op0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:0
-//   Op1's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:1
-//   Op2's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:0
-//   Op3's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:1
-
-namespace tensorflow {
-
-class ReplicateConstantsPass : public GraphOptimizationPass {
- public:
-  Status Run(const GraphOptimizationPassOptions& options) override;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass_test.cc b/tensorflow/core/common_runtime/replicate_constants_pass_test.cc
deleted file mode 100644
index dae22012bfba8b..00000000000000
--- a/tensorflow/core/common_runtime/replicate_constants_pass_test.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/replicate_constants_pass.h"
-
-#include <memory>
-#include <string>
-
-#include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/cc/ops/math_ops.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/config/flag_defs.h"
-#include "tensorflow/core/config/flags.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/platform/test.h"
-#include "tsl/lib/core/status_test_util.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/test.h"
-
-namespace tensorflow {
-
-const char kCpu0[] = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0";
-const char kCpu1[] = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0";
-const char kTpu00[] = "/job:tpu_host_worker/replica:0/task:0/device:TPU:0";
-const char kTpu01[] = "/job:tpu_host_worker/replica:0/task:0/device:TPU:1";
-const char kTpu10[] = "/job:tpu_host_worker/replica:0/task:1/device:TPU:0";
-const char kTpu11[] = "/job:tpu_host_worker/replica:0/task:1/device:TPU:1";
-
-// Return the node with name `name`.
-Node* GetNode(const Graph& graph, const std::string& name) {
-  for (Node* node : graph.nodes()) {
-    if (node->name() == name) return node;
-  }
-  CHECK(false) << "Unknown node name: " << name;
-  return nullptr;
-}
-
-// Return the first predecessor of `node`.
-Node* GetPredecessor(Node* node) {
-  auto it = node->in_nodes().begin();
-  CHECK(it != node->in_nodes().end())
-      << "No predecessor for " << node->name() << "\n";
-  return *it;
-}
-
-// There exists an edge from `src` to `dst`.
-bool IsEdge(Node* src, Node* dst) {
-  for (Node* node : src->out_nodes()) {
-    if (node == dst) return true;
-  }
-  return false;
-}
-
-// Test that a small constant is replicated to each successor's device.
-TEST(ReplicateConstantsPassTest, TestSmallConstant) {
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    Output const0 =
-        ops::Const(scope.WithOpName("const"), 1.0f, TensorShape({}));
-    ops::Negate dst0(scope.WithOpName("dst0"), const0);
-    ops::Negate dst1(scope.WithOpName("dst1"), const0);
-    ops::Negate dst2(scope.WithOpName("dst2"), const0);
-    TF_CHECK_OK(scope.ToGraph(graph.get()));
-  }
-  GetNode(*graph, "const")->set_assigned_device_name(kCpu0);
-  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
-  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
-  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
-
-  // Enable the pass.
-  flags::Global().replicate_small_constants.reset(true);
-
-  GraphDef before;
-  graph->ToGraphDef(&before);
-  GraphOptimizationPassOptions options;
-  options.graph = &graph;
-  ReplicateConstantsPass pass;
-  TF_ASSERT_OK(pass.Run(options));
-  GraphDef actual;
-  graph->ToGraphDef(&actual);
-
-  Node* dst0 = GetNode(*graph, "dst0");
-  Node* dst1 = GetNode(*graph, "dst1");
-  Node* dst2 = GetNode(*graph, "dst2");
-  EXPECT_EQ(dst0->assigned_device_name(),
-            GetPredecessor(dst0)->assigned_device_name());
-  EXPECT_EQ(dst1->assigned_device_name(),
-            GetPredecessor(dst1)->assigned_device_name());
-  EXPECT_EQ(dst2->assigned_device_name(),
-            GetPredecessor(dst2)->assigned_device_name());
-}
-
-// Test that a large constant is ignored.
-TEST(ReplicateConstantsPassTest, TestLargeConstant) {
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    Output const0 =
-        ops::Const(scope.WithOpName("const"),
-                   {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    ops::Negate dst0(scope.WithOpName("dst0"), const0);
-    ops::Negate dst1(scope.WithOpName("dst1"), const0);
-    ops::Negate dst2(scope.WithOpName("dst2"), const0);
-    TF_CHECK_OK(scope.ToGraph(graph.get()));
-  }
-  GetNode(*graph, "const")->set_assigned_device_name(kCpu0);
-  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
-  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
-  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
-
-  // Enable the pass.
-  flags::Global().replicate_small_constants.reset(true);
-
-  GraphDef before;
-  graph->ToGraphDef(&before);
-  GraphOptimizationPassOptions options;
-  options.graph = &graph;
-  ReplicateConstantsPass pass;
-  TF_ASSERT_OK(pass.Run(options));
-  GraphDef actual;
-  graph->ToGraphDef(&actual);
-
-  Node* dst0 = GetNode(*graph, "dst0");
-  Node* dst1 = GetNode(*graph, "dst1");
-  Node* dst2 = GetNode(*graph, "dst2");
-  EXPECT_EQ(dst0->assigned_device_name(),
-            GetPredecessor(dst0)->assigned_device_name());
-  EXPECT_NE(dst1->assigned_device_name(),
-            GetPredecessor(dst1)->assigned_device_name());
-  EXPECT_NE(dst2->assigned_device_name(),
-            GetPredecessor(dst2)->assigned_device_name());
-}
-
-// Test that a constant with a control successor is ignored.
-TEST(ReplicateConstantsPassTest, TestControlOut) {
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    Output const0 =
-        ops::Const(scope.WithOpName("const0"), 1.0f, TensorShape({}));
-    Output ctrl_succ =
-        ops::Const(scope.WithOpName("ctrl_succ"), 1.0f, TensorShape({}));
-    ops::Negate dst0(scope.WithOpName("dst0"), const0);
-    ops::Negate dst1(scope.WithOpName("dst1"), const0);
-    ops::Negate dst2(scope.WithOpName("dst2"), const0);
-    TF_CHECK_OK(scope.ToGraph(graph.get()));
-  }
-  GetNode(*graph, "const0")->set_assigned_device_name(kCpu0);
-  GetNode(*graph, "ctrl_succ")->set_assigned_device_name(kCpu0);
-  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
-  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
-  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
-  graph->AddControlEdge(GetNode(*graph, "const0"),
-                        GetNode(*graph, "ctrl_succ"));
-
-  // Enable the pass.
-  flags::Global().replicate_small_constants.reset(true);
-
-  GraphDef before;
-  graph->ToGraphDef(&before);
-  GraphOptimizationPassOptions options;
-  options.graph = &graph;
-  ReplicateConstantsPass pass;
-  TF_ASSERT_OK(pass.Run(options));
-  GraphDef actual;
-  graph->ToGraphDef(&actual);
-
-  Node* dst0 = GetNode(*graph, "dst0");
-  Node* dst1 = GetNode(*graph, "dst1");
-  Node* dst2 = GetNode(*graph, "dst2");
-  EXPECT_EQ(dst0->assigned_device_name(),
-            GetPredecessor(dst0)->assigned_device_name());
-  EXPECT_NE(dst1->assigned_device_name(),
-            GetPredecessor(dst1)->assigned_device_name());
-  EXPECT_NE(dst2->assigned_device_name(),
-            GetPredecessor(dst2)->assigned_device_name());
-}
-
-// Test that a constant on a TPU is ignored.
-TEST(ReplicateConstantsPassTest, TestTpuConst) {
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    Output const0 =
-        ops::Const(scope.WithOpName("const0"), 1.0f, TensorShape({}));
-    ops::Negate dst0(scope.WithOpName("dst0"), const0);
-    ops::Negate dst1(scope.WithOpName("dst1"), const0);
-    ops::Negate dst2(scope.WithOpName("dst2"), const0);
-    TF_CHECK_OK(scope.ToGraph(graph.get()));
-  }
-  GetNode(*graph, "const0")->set_assigned_device_name(kTpu00);
-  GetNode(*graph, "dst0")->set_assigned_device_name(kTpu00);
-  GetNode(*graph, "dst1")->set_assigned_device_name(kTpu10);
-  GetNode(*graph, "dst2")->set_assigned_device_name(kTpu10);
-
-  // Enable the pass.
-  flags::Global().replicate_small_constants.reset(true);
-
-  GraphDef before;
-  graph->ToGraphDef(&before);
-  GraphOptimizationPassOptions options;
-  options.graph = &graph;
-  ReplicateConstantsPass pass;
-  TF_ASSERT_OK(pass.Run(options));
-  GraphDef actual;
-  graph->ToGraphDef(&actual);
-
-  Node* dst0 = GetNode(*graph, "dst0");
-  Node* dst1 = GetNode(*graph, "dst1");
-  Node* dst2 = GetNode(*graph, "dst2");
-  EXPECT_EQ(dst0->assigned_device_name(),
-            GetPredecessor(dst0)->assigned_device_name());
-  EXPECT_NE(dst1->assigned_device_name(),
-            GetPredecessor(dst1)->assigned_device_name());
-  EXPECT_NE(dst2->assigned_device_name(),
-            GetPredecessor(dst2)->assigned_device_name());
-}
-
-TEST(ReplicateConstantsPassTest, TestSmallAndLargeConstants) {
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    Output small = ops::Const(scope.WithOpName("small"), 1.0f, TensorShape({}));
-    Output large =
-        ops::Const(scope.WithOpName("large"),
-                   {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
-                    10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f});
-    ops::Add dst0(scope.WithOpName("dst0"), small, large);
-    ops::Add dst1(scope.WithOpName("dst1"), small, large);
-    ops::Add dst2(scope.WithOpName("dst2"), small, large);
-    TF_CHECK_OK(scope.ToGraph(graph.get()));
-  }
-  GetNode(*graph, "small")->set_assigned_device_name(kCpu0);
-  GetNode(*graph, "large")->set_assigned_device_name(kCpu0);
-  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
-  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
-  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
-
-  // Enable the pass.
-  flags::Global().replicate_small_constants.reset(true);
-
-  GraphDef before;
-  graph->ToGraphDef(&before);
-  GraphOptimizationPassOptions options;
-  options.graph = &graph;
-  ReplicateConstantsPass pass;
-  TF_ASSERT_OK(pass.Run(options));
-  GraphDef actual;
-  graph->ToGraphDef(&actual);
-
-  Node* small0 = GetNode(*graph, "small/replicate/_0");
-  Node* small1 = GetNode(*graph, "small/replicate/_1");
-  Node* large = GetNode(*graph, "large");
-  Node* dst0 = GetNode(*graph, "dst0");
-  Node* dst1 = GetNode(*graph, "dst1");
-  Node* dst2 = GetNode(*graph, "dst2");
-  EXPECT_EQ(small0->assigned_device_name(), kCpu0);
-  EXPECT_EQ(small1->assigned_device_name(), kCpu1);
-  EXPECT_EQ(large->assigned_device_name(), kCpu0);
-  EXPECT_EQ(dst0->assigned_device_name(), kCpu0);
-  EXPECT_EQ(dst1->assigned_device_name(), kCpu1);
-  EXPECT_EQ(dst1->assigned_device_name(), kCpu1);
-  EXPECT_TRUE(IsEdge(small0, dst0));
-  EXPECT_TRUE(IsEdge(large, dst0));
-  EXPECT_TRUE(IsEdge(small1, dst1));
-  EXPECT_TRUE(IsEdge(large, dst1));
-  EXPECT_TRUE(IsEdge(small1, dst2));
-  EXPECT_TRUE(IsEdge(large, dst2));
-}
-
-// Test that a constant at a CPU with TPU successors is replicated to the
-// TPUs' host CPUs.
-TEST(ReplicateConstantsPassTest, TestTpuDestinations) {
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    Output const0 =
-        ops::Const(scope.WithOpName("const"), 1.0f, TensorShape({}));
-    ops::Negate dst00(scope.WithOpName("dst00"), const0);
-    ops::Negate dst01(scope.WithOpName("dst01"), const0);
-    ops::Negate dst10(scope.WithOpName("dst10"), const0);
-    ops::Negate dst11(scope.WithOpName("dst11"), const0);
-    TF_CHECK_OK(scope.ToGraph(graph.get()));
-  }
-  GetNode(*graph, "const")->set_assigned_device_name(kCpu0);
-  GetNode(*graph, "dst00")->set_assigned_device_name(kTpu00);
-  GetNode(*graph, "dst01")->set_assigned_device_name(kTpu01);
-  GetNode(*graph, "dst10")->set_assigned_device_name(kTpu10);
-  GetNode(*graph, "dst11")->set_assigned_device_name(kTpu11);
-
-  // Enable the pass.
-  flags::Global().replicate_small_constants.reset(true);
-
-  GraphDef before;
-  graph->ToGraphDef(&before);
-  GraphOptimizationPassOptions options;
-  options.graph = &graph;
-  ReplicateConstantsPass pass;
-  TF_ASSERT_OK(pass.Run(options));
-  GraphDef actual;
-  graph->ToGraphDef(&actual);
-
-  Node* const0 = GetNode(*graph, "const/replicate/_0");
-  Node* const1 = GetNode(*graph, "const/replicate/_1");
-  Node* dst00 = GetNode(*graph, "dst00");
-  Node* dst01 = GetNode(*graph, "dst01");
-  Node* dst10 = GetNode(*graph, "dst10");
-  Node* dst11 = GetNode(*graph, "dst11");
-  EXPECT_EQ(const0->assigned_device_name(), kCpu0);
-  EXPECT_EQ(const1->assigned_device_name(), kCpu1);
-  EXPECT_TRUE(IsEdge(const0, dst00));
-  EXPECT_TRUE(IsEdge(const0, dst01));
-  EXPECT_TRUE(IsEdge(const1, dst10));
-  EXPECT_TRUE(IsEdge(const1, dst11));
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/config/flag_defs.h b/tensorflow/core/config/flag_defs.h
index 89ea6a9b73bbf1..4ab5fb4750de46 100644
--- a/tensorflow/core/config/flag_defs.h
+++ b/tensorflow/core/config/flag_defs.h
@@ -49,10 +49,6 @@ class Flags {
   TF_DECLARE_FLAG(more_stack_traces, false,
                   "Enable experimental code that preserves and propagates "
                   "graph node stack traces in C++.");
-  TF_DECLARE_FLAG(replicate_small_constants, false,
-                  "Enable a graph optimization pass that replicate each small "
-                  "constant to its successors' devices. This can decrease "
-                  "message passing.");
   // LINT.ThenChange(//tensorflow/core/config/flags_api_wrapper.cc)
 };
 
diff --git a/tensorflow/core/config/flags_api_wrapper.cc b/tensorflow/core/config/flags_api_wrapper.cc
index 974581e931f7ec..58074fb06257d1 100644
--- a/tensorflow/core/config/flags_api_wrapper.cc
+++ b/tensorflow/core/config/flags_api_wrapper.cc
@@ -51,6 +51,5 @@ PYBIND11_MODULE(flags_pybind, m) {
   TF_PY_DECLARE_FLAG(saved_model_fingerprinting);
   TF_PY_DECLARE_FLAG(tf_shape_default_int64);
   TF_PY_DECLARE_FLAG(more_stack_traces);
-  TF_PY_DECLARE_FLAG(replicate_small_constants);
   // LINT.ThenChange(//tensorflow/core/config/flag_defs.h)
 };
diff --git a/tensorflow/python/flags_pybind.pyi b/tensorflow/python/flags_pybind.pyi
index 90aa0a7d76114b..34b0a0c5666eb8 100644
--- a/tensorflow/python/flags_pybind.pyi
+++ b/tensorflow/python/flags_pybind.pyi
@@ -24,7 +24,6 @@ class Flags:
     graph_building_optimization: Flag
     more_stack_traces: Flag
     op_building_optimization: Flag
-    replicate_small_constants: Flag
     saved_model_fingerprinting: Flag
     test_only_experiment_1: Flag
     test_only_experiment_2: Flag

From da49f51718a58759cd6d54d03bd7572aeef8c81c Mon Sep 17 00:00:00 2001
From: Chao <cchen104@amd.com>
Date: Fri, 22 Sep 2023 11:09:49 -0700
Subject: [PATCH 155/567] PR #5822: [ROCm] fixes kernel name in rocm gpu
 executor

Imported from GitHub PR https://github.com/openxla/xla/pull/5822

ROCm fixes kernel name due to https://github.com/openxla/xla/commit/d04387fd4075af9d70858b2910cfa19a28a28fee

@akuegel @ezhulenev @ddunl Thanks in advance!
Copybara import of the project:

--
4cb2176989b2d89669caae1a26ea7f2506161f4a by Chao Chen <cchen104@amd.com>:

rocm fixes kernel name

Merging this change closes #5822

PiperOrigin-RevId: 567668989
---
 .../xla/xla/stream_executor/rocm/rocm_gpu_executor.cc  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
index 9c0dd98791a6a5..618514de604493 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -200,7 +200,7 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
                                    KernelBase* kernel) {
   GpuKernel* rocm_kernel = AsGpuKernel(kernel);
   hipModule_t module = nullptr;
-  const string* kernelname;
+  const string* kernel_name;
 
   const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
   bool has_cubin = spec.has_cuda_cubin_on_disk();
@@ -212,7 +212,7 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
     return tsl::errors::Internal(
         "Loading ROCM kernel from disk is not supported");
   } else if (spec.has_cuda_cubin_in_memory()) {
-    kernelname = &spec.cuda_cubin_in_memory().kernelname();
+    kernel_name = &spec.cuda_cubin_in_memory().kernel_name();
 
     const char* hsaco = spec.cuda_cubin_in_memory().bytes();
     absl::MutexLock lock{&in_memory_modules_mu_};
@@ -226,9 +226,9 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
     return tsl::errors::Internal("No method of loading ROCM kernel provided");
   }
 
-  VLOG(2) << "getting function " << *kernelname << " from module " << module;
+  VLOG(2) << "getting function " << *kernel_name << " from module " << module;
   TF_RETURN_IF_ERROR(GpuDriver::GetModuleFunction(
-      context_, module, kernelname->c_str(), rocm_kernel->gpu_function_ptr()));
+      context_, module, kernel_name->c_str(), rocm_kernel->gpu_function_ptr()));
 
   // We have to trust the kernel loader spec arity because there doesn't appear
   // to be a way to reflect on the number of expected arguments w/the ROCM API.
@@ -237,7 +237,7 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   KernelMetadata kernel_metadata;
   TF_RETURN_IF_ERROR(GetKernelMetadata(rocm_kernel, &kernel_metadata));
   kernel->set_metadata(kernel_metadata);
-  kernel->set_name(*kernelname);
+  kernel->set_name(*kernel_name);
   return tsl::OkStatus();
 }
 

From 34a5499d01293ee05c3c36d67aa2cc2891820489 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 11:11:28 -0700
Subject: [PATCH 156/567] Add type hints for ops.Operation.

PiperOrigin-RevId: 567669383
---
 tensorflow/python/framework/function.py              |  6 +++---
 .../kernel_tests/random/stateless_random_ops_test.py | 11 ++++++-----
 tensorflow/python/ops/tensor_array_grad.py           | 12 ++++++------
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 502d0805012afa..068a685184245a 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -914,7 +914,7 @@ def _add_tensor_and_parents(self, tensor):
     op = self._add_op_and_parents(tensor.op)
     return op.outputs[tensor.value_index]
 
-  def _add_op_and_parents(self, op):
+  def _add_op_and_parents(self, op: ops.Operation):
     # pylint: disable=protected-access
     op_def = graph_to_function_def._get_op_def(op)
     if op._is_stateful and op not in self._allowlisted_stateful_ops:
@@ -1049,13 +1049,13 @@ def _is_guaranteed_const(tensor):
 
   class Work(object):
 
-    def __init__(self, op, leaving):
+    def __init__(self, op: ops.Operation, leaving):
       self.op = op
       self.leaving = leaving
 
   is_guaranteed_const = lambda op: op.node_def.op == "GuaranteeConst"
   constants = set([])
-  def all_inputs_const(op):
+  def all_inputs_const(op: ops.Operation):
     # If all inputs of an op are guaranteed constants, then we can infer that
     # the op produces a constant as well.
     return op.inputs and all(inp.op in constants for inp in op.inputs)
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index 08c8b53cbf7303..c5c3aeb50a05e4 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -102,7 +102,7 @@ def float_cases(shape_dtypes=(None,)):
   )
   # Explicitly passing in params because capturing cell variable from loop is
   # problematic in Python
-  def wrap(op, dtype, shape, shape_dtype, seed, **kwargs):
+  def wrap(op: ops.Operation, dtype, shape, shape_dtype, seed, **kwargs):
     device_type = get_device().device_type
     # Some dtypes are not supported on some devices
     if (dtype == dtypes.bfloat16 and device_type == 'GPU' and
@@ -134,7 +134,8 @@ def _name(a):
 
 def int_cases(shape_dtypes=(None,), minval_maxval=None):
 
-  def wrap(op, minval, maxval, shape, shape_dtype, dtype, seed, **kwargs):
+  def wrap(op: ops.Operation, minval, maxval, shape, shape_dtype, dtype,
+           seed, **kwargs):
     shape_ = (constant_op.constant(shape, dtype=shape_dtype)
               if shape_dtype is not None else shape)
     return op(
@@ -156,7 +157,7 @@ def wrap(op, minval, maxval, shape, shape_dtype, dtype, seed, **kwargs):
 
 def multinomial_cases():
   num_samples = 10
-  def wrap(op, logits, logits_dtype, output_dtype, seed):
+  def wrap(op: ops.Operation, logits, logits_dtype, output_dtype, seed):
     device_type = get_device().device_type
     # Some dtypes are not supported on some devices
     if (logits_dtype == dtypes.bfloat16 and device_type == 'GPU' and
@@ -183,7 +184,7 @@ def wrap(op, logits, logits_dtype, output_dtype, seed):
 
 
 def gamma_cases():
-  def wrap(op, alpha, dtype, shape, seed):
+  def wrap(op: ops.Operation, alpha, dtype, shape, seed):
     return op(seed=seed, shape=shape,
               alpha=constant_op.constant(alpha, dtype=dtype), dtype=dtype)
   for dtype in np.float16, np.float32, np.float64:
@@ -196,7 +197,7 @@ def wrap(op, alpha, dtype, shape, seed):
 
 
 def poisson_cases():
-  def wrap(op, lam, lam_dtype, out_dtype, shape, seed):
+  def wrap(op: ops.Operation, lam, lam_dtype, out_dtype, shape, seed):
     return op(seed=seed, shape=shape,
               lam=constant_op.constant(lam_dtype(lam), dtype=lam_dtype),
               dtype=out_dtype)
diff --git a/tensorflow/python/ops/tensor_array_grad.py b/tensorflow/python/ops/tensor_array_grad.py
index 994ec609bcf5f1..25ca3495416b7b 100644
--- a/tensorflow/python/ops/tensor_array_grad.py
+++ b/tensorflow/python/ops/tensor_array_grad.py
@@ -80,7 +80,7 @@ def _GetGradSource(op_or_tensor):
 @ops.RegisterGradient("TensorArrayRead")
 @ops.RegisterGradient("TensorArrayReadV2")
 @ops.RegisterGradient("TensorArrayReadV3")
-def _TensorArrayReadGrad(op, grad):
+def _TensorArrayReadGrad(op: ops.Operation, grad):
   """Gradient for TensorArrayRead.
 
   Args:
@@ -111,7 +111,7 @@ def _TensorArrayReadGrad(op, grad):
 @ops.RegisterGradient("TensorArrayWrite")
 @ops.RegisterGradient("TensorArrayWriteV2")
 @ops.RegisterGradient("TensorArrayWriteV3")
-def _TensorArrayWriteGrad(op, flow):
+def _TensorArrayWriteGrad(op: ops.Operation, flow):
   """Gradient for TensorArrayWrite.
 
   Args:
@@ -143,7 +143,7 @@ def _TensorArrayWriteGrad(op, flow):
 @ops.RegisterGradient("TensorArrayGather")
 @ops.RegisterGradient("TensorArrayGatherV2")
 @ops.RegisterGradient("TensorArrayGatherV3")
-def _TensorArrayGatherGrad(op, grad):
+def _TensorArrayGatherGrad(op: ops.Operation, grad):
   """Gradient for TensorArrayGather.
 
   Args:
@@ -174,7 +174,7 @@ def _TensorArrayGatherGrad(op, grad):
 @ops.RegisterGradient("TensorArrayScatter")
 @ops.RegisterGradient("TensorArrayScatterV2")
 @ops.RegisterGradient("TensorArrayScatterV3")
-def _TensorArrayScatterGrad(op, flow):
+def _TensorArrayScatterGrad(op: ops.Operation, flow):
   """Gradient for TensorArrayScatter.
 
   Args:
@@ -204,7 +204,7 @@ def _TensorArrayScatterGrad(op, flow):
 @ops.RegisterGradient("TensorArrayConcat")
 @ops.RegisterGradient("TensorArrayConcatV2")
 @ops.RegisterGradient("TensorArrayConcatV3")
-def _TensorArrayConcatGrad(op, grad, unused_lengths_grad):
+def _TensorArrayConcatGrad(op: ops.Operation, grad, unused_lengths_grad):
   """Gradient for TensorArrayConcat.
 
   Args:
@@ -236,7 +236,7 @@ def _TensorArrayConcatGrad(op, grad, unused_lengths_grad):
 @ops.RegisterGradient("TensorArraySplit")
 @ops.RegisterGradient("TensorArraySplitV2")
 @ops.RegisterGradient("TensorArraySplitV3")
-def _TensorArraySplitGrad(op, flow):
+def _TensorArraySplitGrad(op: ops.Operation, flow):
   """Gradient for TensorArraySplit.
 
   Args:

From efbc945baa1744389fba62202bf6dc6782914453 Mon Sep 17 00:00:00 2001
From: Jieying Luo <jieying@google.com>
Date: Fri, 22 Sep 2023 11:14:10 -0700
Subject: [PATCH 157/567] [PJRT C API] Register "InspectSharding" custom
 partitioning handler in GPU plugin.

PiperOrigin-RevId: 567670200
---
 third_party/xla/xla/pjrt/c/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index d0a7a55ea82099..ba1adb05c17794 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -141,6 +141,7 @@ cc_library(
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt/gpu:gpu_helpers",
         "//xla/pjrt/gpu:se_gpu_pjrt_client",
+        "//xla/python:inspect_sharding",  # To register "InspectSharding" custom partitioning handler.
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",

From 3de450f2ebb7eca8b92ae24548ed11b53821c129 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Fri, 22 Sep 2023 11:25:03 -0700
Subject: [PATCH 158/567] [XLA:GPU] Fix accidentally broken Triton emitter
 test.

Addressing the test case just added in cl/567619889.

PiperOrigin-RevId: 567673155
---
 .../xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 832c7a295b981a..09af76884ead91 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -719,6 +719,7 @@ ENTRY e {
   float tolerance;
   switch (data_type) {
     case F32:
+    case BF16:
       tolerance = 1e-6;
       break;
     case F16:

From f60a692345e570d51b4dcb3578b475d229586933 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 11:34:19 -0700
Subject: [PATCH 159/567] Add type hints under third_party/tensorflow.

PiperOrigin-RevId: 567675462
---
 tensorflow/python/ops/random_grad.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/random_grad.py b/tensorflow/python/ops/random_grad.py
index 32d2b35abe9aa8..a7984ff7cd5fca 100644
--- a/tensorflow/python/ops/random_grad.py
+++ b/tensorflow/python/ops/random_grad.py
@@ -34,7 +34,7 @@ def add_leading_unit_dimensions(x, num_dimensions):  # pylint: disable=invalid-n
 
 
 @ops.RegisterGradient("RandomGamma")
-def _RandomGammaGrad(op, grad):  # pylint: disable=invalid-name
+def _RandomGammaGrad(op: ops.Operation, grad):  # pylint: disable=invalid-name
   """Returns the gradient of a Gamma sample w.r.t. alpha.
 
   The gradient is computed using implicit differentiation
@@ -74,7 +74,7 @@ def _RandomGammaGrad(op, grad):  # pylint: disable=invalid-name
 
 
 @ops.RegisterGradient("StatelessRandomGammaV2")
-def _StatelessRandomGammaV2Grad(op, grad):  # pylint: disable=invalid-name
+def _StatelessRandomGammaV2Grad(op: ops.Operation, grad):  # pylint: disable=invalid-name
   """Returns the gradient of a Gamma sample w.r.t. alpha.
 
   The gradient is computed using implicit differentiation
@@ -106,7 +106,7 @@ def _StatelessRandomGammaV2Grad(op, grad):  # pylint: disable=invalid-name
 
 
 @ops.RegisterGradient("StatelessRandomGammaV3")
-def _StatelessRandomGammaV3Grad(op, grad):  # pylint: disable=invalid-name
+def _StatelessRandomGammaV3Grad(op: ops.Operation, grad):  # pylint: disable=invalid-name
   """Returns the gradient of a Gamma sample w.r.t. alpha.
 
   The gradient is computed using implicit differentiation
@@ -169,7 +169,7 @@ def _Ndtr(x):
 
 
 @ops.RegisterGradient("StatelessParameterizedTruncatedNormal")
-def _StatelessParameterizedTruncatedNormalGrad(op, grad):  # pylint: disable=invalid-name
+def _StatelessParameterizedTruncatedNormalGrad(op: ops.Operation, grad):  # pylint: disable=invalid-name
   """Returns the gradient of a TruncatedNormal sample w.r.t. parameters.
 
   The gradient is computed using implicit differentiation

From f2d3345f00add8504ef3297495c288e896b0212c Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 22 Sep 2023 11:38:04 -0700
Subject: [PATCH 160/567] Replicate small constants so they don't need to be
 sent to their successors. A small constant is replicated to each of its
 successors' devices. The maximum size of a constant to be replicated is 16
 elements.

This pass is disabled by default and can be enabled with the flag replicate_small_constants.

The previous version broke the CI for macOS. This version rolls forwards with a split-out C++ test target.

PiperOrigin-RevId: 567676442
---
 tensorflow/core/common_runtime/BUILD          |  91 ++++-
 .../replicate_constants_pass.cc               | 189 ++++++++++
 .../common_runtime/replicate_constants_pass.h |  50 +++
 .../replicate_constants_pass_test.cc          | 334 ++++++++++++++++++
 tensorflow/core/config/flag_defs.h            |   4 +
 tensorflow/core/config/flags_api_wrapper.cc   |   1 +
 tensorflow/python/flags_pybind.pyi            |   1 +
 7 files changed, 661 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/replicate_constants_pass.cc
 create mode 100644 tensorflow/core/common_runtime/replicate_constants_pass.h
 create mode 100644 tensorflow/core/common_runtime/replicate_constants_pass_test.cc

diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 4b7c5eb8b7c29d..21537de1b5765f 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1,7 +1,9 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_google",
     "if_libtpu",
+    "if_macos",
     "if_oss",
     "if_zendnn",
     "tf_cc_test",
@@ -21,25 +23,24 @@ load(
     "tf_protos_all",
     "tf_protos_grappler",
 )
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
     "tf_cuda_tests_tags",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
-    "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
-    "if_mkl_ml",
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
 )
 load(
     "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
     "tf_cc_fuzz_test",
 )
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+    "if_mkl_ml",
+)
 
 default_package_visibility = [
     "//tensorflow:internal",
@@ -299,6 +300,7 @@ filegroup(
         "renamed_device.h",
         "rendezvous_mgr.h",
         "rendezvous_util.h",
+        "replicate_constants_pass.h",
         "replicate_per_replica_nodes.h",
         "ring_alg.h",
         "ring_gatherer.h",
@@ -1132,6 +1134,31 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "replicate_constants_pass",
+    srcs = ["replicate_constants_pass.cc"],
+    hdrs = ["replicate_constants_pass.h"],
+    copts = tf_copts(),
+    deps = [
+        ":optimization_registry",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core/config:flag_defs",
+        "//tensorflow/core/config:flags",
+        "//tensorflow/core/framework:node_def_util",
+        "//tensorflow/core/framework:tensor_proto_cc",
+        "//tensorflow/core/framework:tensor_shape_proto_cc",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "local_device",
     srcs = ["local_device.cc"],
@@ -1948,7 +1975,10 @@ tf_cuda_library(
         ":step_stats_collector",
         ":threadpool_device",
         ":threadpool_device_factory",
-    ] + if_zendnn([":zen_layout_pass"]),
+    ] + if_zendnn([":zen_layout_pass"]) + if_macos(
+        [],
+        [":replicate_constants_pass"],  # TODO(b/301469885): Remove.
+    ),
 )
 
 tf_cuda_library(
@@ -2342,6 +2372,7 @@ tf_cc_tests(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/config:flag_defs",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/platform:regexp",
@@ -2353,6 +2384,48 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_tests(
+    name = "replicate_constants_pass_test",
+    size = "small",
+    srcs = [
+        "replicate_constants_pass_test.cc",
+    ],
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":pending_counts",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/cc:while_loop",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/config:flag_defs",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/nccl:collective_communicator",
+        "//tensorflow/core/platform:regexp",
+        "//tensorflow/core/util:protos_test_cc",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@eigen_archive//:eigen3",
+    ] + if_macos([
+        ":replicate_constants_pass",  # TODO(b/301469885): Remove.
+    ]),
+)
+
 tf_cc_tests(
     name = "higher_level_tests_needing_kernels",
     size = "small",
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass.cc b/tensorflow/core/common_runtime/replicate_constants_pass.cc
new file mode 100644
index 00000000000000..73f96d66f940bb
--- /dev/null
+++ b/tensorflow/core/common_runtime/replicate_constants_pass.cc
@@ -0,0 +1,189 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/replicate_constants_pass.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "absl/container/btree_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/config/flag_defs.h"
+#include "tensorflow/core/config/flags.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace {
+
+// Maximum size constant to replicate.
+constexpr int64_t kMaxSize = 16;
+
+// Set `node`'s name to <original-name>/replicate/_<unique-index>
+void SetUniqueName(Graph* graph, Node* node) {
+  node->set_name(graph->NewName(absl::StrCat(node->name(), "/replicate")));
+}
+
+// `node` has an output control edge.
+bool HasControlOut(Node* node) {
+  auto control_out_it =
+      std::find_if(node->out_edges().begin(), node->out_edges().end(),
+                   [](const auto& e) { return e->IsControlEdge(); });
+  return control_out_it != node->out_edges().end();
+}
+
+// `node`'s device is a CPU.
+bool HasCpuDevice(const Node* node) {
+  DeviceNameUtils::ParsedName device;
+  if (!DeviceNameUtils::ParseFullName(node->assigned_device_name(), &device))
+    return false;
+  return device.type == "CPU";
+}
+
+// Get the CPU device on the same host as dst.
+Status GetDestinationCpuDevice(const Node* dst, std::string* device) {
+  if (!dst->has_assigned_device_name())
+    return absl::AbortedError(
+        absl::StrCat("Node name: ", dst->name(), " has no assigned device."));
+  return DeviceNameUtils::DeviceNameToCpuDeviceName(dst->assigned_device_name(),
+                                                    device);
+}
+
+// Collect the successor edges of the constant. Group them by the device of the
+// successor.
+Status GetSuccessorEdges(
+    Node* node,
+    absl::btree_map<std::string, std::vector<const Edge*>>& device_to_edges) {
+  for (const auto& edge : node->out_edges()) {
+    const Node* dst = edge->dst();
+    std::string device;
+    TF_RETURN_IF_ERROR(GetDestinationCpuDevice(dst, &device));
+    if (!device_to_edges.count(device)) device_to_edges.insert({device, {}});
+    device_to_edges[device].push_back(edge);
+  }
+  return OkStatus();
+}
+
+// Replicate the constant to each successor device.
+void ReplicateToEachDevice(
+    Graph* graph, Node* node,
+    absl::btree_map<std::string, std::vector<const Edge*>>& device_to_edges) {
+  for (const auto& pair : device_to_edges) {
+    Node* copy = graph->CopyNode(node);
+    SetUniqueName(graph, copy);
+    const std::string device = pair.first;
+    copy->set_assigned_device_name(device);
+    // Set the successor edges to ops on this device.
+    for (const Edge* edge : pair.second) {
+      graph->AddEdge(copy, edge->src_output(), edge->dst(), edge->dst_input());
+    }
+    // Replicate in edges that are control.
+    for (Node* src : node->in_nodes()) {
+      graph->AddControlEdge(src, copy, true);
+    }
+  }
+  graph->RemoveNode(node);
+}
+
+}  // namespace
+
+Status ReplicateConstantsPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  if (!flags::Global().replicate_small_constants.value()) {
+    VLOG(1) << "replicate_constants_pass not enabled";
+    return OkStatus();
+  }
+  VLOG(1) << "replicate_constants_pass will replicate constants with "
+             "number-of-elements <= "
+          << kMaxSize;
+
+  Graph* graph = options.graph->get();
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << DumpGraphToFile("before_replicate_constants_pass", *graph,
+                               options.flib_def);
+  }
+  int64_t min_skipped = std::numeric_limits<int64_t>::max();
+  int64_t max_skipped = std::numeric_limits<int64_t>::min();
+  for (Node* node : graph->nodes()) {
+    if (!node->IsConstant()) continue;
+
+    // For performance, skip when there is at most one successor.
+    if (node->out_edges().size() <= 1) continue;
+
+    // Skip if the constant has a control successor. Replicating constants with
+    // control successors would require relpicating these control edges, which
+    // could result in even more message passing.
+    if (HasControlOut(node)) continue;
+
+    // Skip if the constant is too large.
+    const TensorProto* value = nullptr;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "value", &value));
+    TF_ASSIGN_OR_RETURN(TensorShape shape,
+                        TensorShape::BuildTensorShape(value->tensor_shape()));
+    if (shape.num_elements() > kMaxSize) {
+      min_skipped = std::min(min_skipped, shape.num_elements());
+      max_skipped = std::max(max_skipped, shape.num_elements());
+      continue;
+    }
+
+    // Skip if there is no assigned device.
+    if (!node->has_assigned_device_name()) continue;
+
+    // Skip when the original constant is not on a CPU, because is not clear
+    // whether replicating from non-CPU to CPU is valid.
+    if (!HasCpuDevice(node)) continue;
+
+    // Collect successor edges, per device.
+    absl::btree_map<std::string, std::vector<const Edge*>> device_to_edges;
+    TF_RETURN_IF_ERROR(GetSuccessorEdges(node, device_to_edges));
+
+    // Skip if all successors are on the same device.
+    if (device_to_edges.size() <= 1) continue;
+
+    // Replicate the constant to each successor device.
+    ReplicateToEachDevice(graph, node, device_to_edges);
+  }
+  if (min_skipped != std::numeric_limits<int64_t>::max()) {
+    VLOG(1) << "replicate_constants_pass skipped replicating constants with "
+               "number of elements in the range "
+            << min_skipped << " to " << max_skipped << ".";
+  }
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << DumpGraphToFile("after_replicate_constants_pass", *graph,
+                               options.flib_def);
+  }
+  return OkStatus();
+}
+
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_PLACEMENT, 3,
+                      ReplicateConstantsPass);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass.h b/tensorflow/core/common_runtime/replicate_constants_pass.h
new file mode 100644
index 00000000000000..b7b2f0fe98c0d2
--- /dev/null
+++ b/tensorflow/core/common_runtime/replicate_constants_pass.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+// Small constants are replicated to the hosts of their successors. This pass
+// only applies when there are multiple successors.
+//
+// For example, the graph:
+//   C -> {Op0, Op1, Op2, Op3}
+//   C's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:CPU:0
+//   Op0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:0
+//   Op1's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:1
+//   Op2's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:0
+//   Op3's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:1
+// is rewritten to:
+//   C0 -> {Op0, Op1}
+//   C1 -> {Op2, Op3}
+//   C0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:CPU:0
+//   C1's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:CPU:0
+//   Op0's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:0
+//   Op1's assigned_device is /job:tpu_host_worker/replica:0/task:0/device:TPU:1
+//   Op2's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:0
+//   Op3's assigned_device is /job:tpu_host_worker/replica:0/task:1/device:TPU:1
+
+namespace tensorflow {
+
+class ReplicateConstantsPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_REPLICATE_CONSTANTS_PASS_H_
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass_test.cc b/tensorflow/core/common_runtime/replicate_constants_pass_test.cc
new file mode 100644
index 00000000000000..dae22012bfba8b
--- /dev/null
+++ b/tensorflow/core/common_runtime/replicate_constants_pass_test.cc
@@ -0,0 +1,334 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/replicate_constants_pass.h"
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/config/flag_defs.h"
+#include "tensorflow/core/config/flags.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/test.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/test.h"
+
+namespace tensorflow {
+
+const char kCpu0[] = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0";
+const char kCpu1[] = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0";
+const char kTpu00[] = "/job:tpu_host_worker/replica:0/task:0/device:TPU:0";
+const char kTpu01[] = "/job:tpu_host_worker/replica:0/task:0/device:TPU:1";
+const char kTpu10[] = "/job:tpu_host_worker/replica:0/task:1/device:TPU:0";
+const char kTpu11[] = "/job:tpu_host_worker/replica:0/task:1/device:TPU:1";
+
+// Return the node with name `name`.
+Node* GetNode(const Graph& graph, const std::string& name) {
+  for (Node* node : graph.nodes()) {
+    if (node->name() == name) return node;
+  }
+  CHECK(false) << "Unknown node name: " << name;
+  return nullptr;
+}
+
+// Return the first predecessor of `node`.
+Node* GetPredecessor(Node* node) {
+  auto it = node->in_nodes().begin();
+  CHECK(it != node->in_nodes().end())
+      << "No predecessor for " << node->name() << "\n";
+  return *it;
+}
+
+// There exists an edge from `src` to `dst`.
+bool IsEdge(Node* src, Node* dst) {
+  for (Node* node : src->out_nodes()) {
+    if (node == dst) return true;
+  }
+  return false;
+}
+
+// Test that a small constant is replicated to each successor's device.
+TEST(ReplicateConstantsPassTest, TestSmallConstant) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output const0 =
+        ops::Const(scope.WithOpName("const"), 1.0f, TensorShape({}));
+    ops::Negate dst0(scope.WithOpName("dst0"), const0);
+    ops::Negate dst1(scope.WithOpName("dst1"), const0);
+    ops::Negate dst2(scope.WithOpName("dst2"), const0);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "const")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
+  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* dst0 = GetNode(*graph, "dst0");
+  Node* dst1 = GetNode(*graph, "dst1");
+  Node* dst2 = GetNode(*graph, "dst2");
+  EXPECT_EQ(dst0->assigned_device_name(),
+            GetPredecessor(dst0)->assigned_device_name());
+  EXPECT_EQ(dst1->assigned_device_name(),
+            GetPredecessor(dst1)->assigned_device_name());
+  EXPECT_EQ(dst2->assigned_device_name(),
+            GetPredecessor(dst2)->assigned_device_name());
+}
+
+// Test that a large constant is ignored.
+TEST(ReplicateConstantsPassTest, TestLargeConstant) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output const0 =
+        ops::Const(scope.WithOpName("const"),
+                   {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    ops::Negate dst0(scope.WithOpName("dst0"), const0);
+    ops::Negate dst1(scope.WithOpName("dst1"), const0);
+    ops::Negate dst2(scope.WithOpName("dst2"), const0);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "const")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
+  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* dst0 = GetNode(*graph, "dst0");
+  Node* dst1 = GetNode(*graph, "dst1");
+  Node* dst2 = GetNode(*graph, "dst2");
+  EXPECT_EQ(dst0->assigned_device_name(),
+            GetPredecessor(dst0)->assigned_device_name());
+  EXPECT_NE(dst1->assigned_device_name(),
+            GetPredecessor(dst1)->assigned_device_name());
+  EXPECT_NE(dst2->assigned_device_name(),
+            GetPredecessor(dst2)->assigned_device_name());
+}
+
+// Test that a constant with a control successor is ignored.
+TEST(ReplicateConstantsPassTest, TestControlOut) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output const0 =
+        ops::Const(scope.WithOpName("const0"), 1.0f, TensorShape({}));
+    Output ctrl_succ =
+        ops::Const(scope.WithOpName("ctrl_succ"), 1.0f, TensorShape({}));
+    ops::Negate dst0(scope.WithOpName("dst0"), const0);
+    ops::Negate dst1(scope.WithOpName("dst1"), const0);
+    ops::Negate dst2(scope.WithOpName("dst2"), const0);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "const0")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "ctrl_succ")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
+  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
+  graph->AddControlEdge(GetNode(*graph, "const0"),
+                        GetNode(*graph, "ctrl_succ"));
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* dst0 = GetNode(*graph, "dst0");
+  Node* dst1 = GetNode(*graph, "dst1");
+  Node* dst2 = GetNode(*graph, "dst2");
+  EXPECT_EQ(dst0->assigned_device_name(),
+            GetPredecessor(dst0)->assigned_device_name());
+  EXPECT_NE(dst1->assigned_device_name(),
+            GetPredecessor(dst1)->assigned_device_name());
+  EXPECT_NE(dst2->assigned_device_name(),
+            GetPredecessor(dst2)->assigned_device_name());
+}
+
+// Test that a constant on a TPU is ignored.
+TEST(ReplicateConstantsPassTest, TestTpuConst) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output const0 =
+        ops::Const(scope.WithOpName("const0"), 1.0f, TensorShape({}));
+    ops::Negate dst0(scope.WithOpName("dst0"), const0);
+    ops::Negate dst1(scope.WithOpName("dst1"), const0);
+    ops::Negate dst2(scope.WithOpName("dst2"), const0);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "const0")->set_assigned_device_name(kTpu00);
+  GetNode(*graph, "dst0")->set_assigned_device_name(kTpu00);
+  GetNode(*graph, "dst1")->set_assigned_device_name(kTpu10);
+  GetNode(*graph, "dst2")->set_assigned_device_name(kTpu10);
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* dst0 = GetNode(*graph, "dst0");
+  Node* dst1 = GetNode(*graph, "dst1");
+  Node* dst2 = GetNode(*graph, "dst2");
+  EXPECT_EQ(dst0->assigned_device_name(),
+            GetPredecessor(dst0)->assigned_device_name());
+  EXPECT_NE(dst1->assigned_device_name(),
+            GetPredecessor(dst1)->assigned_device_name());
+  EXPECT_NE(dst2->assigned_device_name(),
+            GetPredecessor(dst2)->assigned_device_name());
+}
+
+TEST(ReplicateConstantsPassTest, TestSmallAndLargeConstants) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output small = ops::Const(scope.WithOpName("small"), 1.0f, TensorShape({}));
+    Output large =
+        ops::Const(scope.WithOpName("large"),
+                   {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+                    10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f});
+    ops::Add dst0(scope.WithOpName("dst0"), small, large);
+    ops::Add dst1(scope.WithOpName("dst1"), small, large);
+    ops::Add dst2(scope.WithOpName("dst2"), small, large);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "small")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "large")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst0")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst1")->set_assigned_device_name(kCpu1);
+  GetNode(*graph, "dst2")->set_assigned_device_name(kCpu1);
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* small0 = GetNode(*graph, "small/replicate/_0");
+  Node* small1 = GetNode(*graph, "small/replicate/_1");
+  Node* large = GetNode(*graph, "large");
+  Node* dst0 = GetNode(*graph, "dst0");
+  Node* dst1 = GetNode(*graph, "dst1");
+  Node* dst2 = GetNode(*graph, "dst2");
+  EXPECT_EQ(small0->assigned_device_name(), kCpu0);
+  EXPECT_EQ(small1->assigned_device_name(), kCpu1);
+  EXPECT_EQ(large->assigned_device_name(), kCpu0);
+  EXPECT_EQ(dst0->assigned_device_name(), kCpu0);
+  EXPECT_EQ(dst1->assigned_device_name(), kCpu1);
+  EXPECT_EQ(dst1->assigned_device_name(), kCpu1);
+  EXPECT_TRUE(IsEdge(small0, dst0));
+  EXPECT_TRUE(IsEdge(large, dst0));
+  EXPECT_TRUE(IsEdge(small1, dst1));
+  EXPECT_TRUE(IsEdge(large, dst1));
+  EXPECT_TRUE(IsEdge(small1, dst2));
+  EXPECT_TRUE(IsEdge(large, dst2));
+}
+
+// Test that a constant at a CPU with TPU successors is replicated to the
+// TPUs' host CPUs.
+TEST(ReplicateConstantsPassTest, TestTpuDestinations) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    Output const0 =
+        ops::Const(scope.WithOpName("const"), 1.0f, TensorShape({}));
+    ops::Negate dst00(scope.WithOpName("dst00"), const0);
+    ops::Negate dst01(scope.WithOpName("dst01"), const0);
+    ops::Negate dst10(scope.WithOpName("dst10"), const0);
+    ops::Negate dst11(scope.WithOpName("dst11"), const0);
+    TF_CHECK_OK(scope.ToGraph(graph.get()));
+  }
+  GetNode(*graph, "const")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "dst00")->set_assigned_device_name(kTpu00);
+  GetNode(*graph, "dst01")->set_assigned_device_name(kTpu01);
+  GetNode(*graph, "dst10")->set_assigned_device_name(kTpu10);
+  GetNode(*graph, "dst11")->set_assigned_device_name(kTpu11);
+
+  // Enable the pass.
+  flags::Global().replicate_small_constants.reset(true);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ReplicateConstantsPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+  GraphDef actual;
+  graph->ToGraphDef(&actual);
+
+  Node* const0 = GetNode(*graph, "const/replicate/_0");
+  Node* const1 = GetNode(*graph, "const/replicate/_1");
+  Node* dst00 = GetNode(*graph, "dst00");
+  Node* dst01 = GetNode(*graph, "dst01");
+  Node* dst10 = GetNode(*graph, "dst10");
+  Node* dst11 = GetNode(*graph, "dst11");
+  EXPECT_EQ(const0->assigned_device_name(), kCpu0);
+  EXPECT_EQ(const1->assigned_device_name(), kCpu1);
+  EXPECT_TRUE(IsEdge(const0, dst00));
+  EXPECT_TRUE(IsEdge(const0, dst01));
+  EXPECT_TRUE(IsEdge(const1, dst10));
+  EXPECT_TRUE(IsEdge(const1, dst11));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/config/flag_defs.h b/tensorflow/core/config/flag_defs.h
index 4ab5fb4750de46..89ea6a9b73bbf1 100644
--- a/tensorflow/core/config/flag_defs.h
+++ b/tensorflow/core/config/flag_defs.h
@@ -49,6 +49,10 @@ class Flags {
   TF_DECLARE_FLAG(more_stack_traces, false,
                   "Enable experimental code that preserves and propagates "
                   "graph node stack traces in C++.");
+  TF_DECLARE_FLAG(replicate_small_constants, false,
+                  "Enable a graph optimization pass that replicate each small "
+                  "constant to its successors' devices. This can decrease "
+                  "message passing.");
   // LINT.ThenChange(//tensorflow/core/config/flags_api_wrapper.cc)
 };
 
diff --git a/tensorflow/core/config/flags_api_wrapper.cc b/tensorflow/core/config/flags_api_wrapper.cc
index 58074fb06257d1..974581e931f7ec 100644
--- a/tensorflow/core/config/flags_api_wrapper.cc
+++ b/tensorflow/core/config/flags_api_wrapper.cc
@@ -51,5 +51,6 @@ PYBIND11_MODULE(flags_pybind, m) {
   TF_PY_DECLARE_FLAG(saved_model_fingerprinting);
   TF_PY_DECLARE_FLAG(tf_shape_default_int64);
   TF_PY_DECLARE_FLAG(more_stack_traces);
+  TF_PY_DECLARE_FLAG(replicate_small_constants);
   // LINT.ThenChange(//tensorflow/core/config/flag_defs.h)
 };
diff --git a/tensorflow/python/flags_pybind.pyi b/tensorflow/python/flags_pybind.pyi
index 34b0a0c5666eb8..90aa0a7d76114b 100644
--- a/tensorflow/python/flags_pybind.pyi
+++ b/tensorflow/python/flags_pybind.pyi
@@ -24,6 +24,7 @@ class Flags:
     graph_building_optimization: Flag
     more_stack_traces: Flag
     op_building_optimization: Flag
+    replicate_small_constants: Flag
     saved_model_fingerprinting: Flag
     test_only_experiment_1: Flag
     test_only_experiment_2: Flag

From cdfaf71501eda4df974e01f13b8af9dc335f03f6 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Fri, 22 Sep 2023 11:44:34 -0700
Subject: [PATCH 161/567] [xla:gpu] Set cublas workspace after calling
 cublasSetStream

We need to set workspace because new memory is allocated every time cublas calls is captured in cuda graph: https://docs.nvidia.com/cuda/cublas/index.html#cuda-graphs-support

We need to call it after set stream because cublasSetStream resets workspace:
https://docs.nvidia.com/cuda/cublas/index.html#cuda-graphs-support

PiperOrigin-RevId: 567678072
---
 .../xla/xla/stream_executor/cuda/cuda_blas.cc | 39 +++++++++++++++++++
 .../xla/xla/stream_executor/cuda/cuda_blas.h  |  5 +++
 .../xla/stream_executor/gpu/gpu_executor.h    |  2 +
 3 files changed, 46 insertions(+)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index d9d5451eb912a7..b56fb1236b45f5 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
 #include "Eigen/Core"  // from @eigen_archive
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -198,6 +199,19 @@ bool CUDABlas::Init() {
   }
 #endif  // CUDA_VERSION >= 11000
 
+  // Initialize cuBLAS workspace memory on device. The workspace size is
+  // determined by the GPU architecture:
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetworkspace
+  absl::MutexLock lock(&mu_);
+  uint64_t workspace_size =
+      parent_->cc_major() >= 9 ? 1 << 25 /*32 MiB*/ : 1 << 22 /*4 MiB*/;
+  workspace_ = parent_->Allocate(workspace_size, /*memory_space=*/0);
+
+  if (workspace_.is_null()) {
+    LOG(ERROR) << "Failed to allocate workspace memory";
+    return false;
+  }
+
   return true;
 }
 
@@ -214,6 +228,9 @@ CUDABlas::CUDABlas(gpu::GpuExecutor *parent)
 CUDABlas::~CUDABlas() {
   if (blas_ != nullptr) {
     gpu::ScopedActivateExecutorContext sac{parent_};
+    if (!workspace_.is_null()) {
+      parent_->Deallocate(&workspace_);
+    }
     cublasDestroy(blas_);
   }
 }
@@ -232,6 +249,24 @@ bool CUDABlas::SetStream(Stream *stream) {
   return true;
 }
 
+bool CUDABlas::SetWorkspace() {
+  CHECK(blas_ != nullptr);
+  gpu::ScopedActivateExecutorContext sac{parent_};
+
+  if (workspace_.is_null()) {
+    LOG(ERROR) << "cuBLAS workspace is not allocated";
+    return false;
+  }
+
+  cublasStatus_t ret =
+      cublasSetWorkspace(blas_, workspace_.opaque(), workspace_.size());
+  if (ret != CUBLAS_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set workspace for cuBLAS calls: " << ToString(ret);
+    return false;
+  }
+  return true;
+}
+
 cudaStream_t CUDABlas::CUDAStream(Stream *stream) {
   CHECK(stream != nullptr);
   CHECK(AsGpuStreamValue(stream) != nullptr);
@@ -359,6 +394,10 @@ tsl::Status CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
     return tsl::errors::Internal("Failed setting stream");
   }
 
+  if (!SetWorkspace()) {
+    return tsl::errors::Internal("Failed setting workspace");
+  }
+
   ScopedCublasMathMode math_mode{blas_};
 #if CUBLAS_VER_MAJOR >= 11
   if (math_type == CUBLAS_TF32_TENSOR_OP_MATH &&
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
index 8869b284ce487a..1e98ab1787d240 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/cuda/cuda_blas_lt.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
 
@@ -70,6 +71,8 @@ class CUDABlas : public blas::BlasSupport {
   // invoked before calling into cuBLAS.
   bool SetStream(Stream *stream) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  bool SetWorkspace() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // Returns the underlying CUDA stream.
   cudaStream_t CUDAStream(Stream *stream);
 
@@ -121,6 +124,8 @@ class CUDABlas : public blas::BlasSupport {
 
   BlasLt blas_lt_;
 
+  DeviceMemoryBase workspace_ ABSL_GUARDED_BY(mu_);
+
   SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
 };
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index f5ed9093451926..06b2c014e88629 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -284,6 +284,8 @@ class GpuExecutor : public internal::StreamExecutorInterface {
     return it->second;
   }
 
+  int cc_major() const { return cc_major_; }
+
  private:
   // Host callback landing routine invoked by CUDA.
   // data: User-provided callback provided to HostCallback() above, captured

From 2ae6018db0ebfb4d1f8af0e6534a13296b3ecb5e Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <grasskin@google.com>
Date: Fri, 22 Sep 2023 11:54:16 -0700
Subject: [PATCH 162/567] Internal change only

PiperOrigin-RevId: 567680498
---
 tensorflow/dtensor/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/dtensor/BUILD b/tensorflow/dtensor/BUILD
index 97e263fb766bfb..b8117e927a2523 100644
--- a/tensorflow/dtensor/BUILD
+++ b/tensorflow/dtensor/BUILD
@@ -12,7 +12,6 @@ package_group(
         "//tensorflow_models/google/...",
         "//third_party/aimee/clara2_labs/...",
         "//third_party/py/jax_tpu_embedding/...",
-        "//third_party/py/keras/dtensor/...",
         "//third_party/py/tf_keras/dtensor/...",
     ],
 )

From 75ebcea4fed412394bd99acd85d37c29fc9a1fc5 Mon Sep 17 00:00:00 2001
From: Adam Cogdell <adamcogdell@google.com>
Date: Fri, 22 Sep 2023 12:26:42 -0700
Subject: [PATCH 163/567] Add found_fingerprint_on_load metric.

PiperOrigin-RevId: 567688573
---
 tensorflow/cc/saved_model/metrics.cc          | 16 +++++++++++
 tensorflow/cc/saved_model/metrics.h           | 26 +++++++++++------
 tensorflow/cc/saved_model/metrics_test.cc     | 11 ++++++++
 tensorflow/python/saved_model/load.py         |  7 ++++-
 .../pywrap_saved_model/metrics.pyi            |  6 ++++
 .../saved_model/pywrap_saved_model_metrics.cc | 28 +++++++++++++++++++
 .../pywrap_saved_model_metrics_test.py        | 17 +++++++++++
 .../tools/def_file_filter/symbols_pybind.txt  |  1 +
 .../tools/def_file_filter/symbols_pybind.txt  |  1 +
 9 files changed, 103 insertions(+), 10 deletions(-)

diff --git a/tensorflow/cc/saved_model/metrics.cc b/tensorflow/cc/saved_model/metrics.cc
index ca48bd70e61e3c..c21c0c8f68dcf6 100644
--- a/tensorflow/cc/saved_model/metrics.cc
+++ b/tensorflow/cc/saved_model/metrics.cc
@@ -107,6 +107,18 @@ auto* saved_model_read_path_and_singleprint =
         "graph_def_program_hash, signature_def_hash, saved_object_graph_hash, "
         "and checkpoint_hash) of the loaded SavedModel.");
 
+// Gauge that marks whether or not the fingerprint.pb file was found when
+// loading the SavedModel.
+// Can hold one of the following string values:
+//  - "FOUND"
+//  - "NOT_FOUND"
+//  - "ERROR"
+auto* saved_model_found_fingerprint_on_load =
+    monitoring::Gauge<std::string, 0>::New(
+        "/tensorflow/core/saved_model/found_fingerprint_on_load",
+        "Whether or not the fingerprint.pb file was found when loading the "
+        "SavedModel.");
+
 // Distribution of checkpoint write durations.
 auto* checkpoint_write_durations = monitoring::Sampler<1>::New(
     {
@@ -251,6 +263,10 @@ ParseSavedModelPathAndSingleprint(std::string path_and_singleprint) {
   return std::pair<std::string, std::string>(path, singleprint);
 }
 
+monitoring::GaugeCell<std::string>& SavedModelFoundFingerprintOnLoad() {
+  return *saved_model_found_fingerprint_on_load->GetCell();
+}
+
 monitoring::SamplerCell& CheckpointReadDuration(absl::string_view api_label) {
   return *checkpoint_read_durations->GetCell(std::string(api_label));
 }
diff --git a/tensorflow/cc/saved_model/metrics.h b/tensorflow/cc/saved_model/metrics.h
index 8a77be00bd22cf..21461206f76956 100644
--- a/tensorflow/cc/saved_model/metrics.h
+++ b/tensorflow/cc/saved_model/metrics.h
@@ -35,6 +35,10 @@ limitations under the License.
 namespace tensorflow {
 namespace metrics {
 
+const char kFingerprintFound[] = "FOUND";
+const char kFingerprintNotFound[] = "NOT_FOUND";
+const char kFingerprintError[] = "ERROR";
+
 // Returns "/tensorflow/core/saved_model/write/count" cell. This metric
 // has 1 field "write_version", which is equal to the
 // `tensorflow::libexport::GetWriteVersion` of the protobuf and should be
@@ -47,6 +51,16 @@ monitoring::CounterCell& SavedModelWriteCount(absl::string_view write_version);
 // incremented when a SavedModel has been successfully read.
 monitoring::CounterCell& SavedModelReadCount(absl::string_view write_version);
 
+// Returns "/tensorflow/core/saved_model/write/api" cell. This metric has 1
+// field "api_label" which corresponds to a SavedModel write API. The cell for
+// `foo` should be incremented when the write API `foo` is called.
+monitoring::CounterCell& SavedModelWriteApi(absl::string_view api_label);
+
+// Returns "/tensorflow/core/saved_model/read/api" cell. This metric has 1
+// field "api_label" which corresponds to a SavedModel read API. The cell for
+// `foo` should be incremented when the read API `foo` is called.
+monitoring::CounterCell& SavedModelReadApi(absl::string_view api_label);
+
 // Returns "/tensorflow/core/saved_model/write/fingerprint" cell, which contains
 // the saved_model_checksum of the SM's fingerprint when it is exported.
 monitoring::GaugeCell<std::string>& SavedModelWriteFingerprint();
@@ -87,15 +101,9 @@ absl::StatusOr<std::string> MakeSavedModelPathAndSingleprint(
 absl::StatusOr<std::pair<std::string, std::string>>
 ParseSavedModelPathAndSingleprint(std::string path_and_singleprint);
 
-// Returns "/tensorflow/core/saved_model/write/api" cell. This metric has 1
-// field "api_label" which corresponds to a SavedModel write API. The cell for
-// `foo` should be incremented when the write API `foo` is called.
-monitoring::CounterCell& SavedModelWriteApi(absl::string_view api_label);
-
-// Returns "/tensorflow/core/saved_model/read/api" cell. This metric has 1
-// field "api_label" which corresponds to a SavedModel read API. The cell for
-// `foo` should be incremented when the read API `foo` is called.
-monitoring::CounterCell& SavedModelReadApi(absl::string_view api_label);
+// Returns string status indicating whether or not the fingerprint.pb file was
+// found when loading the SavedModel.
+monitoring::GaugeCell<std::string>& SavedModelFoundFingerprintOnLoad();
 
 // Returns "/tensorflow/core/checkpoint/read/read_durations" cell belonging to
 // field `api_label`.
diff --git a/tensorflow/cc/saved_model/metrics_test.cc b/tensorflow/cc/saved_model/metrics_test.cc
index 4ab3174881fd8b..76cb0c940da82a 100644
--- a/tensorflow/cc/saved_model/metrics_test.cc
+++ b/tensorflow/cc/saved_model/metrics_test.cc
@@ -175,5 +175,16 @@ TEST(MetricsTest, TestMakeFingerprintJson) {
   EXPECT_EQ(fingerprint_json["checkpoint_hash"].asUInt64(), 5);
 }
 
+TEST(MetricsTest, TestFoundFingerprintOnLoad) {
+  EXPECT_EQ(SavedModelFoundFingerprintOnLoad().value(), "");
+
+  SavedModelFoundFingerprintOnLoad().Set(kFingerprintFound);
+  EXPECT_EQ(SavedModelFoundFingerprintOnLoad().value(), "FOUND");
+  SavedModelFoundFingerprintOnLoad().Set(kFingerprintNotFound);
+  EXPECT_EQ(SavedModelFoundFingerprintOnLoad().value(), "NOT_FOUND");
+  SavedModelFoundFingerprintOnLoad().Set(kFingerprintError);
+  EXPECT_EQ(SavedModelFoundFingerprintOnLoad().value(), "ERROR");
+}
+
 }  // namespace metrics
 }  // namespace tensorflow
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 71ef550783763d..2be4ee77db7e62 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -1080,18 +1080,23 @@ def load_partial(export_dir, filters, tags=None, options=None):
   try:
     fingerprint = fingerprinting.read_fingerprint(export_dir)
   except FileNotFoundError:
+    metrics.SetFoundFingerprintOnLoad(found_status=metrics.kFingerprintNotFound)
     logging.info(
         "Fingerprint not found. Saved model loading will continue.")
     singleprint = ""
   except RuntimeError:
+    metrics.SetFoundFingerprintOnLoad(found_status=metrics.kFingerprintError)
     logging.exception(
-        "Fingerprint was found, but there was an error when reading the proto.")
+        "Fingerprint was found, but there was an error when reading the proto. "
+        "Saved model loading will continue.")
     singleprint = ""
   else:
+    metrics.SetFoundFingerprintOnLoad(found_status=metrics.kFingerprintFound)
     metrics.SetReadFingerprint(
         fingerprint=fingerprinting_utils.to_proto(
             fingerprint).SerializeToString())
     singleprint = fingerprint.singleprint()
+
   try:
     metrics.SetReadPathAndSingleprint(path=export_dir, singleprint=singleprint)
   except metrics.MetricException:
diff --git a/tensorflow/python/saved_model/pywrap_saved_model/metrics.pyi b/tensorflow/python/saved_model/pywrap_saved_model/metrics.pyi
index e73e6dc2fb3b1e..21f00a6396e129 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model/metrics.pyi
+++ b/tensorflow/python/saved_model/pywrap_saved_model/metrics.pyi
@@ -15,6 +15,10 @@
 
 from typing import Any, Tuple
 
+kFingerprintError: str
+kFingerprintFound: str
+kFingerprintNotFound: str
+
 class MetricException(Exception): ...
 
 def AddAsyncCheckpointWriteDuration(*args, **kwargs) -> Any: ...
@@ -26,6 +30,7 @@ def GetAsyncCheckpointWriteDurations(*args, **kwargs) -> Any: ...
 def GetCheckpointReadDurations(*args, **kwargs) -> Any: ...
 def GetCheckpointSize(*args, **kwargs) -> Any: ...
 def GetCheckpointWriteDurations(*args, **kwargs) -> Any: ...
+def GetFoundFingerprintOnLoad() -> str: ...
 def GetRead(*args, **kwargs) -> Any: ...
 def GetReadApi(arg0: str) -> int: ...
 def GetReadFingerprint() -> str: ...
@@ -42,6 +47,7 @@ def IncrementReadApi(arg0: str) -> None: ...
 def IncrementWrite(*args, **kwargs) -> Any: ...
 def IncrementWriteApi(arg0: str) -> None: ...
 def RecordCheckpointSize(*args, **kwargs) -> Any: ...
+def SetFoundFingerprintOnLoad(*args, **kwargs) -> Any: ...
 def SetReadFingerprint(*args, **kwargs) -> Any: ...
 def SetReadPath(*args, **kwargs) -> Any: ...
 def SetReadPathAndSingleprint(*args, **kwargs) -> Any: ...
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc b/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
index e6325f7598d12a..eac5bdc30660cb 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
@@ -261,6 +261,34 @@ void DefineMetricsModule(py::module main_module) {
           "Get tuple of `path` and `singleprint` values of "
           "'/tensorflow/core/saved_model/write/path_and_singleprint' gauge."));
 
+  m.attr("kFingerprintFound") = metrics::kFingerprintFound;
+  m.attr("kFingerprintNotFound") = metrics::kFingerprintNotFound;
+  m.attr("kFingerprintError") = metrics::kFingerprintError;
+
+  m.def(
+      "GetFoundFingerprintOnLoad",
+      []() { return metrics::SavedModelFoundFingerprintOnLoad().value(); },
+      py::doc(
+          "Get value of "
+          "'/tensorflow/core/saved_model/found_fingerprint_on_load' gauge."));
+
+  m.def(
+      "SetFoundFingerprintOnLoad",
+      [](std::string found_status) {
+        if (found_status == metrics::kFingerprintFound ||
+            found_status == metrics::kFingerprintNotFound ||
+            found_status == metrics::kFingerprintError) {
+          metrics::SavedModelFoundFingerprintOnLoad().Set(found_status);
+        } else {
+          metrics::SavedModelFoundFingerprintOnLoad().Set("");
+        }
+      },
+      py::kw_only(), py::arg("found_status"),
+      py::doc("Set value of "
+              "'/tensorflow/core/saved_model/found_fingerprint_on_load' gauge "
+              "with 'found_status' if status is one of { \"FOUND\", "
+              "\"NOT_FOUND\", \"ERROR\" }."));
+
   m.def(
       "AddCheckpointReadDuration",
       [](const char* api_label, double microseconds) {
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py b/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py
index 9920a0302fc5ca..dcf057ffd976ea 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py
+++ b/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py
@@ -195,6 +195,23 @@ def test_SM_write_invalid_path_and_singleprint(self):
         str(excinfo.exception),
         "Invalid path_and_singleprint argument. Empty singleprint.")
 
+  def test_SM_found_fingerprint_on_load(self):
+    metrics.SetFoundFingerprintOnLoad(found_status=metrics.kFingerprintFound)
+    self.assertEqual(metrics.GetFoundFingerprintOnLoad(), "FOUND")
+
+    metrics.SetFoundFingerprintOnLoad(found_status=metrics.kFingerprintNotFound)
+    self.assertEqual(metrics.GetFoundFingerprintOnLoad(), "NOT_FOUND")
+
+    metrics.SetFoundFingerprintOnLoad(found_status=metrics.kFingerprintError)
+    self.assertEqual(metrics.GetFoundFingerprintOnLoad(), "ERROR")
+
+  def test_invalid_SM_found_fingerprint_on_load(self):
+    metrics.SetFoundFingerprintOnLoad(found_status="absolute nonsense")
+    self.assertEqual(metrics.GetFoundFingerprintOnLoad(), "")
+
+    metrics.SetFoundFingerprintOnLoad(found_status="found")
+    self.assertEqual(metrics.GetFoundFingerprintOnLoad(), "")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 2dedeaa23d5ba2..0f52e9fa804dae 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -462,6 +462,7 @@ tensorflow::metrics::SavedModelReadPathAndSingleprint
 tensorflow::metrics::MakeFingerprintJson
 tensorflow::metrics::MakeSavedModelPathAndSingleprint
 tensorflow::metrics::ParseSavedModelPathAndSingleprint
+tensorflow::metrics::SavedModelFoundFingerprintOnLoad
 tensorflow::metrics::CheckpointReadDuration
 tensorflow::metrics::CheckpointWriteDuration
 tensorflow::metrics::AsyncCheckpointWriteDuration
diff --git a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
index 2dedeaa23d5ba2..0f52e9fa804dae 100644
--- a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
+++ b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
@@ -462,6 +462,7 @@ tensorflow::metrics::SavedModelReadPathAndSingleprint
 tensorflow::metrics::MakeFingerprintJson
 tensorflow::metrics::MakeSavedModelPathAndSingleprint
 tensorflow::metrics::ParseSavedModelPathAndSingleprint
+tensorflow::metrics::SavedModelFoundFingerprintOnLoad
 tensorflow::metrics::CheckpointReadDuration
 tensorflow::metrics::CheckpointWriteDuration
 tensorflow::metrics::AsyncCheckpointWriteDuration

From 11d6dacde62d48bdd9ba8af063ffbcb2df6ce0a7 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 22 Sep 2023 12:37:38 -0700
Subject: [PATCH 164/567] Integrate LLVM at llvm/llvm-project@6e3827af98fa

Updates LLVM usage to match
[6e3827af98fa](https://github.com/llvm/llvm-project/commit/6e3827af98fa)

PiperOrigin-RevId: 567691274
---
 third_party/llvm/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 288b1748010f35..eb146861cd0cb1 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "058222b2316615194c089f2bc68d11341f39d26e"
-    LLVM_SHA256 = "99d3c38eb11dee8f00bd74b69152d961ab73cf4488842f6120e81342eeb94a3b"
+    LLVM_COMMIT = "6e3827af98fa59d5147598972625a5317936c31f"
+    LLVM_SHA256 = "bda0e24e2b92f19d2929237101edc1f66fa64f5407d32cbabaf44f878ff0827c"
 
     tf_http_archive(
         name = name,

From 56a9f7f8404b95d62c1bae880fdaf88893166316 Mon Sep 17 00:00:00 2001
From: Edward Schwartz <schwartzedward@google.com>
Date: Fri, 22 Sep 2023 12:40:55 -0700
Subject: [PATCH 165/567] Bincount with weights testing and documentation
 improvements

* Enable tests for Bincount with weights for `xla_gpu` and only skip for `gpu`.

* Update error message in GPU Bincount kernel to add `tf.function(jit_compile=True)` suggestion for using weights.

* Document equivalence between bincount and unsorted_segment sum and possible use as workaround on GPU when not using XLA (for Tensor and RaggedTensor but not SparseTensor).

* Update examples in documentation to use doctest and other documentation improvements.

(No behavior changes.)

PiperOrigin-RevId: 567692049
---
 tensorflow/core/kernels/bincount_op_gpu.cu.cc |  3 +-
 tensorflow/python/ops/bincount_ops.py         | 46 ++++++++++++-------
 tensorflow/python/ops/bincount_ops_test.py    |  3 +-
 .../python/ops/ragged/ragged_bincount_ops.py  | 27 ++++++++---
 .../ops/ragged/ragged_bincount_ops_test.py    |  3 +-
 .../python/ops/sparse_bincount_ops_test.py    |  3 +-
 tensorflow/python/ops/sparse_ops.py           | 20 +++++---
 7 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/kernels/bincount_op_gpu.cu.cc b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
index cb3522c1c015ec..8389fb2a8e1180 100644
--- a/tensorflow/core/kernels/bincount_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
@@ -44,7 +44,8 @@ struct BincountFunctor<GPUDevice, Tidx, T, false> {
     if (weights.size() != 0) {
       return errors::Unimplemented(
           "Weights are not yet supported by the GPU implementation of Bincount."
-          " Please use unsorted_segment_sum instead.");
+          " Please use unsorted_segment_sum instead or put Bincount inside"
+          " tf.function(jit_compile=True).");
     }
     if (output.size() == 0) {
       return OkStatus();
diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py
index 7d19fc3a4cbffd..f04866a223e907 100644
--- a/tensorflow/python/ops/bincount_ops.py
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -40,14 +40,11 @@ def bincount(arr,
 
   If `minlength` and `maxlength` are not given, returns a vector with length
   `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
-  If `weights` are non-None, then index `i` of the output stores the sum of the
-  value in `weights` at each index where the corresponding value in `arr` is
-  `i`.
 
-  ```python
-  values = tf.constant([1,1,2,3,2,4,4,5])
-  tf.math.bincount(values) #[0 2 2 1 2 1]
-  ```
+  >>> values = tf.constant([1,1,2,3,2,4,4,5])
+  >>> tf.math.bincount(values)
+  <tf.Tensor: ... numpy=array([0, 2, 2, 1, 2, 1], dtype=int32)>
+
   Vector length = Maximum element in vector `values` is 5. Adding 1, which is 6
                   will be the vector length.
 
@@ -55,14 +52,32 @@ def bincount(arr,
   index. Here, index 1 in output has a value 2. This indicates value 1 occurs
   two times in `values`.
 
-  ```python
-  values = tf.constant([1,1,2,3,2,4,4,5])
-  weights = tf.constant([1,5,0,1,0,5,4,5])
-  tf.math.bincount(values, weights=weights) #[0 6 0 1 9 5]
-  ```
-  Bin will be incremented by the corresponding weight instead of 1.
-  Here, index 1 in output has a value 6. This is the summation of weights
-  corresponding to the value in `values`.
+  **Bin-counting with weights**
+
+  >>> values = tf.constant([1,1,2,3,2,4,4,5])
+  >>> weights = tf.constant([1,5,0,1,0,5,4,5])
+  >>> tf.math.bincount(values, weights=weights)
+  <tf.Tensor: ... numpy=array([0, 6, 0, 1, 9, 5], dtype=int32)>
+
+  When `weights` is specified, bins will be incremented by the corresponding
+  weight instead of 1. Here, index 1 in output has a value 6. This is the
+  summation of `weights` corresponding to the value in `values` (i.e. for index
+  1, the first two values are 1 so the first two weights, 1 and 5, are
+  summed).
+
+  There is an equivilance between bin-counting with weights and
+  `unsorted_segement_sum` where `data` is the weights and `segment_ids` are the
+  values.
+
+  >>> values = tf.constant([1,1,2,3,2,4,4,5])
+  >>> weights = tf.constant([1,5,0,1,0,5,4,5])
+  >>> tf.math.unsorted_segment_sum(weights, values, num_segments=6).numpy()
+  array([0, 6, 0, 1, 9, 5], dtype=int32)
+
+  On GPU, `bincount` with weights is only supported when XLA is enabled
+  (typically when a function decorated with `@tf.function(jit_compile=True)`).
+  `unsorted_segment_sum` can be used as a workaround for the non-XLA case on
+  GPU.
 
   **Bin-counting matrix rows independently**
 
@@ -76,7 +91,6 @@ def bincount(arr,
     array([[1, 1, 1, 1],
            [2, 1, 1, 0]], dtype=int32)>
 
-
   **Bin-counting with binary_output**
 
   This example gives binary output instead of counting the occurrence.
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 4e14b1b8a45ef8..d1eff521883077 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -319,7 +319,8 @@ def test_weights(
       expected,
       axis=None,
   ):
-    if "GPU" in set([d.device_type for d in tf_config.list_physical_devices()]):
+    device_set = set([d.device_type for d in tf_config.list_physical_devices()])
+    if "GPU" in device_set and not test_util.is_xla_enabled():
       self.skipTest(
           "b/263004039 The DenseBincount GPU kernel does not support weights."
           " unsorted_segment_sum should be used instead on GPU."
diff --git a/tensorflow/python/ops/ragged/ragged_bincount_ops.py b/tensorflow/python/ops/ragged/ragged_bincount_ops.py
index 1e4cc273701d60..7cae73ba66db8c 100644
--- a/tensorflow/python/ops/ragged/ragged_bincount_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_bincount_ops.py
@@ -41,9 +41,6 @@ def bincount(arr: ragged_tensor.RaggedTensor,
 
   If `minlength` and `maxlength` are not given, returns a vector with length
   `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
-  If `weights` are non-None, then index `i` of the output stores the sum of the
-  value in `weights` at each index where the corresponding value in `arr` is
-  `i`.
 
   >>> data = tf.ragged.constant([[1, 1], [2, 3, 2, 4, 4, 5]])
   >>> tf.math.bincount(data)
@@ -56,14 +53,32 @@ def bincount(arr: ragged_tensor.RaggedTensor,
   index. Here, index 1 in output has a value 2. This indicates value 1 occurs
   two times in `values`.
 
+  **Bin-counting with weights**
+
   >>> data = tf.ragged.constant([[1, 1], [2, 3, 2, 4, 4, 5]])
   >>> weights = tf.ragged.constant([[1, 5], [0, 1, 0, 5, 4, 5]])
   >>> tf.math.bincount(data, weights=weights)
   <tf.Tensor: ... numpy=array([0, 6, 0, 1, 9, 5], dtype=int32)>
 
-  Bin will be incremented by the corresponding weight instead of 1.
-  Here, index 1 in output has a value 6. This is the summation of weights
-  corresponding to the value in `values`.
+  When `weights` is specified, bins will be incremented by the corresponding
+  weight instead of 1. Here, index 1 in output has a value 6. This is the
+  summation of `weights` corresponding to the value in `arr` (i.e. for index
+  1, the first two values `arr` are 1 so the first two weights, 1 and 5, are
+  summed).
+
+  There is an equivilance between bin-counting with weights and
+  `unsorted_segement_sum` where `data` is the weights and `segment_ids` are the
+  values.
+
+  >>> data = tf.ragged.constant([[1, 1], [2, 3, 2, 4, 4, 5]])
+  >>> weights = tf.ragged.constant([[1, 5], [0, 1, 0, 5, 4, 5]])
+  >>> tf.math.unsorted_segment_sum(weights, data, num_segments=6).numpy()
+  array([0, 6, 0, 1, 9, 5], dtype=int32)
+
+  On GPU, `bincount` with weights is only supported when XLA is enabled
+  (typically when a function decorated with `@tf.function(jit_compile=True)`).
+  `unsorted_segment_sum` can be used as a workaround for the non-XLA case on
+  GPU.
 
   **Bin-counting matrix rows independently**
 
diff --git a/tensorflow/python/ops/ragged/ragged_bincount_ops_test.py b/tensorflow/python/ops/ragged/ragged_bincount_ops_test.py
index bda3dc32c6eedd..79b046d3072a53 100644
--- a/tensorflow/python/ops/ragged/ragged_bincount_ops_test.py
+++ b/tensorflow/python/ops/ragged/ragged_bincount_ops_test.py
@@ -281,7 +281,8 @@ def test_weights(
       expected,
       axis,
   ):
-    if "GPU" in set([d.device_type for d in tf_config.list_physical_devices()]):
+    device_set = set([d.device_type for d in tf_config.list_physical_devices()])
+    if "GPU" in device_set and not test_util.is_xla_enabled():
       self.skipTest(
           "b/263004039 The DenseBincount GPU kernel does not support weights."
           " unsorted_segment_sum should be used instead on GPU."
diff --git a/tensorflow/python/ops/sparse_bincount_ops_test.py b/tensorflow/python/ops/sparse_bincount_ops_test.py
index f5f86603bfd060..e6cc1c1fb747e8 100644
--- a/tensorflow/python/ops/sparse_bincount_ops_test.py
+++ b/tensorflow/python/ops/sparse_bincount_ops_test.py
@@ -589,7 +589,8 @@ def test_weights(
       expected,
       axis,
   ):
-    if "GPU" in set([d.device_type for d in tf_config.list_physical_devices()]):
+    device_set = set([d.device_type for d in tf_config.list_physical_devices()])
+    if "GPU" in device_set and not test_util.is_xla_enabled():
       self.skipTest(
           "b/263004039 The DenseBincount GPU kernel does not support weights."
           " unsorted_segment_sum should be used instead on GPU."
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 82cee2d96e0866..74cc5e4eddfd0a 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -3012,9 +3012,6 @@ def bincount(arr: sparse_tensor.SparseTensor,
 
   If `minlength` and `maxlength` are not given, returns a vector with length
   `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
-  If `weights` are non-None, then index `i` of the output stores the sum of the
-  value in `weights` at each index where the corresponding value in `arr` is
-  `i`.
 
   >>> data = tf.sparse.SparseTensor(
   ...     indices=[[0, 3], [1, 7], [2, 4], [3, 0],
@@ -3031,6 +3028,8 @@ def bincount(arr: sparse_tensor.SparseTensor,
   index. Here, index 1 in output has a value 2. This indicates value 1 occurs
   two times in `values`.
 
+  **Bin-counting with weights**
+
   >>> indices=[[0, 3], [1, 7], [2, 4], [3, 0], [4, 9], [5, 1], [6, 8], [7, 2]]
   >>> data = tf.sparse.SparseTensor(
   ...     indices=indices,
@@ -3043,9 +3042,15 @@ def bincount(arr: sparse_tensor.SparseTensor,
   >>> tf.math.bincount(data, weights=weights)
   <tf.Tensor: ... numpy=array([0, 6, 0, 1, 9, 5], dtype=int32)>
 
-  Bin will be incremented by the corresponding weight instead of 1.
-  Here, index 1 in output has a value 6. This is the summation of weights
-  corresponding to the value in `values`.
+  When `weights` is specified, bins will be incremented by the corresponding
+  weight instead of 1. Here, index 1 in output has a value 6. This is the
+  summation of `weights` corresponding to the value in `values` (i.e. for index
+  1, the first two data values are 1 so the first two weights, 1 and 5, are
+  summed).
+
+  On GPU, `bincount` with weights is only supported when `axis=0` and XLA is
+  enabled (typically when a function decorated with
+  `@tf.function(jit_compile=True)`).
 
   **Bin-counting matrix rows independently**
 
@@ -3128,7 +3133,8 @@ def bincount(arr: sparse_tensor.SparseTensor,
     name: A name scope for the associated operations (optional).
     axis: The axis to slice over. Axes at and below `axis` will be flattened
       before bin counting. Currently, only `0`, and `-1` are supported. If None,
-      all axes will be flattened (identical to passing `0`).
+      all axes will be flattened (identical to passing `0`). XLA does not
+      support `axis=-1`.
     binary_output: If True, this op will output 1 instead of the number of times
       a token appears (equivalent to one_hot + reduce_any instead of one_hot +
       reduce_add). Defaults to False.

From 5e2d5c50c2e9a7ecce00c8859d8f7fa1fddb752c Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <grasskin@google.com>
Date: Fri, 22 Sep 2023 12:51:01 -0700
Subject: [PATCH 166/567] Internal Code Change

PiperOrigin-RevId: 567694546
---
 tensorflow/BUILD                          | 1 -
 tensorflow/compiler/tf2xla/BUILD          | 1 -
 tensorflow/python/BUILD                   | 2 --
 tensorflow/python/distribute/BUILD        | 3 ---
 tensorflow/python/framework/BUILD         | 2 --
 tensorflow/python/layers/BUILD            | 2 +-
 tensorflow/python/lib/core/BUILD          | 1 -
 tensorflow/python/ops/BUILD               | 1 -
 tensorflow/python/util/BUILD              | 1 -
 tensorflow/python/util/keras_deps.py      | 2 +-
 tensorflow/python/util/protobuf/BUILD     | 1 -
 third_party/xla/third_party/tsl/tsl/BUILD | 1 -
 12 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 3bd7e8564bee16..840e3d4c174664 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -1054,7 +1054,6 @@ package_group(
         "//third_party/py/cloud_ml_autoflow/...",
         "//third_party/py/envlogger/...",
         "//third_party/py/gldm/...",
-        "//third_party/py/keras/...",
         "//third_party/py/tf_keras/...",
         "//third_party/yggdrasil_decision_forests/...",
         "//waymo/ml/cn/...",
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 44ecb3ee0e2b27..08a499f25c2083 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -50,7 +50,6 @@ package_group(
         "//third_party/deepmind/deepmind_research/density_functional_approximation_dm21/...",
         "//third_party/mlir_edge/model_curriculum/iree/...",
         "//third_party/mlperf/submissions/training/v0_7/models/...",
-        "//third_party/py/keras/...",
         "//third_party/py/keras_cv/...",
         "//third_party/py/tf_keras/...",
         "//waymo/ml/deploy/benchmark/...",
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 4f7b1427e37ce1..4d79a70ca67e2c 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -47,7 +47,6 @@ visibility = [
     "//third_party/py/tensorflow_recommenders:__subpackages__",
     "//third_party/py/tf_agents:__subpackages__",  # For benchmarks.
     "//third_party/py/tf_slim:__subpackages__",
-    "//third_party/py/keras:__subpackages__",
     "//third_party/py/tf_keras:__subpackages__",
     "//third_party/py/starcraft2:__subpackages__",
     "//third_party/py/tensorflow_gnn:__subpackages__",
@@ -129,7 +128,6 @@ py_strict_library(
         "//tensorflow/python/tools/api/generator:__pkg__",
         "//tensorflow/tools/api/tests:__pkg__",
         "//tensorflow/tools/compatibility/update:__pkg__",
-        "//third_party/py/keras:__subpackages__",
         "//third_party/py/tensorflow_core:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 62662abd82517c..0c233b2b4a455b 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -8,7 +8,6 @@ package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
-        "//third_party/py/keras:__subpackages__",  # TODO(scottzhu): remove this once keras is relying on tf.__internal__.
     ],
     licenses = ["notice"],
 )
@@ -1107,7 +1106,6 @@ py_strict_library(
     visibility = [
         "//tensorflow:internal",
         "//tensorflow_models:__subpackages__",
-        "//third_party/py/keras:__subpackages__",
     ],
     deps = [
         ":collective_all_reduce_strategy",
@@ -1154,7 +1152,6 @@ py_strict_library(
     visibility = [
         "//tensorflow:internal",
         "//tensorflow_models:__subpackages__",
-        "//third_party/py/keras:__subpackages__",
     ],
     deps = [
         ":central_storage_strategy",
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 09f576bce4a250..a5ca3b6ea44acc 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -2035,7 +2035,6 @@ py_strict_library(
         "//tensorflow_estimator/python/estimator:__subpackages__",
         "//tensorflow_model_optimization:__subpackages__",
         "//third_party/cloud_tpu/convergence_tools:__subpackages__",
-        "//third_party/py/keras:__subpackages__",
         "//third_party/py/neural_structured_learning:__subpackages__",
         "//third_party/py/tf_agents:__subpackages__",
         "//third_party/py/tf_keras:__subpackages__",
@@ -2109,7 +2108,6 @@ py_strict_library(
     visibility = visibility + [
         "//tensorflow:internal",
         "//tensorflow_models:__subpackages__",
-        "//third_party/py/keras:__subpackages__",
         "//third_party/py/keras_cv:__subpackages__",
         "//third_party/py/tf_keras:__subpackages__",
     ],
diff --git a/tensorflow/python/layers/BUILD b/tensorflow/python/layers/BUILD
index d1a56ff4bc9506..0a80d5731f7c13 100644
--- a/tensorflow/python/layers/BUILD
+++ b/tensorflow/python/layers/BUILD
@@ -64,7 +64,7 @@ py_strict_library(
         "//tensorflow/python/keras/legacy_tf_layers:core",
         "//tensorflow/python/keras/legacy_tf_layers:pooling",
         "//tensorflow/python/util:lazy_loader",
-        # Normalization layer will need //third_party/py/keras/legacy_tf_layers:normalization
+        # Normalization layer will need //third_party/py/tf_keras/legacy_tf_layers:normalization
         # Client lib should import that, since this target can't import it due to
         # circular dependency.
     ],
diff --git a/tensorflow/python/lib/core/BUILD b/tensorflow/python/lib/core/BUILD
index 67385a894a37d9..a6904053c60b76 100644
--- a/tensorflow/python/lib/core/BUILD
+++ b/tensorflow/python/lib/core/BUILD
@@ -18,7 +18,6 @@ visibility = [
     "//third_party/py/tf_agents:__subpackages__",  # For benchmarks.
     "//third_party/py/tf_slim:__subpackages__",
     "//third_party/py/tensorflow_docs:__subpackages__",
-    "//third_party/py/keras:__subpackages__",
 ]
 
 package(
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 00679cbf1d36ec..cba5c7c60e69ac 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -25,7 +25,6 @@ visibility = [
     "//third_party/py/tf_agents:__subpackages__",  # For benchmarks.
     "//third_party/py/tf_slim:__subpackages__",
     "//third_party/py/tf_keras:__subpackages__",
-    "//third_party/py/keras:__subpackages__",
     "//third_party/py/starcraft2:__subpackages__",
     "//third_party/py/tensorflow_gnn:__subpackages__",
     "//third_party/py/tensorflow_numerics:__subpackages__",
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index 9e0eb80d2fb1dd..d1e7d626a1d3e7 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -23,7 +23,6 @@ visibility = [
     "//third_party/py/tf_agents:__subpackages__",  # For benchmarks.
     "//third_party/py/tf_slim:__subpackages__",
     "//third_party/py/tensorflow_docs:__subpackages__",
-    "//third_party/py/keras:__subpackages__",
     "//third_party/py/tf_keras:__subpackages__",
 ]
 
diff --git a/tensorflow/python/util/keras_deps.py b/tensorflow/python/util/keras_deps.py
index 837b121f5bd669..99daeaa2378634 100644
--- a/tensorflow/python/util/keras_deps.py
+++ b/tensorflow/python/util/keras_deps.py
@@ -61,7 +61,7 @@ def register_load_model_function(func):
 
 
 # This is used to register the in_load_context function in
-# third_party/py/keras/saving/saved_model/load_context.py for use in
+# third_party/py/tf_keras/saving/saved_model/load_context.py for use in
 # third_party/tensorflow library.
 @tf_export('__internal__.register_load_context_function', v1=[])
 def register_load_context_function(func):
diff --git a/tensorflow/python/util/protobuf/BUILD b/tensorflow/python/util/protobuf/BUILD
index f026e3734f33f9..783e6a64707329 100644
--- a/tensorflow/python/util/protobuf/BUILD
+++ b/tensorflow/python/util/protobuf/BUILD
@@ -21,7 +21,6 @@ visibility = [
     "//third_party/py/tf_agents:__subpackages__",  # For benchmarks.
     "//third_party/py/tf_slim:__subpackages__",
     "//third_party/py/tensorflow_docs:__subpackages__",
-    "//third_party/py/keras:__subpackages__",
 ]
 
 package(
diff --git a/third_party/xla/third_party/tsl/tsl/BUILD b/third_party/xla/third_party/tsl/tsl/BUILD
index 7045fe93da7e0e..e40e9900022b1a 100644
--- a/third_party/xla/third_party/tsl/tsl/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/BUILD
@@ -441,7 +441,6 @@ package_group(
         "//third_party/cloud_tpu/inference_converter/...",
         "//third_party/py/cloud_ml_autoflow/...",
         "//third_party/py/envlogger/...",
-        "//third_party/py/keras/...",
         "//third_party/yggdrasil_decision_forests/...",
         "//tsl/...",
     ],

From 7c8a95f2ab9b8996eccf5c33729018a45af467cb Mon Sep 17 00:00:00 2001
From: Shixin Li <shixinli@google.com>
Date: Fri, 22 Sep 2023 13:05:26 -0700
Subject: [PATCH 167/567] Enable cross compilation for PJRT GPU compiler: 1.
 StreamExecutorGpuCompiler compiles w/o client. 2. Add
 StreamExecutorGpuExecutable (the unloaded pjrt executable). 3. Load
 StreamExecutorGpuExecutable to PjRtLoadedExecutable through `Load` API.

PiperOrigin-RevId: 567697879
---
 third_party/xla/xla/client/local_client.h     |   2 +
 third_party/xla/xla/pjrt/BUILD                |  16 ++
 third_party/xla/xla/pjrt/gpu/BUILD            |  95 +++++++++-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  45 +++++
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     |   5 +
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc  | 108 +++++++++++
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h   |  15 ++
 .../pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc | 167 ++++++++++++++++++
 .../xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc |   1 +
 .../xla/pjrt/pjrt_stream_executor_client.cc   |  39 ++--
 .../xla/pjrt/pjrt_stream_executor_client.h    |   1 +
 .../stream_executor_unloaded_executable.cc    |  31 ++++
 .../stream_executor_unloaded_executable.h     |  78 ++++++++
 .../stream_executor_unloaded_executable.proto |  28 +++
 third_party/xla/xla/service/gpu/BUILD         |  14 ++
 .../xla/xla/service/gpu/gpu_compiler.cc       |  15 --
 .../xla/xla/service/gpu/gpu_compiler.h        |  13 +-
 .../xla/xla/service/gpu/gpu_target_config.cc  |  38 ++++
 .../xla/xla/service/gpu/gpu_target_config.h   |  41 +++++
 19 files changed, 705 insertions(+), 47 deletions(-)
 create mode 100644 third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
 create mode 100644 third_party/xla/xla/pjrt/stream_executor_unloaded_executable.cc
 create mode 100644 third_party/xla/xla/pjrt/stream_executor_unloaded_executable.h
 create mode 100644 third_party/xla/xla/pjrt/stream_executor_unloaded_executable.proto
 create mode 100644 third_party/xla/xla/service/gpu/gpu_target_config.cc
 create mode 100644 third_party/xla/xla/service/gpu/gpu_target_config.h

diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index 3b657b5113a00f..f14b98581a9fb2 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -225,6 +225,8 @@ class LocalClient : public Client {
   const Backend& backend() const;
   Backend* mutable_backend();
 
+  LocalService* local_service() { return local_service_; }
+
  private:
   LocalService* local_service_;
 };
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index a2aae828c0ddd3..dac1259700f4ee 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -372,6 +372,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "stream_executor_unloaded_executable",
+    srcs = ["stream_executor_unloaded_executable.cc"],
+    hdrs = ["stream_executor_unloaded_executable.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pjrt_executable",
+        "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compiler",
+        "@com_google_absl//absl/status",
+    ],
+)
+
 cc_library(
     name = "pjrt_stream_executor_client",
     srcs = ["pjrt_stream_executor_client.cc"],
@@ -383,6 +397,7 @@ cc_library(
         ":metrics",
         ":mlir_to_hlo",
         ":pjrt_client",
+        ":pjrt_executable",
         ":pjrt_future",
         ":tracked_device_buffer",
         ":transpose",
@@ -417,6 +432,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 931a48d4888d3a..12c50d4c918a80 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -45,15 +45,22 @@ cc_library(
         ":gpu_topology",
         "//xla:statusor",
         "//xla:util",
+        "//xla:xla_proto_cc",
         "//xla/client:client_library",
+        "//xla/client:local_client",
+        "//xla/pjrt:compile_options_proto_cc",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_stream_executor_client",
+        "//xla/pjrt:stream_executor_unloaded_executable",
         "//xla/pjrt:tracked_device_buffer",
         "//xla/pjrt:utils",
         "//xla/pjrt/distributed:client",
         "//xla/pjrt/distributed:topology_util",
+        "//xla/service:compiler",
+        "//xla/service:executable",
         "//xla/service:platform_util",
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/stream_executor:device_mem_allocator",
@@ -63,6 +70,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -82,9 +90,11 @@ cc_library(
         ":nccl_id_store_cuda",
         "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
+        "//xla/service/gpu:gpu_compiler",
     ]) + if_rocm([
         ":nccl_id_store_rocm",
         "@local_config_rocm//rocm:rocm_headers",
+        "//xla/service/gpu:gpu_compiler",
     ]),
 )
 
@@ -202,15 +212,44 @@ cc_library(
     name = "se_gpu_pjrt_compiler",
     srcs = ["se_gpu_pjrt_compiler.cc"],
     hdrs = ["se_gpu_pjrt_compiler.h"],
+    defines = if_cuda(["GOOGLE_CUDA=1"]) + if_rocm(["TENSORFLOW_USE_ROCM=1"]),
     visibility = ["//visibility:public"],
     deps = [
         ":se_gpu_pjrt_client",
         "//xla:status_macros",
+        "//xla/client:local_client",
+        "//xla/client:xla_computation",
+        "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt:pjrt_stream_executor_client",
+        "//xla/pjrt:stream_executor_unloaded_executable",
+        "//xla/pjrt:utils",
+        "//xla/service:compiler",
+        "//xla/service:dump",
+        "//xla/service:hlo_module_util",
+        "//xla/service:hlo_proto_cc",
+        "//xla/service:hlo_proto_util",
+        "//xla/service:local_service",
+        "//xla/service/gpu:executable_proto_cc",
+        "//xla/service/gpu:gpu_target_config",
+        "//xla/stream_executor/cuda:cuda_platform_id",
         "@com_google_absl//absl/status",
-    ],
+        "@local_tsl//tsl/platform:errors",
+    ] + if_cuda([
+        ":nccl_id_store_cuda",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//xla/stream_executor/cuda:cuda_activation_header",
+        "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
+        "//xla/service/gpu:gpu_compiler",
+        "//xla/service/gpu:nvptx_compiler_impl",
+    ]) + if_rocm([
+        ":nccl_id_store_rocm",
+        "@local_config_rocm//rocm:rocm_headers",
+        "//xla/service/gpu:gpu_compiler",
+        "//xla/service/gpu:amdgpu_compiler_impl",
+    ]),
     alwayslink = True,
 )
 
@@ -223,10 +262,12 @@ xla_cc_test(
         ":se_gpu_pjrt_compiler",
         "//xla/client:xla_computation",
         "//xla/mlir_hlo",
+        "//xla/pjrt:pjrt_client",
         "//xla/service:gpu_plugin",
         "//xla/service:hlo_parser",
         "//xla/stream_executor/cuda:cublas_plugin",
         "//xla/tests:literal_test_util",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Parser",
@@ -234,3 +275,55 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test_main",
     ],
 )
+
+xla_cc_test(
+    name = "se_gpu_pjrt_compiler_aot_test",
+    srcs = if_gpu_is_configured(["se_gpu_pjrt_compiler_aot_test.cc"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = [
+        "config-cuda-only",
+        "gpu",
+        "no_oss",
+        "no_tap",
+        "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":se_gpu_pjrt_client",
+        ":se_gpu_pjrt_compiler",
+        "//tensorflow/core/platform:path",
+        "//third_party/protobuf",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla/client:xla_computation",
+        "//xla/mlir_hlo",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/service:gpu_plugin",
+        "//xla/service:hlo_parser",
+        "//xla/service/gpu:gpu_target_config",
+        "//xla/stream_executor/cuda:cublas_plugin",
+        "//xla/tests:literal_test_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test_main",
+    ] + if_cuda([
+        ":nccl_id_store_cuda",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//xla/stream_executor/cuda:cuda_activation_header",
+        "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
+    ]) + if_rocm([
+        ":nccl_id_store_rocm",
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 567e4e00194975..6df22004640e1b 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
@@ -35,12 +36,17 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/time/time.h"
+#include "xla/client/local_client.h"
 #include "xla/pjrt/distributed/topology_util.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
+#include "xla/pjrt/stream_executor_unloaded_executable.h"
 #include "xla/pjrt/tracked_device_buffer.h"
 #include "xla/pjrt/utils.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
 #include "xla/stream_executor/device_memory.h"
 #include "tsl/framework/allocator.h"
 #include "tsl/framework/bfc_allocator.h"
@@ -533,6 +539,45 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
   return devices;
 }
 
+StatusOr<std::unique_ptr<PjRtLoadedExecutable>> StreamExecutorGpuClient::Load(
+    std::unique_ptr<PjRtExecutable> executable) {
+  auto se_executable =
+      absl::WrapUnique(tensorflow::down_cast<StreamExecutorUnloadedExecutable*>(
+          executable.release()));
+
+  CompileOptions compile_options = se_executable->compile_options();
+  TF_RETURN_IF_ERROR(compile_options.ApplyAllOptionOverrides());
+  TF_ASSIGN_OR_RETURN(ExecutableExtras extras,
+                      GetExecutableExtras(&compile_options));
+
+  TF_ASSIGN_OR_RETURN(
+      auto se_executor,
+      client()->backend().stream_executor(
+          compile_options.executable_build_options.device_ordinal()));
+
+  // Load Executable from AOT compilation result.
+  std::vector<std::unique_ptr<LocalExecutable>> local_executables;
+  local_executables.reserve(se_executable->aot_executables().size());
+  for (std::unique_ptr<xla::AotCompilationResult>& aot_executable :
+       se_executable->aot_executables()) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                        aot_executable->LoadExecutable(
+                            client()->backend().compiler(), se_executor));
+    local_executables.push_back(std::make_unique<LocalExecutable>(
+        std::move(executable), client()->local_service()->mutable_backend(),
+        compile_options.executable_build_options));
+  }
+  bool parameter_is_tupled_arguments =
+      compile_options.parameter_is_tupled_arguments;
+  auto ret = std::make_unique<PjRtStreamExecutorExecutable>(
+      std::move(local_executables), parameter_is_tupled_arguments,
+      std::move(extras.device_assignment), std::move(compile_options),
+      std::move(extras.addressable_device_logical_ids),
+      std::move(extras.addressable_devices), this);
+  TF_RETURN_IF_ERROR(ret->SetUpDonation(parameter_is_tupled_arguments));
+  return std::unique_ptr<PjRtLoadedExecutable>(std::move(ret));
+}
+
 namespace {
 
 #if defined(GOOGLE_CUDA) && CUDA_VERSION >= 11020
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index 00342d028df944..7b2f6da962daa7 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -195,6 +195,11 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
         tensorflow::down_cast<PjRtLoadedExecutable*>(executable.release()));
   }
 
+  // TODO(b/296466237): Unify `Load` method after (de)serialization and tests on
+  // existing use cases are done.
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
+      std::unique_ptr<PjRtExecutable> executable);
+
  private:
   xla::StreamExecutorGpuTopologyDescription topology_;
 };
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 795a5bd41bc9fe..2535b5d5308ada 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -16,12 +16,36 @@ limitations under the License.
 #include "xla/pjrt/gpu/se_gpu_pjrt_compiler.h"
 
 #include <memory>
+#include <optional>
 
 #include "absl/status/status.h"
+#include "xla/client/xla_computation.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/status_macros.h"
+#include "tsl/platform/errors.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "xla/client/local_client.h"
+#include "xla/pjrt/mlir_to_hlo.h"
+#include "xla/pjrt/stream_executor_unloaded_executable.h"
+#include "xla/pjrt/utils.h"
+#include "xla/service/dump.h"
+#include "xla/service/gpu/executable.pb.h"
+#include "xla/service/gpu/gpu_compiler.h"
+#include "xla/service/hlo_module_util.h"
+#include "xla/service/hlo_proto_util.h"
+#include "xla/service/local_service.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#endif
+
+#if GOOGLE_CUDA
+#include "xla/service/gpu/nvptx_compiler.h"
+#elif TENSORFLOW_USE_ROCM
+#include "xla/service/gpu/amdgpu_compiler.h"
+#endif
 
 namespace xla {
 namespace {
@@ -59,6 +83,67 @@ absl::Status IsValidTopologyAndClientForCompile(
   }
   return absl::OkStatus();
 }
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> AotCompile(
+    CompileOptions options, const XlaComputation& computation,
+    gpu::GpuTargetConfig& gpu_target_config) {
+  CompileOptions input_options = options;
+  TF_RETURN_IF_ERROR(options.ApplyAllOptionOverrides());
+
+  std::vector<const Shape*> argument_layout_pointers;
+  TF_RETURN_IF_ERROR(DetermineArgumentLayoutsFromCompileOptions(
+      computation,
+      [](Shape shape) { return LayoutUtil::GetWithDefaultLayout(shape); },
+      options.argument_layouts, &options.executable_build_options,
+      &argument_layout_pointers));
+
+  // TODO(b/300657649): Call `UpdateBuildOptions` like in LocalClient::Compile.
+  // TODO(b/300657649): Get HloModuleConfig from `GetHloModuleConfig` like in
+  // LocalService::CompileExecutables.
+  HloModuleProto hlo_module_proto = computation.proto();
+  TF_ASSIGN_OR_RETURN(ProgramShape shape, computation.GetProgramShape());
+  DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
+  HloModuleConfig config(shape);
+  config.set_debug_options(debug_options);
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                      HloModule::CreateFromProto(hlo_module_proto, config));
+#if GOOGLE_CUDA
+  auto gpu_compiler = gpu::NVPTXCompiler();
+#elif TENSORFLOW_USE_ROCM
+  auto gpu_compiler = gpu::AMDGPUCompiler();
+#endif
+
+  UpdateEntryComputationLayout(
+      hlo_module.get(), std::bind(&Compiler::DefaultDeviceShapeRepresentation,
+                                  &gpu_compiler, std::placeholders::_1));
+  DumpHloModuleIfEnabled(*hlo_module, kBeforeOptimizationsDumpName);
+
+  if (!options.executable_build_options.run_backend_only()) {
+    TF_ASSIGN_OR_RETURN(hlo_module,
+                        gpu_compiler.RunHloPassesWithoutDevice(
+                            std::move(hlo_module), Compiler::CompileOptions{},
+                            gpu_target_config, AutotuneResults()));
+  }
+
+  AotCompilationOptions aot_options(gpu_compiler.PlatformId());
+  aot_options.set_target_config(gpu_target_config);
+
+  const int num_replicas = hlo_module->config().replica_count();
+  const int num_partitions = hlo_module->config().num_partitions();
+  const std::string name = hlo_module->name();
+  auto unique_module_group =
+      std::make_unique<HloModuleGroup>(std::move(hlo_module));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+      gpu_compiler.CompileAheadOfTime(std::move(unique_module_group),
+                                      aot_options));
+  return std::make_unique<StreamExecutorUnloadedExecutable>(
+      std::move(input_options), std::move(aot_results), num_replicas,
+      num_partitions, name);
+}
+#endif
 }  // namespace
 
 // TODO(b/285385306): Enable compilation on provided `topology`.
@@ -67,6 +152,15 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    const XlaComputation& computation,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
+  if (client == nullptr && gpu_target_config_ != std::nullopt) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    return AotCompile(options, computation, *gpu_target_config_);
+#endif
+    return absl::InternalError(
+        "GPU AOT compilation requires the target to be built with CUDA or "
+        "ROCm.");
+  }
+  // TODO(b/296466237): Remove client dependency.
   TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
   return client->Compile(computation, options);
 }
@@ -76,6 +170,20 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    mlir::ModuleOp module,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
+  if (client == nullptr && gpu_target_config_ != std::nullopt) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    XlaComputation xla_computation;
+    TF_RETURN_IF_ERROR(MlirToXlaComputation(
+        module, xla_computation,
+        /*use_tuple_args=*/options.parameter_is_tupled_arguments,
+        /*return_tuple=*/false));
+    return AotCompile(options, xla_computation, *gpu_target_config_);
+#endif
+    return absl::InternalError(
+        "GPU AOT compilation requires the target to be built with CUDA or "
+        "ROCm.");
+  }
+  // TODO(b/296466237): Remove client dependency.
   TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
   return client->Compile(module, options);
 }
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
index 6d44d28b29442b..fe3a67eb5849ef 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
@@ -17,9 +17,13 @@ limitations under the License.
 #define XLA_PJRT_GPU_SE_GPU_PJRT_COMPILER_H_
 
 #include <memory>
+#include <optional>
 
+#include "absl/status/status.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/gpu_target_config.h"
 
 namespace xla {
 // Implements the interfaces that are needed for the registered compiler.
@@ -27,6 +31,12 @@ namespace xla {
 // Compile() functions and ignores the `topology` parameter.
 class StreamExecutorGpuCompiler : public PjRtCompiler {
  public:
+  // If `gpu_target_config` is nullopt, the compiler has to compile with device,
+  // i.e. calling of `Compile` should depend on the passed-in client's
+  // compilation functionality.
+  explicit StreamExecutorGpuCompiler(const std::optional<gpu::GpuTargetConfig>
+                                         gpu_target_config = std::nullopt)
+      : gpu_target_config_(gpu_target_config) {}
   absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       CompileOptions options, const XlaComputation& computation,
       const PjRtTopologyDescription& topology, PjRtClient* client) override;
@@ -34,6 +44,11 @@ class StreamExecutorGpuCompiler : public PjRtCompiler {
   absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       CompileOptions options, mlir::ModuleOp module,
       const PjRtTopologyDescription& topology, PjRtClient* client) override;
+
+ private:
+  // GpuTargetConfig is used by GPU compiler for ahead-of-time (AOT) compilation
+  // without device.
+  std::optional<gpu::GpuTargetConfig> gpu_target_config_;
 };
 }  // namespace xla
 #endif  // XLA_PJRT_GPU_SE_GPU_PJRT_COMPILER_H_
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
new file mode 100644
index 00000000000000..91c063300117da
--- /dev/null
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -0,0 +1,167 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "third_party/protobuf/text_format.h"
+#include "xla/client/xla_computation.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/gpu/se_gpu_pjrt_compiler.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/service/gpu/gpu_target_config.h"
+#include "xla/service/hlo_parser.h"
+#include "xla/tests/literal_test_util.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+constexpr absl::string_view kProgram = R"(HloModule Computation
+
+ENTRY Computation() -> s32[] {
+  ROOT result = s32[] constant(2)
+})";
+
+constexpr absl::string_view mlir_str = R"mlir(
+  module {
+    func.func @main() -> tensor<i32> {
+      %0 = mhlo.constant dense<2> : tensor<i32>
+      return %0 : tensor<i32>
+    }
+  })mlir";
+
+constexpr absl::string_view kGpuTargetConfig =
+    R"pb(
+  gpu_device_info {
+    threads_per_block_limit: 1024
+    threads_per_warp: 32
+    shared_memory_per_block: 49152
+    shared_memory_per_core: 65536
+    threads_per_core_limit: 2048
+    core_count: 56
+    fpus_per_core: 64
+    block_dim_limit_x: 2147483647
+    block_dim_limit_y: 65535
+    block_dim_limit_z: 65535
+    memory_bandwidth: 732160000000
+    l2_cache_size: 4194304
+    clock_rate_ghz: 1.4805
+    device_memory_size: 17066622976
+    shared_memory_per_block_optin: 49152
+    cuda_compute_capability { major: 6 }
+  }
+  platform_name: "CUDA"
+  dnn_version_info {}
+  device_description_str: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
+    )pb";
+
+absl::StatusOr<xla::XlaComputation> GetXlaComputation(
+    absl::string_view program) {
+  TF_ASSIGN_OR_RETURN(auto hlo_module,
+                      xla::ParseAndReturnUnverifiedModule(program, {}));
+
+  return XlaComputation(hlo_module->ToProto());
+}
+
+void ValidateResult(
+    std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>& result) {
+  ASSERT_EQ(result.size(), 1);
+  std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
+  ASSERT_EQ(result_buffers.size(), 1);
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          result_buffers[0]->ToLiteralSync());
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(LiteralUtil::CreateR0(2), *result_literal));
+}
+
+absl::StatusOr<gpu::GpuTargetConfig> GetGpuTargetConfig() {
+  stream_executor::GpuTargetConfigProto gpu_target_config_proto;
+  if (!proto2::TextFormat::ParseFromString(kGpuTargetConfig,
+                                           &gpu_target_config_proto)) {
+    return absl::InvalidArgumentError("Failed to parse GpuTargetConfigProto");
+  }
+  return gpu::GpuTargetConfig(gpu_target_config_proto);
+}
+
+TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
+  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
+                       GetGpuTargetConfig());
+  CompileOptions options = xla::CompileOptions();
+  StreamExecutorGpuCompiler compiler(gpu_config);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  auto se_client = absl::WrapUnique(
+      tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::mhlo::MhloDialect, mlir::func::FuncDialect>();
+  auto mlir_module =
+      mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
+  TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable, compiler.Compile(xla::CompileOptions(),
+                                        mlir_module.get(), *topology, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
+                          se_client->Load(std::move(executable)));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
+  ValidateResult(result);
+}
+
+TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
+  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
+                       GetGpuTargetConfig());
+  CompileOptions options = xla::CompileOptions();
+  StreamExecutorGpuCompiler compiler(gpu_config);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  auto se_client = absl::WrapUnique(
+      tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
+  TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      compiler.Compile(xla::CompileOptions(), computation, *topology, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
+                          se_client->Load(std::move(executable)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
+  ValidateResult(result);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
index e238b1a1a8ad38..d4bc1beb3f74e4 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/client/xla_computation.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/pjrt_client.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/tests/literal_test_util.h"
 #include "tsl/platform/status_matchers.h"
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index c5075678754062..f9fa35debcb7b0 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -80,6 +80,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/match.h"
@@ -100,6 +101,7 @@ limitations under the License.
 #include "xla/pjrt/metrics.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/tracked_device_buffer.h"
 #include "xla/pjrt/utils.h"
@@ -1550,7 +1552,6 @@ PjRtStreamExecutorBuffer::CopyToDeviceHelper(
                           dst_device, dst_local_device, transfer_stream,
                           /*is_uninitialized_create=*/false, client_));
 
-
   ScopedHold dst_device_buffer(py_buffer->GetBufferWithUsageHold());
   CHECK(dst_device_buffer.ok());
 
@@ -2150,15 +2151,15 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
     tsl::thread::ThreadPool* thread_pool) {
   // Check if we have callbacks registered for the given device ordinal.
   if (device_ordinal >= options.send_callbacks.size()) {
-    return [device_ordinal](
-               int64_t channel_id, se::Stream*, const Shape&,
-               const se::DeviceMemoryBase&,
-               const absl::flat_hash_map<std::string, std::string>&) {
-      return InvalidArgument(
-          "Failed to send a buffer to the channel_id=%d, there was no send "
-          "callbacks registered for the device_ordinal=%d",
-          channel_id, device_ordinal);
-    };
+    return
+        [device_ordinal](int64_t channel_id, se::Stream*, const Shape&,
+                         const se::DeviceMemoryBase&,
+                         const absl::flat_hash_map<std::string, std::string>&) {
+          return InvalidArgument(
+              "Failed to send a buffer to the channel_id=%d, there was no send "
+              "callbacks registered for the device_ordinal=%d",
+              channel_id, device_ordinal);
+        };
   }
 
   // SendCallbacks registered for a device ordinal. Can be empty.
@@ -2305,15 +2306,15 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
     int device_ordinal, const ExecuteOptions& options) {
   // Check if we have callbacks registered for the given device ordinal.
   if (device_ordinal >= options.send_callbacks.size()) {
-    return [device_ordinal](
-               int64_t channel_id, se::Stream*, const Shape&,
-               se::DeviceMemoryBase*,
-               const absl::flat_hash_map<std::string, std::string>&) {
-      return InvalidArgument(
-          "Failed to receive a buffer from the channel_id=%d, there was no "
-          "recv callbacks registered for the device_ordinal=%d",
-          channel_id, device_ordinal);
-    };
+    return
+        [device_ordinal](int64_t channel_id, se::Stream*, const Shape&,
+                         se::DeviceMemoryBase*,
+                         const absl::flat_hash_map<std::string, std::string>&) {
+          return InvalidArgument(
+              "Failed to receive a buffer from the channel_id=%d, there was no "
+              "recv callbacks registered for the device_ordinal=%d",
+              channel_id, device_ordinal);
+        };
   }
 
   // RecvCallbacks registered for a device ordinal. Can be empty.
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 27fcc20386598b..a8225d46f4d030 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -880,6 +880,7 @@ class PjRtStreamExecutorExecutable : public PjRtLoadedExecutable {
   friend class PjRtStreamExecutorClient;
   friend class PjRtTpuClient;
   friend class InternalPjRtTpuClient;
+  friend class StreamExecutorGpuClient;
   // Initializes information about which arguments to which executables must be
   // donated due to aliases that were specified by the computation.
   Status SetUpDonation(bool tuple_inputs);
diff --git a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.cc b/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.cc
new file mode 100644
index 00000000000000..460737315f3343
--- /dev/null
+++ b/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.cc
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/stream_executor_unloaded_executable.h"
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/statusor.h"
+
+namespace xla {
+// TODO(b/296466237): Add serialization.
+StatusOr<std::string> StreamExecutorUnloadedExecutable::SerializeExecutable()
+    const {
+  return absl::UnimplementedError(
+      "StreamExecutorUnloadedExecutable::SerializeExecutable() not "
+      "implemented");
+}
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.h b/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.h
new file mode 100644
index 00000000000000..a44660e9579b68
--- /dev/null
+++ b/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.h
@@ -0,0 +1,78 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_STREAM_EXECUTOR_UNLOADED_EXECUTABLE_H_
+#define XLA_PJRT_STREAM_EXECUTOR_UNLOADED_EXECUTABLE_H_
+
+#include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/service/compiler.h"
+
+namespace xla {
+// TODO(b/300657649): Rename existing PjRtStreamExecutorExecutable to
+// PjRtStreamExecutorLoadedExecutable, and this one to
+// PjRtStreamExecutorExecutable.
+class StreamExecutorUnloadedExecutable : public PjRtExecutable {
+ public:
+  StreamExecutorUnloadedExecutable(
+      const CompileOptions& compile_options,
+      std::vector<std::unique_ptr<xla::AotCompilationResult>> executables,
+      int num_replicas, int num_partitions, absl::string_view name)
+      : compile_options_(compile_options),
+        aot_executables_(std::move(executables)),
+        num_replicas_(num_replicas),
+        num_partitions_(num_partitions),
+        name_(name) {}
+
+  StatusOr<std::string> SerializeExecutable() const override;
+
+  absl::string_view name() const override { return name_; }
+  int num_replicas() const override { return num_replicas_; }
+  int num_partitions() const override { return num_partitions_; }
+  absl::StatusOr<CompileOptions> GetCompileOptions() const override {
+    return compile_options_;
+  }
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    return absl::UnimplementedError("GetHloModules is not supported.");
+  }
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
+    return absl::UnimplementedError("GetOutputMemoryKinds is not supported.");
+  }
+  StatusOr<absl::flat_hash_map<std::string, PjRtValueType>> GetCostAnalysis()
+      const override {
+    return absl::UnimplementedError("GetCostAnalysis is not supported.");
+  }
+
+  int64_t SizeOfGeneratedCodeInBytes() const override { return 0; }
+
+  const CompileOptions& compile_options() const { return compile_options_; }
+  std::vector<std::unique_ptr<xla::AotCompilationResult>>& aot_executables() {
+    return aot_executables_;
+  }
+
+ private:
+  CompileOptions compile_options_;
+  std::vector<std::unique_ptr<xla::AotCompilationResult>> aot_executables_;
+  int num_replicas_;
+  int num_partitions_;
+  std::string name_;
+};
+}  // namespace xla
+
+#endif  // XLA_PJRT_STREAM_EXECUTOR_UNLOADED_EXECUTABLE_H_
diff --git a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.proto b/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.proto
new file mode 100644
index 00000000000000..28b3e21204c766
--- /dev/null
+++ b/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.proto
@@ -0,0 +1,28 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+import "xla/pjrt/compile_options.proto";
+
+message StreamExecutorGpuExecutableProto {
+  CompileOptionsProto compile_options = 1;
+  repeated bytes executables = 2;
+  int32 num_replicas = 3;
+  int32 num_partitions = 4;
+  string name = 5;
+}
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 17eb3ead3fd0a4..faf60063e796dc 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2559,6 +2559,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_target_config",
+    srcs = ["gpu_target_config.cc"],
+    hdrs = ["gpu_target_config.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":gpu_device_info",
+        "//xla/stream_executor:device_description_proto_cc_impl",
+        "//xla/stream_executor:dnn",
+    ],
+)
+
 cc_library(
     name = "gpu_compiler",
     srcs = if_gpu_is_configured([
@@ -2599,6 +2612,7 @@ cc_library(
         ":gpu_sanitize_constant_names",
         ":gpu_scatter_expander",
         ":gpu_shape_verifier",
+        ":gpu_target_config",
         ":hlo_fusion_stats",
         ":horizontal_input_fusion",
         ":horizontal_loop_fusion",
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index ec61161d3b5ef0..f025ef98cc0af9 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -257,21 +257,6 @@ GpuXlaRuntimeAotCompilationResult::LoadExecutable(
       gpu_compiler->GetGpuVersion(executor), executor);
 }
 
-GpuTargetConfig::GpuTargetConfig(const se::GpuTargetConfigProto& proto)
-    : gpu_device_info(proto.gpu_device_info()),
-      platform_name(proto.platform_name()),
-      dnn_version_info(proto.dnn_version_info()),
-      device_description_str(proto.device_description_str()) {}
-
-se::GpuTargetConfigProto GpuTargetConfig::ToProto() const {
-  se::GpuTargetConfigProto proto;
-  *proto.mutable_gpu_device_info() = gpu_device_info.ToProto();
-  proto.set_platform_name(platform_name);
-  *proto.mutable_dnn_version_info() = dnn_version_info.ToProto();
-  proto.set_device_description_str(device_description_str);
-  return proto;
-}
-
 GpuCompiler::GpuCompiler(se::Platform::Id platform_id,
                          const char* target_triple, const char* data_layout)
     : platform_id_(platform_id),
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 2a34ee7b7f86e5..bef6f651119e62 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/gpu_target_config.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_pass_pipeline.h"
@@ -97,18 +98,6 @@ class GpuXlaRuntimeAotCompilationResult : public AotCompilationResult {
   XlaRuntimeGpuExecutableProto xla_runtime_gpu_executable_;
 };
 
-struct GpuTargetConfig {
-  GpuTargetConfig() = default;
-  explicit GpuTargetConfig(const stream_executor::GpuTargetConfigProto& proto);
-
-  se::GpuTargetConfigProto ToProto() const;
-
-  GpuDeviceInfo gpu_device_info;
-  std::string platform_name;
-  se::dnn::VersionInfo dnn_version_info;
-  std::string device_description_str;
-};
-
 // The GPU compiler generates efficient GPU executables.
 class GpuCompiler : public LLVMCompiler {
  public:
diff --git a/third_party/xla/xla/service/gpu/gpu_target_config.cc b/third_party/xla/xla/service/gpu/gpu_target_config.cc
new file mode 100644
index 00000000000000..44a5ceff8534f8
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_target_config.cc
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_target_config.h"
+
+namespace xla {
+namespace gpu {
+
+GpuTargetConfig::GpuTargetConfig(
+    const stream_executor::GpuTargetConfigProto& proto)
+    : gpu_device_info(proto.gpu_device_info()),
+      platform_name(proto.platform_name()),
+      dnn_version_info(proto.dnn_version_info()),
+      device_description_str(proto.device_description_str()) {}
+
+stream_executor::GpuTargetConfigProto GpuTargetConfig::ToProto() const {
+  stream_executor::GpuTargetConfigProto proto;
+  *proto.mutable_gpu_device_info() = gpu_device_info.ToProto();
+  proto.set_platform_name(platform_name);
+  *proto.mutable_dnn_version_info() = dnn_version_info.ToProto();
+  proto.set_device_description_str(device_description_str);
+  return proto;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_target_config.h b/third_party/xla/xla/service/gpu/gpu_target_config.h
new file mode 100644
index 00000000000000..4c5e40f58fda83
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_target_config.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_TARGET_CONFIG_H_
+#define XLA_SERVICE_GPU_GPU_TARGET_CONFIG_H_
+
+#include "xla/service/gpu/gpu_device_info.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/stream_executor/dnn.h"
+
+namespace xla {
+namespace gpu {
+
+struct GpuTargetConfig {
+  GpuTargetConfig() = default;
+  explicit GpuTargetConfig(const stream_executor::GpuTargetConfigProto& proto);
+
+  stream_executor::GpuTargetConfigProto ToProto() const;
+
+  GpuDeviceInfo gpu_device_info;
+  std::string platform_name;
+  stream_executor::dnn::VersionInfo dnn_version_info;
+  std::string device_description_str;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_TARGET_CONFIG_H_

From 2779ea10a44c93635e9f59f60cd320a293636440 Mon Sep 17 00:00:00 2001
From: Matthias Kramm <kramm@google.com>
Date: Fri, 22 Sep 2023 13:31:51 -0700
Subject: [PATCH 168/567] For embedding program key argument patching, allow
 ops with mini_batch_splits to be outside of a tf_device.launch.

PiperOrigin-RevId: 567704317
---
 .../tests/embedding_program_key.mlir          | 131 +++++++++++--
 .../transforms/embedding_program_key.cc       | 173 ++++++++++++------
 2 files changed, 232 insertions(+), 72 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/embedding_program_key.mlir b/tensorflow/compiler/mlir/tensorflow/tests/embedding_program_key.mlir
index 078c59ba67f335..6cf7cc91e73f4a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/embedding_program_key.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/embedding_program_key.mlir
@@ -17,6 +17,8 @@ func.func @single_op_program_key() {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @multiple_ops_program_key
 func.func @multiple_ops_program_key() {
   // CHECK: %[[COMPILE_LAUNCH:[0-9]*]]:2 = "tf_device.launch"
@@ -36,6 +38,8 @@ func.func @multiple_ops_program_key() {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @reorder_single_op_program_key
 func.func @reorder_single_op_program_key() {
   // CHECK: %[[COMPILE_LAUNCH:[0-9]*]]:2 = "tf_device.launch"
@@ -53,6 +57,8 @@ func.func @reorder_single_op_program_key() {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @reorder_multiple_ops_program_key
 func.func @reorder_multiple_ops_program_key() {
   // CHECK: %[[COMPILE_LAUNCH:[0-9]*]]:2 = "tf_device.launch"
@@ -72,6 +78,8 @@ func.func @reorder_multiple_ops_program_key() {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @reorder_multiple_ops_with_successors_program_key
 func.func @reorder_multiple_ops_with_successors_program_key() {
   // CHECK: %[[COMPILE_LAUNCH:[0-9]*]]:2 = "tf_device.launch"
@@ -95,6 +103,8 @@ func.func @reorder_multiple_ops_with_successors_program_key() {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @launch_intermediate_usage
 func.func @launch_intermediate_usage() {
   // CHECK: %[[ORIG_LAUNCH:[0-9]*]]:2 = "tf_device.launch"
@@ -122,12 +132,29 @@ func.func @launch_intermediate_usage() {
   return
 }
 
-// CHECK-LABEL: func @preprocess_not_in_launch
-func.func @preprocess_not_in_launch() {
-  // CHECK: %[[COMPILE_LAUNCH:[0-9]*]]:2 = "tf_device.launch"
+// -----
+
+// CHECK-LABEL: func @compile_not_in_launch
+func.func @compile_not_in_launch() {
   // CHECK: TPUCompileMlir
   // CHECK: %[[CONSTANT:[a-z0-9]*]] = "tf.Const"
   // CHECK: "tf.OpA"(%[[CONSTANT]]
+   %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+  "tf_device.launch"() ({
+    %cst_0 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    "tf.OpA"(%cst_0) { mini_batch_splits = ""} : (tensor<1x!tf_type.string>) -> ()
+    tf_device.return
+  }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @preprocess_not_in_launch
+func.func @preprocess_not_in_launch() {
+  // CHECK: [[COMPILE_LAUNCH:%[0-9]*]]:2 = "tf_device.launch"
+  // CHECK: TPUCompileMlir
+  // CHECK: "tf.OpA"([[COMPILE_LAUNCH]]#1
   %0:2 = "tf_device.launch"() ({
     %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
     tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
@@ -137,17 +164,99 @@ func.func @preprocess_not_in_launch() {
   return
 }
 
-// CHECK-LABEL: func @compile_not_in_launch
-func.func @compile_not_in_launch() {
+// -----
+
+// CHECK-LABEL: func @preprocess_not_in_launch_and_needs_moving
+func.func @preprocess_not_in_launch_and_needs_moving() {
+  // CHECK: [[COMPILE_LAUNCH:%[0-9]*]]:2 = "tf_device.launch"
   // CHECK: TPUCompileMlir
-  // CHECK: %[[CONSTANT:[a-z0-9]*]] = "tf.Const"
-  // CHECK: "tf.OpA"(%[[CONSTANT]]
-   %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
-  "tf_device.launch"() ({
+  // CHECK: "tf.OpA"([[COMPILE_LAUNCH]]#1
+  %cst_0 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+  "tf.OpA"(%cst_0) { mini_batch_splits = ""} : (tensor<1x!tf_type.string>) -> ()
+  %0:2 = "tf_device.launch"() ({
+    %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+    tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
+  }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @only_compile_under_replicate
+func.func @only_compile_under_replicate() {
+  // CHECK-DAG: [[COMPILE_REPLICATE:%[0-9]*]]:4 = tf_device.replicate
+  // CHECK-NEXT: [[COMPILE_LAUNCH:%[0-9]*]]:2 = "tf_device.launch"
+  // CHECK-DAG: _TPUCompileMlir
+  // CHECK-DAG: %[[CONSTANT:[a-z0-9]*]] = "tf.Const"
+  // CHECK: "tf.OpA"
+  %0:4 = "tf_device.replicate"() ({
+    %0:2 = "tf_device.launch"() ({
+      %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+      tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+    tf_device.return %0#0, %0#1: tensor<!tf_type.string>, tensor<3x!tf_type.string>
+  }) {n = 2: i32, operandSegmentSizes = array<i32: 0, 0>} : () -> (
+      tensor<!tf_type.string>,
+      tensor<!tf_type.string>,
+      tensor<3x!tf_type.string>,
+      tensor<3x!tf_type.string>
+      )
+  %cst_0 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+  "tf.OpA"(%cst_0) { mini_batch_splits = ""} : (tensor<1x!tf_type.string>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @compile_and_op_under_replicate
+func.func @compile_and_op_under_replicate() {
+  // CHECK-DAG: [[COMPILE_REPLICATE:%[0-9]*]]:4 = tf_device.replicate
+  // CHECK-NEXT: [[COMPILE_LAUNCH:%[0-9]*]]:2 = "tf_device.launch"
+  // CHECK-DAG: _TPUCompileMlir
+  // CHECK-DAG: %[[CONSTANT:[a-z0-9]*]] = "tf.Const"
+  // CHECK: "tf.OpA"([[COMPILE_LAUNCH]]#1
+  %0:4 = "tf_device.replicate"() ({
+    %0:2 = "tf_device.launch"() ({
+      %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+      tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
     %cst_0 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
     "tf.OpA"(%cst_0) { mini_batch_splits = ""} : (tensor<1x!tf_type.string>) -> ()
-    tf_device.return
-  }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+    tf_device.return %0#0, %0#1: tensor<!tf_type.string>, tensor<3x!tf_type.string>
+  }) {n = 2: i32, operandSegmentSizes = array<i32: 0, 0>} : () -> (
+      tensor<!tf_type.string>,
+      tensor<!tf_type.string>,
+      tensor<3x!tf_type.string>,
+      tensor<3x!tf_type.string>
+      )
   return
 }
 
+// -----
+
+// CHECK-LABEL: func @compile_and_op_under_replicate_and_launch
+func.func @compile_and_op_under_replicate_and_launch() {
+  // CHECK-DAG: [[COMPILE_REPLICATE:%[0-9]*]]:4 = tf_device.replicate
+  // CHECK-NEXT: [[COMPILE_LAUNCH:%[0-9]*]]:2 = "tf_device.launch"
+  // CHECK-DAG: _TPUCompileMlir
+  // CHECK-DAG: %[[CONSTANT:[a-z0-9]*]] = "tf.Const"
+  // CHECK: "tf.OpA"([[COMPILE_LAUNCH]]#1
+  %0:4 = "tf_device.replicate"() ({
+    %0:2 = "tf_device.launch"() ({
+      %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+      tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+    "tf_device.launch"() ({
+      %cst_0 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      "tf.OpA"(%cst_0) { mini_batch_splits = ""} : (tensor<1x!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+    tf_device.return %0#0, %0#1: tensor<!tf_type.string>, tensor<3x!tf_type.string>
+  }) {n = 2: i32, operandSegmentSizes = array<i32: 0, 0>} : () -> (
+      tensor<!tf_type.string>,
+      tensor<!tf_type.string>,
+      tensor<3x!tf_type.string>,
+      tensor<3x!tf_type.string>
+      )
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc
index 829a5f6080a9c6..73bc4d13f028c9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc
@@ -15,21 +15,23 @@ limitations under the License.
 
 #include <memory>
 #include <queue>
-#include <vector>
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 namespace mlir {
 namespace TFDevice {
@@ -60,22 +62,63 @@ bool OpInBlock(Operation* op, Block* block) {
   return false;
 }
 
-// Checks if there is a precedecessor TPUCOmpileMlir op of `preprocess_op` in
-// `func_op`.  Assumes both ops are wrapped in a tf_device.launch.
-bool HasTPUCompilePredecessor(func::FuncOp func_op, Operation* preprocess_op) {
-  bool has_tpu_compile_predecessor = false;
-  func_op.walk([&](TF::_TPUCompileMlirOp compile_op) {
-    if (compile_op->getParentOp()->getBlock() ==
-            preprocess_op->getParentOp()->getBlock() &&
-        compile_op->getParentOp()->isBeforeInBlock(
-            preprocess_op->getParentOp())) {
-      has_tpu_compile_predecessor = true;
+// Find a TPUCompileMlirOp that's after the given `preprocess_op`, under `func`.
+// Assumes the TPUCompileMlirOp is wrapped in a tf_device.launch.
+Operation* FindCompileSuccessor(Operation* func_op, Operation* preprocess_op) {
+  bool in_launch = isa<tf_device::LaunchOp>(preprocess_op->getParentOp());
+  Operation* preprocess_or_launch =
+      in_launch ? preprocess_op->getParentOp() : preprocess_op;
+
+  Operation* tpu_compile_successor = nullptr;
+  func_op->walk([&](TF::_TPUCompileMlirOp compile_op) {
+    if (compile_op->getParentOp() == nullptr ||
+        !isa<tf_device::LaunchOp>(compile_op->getParentOp()))
+      return WalkResult::advance();
+    Operation* compile_launch_op = compile_op->getParentOp();
+
+    if (compile_launch_op->getBlock() == preprocess_or_launch->getBlock() &&
+        preprocess_or_launch->isBeforeInBlock(compile_launch_op)) {
+      tpu_compile_successor = compile_op;
       return WalkResult::interrupt();
     }
     return WalkResult::advance();
   });
+  return tpu_compile_successor;
+}
 
-  return has_tpu_compile_predecessor;
+// Find a TPUCompileMlirOp that's before the given `preprocess_op`, under
+// `func`. Assumes the TPUCompileMlirOp is wrapped in a tf_device.launch,
+// possibly itself wrapped in a tf_device.replicate.
+Operation* FindCompilePredecessor(Operation* func_op,
+                                  Operation* preprocess_op) {
+  bool in_launch = isa<tf_device::LaunchOp>(preprocess_op->getParentOp());
+  Operation* preprocess_or_launch =
+      in_launch ? preprocess_op->getParentOp() : preprocess_op;
+
+  Operation* tpu_compile_predecessor = nullptr;
+  func_op->walk([&](TF::_TPUCompileMlirOp compile_op) {
+    if (compile_op->getParentOp() == nullptr ||
+        !isa<tf_device::LaunchOp>(compile_op->getParentOp()))
+      return WalkResult::advance();
+    Operation* compile_launch_op = compile_op->getParentOp();
+    if (compile_launch_op->getBlock() == preprocess_or_launch->getBlock() &&
+        compile_launch_op->isBeforeInBlock(preprocess_or_launch)) {
+      tpu_compile_predecessor = compile_op;
+      return WalkResult::interrupt();
+    }
+    // The launch op might be underneath a replicate op. If the preprocess_op is
+    // in the same block as said replicate, that's OK, too.
+    if (auto replicate_op = llvm::dyn_cast_or_null<tf_device::ReplicateOp>(
+            compile_launch_op->getParentOp())) {
+      if (replicate_op->getBlock() == preprocess_or_launch->getBlock() &&
+          replicate_op->isBeforeInBlock(preprocess_or_launch)) {
+        tpu_compile_predecessor = compile_op;
+        return WalkResult::interrupt();
+      }
+    }
+    return WalkResult::advance();
+  });
+  return tpu_compile_predecessor;
 }
 
 // Get all of the successor ops of `root_op` in the same block.
@@ -233,11 +276,12 @@ void CreateReducedLaunchOp(OpBuilder* builder, Block* old_block,
 // ops before `preprocess_op`.  This actually creates a new launch op after
 // _TPUCompileMlir and moves `preprocess_op` and its successors that are input
 // to TPUExecute to it.
-LogicalResult MovePreprocessingOps(OpBuilder* builder, func::FuncOp func_op,
-                                   Operation* preprocess_op) {
+LogicalResult MovePreprocessingOpInLaunch(OpBuilder* builder,
+                                          func::FuncOp func_op,
+                                          Operation* preprocess_op) {
   // If this is already a TPUCompile predecessor, no need to move the
   // preprocessing ops.
-  if (HasTPUCompilePredecessor(func_op, preprocess_op)) return success();
+  if (FindCompilePredecessor(func_op, preprocess_op)) return success();
 
   auto original_launch_op =
       llvm::dyn_cast<tf_device::LaunchOp>(preprocess_op->getParentOp());
@@ -252,19 +296,8 @@ LogicalResult MovePreprocessingOps(OpBuilder* builder, func::FuncOp func_op,
   }
 
   // Find the TPUCompile successor.
-  Operation* tpu_compile_successor = nullptr;
-  func_op.walk([&](TF::_TPUCompileMlirOp compile_op) {
-    if (compile_op->getParentOp() == nullptr ||
-        !isa<tf_device::LaunchOp>(compile_op->getParentOp()))
-      return WalkResult::advance();
-    if (compile_op->getParentOp()->getBlock() ==
-            original_launch_op->getBlock() &&
-        original_launch_op->isBeforeInBlock(compile_op->getParentOp())) {
-      tpu_compile_successor = compile_op;
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
-  });
+  Operation* tpu_compile_successor =
+      FindCompileSuccessor(func_op, preprocess_op);
 
   // Return early if can't find TPUCompile successor.
   if (tpu_compile_successor == nullptr) return success();
@@ -287,35 +320,51 @@ LogicalResult MovePreprocessingOps(OpBuilder* builder, func::FuncOp func_op,
   return success();
 }
 
+LogicalResult MoveStandalonePreprocessingOp(OpBuilder* builder,
+                                            func::FuncOp func_op,
+                                            Operation* preprocess_op) {
+  if (FindCompilePredecessor(func_op, preprocess_op)) return success();
+
+  // Find the TPUCompile successor we want to move upwards. We're moving the
+  // compile, not the preprocess_op, since it's easier to move (because it
+  // doesn't typically have any dependencies)
+  Operation* tpu_compile_successor =
+      FindCompileSuccessor(func_op, preprocess_op);
+  if (tpu_compile_successor == nullptr) return success();
+
+  Operation* compile_launch_op = tpu_compile_successor->getParentOp();
+
+  // If the launch isn't in the same block as the preprocess op, abort.
+  if (compile_launch_op->getBlock() != preprocess_op->getBlock())
+    return success();
+
+  // Move the compile op launch right before our op.
+  compile_launch_op->moveBefore(preprocess_op);
+
+  return success();
+}
+
 // Rewrites the program_key input of `preprocess_op` to use the output of
 // _TPUCompileMlir.
 void RewritePreprocessInputs(OpBuilder* builder, func::FuncOp func_op,
                              Operation* preprocess_op) {
-  if (preprocess_op->getParentOp() == nullptr ||
-      !isa<tf_device::LaunchOp>(preprocess_op->getParentOp()))
-    return;
+  if (preprocess_op->getParentOp() == nullptr) return;
 
   // Find predecessor TPUCompile Op and rewrite the program key.
-  func_op.walk([&](TF::_TPUCompileMlirOp compile_op) {
-    if (compile_op->getParentOp() == nullptr ||
-        !isa<tf_device::LaunchOp>(compile_op->getParentOp()))
-      return WalkResult::advance();
-    if (compile_op->getParentOp()->getBlock() ==
-            preprocess_op->getParentOp()->getBlock() &&
-        compile_op->getParentOp()->isBeforeInBlock(
-            preprocess_op->getParentOp())) {
-      // Find the corresponding result of the _TPUCompileMlirOp in launch return
-      // op.
-      for (OpOperand& operand : compile_op->getResult(1).getUses()) {
-        if (llvm::isa<tf_device::ReturnOp>(operand.getOwner())) {
-          preprocess_op->setOperand(0, compile_op->getParentOp()->getResult(
-                                           operand.getOperandNumber()));
-        }
-      }
-      return WalkResult::interrupt();
+  Operation* tpu_compile_predecessor =
+      FindCompilePredecessor(func_op, preprocess_op);
+  if (tpu_compile_predecessor == nullptr) return;
+
+  for (OpOperand& operand : tpu_compile_predecessor->getResult(1).getUses()) {
+    if (llvm::isa<tf_device::ReturnOp>(operand.getOwner()) &&
+        tpu_compile_predecessor->getParentOp()
+            ->getBlock()
+            ->findAncestorOpInBlock(*preprocess_op)) {
+      preprocess_op->setOperand(
+          0, tpu_compile_predecessor->getParentOp()->getResult(
+                 operand.getOperandNumber()));
     }
-    return WalkResult::advance();
-  });
+  }
 }
 
 void EmbeddingProgramKeyPass::runOnOperation() {
@@ -325,18 +374,20 @@ void EmbeddingProgramKeyPass::runOnOperation() {
   // Handle ops with mini_batch_splits attribute first since all preprocessing
   // ops may need to be moved.
   getOperation().walk([&](Operation* op) {
-    if (op->hasAttr(kMiniBatchSplitsAttr) &&
-        isa<tf_device::LaunchOp>(op->getParentOp())) {
-      preprocess_ops.push_back(op);
-    }
+    if (op->hasAttr(kMiniBatchSplitsAttr)) preprocess_ops.push_back(op);
   });
 
   OpBuilder builder(&getContext());
 
-  for (Operation* preprocess_op : preprocess_ops) {
-    if (failed(MovePreprocessingOps(&builder, getOperation(), preprocess_op)))
-      return signalPassFailure();
-    RewritePreprocessInputs(&builder, getOperation(), preprocess_op);
+  for (Operation* op : preprocess_ops) {
+    if (isa<tf_device::LaunchOp>(op->getParentOp())) {
+      if (failed(MovePreprocessingOpInLaunch(&builder, getOperation(), op)))
+        return signalPassFailure();
+    } else {
+      if (failed(MoveStandalonePreprocessingOp(&builder, getOperation(), op)))
+        return signalPassFailure();
+    }
+    RewritePreprocessInputs(&builder, getOperation(), op);
   }
 
   // Handle ops with mini_batch_in_csr attribute.

From c273d635119a8b2082d6ace0bbcb0c5d614c84d2 Mon Sep 17 00:00:00 2001
From: Logan Chien <loganchien@google.com>
Date: Fri, 22 Sep 2023 13:38:04 -0700
Subject: [PATCH 169/567] Fix PadOp lowering with negative padding amount

PiperOrigin-RevId: 567705787
---
 .../lite/stablehlo/tests/legalize_hlo.mlir    | 19 +++++
 .../lite/stablehlo/transforms/legalize_hlo.cc | 75 +++++++++++++++----
 .../transforms/legalize_hlo_patterns.td       |  5 +-
 3 files changed, 84 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index 6d95e62c79a452..ddc6848686f5a6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -3113,6 +3113,25 @@ func.func @convert_pad(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<1
   func.return %0 : tensor<11x131xf32>
 }
 
+// CHECK-LABEL:   func @convert_pad_negative_amount(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<8x128xf32>,
+// CHECK-SAME:                      %[[VAL_1:.*]]: tensor<f32>) -> tensor<7x128xf32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant dense<{{\[\[}}0, 0], [0, 1]]> : tensor<2x2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.PadV2"(%[[VAL_0]], %[[VAL_2]], %[[VAL_1]]) : (tensor<8x128xf32>, tensor<2x2xi64>, tensor<f32>) -> tensor<8x129xf32>
+// CHECK:           %[[VAL_4:.*]] = arith.constant dense<[0, 1]> : tensor<2xi64>
+// CHECK:           %[[VAL_5:.*]] = arith.constant dense<[7, 128]> : tensor<2xi64>
+// CHECK:           %[[VAL_6:.*]] = "tf.Slice"(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) : (tensor<8x129xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<7x128xf32>
+// CHECK:           return %[[VAL_6]] : tensor<7x128xf32>
+// CHECK:         }
+func.func @convert_pad_negative_amount(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<7x128xf32> {
+  %0 = "mhlo.pad"(%arg0, %arg1) {
+    edge_padding_low = dense<[0, -1]> : tensor<2xi64>,
+    edge_padding_high = dense<[-1, 1]> : tensor<2xi64>,
+    interior_padding = dense<0> : tensor<2xi64>
+  } : (tensor<8x128xf32>, tensor<f32>) -> tensor<7x128xf32>
+  func.return %0 : tensor<7x128xf32>
+}
+
 // CHECK-LABEL:   func @convert_round(
 // CHECK-SAME:                        %[[VAL_0:.*]]: tensor<8x128xbf16>) -> tensor<8x128xbf16>
 // CHECK:           %[[VAL_1:.*]] = "tf.Round"(%[[VAL_0]]) : (tensor<8x128xbf16>) -> tensor<8x128xbf16>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index d424a1e9c3137b..0c1e157af27d3d 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -3344,20 +3344,69 @@ Value ConvertPadOp(PatternRewriter& rewriter, Operation* old_op) {
   auto pad_op = cast<mhlo::PadOp>(old_op);
   mlir::Location loc = pad_op.getLoc();
 
-  llvm::SmallVector<APInt, 8> padding;
-  for (auto p : llvm::zip(pad_op.getEdgePaddingLow().getValues<APInt>(),
+  // Calculates non-negative padding amount and slice begins/sizes.
+  llvm::SmallVector<int64_t, 8> padding;
+  llvm::SmallVector<int64_t, 8> pad_output_shape;
+
+  bool has_negative_padding_amount = false;
+  llvm::SmallVector<int64_t, 8> slice_begins;
+  llvm::SmallVector<int64_t, 8> slice_sizes;
+
+  for (auto p : llvm::zip(pad_op.getOperand().getType().getShape(),
+                          pad_op.getEdgePaddingLow().getValues<APInt>(),
                           pad_op.getEdgePaddingHigh().getValues<APInt>())) {
-    padding.push_back(std::get<0>(p));
-    padding.push_back(std::get<1>(p));
-  }
-  auto attr_type = RankedTensorType::get({pad_op.getEdgePaddingLow().size(), 2},
-                                         rewriter.getI64Type());
-  auto padding_attr = DenseIntElementsAttr::get(attr_type, padding);
-  auto padding_op =
-      rewriter.create<arith::ConstantOp>(loc, attr_type, padding_attr);
-  return rewriter.create<TF::PadV2Op>(loc, pad_op.getType(),
-                                      pad_op.getOperand(), padding_op,
-                                      pad_op.getPaddingValue());
+    const int64_t input_dim_size = std::get<0>(p);
+    int64_t pad_output_dim_size = input_dim_size;
+
+    const int64_t pad_low = std::get<1>(p).getSExtValue();
+    if (pad_low < 0) {
+      has_negative_padding_amount = true;
+      padding.push_back(0);
+    } else {
+      padding.push_back(pad_low);
+      pad_output_dim_size += pad_low;
+    }
+
+    const int64_t pad_high = std::get<2>(p).getSExtValue();
+    if (pad_high < 0) {
+      has_negative_padding_amount = true;
+      padding.push_back(0);
+    } else {
+      padding.push_back(pad_high);
+      pad_output_dim_size += pad_high;
+    }
+
+    pad_output_shape.push_back(pad_output_dim_size);
+
+    slice_begins.push_back(pad_low < 0 ? -pad_low : 0);
+    slice_sizes.push_back(input_dim_size + pad_low + pad_high);
+  }
+
+  // Convert to PadV2.
+  auto padding_attr_type = RankedTensorType::get(
+      {pad_op.getEdgePaddingLow().size(), 2}, rewriter.getI64Type());
+  auto padding_attr = DenseIntElementsAttr::get(padding_attr_type, padding);
+  auto padding_amount_const_op =
+      rewriter.create<arith::ConstantOp>(loc, padding_attr_type, padding_attr);
+  auto new_pad_op = rewriter.create<TF::PadV2Op>(
+      loc, pad_op.getType().clone(pad_output_shape), pad_op.getOperand(),
+      padding_amount_const_op, pad_op.getPaddingValue());
+  if (!has_negative_padding_amount) {
+    return new_pad_op;
+  }
+
+  // Convert negative padding amount into slice.
+  auto slice_attr_type = RankedTensorType::get(
+      {pad_op.getEdgePaddingLow().size()}, rewriter.getI64Type());
+  auto slice_begins_const_op = rewriter.create<arith::ConstantOp>(
+      loc, slice_attr_type,
+      DenseIntElementsAttr::get(slice_attr_type, slice_begins));
+  auto slice_sizes_const_op = rewriter.create<arith::ConstantOp>(
+      loc, slice_attr_type,
+      DenseIntElementsAttr::get(slice_attr_type, slice_sizes));
+  return rewriter.create<TF::SliceOp>(loc, pad_op.getType(), new_pad_op,
+                                      slice_begins_const_op,
+                                      slice_sizes_const_op);
 }
 
 class ConvertPopulationCountOp
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
index 5030777490c5d8..3ba9d1911ffab6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
@@ -309,8 +309,9 @@ def IsZero : Constraint<CPred<
   "$0.isSplat() && $0.getSplatValue<APInt>() == 0">>;
 def ConvertPadOp : NativeCodeCall<
   "ConvertPadOp($_builder, $0.getDefiningOp())">;
-def : Pat<(MHLO_PadOp:$old_value $input, $pad_value, $pad_low, $pad_high,
-               $pad_interior),
+def : Pat<(MHLO_PadOp:$old_value
+               StaticShapeTensorOf<[TF_ElementType]>:$input,
+               $pad_value, $pad_low, $pad_high, $pad_interior),
           (ConvertPadOp $old_value),
           [(IsZero $pad_interior)]>;
 

From af243fde7e770badda114281532a3cbd372becee Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 22 Sep 2023 13:53:50 -0700
Subject: [PATCH 170/567] Remove unused `std::to_string()` in
 `GetMinibatchSplitsWithPhysicalReplicas` kernel.

PiperOrigin-RevId: 567709583
---
 tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index d4cd8035fdc7dd..38df331827907a 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -878,7 +878,6 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
         continue;
       }
       int32_t col_id = item >> 32;
-      std::string col_id_str = std::to_string(col_id);
       int32_t replica_id = col_id % num_physical_replica;
       int32_t bucket_id;
       int32_t main_index;

From 4be8c8d2d20c7297fd3fc6a036a21098b830c9bc Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Fri, 22 Sep 2023 14:10:17 -0700
Subject: [PATCH 171/567] Rewire TensorFlow OSS to use Keras 3 once the v3
 release is live.

PiperOrigin-RevId: 567713807
---
 tensorflow/api_template.__init__.py       | 98 +++++++++++++++++------
 tensorflow/api_template_v1.__init__.py    | 93 ++++++++++++++-------
 tensorflow/compat_template_v1.__init__.py | 39 ++++++---
 3 files changed, 165 insertions(+), 65 deletions(-)

diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 57676e93a68db1..8b789f7202db6d 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -25,6 +25,7 @@
 only a placeholder to enable test cases to run. The TensorFlow build replaces
 this file with a file generated from [`api_template.__init__.py`](https://www.github.com/tensorflow/tensorflow/blob/master/tensorflow/api_template.__init__.py)
 """
+# pylint: disable=g-bad-import-order,protected-access,g-import-not-at-top
 
 import distutils as _distutils
 import importlib
@@ -74,11 +75,9 @@
       "Limited tf.summary API due to missing TensorBoard installation.")
 
 # Load tensorflow-io-gcs-filesystem if enabled
-# pylint: disable=g-import-not-at-top
 if (_os.getenv("TF_USE_MODULAR_FILESYSTEM", "0") == "true" or
     _os.getenv("TF_USE_MODULAR_FILESYSTEM", "0") == "1"):
   import tensorflow_io_gcs_filesystem as _tensorflow_io_gcs_filesystem
-# pylint: enable=g-import-not-at-top
 
 # Lazy-load estimator.
 _estimator_module = "tensorflow_estimator.python.estimator.api._v2.estimator"
@@ -88,16 +87,51 @@
   _current_module.__path__ = [_module_dir] + _current_module.__path__
 setattr(_current_module, "estimator", estimator)
 
-_keras_module = "keras.api._v2.keras"
-_keras = _LazyLoader("keras", globals(), _keras_module)
-_module_dir = _module_util.get_parent_dir_for_name(_keras_module)
-if _module_dir:
-  _current_module.__path__ = [_module_dir] + _current_module.__path__
-setattr(_current_module, "keras", _keras)
+# Keras v2 loading.
+_keras_to_use = None
+_keras_package_name = None
+_keras_version = None
+if _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"):
+  # Users can opt out of Keras 3 with this environment variable.
+  try:
+    import tf_keras.api._v2.keras as _keras_to_use
+
+    _keras_package_name = "tf_keras.api._v2.keras"
+    _keras_version = "tf_keras"
+  except ImportError:
+    _logging.warning(
+        "Your environment has TF_USE_LEGACY_KERAS set to True, but you "
+        "do not have the tf_keras package installed. You must install it "
+        "in order to use the legacy tf.keras. Install it via: "
+        "`pip install tf_keras`"
+    )
+else:
+  try:
+    import keras as _keras_module
+
+    if _keras_module.__version__.startswith("3."):
+      # This is the Keras 3.x case.
+      _keras_to_use = _keras_module._tf_keras
+      _keras_package_name = "keras._tf_keras"
+      _keras_version = "keras_3"
+    else:
+      # This is the Keras 2.x case.
+      import keras.api._v2.keras as _keras_to_use
+      _keras_package_name = "keras.api._v2.keras"
+      _keras_version = "keras_2"
+  except (ImportError, AttributeError):
+    pass
+
+if _keras_to_use is not None:
+  setattr(_current_module, "keras", _keras_to_use)
+else:
+    # TF will not have `tf.keras` in this case. This should not be silent.
+  _logging.warning("Unable to load `tf.keras`. Check that the `keras` package "
+                   "is installed.")
 
 
 # Enable TF2 behaviors
-from tensorflow.python.compat import v2_compat as _compat  # pylint: disable=g-import-not-at-top
+from tensorflow.python.compat import v2_compat as _compat
 _compat.enable_v2_behavior()
 _major_api_version = 2
 
@@ -156,13 +190,13 @@ def _running_from_pip_package():
   # actually trying to import it. Have a Try-Catch to make sure it doesn't break
   # when it doing some very initial loading, like tf.compat.v2, etc.
   try:
-    _keras_package = "keras.api._v2.keras."
-    _losses = _LazyLoader("losses", globals(), _keras_package + "losses")
-    _metrics = _LazyLoader("metrics", globals(), _keras_package + "metrics")
+    _losses = _LazyLoader("losses", globals(), _keras_package_name + ".losses")
+    _metrics = _LazyLoader(
+        "metrics", globals(), _keras_package_name + ".metrics")
     _optimizers = _LazyLoader(
-        "optimizers", globals(), _keras_package + "optimizers")
+        "optimizers", globals(), _keras_package_name + ".optimizers")
     _initializers = _LazyLoader(
-        "initializers", globals(), _keras_package + "initializers")
+        "initializers", globals(), _keras_package_name + ".initializers")
     setattr(_current_module, "losses", _losses)
     setattr(_current_module, "metrics", _metrics)
     setattr(_current_module, "optimizers", _optimizers)
@@ -175,25 +209,37 @@ def _running_from_pip_package():
   # SavedModel registry.
   # See b/196254385 for more details.
   try:
-    importlib.import_module("keras.optimizers")
-  except (ImportError, AttributeError):
-    pass
-  try:
-    importlib.import_module("keras.src.optimizers")
+    if _keras_version == "keras_2":
+      importlib.import_module("keras.src.optimizers")
+    elif _keras_version == "tf_keras":
+      importlib.import_module("tf_keras.src.optimizers")
   except (ImportError, AttributeError):
     pass
+
 del importlib
 
 # Explicitly import lazy-loaded modules to support autocompletion.
-# pylint: disable=g-import-not-at-top
 if _typing.TYPE_CHECKING:
   from tensorflow_estimator.python.estimator.api._v2 import estimator as estimator
-  from keras.api._v2 import keras
-  from keras.api._v2.keras import losses
-  from keras.api._v2.keras import metrics
-  from keras.api._v2.keras import optimizers
-  from keras.api._v2.keras import initializers
-# pylint: enable=g-import-not-at-top
+
+  if _keras_version == "keras_2":
+    from keras.api._v2 import keras
+    from keras.api._v2.keras import losses
+    from keras.api._v2.keras import metrics
+    from keras.api._v2.keras import optimizers
+    from keras.api._v2.keras import initializers
+  elif _keras_version == "tf_keras":
+    from tf_keras.api._v2 import keras
+    from tf_keras.api._v2.keras import losses
+    from tf_keras.api._v2.keras import metrics
+    from tf_keras.api._v2.keras import optimizers
+    from tf_keras.api._v2.keras import initializers
+  elif _keras_version == "keras_3":
+    from keras import _tf_keras as keras
+    from keras._tf_keras import losses
+    from keras._tf_keras import metrics
+    from keras._tf_keras import optimizers
+    from keras._tf_keras import initializers
 
 # pylint: enable=undefined-variable
 
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index 9684275e572f31..fe250ed3df42f5 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -22,7 +22,7 @@
 import sys as _sys
 import typing as _typing
 
-# pylint: disable=g-bad-import-order
+# pylint: disable=g-bad-import-order,protected-access,g-import-not-at-top
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python.tools import module_util as _module_util
 from tensorflow.python.platform import tf_logging as _logging
@@ -65,11 +65,9 @@
 _current_module.compat.v2  # pylint: disable=pointless-statement
 
 # Load tensorflow-io-gcs-filesystem if enabled
-# pylint: disable=g-import-not-at-top
 if (_os.getenv("TF_USE_MODULAR_FILESYSTEM", "0") == "true" or
     _os.getenv("TF_USE_MODULAR_FILESYSTEM", "0") == "1"):
   import tensorflow_io_gcs_filesystem as _tensorflow_io_gcs_filesystem
-# pylint: enable=g-import-not-at-top
 
 # Lazy-load estimator.
 _estimator_module = "tensorflow_estimator.python.estimator.api._v1.estimator"
@@ -79,15 +77,48 @@
   _current_module.__path__ = [_module_dir] + _current_module.__path__
 setattr(_current_module, "estimator", estimator)
 
-_keras_module = "keras.api._v1.keras"
-keras = _LazyLoader("keras", globals(), _keras_module)
-_module_dir = _module_util.get_parent_dir_for_name(_keras_module)
-if _module_dir:
-  _current_module.__path__ = [_module_dir] + _current_module.__path__
-setattr(_current_module, "keras", keras)
+# Keras v1 loading.
+_keras_to_use = None
+_keras_package_name = None
+_keras_version = None
+if _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"):
+  # Users can opt out of Keras 3 with this environment variable.
+  try:
+    import tf_keras.api._v1.keras as _keras_to_use
+    _keras_package_name = "tf_keras.api._v1.keras"
+    _keras_version = "tf_keras"
+  except ImportError:
+    _logging.warning(
+        "Your environment has TF_USE_LEGACY_KERAS set to True, but you "
+        "do not have the tf_keras package installed. You must install it "
+        "in order to use the legacy tf.keras. Install it via: "
+        "`pip install tf_keras`"
+    )
+else:
+  try:
+    import keras as _keras_module
+
+    if _keras_module.__version__.startswith("3."):
+      # This is the Keras 3.x case. It does not have v1 compatibility.
+      _keras_to_use = None
+      _keras_package_name = None
+      _keras_version = "keras_3"
+    else:
+      # This is the Keras 2.x case.
+      import keras.api._v1.keras as _keras_to_use
+      _keras_package_name = "keras.api._v1.keras"
+      _keras_version = "keras_2"
+  except (ImportError, AttributeError):
+    pass
+
+if _keras_to_use is not None:
+  setattr(_current_module, "keras", _keras_to_use)
+else:
+    # TF will not have `tf.keras` in this case. This should not be silent.
+  _logging.warning("Unable to load `tf.keras`. Check that the `keras` package "
+                   "is installed.")
 
 
-from tensorflow.python.util.lazy_loader import LazyLoader  # pylint: disable=g-import-not-at-top
 _CONTRIB_WARNING = """
 The TensorFlow contrib module will not be included in TensorFlow 2.0.
 For more information, please see:
@@ -96,16 +127,15 @@
   * https://github.com/tensorflow/io (for I/O related ops)
 If you depend on functionality not listed there, please file an issue.
 """
-contrib = LazyLoader("contrib", globals(), "tensorflow.contrib",
-                     _CONTRIB_WARNING)
-del LazyLoader
+contrib = _LazyLoader("contrib", globals(), "tensorflow.contrib",
+                      _CONTRIB_WARNING)
 # The templated code that replaces the placeholder above sometimes
 # sets the __all__ variable. If it does, we have to be sure to add
 # "contrib".
 if "__all__" in vars():
   vars()["__all__"].append("contrib")
 
-from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+from tensorflow.python.platform import flags
 # The "app" module will be imported as part of the placeholder section above.
 _current_module.app.flags = flags  # pylint: disable=undefined-variable
 setattr(_current_module, "flags", flags)
@@ -114,19 +144,20 @@
 
 # Add module aliases from Keras to TF.
 # Some tf endpoints actually lives under Keras.
-if hasattr(_current_module, "keras"):
+if (hasattr(_current_module, "keras") and
+    _keras_version in ("tf_keras", "keras_2")):
   # It is possible that keras is a lazily loaded module, which might break when
   # actually trying to import it. Have a Try-Catch to make sure it doesn't break
   # when it doing some very initial loading, like tf.compat.v2, etc.
   try:
-    _layer_package = "keras.api._v1.keras.__internal__.legacy.layers"
+    _layer_package = f"{_keras_package_name}.__internal__.legacy.layers"
     layers = _LazyLoader("layers", globals(), _layer_package)
     _module_dir = _module_util.get_parent_dir_for_name(_layer_package)
     if _module_dir:
       _current_module.__path__ = [_module_dir] + _current_module.__path__
     setattr(_current_module, "layers", layers)
 
-    _legacy_rnn_package = "keras.api._v1.keras.__internal__.legacy.rnn_cell"
+    _legacy_rnn_package = f"{_keras_package_name}.__internal__.legacy.rnn_cell"
     _rnn_cell = _LazyLoader("legacy_rnn", globals(), _legacy_rnn_package)
     _module_dir = _module_util.get_parent_dir_for_name(_legacy_rnn_package)
     if _module_dir:
@@ -140,11 +171,10 @@
   # SavedModel registry.
   # See b/196254385 for more details.
   try:
-    importlib.import_module("keras.optimizers")
-  except (ImportError, AttributeError):
-    pass
-  try:
-    importlib.import_module("keras.src.optimizers")
+    if _keras_version == "keras_2":
+      importlib.import_module("keras.src.optimizers")
+    elif _keras_version == "tf_keras":
+      importlib.import_module("tf_keras.src.optimizers")
   except (ImportError, AttributeError):
     pass
 del importlib
@@ -197,15 +227,20 @@ def _running_from_pip_package():
   )
 
 # Explicitly import lazy-loaded modules to support autocompletion.
-# pylint: disable=g-import-not-at-top
 if _typing.TYPE_CHECKING:
   from tensorflow_estimator.python.estimator.api._v1 import estimator as estimator
-  from keras.api._v1 import keras
-  from keras.api._v1.keras import losses
-  from keras.api._v1.keras import metrics
-  from keras.api._v1.keras import optimizers
-  from keras.api._v1.keras import initializers
-# pylint: enable=g-import-not-at-top
+  if _keras_version == "keras_2":
+    from keras.api._v1 import keras
+    from keras.api._v1.keras import losses
+    from keras.api._v1.keras import metrics
+    from keras.api._v1.keras import optimizers
+    from keras.api._v1.keras import initializers
+  elif _keras_version == "tf_keras":
+    from tf_keras.api._v1 import keras
+    from tf_keras.api._v1.keras import losses
+    from tf_keras.api._v1.keras import metrics
+    from tf_keras.api._v1.keras import optimizers
+    from tf_keras.api._v1.keras import initializers
 
 # Delete modules that should be hidden from dir().
 # Don't fail if these modules are not available.
diff --git a/tensorflow/compat_template_v1.__init__.py b/tensorflow/compat_template_v1.__init__.py
index 28b8ffa40ec051..ec1ac65e5ad03a 100644
--- a/tensorflow/compat_template_v1.__init__.py
+++ b/tensorflow/compat_template_v1.__init__.py
@@ -38,9 +38,18 @@
   _current_module.__path__ = [_module_dir] + _current_module.__path__
 setattr(_current_module, "estimator", estimator)
 
-_keras_module = "keras.api._v1.keras"
-keras = _LazyLoader("keras", globals(), _keras_module)
-_module_dir = _module_util.get_parent_dir_for_name(_keras_module)
+_keras_package_name = None
+_keras_version = None
+if _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"):
+  # Users can opt out of Keras 3 with this environment variable.
+  _keras_package_name = "tf_keras.api._v1.keras"
+  _keras_version = "tf_keras"
+else:
+  _keras_package_name = "keras.api._v1.keras"
+  _keras_version = "keras_2"
+
+keras = _LazyLoader("keras", globals(), _keras_package_name)
+_module_dir = _module_util.get_parent_dir_for_name(_keras_package_name)
 if _module_dir:
   _current_module.__path__ = [_module_dir] + _current_module.__path__
 setattr(_current_module, "keras", keras)
@@ -56,14 +65,14 @@
   # actually trying to import it. Have a Try-Catch to make sure it doesn't break
   # when it doing some very initial loading, like tf.compat.v2, etc.
   try:
-    _layer_package = "keras.api._v1.keras.__internal__.legacy.layers"
+    _layer_package = f"{_keras_package_name}.__internal__.legacy.layers"
     layers = _LazyLoader("layers", globals(), _layer_package)
     _module_dir = _module_util.get_parent_dir_for_name(_layer_package)
     if _module_dir:
       _current_module.__path__ = [_module_dir] + _current_module.__path__
     setattr(_current_module, "layers", layers)
 
-    _legacy_rnn_package = "keras.api._v1.keras.__internal__.legacy.rnn_cell"
+    _legacy_rnn_package = f"{_keras_package_name}.__internal__.legacy.rnn_cell"
     _rnn_cell = _LazyLoader("legacy_rnn", globals(), _legacy_rnn_package)
     _module_dir = _module_util.get_parent_dir_for_name(_legacy_rnn_package)
     if _module_dir:
@@ -76,9 +85,19 @@
 # pylint: disable=g-import-not-at-top
 if _typing.TYPE_CHECKING:
   from tensorflow_estimator.python.estimator.api._v1 import estimator as estimator
-  from keras.api._v1 import keras
-  from keras.api._v1.keras import losses
-  from keras.api._v1.keras import metrics
-  from keras.api._v1.keras import optimizers
-  from keras.api._v1.keras import initializers
+  try:
+    if _keras_version == "keras_2":
+      from keras.api._v1 import keras
+      from keras.api._v1.keras import losses
+      from keras.api._v1.keras import metrics
+      from keras.api._v1.keras import optimizers
+      from keras.api._v1.keras import initializers
+    elif _keras_version == "tf_keras":
+      from tf_keras.api._v1 import keras
+      from tf_keras.api._v1.keras import losses
+      from tf_keras.api._v1.keras import metrics
+      from tf_keras.api._v1.keras import optimizers
+      from tf_keras.api._v1.keras import initializers
+  except (ImportError, AttributeError):
+    pass
 # pylint: enable=g-import-not-at-top

From 86c255ac098919ef6e72c957175a44f00d92e0c5 Mon Sep 17 00:00:00 2001
From: Dragan Mladjenovic <Dragan.Mladjenovic@amd.com>
Date: Fri, 22 Sep 2023 14:12:25 -0700
Subject: [PATCH 172/567] PR #5830: [ROCm] Fix matching in
 launch_dimensions.hlo and slice_to_dynamic.hlo

Imported from GitHub PR https://github.com/openxla/xla/pull/5830

Copybara import of the project:

--
31c155a3110ac874912060f6cbc455b51f0c4858 by Dragan Mladjenovic <Dragan.Mladjenovic@amd.com>:

[ROCm] Fix matching in launch_dimensions.hlo and slice_to_dynamic.hlo

Merging this change closes #5830

PiperOrigin-RevId: 567714298
---
 .../xla/xla/service/gpu/tests/launch_dimensions.hlo    | 10 +++++-----
 .../xla/xla/service/gpu/tests/slice_to_dynamic.hlo     |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
index ef46dc686e9039..c2d36d0ad94042 100644
--- a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
+++ b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
@@ -134,8 +134,8 @@ ENTRY main {
 // CHECK-PTX:   ![[tid_range]] = !{i32 0, i32 128}
 // CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 195313}
-// CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 97657}
+// CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 256}
 
 HloModule NonElementwise
 
@@ -168,8 +168,8 @@ ENTRY main {
 // CHECK-PTX:   ![[tid_range]] = !{i32 0, i32 128}
 // CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 7813}
-// CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 3907}
+// CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 256}
 
 HloModule NoFewWaves
 
@@ -253,7 +253,7 @@ ENTRY main {
 // CHECK-PTX:   ![[tid_range]] = !{i32 0, i32 128}
 // CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1008}
+// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
 // CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule ScalarBroadcastFourInputs
diff --git a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
index 54aa009ef75a93..8c2b1144e951f9 100644
--- a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
+++ b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
@@ -59,7 +59,7 @@
 // CHECK:         %[[VAL_47:.*]] = mul i32 %[[VAL_44]], %[[VAL_0]]
 // CHECK:         %[[VAL_48:.*]] = udiv i32 %[[VAL_39]], %[[VAL_47]]
 // CHECK:         %[[VAL_49:.*]] = getelementptr inbounds [2 x [2 x [2 x i32]]], ptr %[[VAL_50:.*]], i32 0, i32 %[[VAL_48]], i32 %[[VAL_46]], i32 %[[VAL_43]]
-// CHECK:         %[[VAL_51:.*]] = load i32, ptr %[[VAL_49]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_51:.*]] = load i32, ptr %[[VAL_49]], align 4, !invariant.load
 // CHECK:         %[[VAL_52:.*]] = getelementptr inbounds i32, ptr %[[VAL_31]], i32 %[[VAL_19]]
 // CHECK:         store i32 %[[VAL_51]], ptr %[[VAL_52]], align 4
 // CHECK:         br label %[[VAL_29]]

From 4d076257f554c47c402549dec52ba2a10c8f85bc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 14:14:42 -0700
Subject: [PATCH 173/567] This is an automatic update to an allowlist, work in
 progress.

PiperOrigin-RevId: 567714872
---
 .../compatibility/gpu_compatibility.bin       | Bin 30028 -> 32104 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin
index 9a985e1f5b878f589b8c6b177918e4aee4393876..9851273fc396d07380cb23f908b0326e1286be3d 100644
GIT binary patch
literal 32104
zcmZ{t4_sYmn%7S<G{a<=43lBAY?h5om&wvDo4s5vaFfNz)Vo=%n8hqpF=Bz_LLj-o
z9k?MhVn2K;Myyz|VvG?hMvSpy#EKOoR;(DYV#SCRD@LqXam0ubM~v9|ea?ArdC$YK
zpLymUzUMi==Q+=L&-=dT&%McV<i7(>rjx<{JN&=nd<{$dqT|$Md4IqE2fqB$`TgGV
z_gS~~O~)C>YOxG#>Y(Gqu^epf8;+C03b3V2#~HvP*xJ_}X9SC3s{|UxDzVMy9A^xx
z#8wF2i{)TbBvFH{?{}PASQWOt&vEWyF>H%G#<5xqcc)_D%P+@L-ZHCQ_BM>geH*{e
z&u_qIuJ~CWY#hH~Y>?%wtN!(mjn<gbdIV1@t$A1-elcu`<vgYJ;ve+#s#RLY-~~!6
z2bP83_I1a}XSt%qpI493%2ZmH;meo(zKyV1bSAI_%T=aM>3V%=v}RZ@HBIyP;X_Jm
z0+zzB5u=$n?M;cchD+t;4eb|C@~5-F`i*A4&ht0C{jOK~N8m+DKMR(FpM&MHyrZ-_
zj8?YNx&U8o^T(}&&A&#Q!s0Avs~SEqT63(o_xnD4L}`t~`tWPTCRxr`S_l4+w~xI_
z>lwaPN^27q!Y>coVY!Cgwd0>NT186h3_MTyR>0PJ9cK~aRB=kej>9RKJ}$)}mK*WW
z$NYU0W1j8o*zdyUFZ$UStRKGwwtLLWq8#%G^3H64{*ghi9<56MAbj!n`t==HzG!1~
z87B9#sA%W0PUqfbw8h`fy#=17a<7NYUE`QxySbN2dv$Z~3($XL^e5QH)_(}^yX0px
zusD7-d-Y2s|CIp!@E`VSQ?2xm!>cd&_48mE_-*#=$)6)l|Hpb=pSO+v2HV*A-+<4k
zcH0LV$8Q+ht$k^^Xy0Ppma69yqb>fn_Gj=$rM(QR!Y_iYv78zAZ*%HRuMRP#bpjr1
z^7|IRcIZ#nu@K9-O6!Kv+E#t;b@;sMLz1vb{6?`pmJ5~EBcnB;v>w9yl-3L^j$aKn
z&$2!i$Nq@7kBv&}Fg&ERGGV#mhkcv+H^sY}J31Q*Q|&J_ltoIMLe{US<MF!Dca(k?
zd{OOJD{PvvavV#sys5N4GFnrt&mew!9QXh}ptPo73H<7?S(Y=@`XfW;{8w5B;Zf!5
z!1D3S5|d@M{({lUQ(CXUH`G2<!<M?q39Dl{qO|TCtwq*nSo?SfKBly8!3Ob5VdE@U
zDXnKlYd~o|f!8XnHCP3Hh1e#`^-Al^ANA@LRa&RunMx}J+w7um!75lzDy_GT)&}eC
zeY^pmQT<CFY#hH~Y>;JroF5ylF{Skgo>W@%usZx=*b>WYDz6vc^7gS-X&r+XD6Je=
z7Jl15V*bN&tlhsaJw_{2X<dddtG=TVHjB;#mSA~X<@KS_nqj?NulM0YN^1g^!mkmV
zW;sip!-xKuw~tAswI5!ow6@v45WgJpW&LczAOF13Do|Qy;aiP<-xzEqNgssOu)L(S
z?i#IS*4z7d8-7n|4a0`<>%&G_URT>ZHCjW8e-5u#S}U*^ej)6@Fzv%^bAWSi{pYwQ
zU_!J0v()n_=h@GAxn<+Gp~l&lU=u&|*8d9IZspQuu_A2#r<vbj4mNtkaiZAjPdUyI
z7Q&WtIPO>`HvV1KV~dBG%VAmAMmB99E5H`$c;i?mHhGBa3~c*bFszyh#M(D8u6KH|
zbtYPUSS~in;8}}pbDcAc6<|~BWDU0c9C5J<Y?a~<V7b`D*SP+{R`;WW<ziEGx^kVf
z#O_ck_gLls@qjrZY>4{V$Fc>c=ZE#MxeoHfNXbc-mlU^zLk_#;Tu(JOTu3y<n;R0*
zuB+4l-<XL>3a<78Sb>W3F>GD=&B5x4QH6cSr`=T2kZ4PBnmFtD)Tr%_zD<54ZSThk
znBI>9*baTbI_B=j$&zqOX=rb(aEQI*Sm3W?-GJ4pShoXW4a0_s(`Uy*m1bkrjQt9S
ziceCw*??G2VH>n-d%q6+iM{)^gI@u0S}73C!)d+NbA@TYyN@h2Btj>he74j3at=16
z^jiY->tS>B1)0107v)$;4SOzf<ila7KS2LISdr3y7@$7`i=%%pp3c9!BdyO;L!vkA
ztm0EmJi8W$N51@Wfpgww$6@-o=fN`2-{kt3Q^mQI>U4AOZf{71PnJ3rtgltt9k8U*
zej`A;7j_R_nL}{uIv2Y-uXQBcrt1DpjE<9KA!fClzX5SRfn};VKMshq3acc}fQfTC
z`dV?+ALnwkICM$kWc|s#`*RvLX8OkTd3g?|YgGl4YyUhGr#scu752xubh50}-JcY`
zbLIZ}--Sh0oCg7MCSgh9OfhE5wI<Wlrgjr(fpsyMeO$kQt<q21?C?#-QRSBj%SE>y
zT{#}qzo{%#>ei)zb^0851vaR3FT-?Q8ey|-#7CFz*l8+Dx>d1axHTEXZ}XS^c|L#@
zs2Cpy#F&HC6JxB&Yo|>mA!iw;YjN;TF*Y+!*|j(Z8zC{9<-oGgt3glB)vo5Ao<!Gc
zP7z$6ORvKEm1Y-A*P#_Q-O4?OMlZ+Lx)W}lyPD(eZ4KRF!Z<_t==13VSiaJK6rev3
zt3!VjeQ6h+iMF2RYn`2OmcNC0aiUEP@vBXo($023tQT*6`Q;Sjw_S_VFugAkn9LDE
zjaDo(&d701$70cG#8<bSw_(*P&fS1Gqp$(u%r|&BCEMKY&8k~DM~M5M_{I3N^Zp5J
zo%x5&4*qHSJ>}=X@`({AhUA{?Oi91VnZUWsI<0#aHmr0n!1Vc42b-s5XZduC#mz~v
zPS@lvEUI)Lz;x{=VM%nSUiI2$>ov)^)ylott;upgjDxpn|2h7e9E0f?Ij}5Z)DXiP
z7n2tyaXA-QV%$3EMDeNRZ@a#iVJW5mdVqckHo-u33p<ELJeg?dPIh%(C5W`$j?OD>
zO${X{<fIrAO%>-6EX%}U+U7*e`X2|xT7^{-XTaR|O0>1R^*QoqzWj2OjqUuiU^)2K
z;VW~r_Rdsi!==LV7aEd@CME_&tk><~Raifh6I-VXR;jdGVbgKW|9Zz^Xt%UGlc~0L
z?lknKicgk?<B1oX5q#IoKJ7D~4`I0`j>Ni!A@|k3f6jV+99Lj5oCb;c9KM|wo12&-
zI)_L1`ewr9zD~XIz1*B??!xzUfNumQ<8}yNIbKZ(*SkTqI|BUUuo?Uv{G|;Uf4jzI
z=^78?8^b39OYi%KqRF*_%|3xeaIo0|Os@TJ{T{CC*xnpJwB6!{OQpE{w}qdG+uI)h
zvm75L&%&fu)7T(ZhZSPNU&g0EZC43fWo#=T=4F=ijBh&kM%XOtGgvRWU3AM**P8*p
zL$DEim(P23y4c#((b1f6J`C`khPC25fv@PwqX4P@lK|f}SOvcQ_<qgvb&uOu^?_x+
zLp$cvuE{Cbc(%v(vo3v}orCNBtb)n4e;zRyPMz+nT`e8_{d$0J3O2#I&EM^5x3pbu
zV-vS+xH&bKIk)hM!R&lKg5{We()E1|>*WvIZwXe7{xJH|KHHnR=+jG@nCd&50lLrs
zIoc=JoVG?8EMMss!L}N?hDBG#3QyM?E2>zh^Sus}>sed(E!aHU+H3$O<Mt|zP0kOa
z>x{y+=EtyXrTG~wr8JjeRcH>OxtnWoD{jsf>-6|@e2n%_r)TGN3Z{>92)4<X7C~3q
zUF$Wn<<p8mOXIx3Pv`z7Os?te81KN6_}gp*HbjiY-{sXK@miBR=1jnK-j89@k=WZm
zh3UN4VNo>u(Dd>~mAF!GSD$OWtUte3|1@lj36!mW4yN_1V9Sh`dAs_Zaz48HogHre
z34F3tn|cRUY4jzpIcx-rW9jql!+?0xuvX$tyyERodwXl7mu;Q|_^!b!@a@M}_CtL6
zaE{*DyC2!GY<z3ezMiaeCcw7>w#Halh_9UU$TlQt3|#@ft*~i)cYf&gL6U-7oA!p@
z2-o+6tdF4~`B5VG^8u_N%g;WB>9#%xt4Dhb?QgPeqP^iq<;9^BEtfi+Ro3Y?bNDYX
z55&=qbsVOTX&x*C-70j&_hOGX-;_DaZVc`j;iqHtz*<#|Hv?ky!^Y~luT^V~<Hep*
zX_x63W$u1(jUN!>Q<%I9VDHD1fEa7A3S#sV!`lxxg!@cR_JbIO)XE(jv&Z-D%S*6P
z60+mG0@L-XhAlD9<q}7ZZR@qm-5tzLWm%RI;SwhqpnnHeq4eJm(4T;%(4YPdue~LI
z6bXfjy}DiQe64{+XE8v3|6k<(ECYd^{}GtZKMR(Fex1=L|9D5(>)T|>y&*)52!1g>
zZM_y)uZnR!AVv~4$rw3`$uZ{cT%sXSS{f>)uMW`t0G6+G9|h>n!|KqCVR8-_-B2i;
zbk+iNkKAP(qSLnb;YFCPNj}U$bAvHY<_>K<8jisB>_4nm>0J-dOTs46+WMW|F>Jlo
z5GuaHTnnzxp$D)6rTKAy<{YdZ%`r5+cGueVnrU~kB=^DA1N4vnCGNwpfn9?WFkOQJ
z*bd`eCHmgj(du4AxUt&3v8f6lo&R-MT<O0Rpg#Z`MSm3m=|?-0kqajoMMeK&vNbG?
z{9b_mr?3pA|0F<v4OW4EKl)z&7v=Gxr=Q?@E);Px@9oY16l|P??E0LA>H5TAD~x;j
zU+G`CBKH_1f0o?SrZqtSHY}#}?+56Q!(`l^{q^+mZR(P1UeT8&xgMPl(0>M7V~6eh
z5B=r6`DX}2zY%?}K23?F$)6=xzYrhY=Fh_hlzvBmejGN#80nxd$G54JVX~9C2)wY`
zpWl60WrfFN&fSPf{`cTI{->}E75@oLAD=Z?1@ZgOdh_8_^F<zQw{?`E>%6=7xQAgA
z1eLhyvG@#3>sG+l7#j=Gb@!{|QbI0nWG>jz+<UdVp-jr?#PKWo9e%x=u)1IIn8cBJ
z(G;KcnEZW<^)a)Y&fz1Nyr(AX)9f)!=d}c@Ca+;)nscZ)8ZLJ>0yK~P6|T?d*lf)(
zOluaxwi(wdEX{J-T_s%S*#%1|%^LxleXwyf*J%UkJj=tS&PagfLs+iT{4_vw5mtld
zh)=V`S%<g&HvfJeonX8^>1WxnX;t@^VCj1$q~OT89W_3$vQF2k7Pi0`D&w@Yg{D+%
zGmj%uEIMxm_z#FazJq@M7V-Zez<&ysz&AZkd-~pZ>Rr<|6h>a)89jdW_}Rzy&|l?x
zo#chd`6WroaweRz`Fu~o#%XBw_Ge*we`BzfbKIxIWPh)BcD6V0JcC8|-fMz4NqRaP
zuAdAwMMAJtK&-p4sEYL<Al4)-Nt}8Ui<|V}P@=>=mA{R)iS-p;#a~}@9KKJ!<-0YI
z<0<=?1*Z%5?eA&Wm}2K(x&~FSWyZ%jGe-85v^Vv#oy>81+Sq#olK8bMy*sc9#oiBy
zF#$`V-)Lf7Y3{gAH|iY5QY~@=`b50Br;R6T9Zl`c><`?#7v0gx<JJ=XbdNcY-JBzT
zjq@HI*>CCd<CwH**=G9;eKVh@@X`Ajf^AZy2<MWV3neE)&V>NKI@mmZS;jAn-<tt`
z{jf3o)~Y<c;uFQr2rNMkcDs8Bleoe-bk6p#I?fnYg-yn&36_mb{8kvY{Stiu7Q<G5
z!2V#l*i<F4Xr$GAuKpG{mWxfD#+Obgx0vl^`8d~rMa<#Z$uZjH@^9w3JC=h@;kOK*
zKZFmttYKpuzzS@IKD`&q#U@YjHz!#=pA{s%K`R}_s<FQBGhbl)$`F2B|870U`Fw!u
zL$pUG8Sl}|zzFSrK7<uAN4D7vjI#OI;lJ*^1Cn5@wS5o3hkwn_cG$iEzif=~PEWWv
z65*4y%zKbrMe?$uyvSL|_Jb<cdDsXW*l}uL93~&T3j^(2KI`YV;ktbd!-nze!zi6o
zg1*e>lWo0RC`8?f!#DWby!%^BNt`*hsiaVL9zTI?)8d73sGL@87L&iXSf}gu@_Sy6
z5tZX<_^8oIABzZVgK=;f%S4Z7h21AZ(F-g(HLTOxca66A+d1BbS1Rpcn2g(fKJ8*x
zdm=!4|KITP7Jpm&C-|;$p4)5#7Qrvyr(Nc1{}Q=3$^50Eu)C?f`)bFfhTe;bwq$p6
z!^v>yg$8cn^ju9`Y3gX;3_Vd?&amlJ5Ie^CXUBNk%V+;awV&UB7smXo4>pe9I((4j
zB$~F?1K6U{nS>3!?Dsn`<<+29`98yURK?tch49P6zAgJ~z2{iY=#-6CEe+4k<0aSx
z{emz~QAh59&+#euUE_Q%n6_7qpFZ|?Ol~W`-LH95c+Ag+U?ceTW8V1~=1DHk{{wWN
z8QlS;`viWE{!emEx7RgT1%8DX-HUroRl;kwI{~`q{-#&UBBgr<K0-3KZUt<OW3h<I
zIT;RzBE{k2vNo3MSf^`o-)M`!t#b#y^^%|6f(_!AQref$o(j-D_`a97_}kiF;Jfn6
zezpZG!Y>y)=;d8b-r*+1v(eFcw-{~lx3$m1`;~SLY>_c<220mE!hN4_5y$JZxZ6aZ
z$`Ii61;l#f#o9l?_I9j?qWc3sn}Nmgy9dufFWGsmxvQs(sn?~e%?+2@PevjV##sx9
zbLwxo_1Yg(aZbQ18UO6q1+X28Qc0Yj*&8Rtc@yFh;ni-|)!^IDr@j3fu*3;J>xJn$
ze+*;OeUq%PxBn16srt!jSSx<DSSA~_hl{z7a#EVdp7Xja!u=2YhgfIp9-U_XPseAo
zY#5S0)&c|V%VmY_dmi4ewy%LL{u;+0lVc+D@vhE9Q$ut75`8Ogh=pGO?`aS0o-@R@
zQC0iTyx99k(X(TJ0*^6ovDpHw7Qb8YWm!)rVy{2=w>e*nJa&k9!`L`|o%AuYI99N{
zN~cl7@&YW%z$x>WJXjLz=Wm(!M9|SSdK1Pb`?k*bW4;dGGJPi!Zayj41b(-$ufw><
z!=v9)XDC4TGow4IbU%S7Xx;YyEx>B=tF&~Z<>k&=fbOXWUcND<djcL)`4+%-80Xfp
zZ;-F|d_Kxdl~}CPwYX`t#ox~FP58R%L;GPej~&9K7G3SV#V5nX&aD95Cq{Qz>3#-p
zRCB{+*&e@$ty^B~YzF9_`8!@MqDuD^JX7h0V4E@81t$6O^3KVI?q;V7uIuoQ(Ot#Y
zuEX2#Ni}aAgx$h#0FxTKy5^ml@c`{-MqB)C?I-X7)hDmPD)5u}$5-vGnDH+__uPzE
ziz20a2419eD`0DkwTswS$BuiIwQyaFca826zIMLvz?W3tI075Ouh-IzmKHk`0lNGD
zu9t70()|g()vC|jfJN}j$1<rg^VUnzXn6;VP9{1!=U0uk_}e+3gXgQ9t6<BF;d7WA
zKlhrq_+*LnYw-Z>2S!``ZSDK;5jAfehxOstihUQoulm$Z?y9t0?QY{q<9tAjV;^|6
zX;d)|!!x-iv*TvMa>Wn(s*OZT_KqdRWljWNU8^^}So^nr;MaX!bXBYrYy!VcCeZ=o
zc5}2uuF$zw42bj8#2Hd?K8H`Lds!>67=DBQ-#8)Z`kjow#~kU1KkqWw^l^`I$?H^-
z=rEI|VLr2HJhc>Q^*L|+knhdbVsQ$(xnBesQ_#yhz%tv!Xb?NFyzk(%kL%%|;`$su
z#_CCYRv3)r-IK&&boeYnHvwB={Z<xz!Z(?7V|D1%(*W|anHOSmaO%jX6}=*IO5mH%
zL~99KMynT_K~HiX!nX*kXI(zA<zAD#Ge5}ZDmnC_KTq66KDXJIe)cbKcJFcSf)RON
zR;9mQE%2;g^s{={9Aoh$Cg+5_Q(D41r5!By255h5w8h_Un;*g_U-YwSSSx<D7`x$W
zM?$6DEILa8+AscnFK_X;wU5DTm39s+3%_mVBI&%tq0+0xXopx=#iy-t!)S}Y&9B3^
zU+}XeY!bgwOwKcElH>u8GZ>)z*wfuVrgR^{$CU0otWI>96PTR4i$lClMvS!p?NcAR
z_1GU%+9%*KrCk8q;UuWs)ow!Dtw$y6bUkhw?RB=X>+vRhz0lA4VPp8MG0vusW3hSf
zcr-x!snH%%+MmORl=ccNhTouHyM>ZC+X32V|AAMNkkWn$9#Yy-*gE&SihSB(_nvPp
z>#B%h@B6z(dx>pq{tkTU7yN7lHiX||#8;Dc$$K(D`@o!+cdycZhHtOZ-h_qlOQCJr
z3CFUuMb61AbadVqjCP*Veg&SVw5wrDa_<jq)3#Hk+@XpG=sxgt_s!vN*W*5X?zo?g
z!}{=R#WKlL`Vr1Y_gtS1&_41Hy>ngsZS8~bR;BI0^6|^U()FMpX^~>*u};^c%hTR3
z{<d}tJWFZU!{!+KGD~*zmh%Q_r#C?RV^4dZ+&8tgKZH+y-_NFDt@zc7{#P{#UlHx4
z0PPnadNmP$Tl*NiR%z$Jvhdqx+%(#$l2EyucZhYmCO3?>_}kjo;oAj%mW0VTJ&L94
zEn`B8yoUm`KQr3mZ)<-7A6425uv+{oF@`dy`(jUoMQ1HQ_tZb~>Jd}AC*U!qTL9bP
z;lMg(a_*-6xqWLT>vTPC8g22nbAA)P{_}p;4;#a82utToH|yLA(0yWbhn4PU@L{F9
zEZgH3!P3W&b5TxnXDdMa<$13j;&11C8lF+;XA#&2_w$!AQ;$m}C!=m3Qo}l3kGn=&
z{B7;q@a5<IY#26-Ums@r5XQ$+=U#yB7e=>V={|)Qs%w>XSQNhk%+y2rkV{gJjDNg0
z=kqXD?Gt}H=d<vI6Mhzht#Huhv9xx1iTB;5#sKa2jkfsP+V8@XN_z~}kDpxYr?tu3
z9m8e<v=4ve<*n}x9DwI2?H#r+KsOsZz|MAGYzT$v_;T>o$MCYz&B52s`BnJLFZo$5
zY=LocdUtH>I9VEU+g>t2`=QYme_Q(jyisXS!jkya?`peqN;!@T0ou?16R#fPZ)+cc
zXDIC~SPp)UrCnMQx+dC%tkX4l-Dr!yt=$E`SLA1{uxZA{aV&it(ROPx5TO03(H4JO
z`(t>u(w>9W<5z{H`=^q!@*bJ{tp;e9EqFB%e_Q)S_@<h}=Zi0X8;oJQV|vJm!gW2~
zGP;}i+PZJRM_%x=Uf4bSMs~->OSBQUU5^B4|JZ1Yzpeck9#z^)FuC^U^3|@jtGyYZ
zedeEf^$>qs`xJbo(9c4!O|DK?FmoKq+dChtS*L4q$7qYct$h>Tue66?a_!&0t1ah^
zoA<o{?JtbB_}kh~;WesXU57>SE7;Y(TpsqcGd|v%_jwr2XP@}n+GpVls@^f!3KvK7
zm}!5!=-|yM8w0f8H`?NFYrhLmD(x{?KYj^Io4JTP*31NGAO2@v-r{d-AAnbU&(C()
zz5u^$%pA+6@>1`+pt<<#W7%P}#oyMx0AJ1bvpU#3cNeBGqutIthu)RBcYyXsMqB)C
z?GNAsN_z^Hz^~4)?Y5J}0PW+8UQNW`);=m+wZUvyHhw#dpH@v=?IPCcn!I7O#oyNM
zfsd(qWdb&Z-x!u|C)C?LmV*J>kBzqY+uD!d$=~E>^RPPnVpuwF#>uNv@3jE!Q~%tn
ziTK;vC*V0sy8yPs*tw3S^QH~DbJj}M>6+X$+Tw3(zX=~c?q~h5G5m(Gbl##Zd5;EY
zKQ-FoZ)<-JuRrEzE3g=Tp<V6v^qglqK>O??uO{MeYrh0vR>v|5TW9=~=SOx;I>@`0
zb-E_+8g2c&=^gm^&-vqxz=qK6-PM-7-F7k=pnc$9czKJzo%b_*^HtuPun>NEn5nm`
zE%nYqN9TRPXp6tC{R(^|&(Er1ON`C4So&DfzAuV)Yk>9#MqB)C?f2lrO8XwH7r!`m
z06llEmLOJu?vW+09<56EAUvvc9auhoS=e)~ZV$h9#5Fuz*P_GdW-Hwb@F~@|*1_hP
zOH5(u_91<%w2xGP_D4or{OwwN03Y~SKbwNNzm<rubqw2)b`}G)kN-=r9(79lsBm@N
zlnu+qZ-;w=>0?O0?e%}G(=~a+Xp6s{cMn|u#zF!%h2NN^O--aE&R~G{W1}tpw)P|O
zRpa|StPa1JrOm@uH*eXG^!q{*Ybzku%b&P4+8<G|PQypP?T;UUZM;CgLJm3Tspsy^
z{GI|M2y=A&c4$<R`+jVb1zCO@UT-wN#ov9vTil11-<#avvk#V!EnxDl$q-zAld}42
z)DGLirqR&H;?TeHj<cRa?1zth&%Y1bY+uMe<Y3b7+vN8|_{_#v>$Vu(Tzn;eQgT1f
z!}Xk^2DVtpeRoXe%kus{{khW`p!<Q*oyFJIeGguE$j|P<dhv^6nds7K6&GI#yW_-M
zfcDX4uNLBOYafCyP;{GR!16>F`-agDUnwpY@qE@*@oD?_7;W*l`DJ)gX*a@Ve}VTB
zv2UW&)Xi(*Z6WTSG&y|%+K-I3_}khK;frcqo`J>jtHHjGw$!?~yu?`w(0%b=d-bSQ
zy2s$XN;e0Vh2J(~^+7A=vN&y?b-ErmjJEjOIbVl2X8BnXHi_RT#wzn2Wu5>uc=sZP
z1N1*P`nQz+r^1!~BCH0#3hbwi{_gu1C86~5!tH=KXFv676jE_sg5Toc+4~lStuwYR
zVF!rQ-P%wpZ|T**b$#v{T|I}m4c}2?|1fMAO?f}m)$R0j?*-_7VRZZPwR3w4*K>+>
zSQNhkOSgo4oqw}8-&bI)*;lA^UxpWE`(stYRtxB>FgbU44X3=gxTO0M%hy?#Z8Yez
ze7~LB%es73(~n^icM>goe?Ejysy=uc){0*(Hpg=B^Zsp)u6SB;rF96dzmuE+%fl}d
z%Vv3nBr+rs^Vwpwa+TJ3ct1OB`_{k~f1dsiBW2(Fg=J+SC&~IK8^|_v=}rzoWce<9
zjcsIEY9;5!1XhoYvrf0EXE0Xn8!+3XbA1Be$n@v52CKlY5R-m_$L8+aT08JjmH)Ya
z>**FL-81kJrCR}8qwb4X70X-s{<!ZLtyOhw-iB8xtwGo={06Z1SgtVZ`EUx^xdlGu
zH<=4Cc^7F8lXX&`Inm=&YL~9V(Z}Zaqi6F&@WG>gmI2GdFB6mc@D5H%3GZ>_!L@Fe
z(alo2E%1=it%uEhk7rg``dp4&Wd0YR`;pPr^T-e2c_d};!xSumW*sK^O1~c|E^!tD
zbf5osULET3wKa~w`+vsIvS2y*IhHPSKc@h$>u}xZW+>ebc%jmb!)Ee%PZ5(kbTo&<
z#oR;a3($RJbSLn&^L+?k%=NPwSRB6^TbIfEQh@G@t6m*ymF_Y4%yWL-99R~9+l-6+
zZ})u@_Z>btbr`G~u3k&O&KJX9*XK5jP4}%KVDHCU@M;xr05*!>ASUz4uikHQ<1uEr
z-<I3kpKy=enGJ}2<lnoo_qVFp2jN2uigrE@EFVAl4a2~>Y>E4mc1|IFeJ1by<T!<h
z=)4LaHp|jy6|mfjLm|sDkCk(G3md@h!FMqEjrDPCiuL_$uWR=Sj8*&Ysq6WV;0s6m
z`Om@{@vFv2&z;Be2*+6t&^_^)r(2_RpNFTEZZ0enzb(dRPq#SCt$!y3*ZJNsy4z|j
zzYfn+x=Gk1expX0?_BU^7soz8_p#9(Q@W4fnbc3}Pg&j1c~~8OF^nNvwIS~qZUn@5
z=|6ZisZcS>;PP&$9is@gmFKv>n@LxvV(gw0#V^n|;j8O(-^5x(!`8h6kE&R=V1xLj
zu)X`^z83Gy1jIPJ=Iu{H#W(<8qF8p^9kwqJeN6h%7G5Xo=soG=;Hz`LY;<#!?yK-V
zrCSSI;F^6J6WxwftE-y`(EZTp&Zukr_u+EwXXiTsOX1gOblq`^ImKLn?$Q6~)gi8Q
z55Z+DwRJOKdH7}ebW5B(xUNH&(alo2E%21mt%uFgP$n^{gX9}2D|S-w8s+<u(VbFb
z=m&60?#p)ymcXwLquDqm=6-Zjs7!9%xbF!1zYo{m8E;NF2R~=bGy1Y!Cg$b=kE*fd
zJdD-*3RFI4;RUMyj=@$K^X9P{mUYeU8m(nDF5ZSOb0F+vF$^2VuMd;4vbU|Nv%Sqb
z2kh^v@J@Sz)y^#b`nREu{3kE>Ru%6cyjn2_mJiDkW4Y+CKc5RmD^G3r3OrX`qgTV0
zsN*bF$8v?zx^J}fIDQ8{rRskRCcm$k!p2$F{oN<lHgX(?IiL}2nR7<YM|o#`2HuEC
zeb@2V$K~|8SEou9I}9II*YSn0ZN|toGk#ucyV8lQQwi6)H;wMP8f)K#N0n|rYz)8j
zxZK^`o)q0%0lH6&?y%DR48F?XYqz6i*&e?LCiO{ncDJSY9;#?>1!%wgpS@a$zpZl`
zKA^NCunoqwWlZi5v|ml6+7eCO^4<3u*6DiOHQM5DYu|<!D(zv|Fn)cQjJxg4ZP(=?
z#=QXDFN|)#(tRqrs%@^r<hPj%jBlc;rwLid`SIR-UxBe^U!l@{87}Xk+WSxmlizz@
zz}$I4N2<A(jq3t*?;G7kd~J<8@Lts}ZovleOBvq_KkB+F>2kT|-OEfx-9dd8zmU2%
zKJs78`LE&}gv-B~W5;%2`S@jtv8?;$3q~tXX}tp1{c<%-eqV7GbNl6x*Y?c4*Qn&5
z#7DP}hbG1}+u6B2fDZy}HVI4OSC2`%HrnN)y%3=N{DxNx@wau3z%x|a%Yx<LClg^-
zrM1h_|6WHC>visL82z2^`t^I@<EpJBU{m;wVOH*ClKWtQ_G6<h{&wz<;F}28Y#vsJ
zUktNy4>z<YOGJM?K>zfA^=eb8^uuuZj<l^`2-~JVU&E~2!=fEyT@|0U|81i!{x*LL
zKBU^<0BjV$L2NhoQgXlO>W>HLKQsCRO8*I5e}8igCSy?{wwrsYtL;45oBJy;c4wdX
z+qu6C?`0xmvr5?NcW8T<r7cr>r#?XYJ)<rDw$5GnC}Wn*MqvZ^C9ysClKB1cnE?I6
z|IN!iq4W>HbLc#5{T;S1z%M&UpYOuvv5h{iT}D4k>9@e;U4C1?9yXWF{14lcKU4U=
z0R2Zse}Zjnt%vYF)y8IEar|n6^ywW}SYM;I32%D!saE>O;krM|gJs~iN%GR4@pC78
z_k*V<G5mC$Zksr3s&9S^KBU^v08E}S4PyMyf6uTu>Wl~IKQsCRO8*JGNa?S^D)1`|
z)Gu{1{`=ni&%;P%UxCs;3tv#jIR=yOaLDf=;I8ryJMjSh2S$HRjnntx38g;{>%*@#
zP@n6c0R5v+z5L@!{}8-UT{~yM^6<+H&@bgYC}906C%4@muN(agrQac3>BnI+jPLib
zB+I(3JTzL<Y7X!KzC}<w?j%f}G0FE>bgfH^!_Fe>^?lmo|HG?Aozg!F*L`I+EE~Ta
zI^sN*b*^1TD@$p$!0S0l>|E<%bLdQBdvfLXCi?>P9~u1#wz0Jy!q-(@XJB#sY6A76
z&WdcK&V}%nR}1;~G;ICj@G8~+=D}o+u*o=H$g=!4hrP|~MoR`Qn|FyX=Yh>yVbkcy
zoFK)r?#u4N*3^7q6sGI{6ej28kn;T;9#QpKfyM9(VR9|${Wea}vEcX5{wF!9{d*NA
zvGjAAb8y{PR>77T*XJ;|uZ++}VY>eB8O;SX2e=!cISL!VFPYY4mNyAoQrjN*iI=DR
zyC`<ep9N@c!b14vr8Sv-Q8s?ZChTOPGp;mF!_vA;lN{+==P71f3W|*~>5pPd1Wsbv
z*d&7Uv?Y07zd<qN`F#;4?Q5CMl9=S#hqgYxkGy@^KcU9zhwx#=W?*sHJ#=N-s@_BB
zEekVA;J>wiIH&%XTeJQ0-8{(!3HS2^yx%ORSpjS(lWSdo)Ii1Ip9c#$Rru7X?cOml
zbl>?lT(_S=*eznLpy9Tk(%@L~U9qWvSO>Sg{Yjx=@6Q(jv9@6HeL*=6ZY-`$g7&MJ
zAHK;)w^8j^kBOD3VqJz8Dc?retU0eZ4V@pJ{f!^i)cpFn+Fy;ivE>W090GGq#~Z0E
zkFZUZ()qC$bD#WsruP0lhU+=$6091(D7LE|jf(bWfcBaH?VUg3Z)=}|>t|0P*e30M
z1+%od3rTxtozDA?(H4JO`zCx=u_2iJTNM3%ZTDvHy#VbmjJEjO+E3wfe%fpu7KIh~
zw9DPRGoJ0u`#g+Y+9&?D&RKY-VlmhXeSmy_bhjpDVY%RK4A6exXp6tC{Vse$u`yUb
zETOdNUY(f$?Zf}a%Uk?y?E~-{#dg?U?qz0Ud&Xy~p2s%&cy<~6ETz=~Pru)k?l<aT
zb0jf|?a}Al>I=|+Wc2m#FFu41qhs&W3{1XrToa@pDsxr>^us$|eX5n#ad=3vJXi*7
zlYV?p{yZ10V7;!-+eUwbZP@gye)R@?QL#RlJO>*N(3d`SG(i8U(H~M;pTm=it-xZi
zP>?=7%}#*+x&Q0cr$}j?foCgL0b8R3UBq;KB3!rBv%XMm^A=2=0ZMJ?Zk$4v<@b|P
zn7(H_im$HkQ<#)h##Fn$pTm=it-xZi5XLF%ymmFw*>Q>cp8>irf8puMJ!D(=G+h2|
zG@C_W@}8dDYnFX5KbYlI!*#xQjP44)w#H5PtYSm35m-N#iEjEiFpJK;0PQb~w)orH
zPvLRJ)?rbYJX7M3@SE=~&Q5^lxi8J}SKFU~XDe0#TSIFRb2TqDJ2fy}gS$pk=00|w
zx8XC24a0_EeV81x)~ij|n%mfa__XDl&9dBm&ZbgdX9CW&P5#@L;{Vby(mq)($#i|H
zt@TXX*;>|Sy7kUA>!c6g9i_d0Q_eh3x|F<nf`8g7<^I+Y&BEcQog5w!rGJ01sUzOi
z*%oi;j$diuAG&aVn!=4P@BQKC&NGZwdsD~NmZpoTt6khU-*f)ZzexVOaT~mcSTLz|
Y3BNug`I_~e$!31Mt)aPlH>%_OKkBwMiU0rr

literal 30028
zcmZ{t0aRUAy4N=uCc|WS8K#4IOdqeqL|*2$j~DN~T;Qe;`<TkBG4lGbkBPj90g?-W
z<N|l#hR`fl7psfal~plfjS(Y`7%|3*F;=Wtv0}xD5i7<Rv0}uC6)Q%pSYyPFzwexV
z%f27SuKDJk{?2#)-?zWB&))l-d$`Rv^3PtAXL9*}!~YxeH7xnd#x&;p|9<a}{Oo6I
zX@C2d*thj9V<xc%EEk*JXG{Vs#OA+gOd2c2?&ldZjK#3EZy0k0tHo9cG>X+=n_o9(
z0;|DR2tI%nV$&p1kFD=9W(+%uJ;^cVCRU4Wk;f#~fZ=W$M}GFR;iSLKZjar9v6-{f
zm(8!kCr)RxG1w*i(%2;1n@a1c)f!e>kKqkUYYkS7UkKZj<GQkAz4Ax>yy8mhMR=al
z3d1&gsSQ@mcFD<Xt#_=}2K%KBnYO<LpFNe$24R!<jbfMBK5FZfIp(3&nowHz;3=iG
z0Bgjr7NeV(uGVB{^O>ru=C0F6_*b*e{-r;h9si}ve!FXweiUAy^h2;GKcU}XF}7Eg
z*4tKVi~Vl9-+<4zX0tSG3coRIi0v(<^}uS~R9bi814?TS)`DLhw#as_Iv)rBnBT@G
zrF8&arnK^5h4>k)i0yhd-L`aFtpcTW3SO*y8(|A6t{aR?#gs>l;S$W8mq?iHL45Q%
zzh`4iahx0b4t(klWwTjW0>64}=bTr@Ip;Ct%}RiNbkwg$ozg!9pZ;;SelaW;zs(-Q
zc$CS#GA`P2_UYU&TW#@obAJ=Qc_Nz)z;5Dq1>4EJLi($h`&fYfW2--^^dG@nnzQwn
z<#_yJyY<T@|0e<Zul{krHesdzGCZvEkHglx8B;NpKWCcpkNvtnx2*mW$GG*m4xdo{
zb_{k2zcjW}`-&>jo@U>&>I?hc@pBh{SNltRYnApEtPH;*%y8U>(mHOnLQ3lxJoMYy
z`Zcgs#?wWtj_qxwb=_*Ms4@2{d`eySBd{y@4Pj$!=Qm}?dTg~umDVG8i_%(#9mOw(
zt;se`b?frtpYZEdtF#Wo3zb$WY&*#qihUcM)<kbxcTaOD-SvEPWvtxPvVTss`<m5X
z;}|#YW%#se*D!1xzf0IC+xL~$r&eoBX?+6kP+Ci{di<)f6}H!vR`iO!{*~4tc(Kwd
zhUMb7d5*bLw$=VOtd>z)z3?R!w*xkV&Loy*yGUu>wOZ5cckA^bd{}8s!;<(lVsmVl
zDXjy4(r;sj(%J`)D=h;n!7pEo?HZ+Z%4!uWt>f?wH73--?w=(mtdZ>orFF|{EwSHi
z<8}Ci(i($Z!Y_?YvaQeaQ>!(sv>wA7l-3%o8ov;>$@YTE>y@j1z2ZviMR=al3d1%#
z>04Me+o87Xw!CAtHrVgB@h$kQ8aoDIllYBdm)KrWc|EjR6H4nIJf*Z2V2$|IV)xnJ
zR9Y|mDZh;kO6xgzsnRNh<>U9H!<Z7bCllFi>9<;WO6x3qxiQ<f1vZDy6qaOrMrnOy
zwPxAx*6V%vh|-#ZrSWUQX4qa-$Lt^T+n7>(54=WcJ>mEeeudaU_E+0u<ep>w7r7@W
z=JOYb^(=kxdwy<Z_${e<_9SfdH~js-%CTEToKLI_TljhILokDlA7IUZt^N*UMzAn;
zzmS?>dD!H4*^ezfV@wB@k8KptFR)T<k%2dX<zct?bDx1d`8gO?m(LjgEo>k6E7<xs
zSzBR6*zG)gu_xT;jAEtOG>xpso^X#diB)5(6n_{i!lt;-slirRxD8@O*ffiBxzD*z
zb6662=c@Wo2doicBh=45mv6!-XBHcP-E3$5gOQR+Rh38DqhUj{%*AwD^QmNOqOCa@
z?>$eB_|~f9AHYggoX=qE%5N1`LyR))J6YPT<;}^?G?%Z*y|%j@m9Tk|cH40brrS{i
zTSa6M^V)HwJlb9n-W@9%rggM4Al7wQql)!@K&&a4B;4Z0LX~dg)QomT!;vEtZY3br
z{y)RGO}}>Abr7c8RRqf;P91B3ucO~_q5s_Ld`cVHYEFiam}-3L`FB^V8#bc!-we<n
zfZa^c$D6*=KTSAa|1@Vl8a3kq`X9r}l>Vat{bkru^hdtZ@9WO!v(=m&h?;GD>Qw%R
z#~D|+=G`{E4Ab?A!`8VszK?OKm^0}fFZaH#=5+K(g=u7egVKH*mQvb34bZ*<8$nmr
z5L~+EbZ^gv?xfdMy?-Z0_mRplJ({^TAkOEoJQZi}pWWStZTw1!(_!PBjlUj=XU91k
zkA%-ioDj$B_MCuC*s+nJ!K+z!K%4|j?){C8)0b}Tjb_I=bELAuYtJaYO_A*Se*}xG
zI1d8iEW#RyGj_ti*JPR6+GPW+v#-|Xn923opK|RoPP^y&Ff6V7N@3frjEU&V`Jn!-
zmEj7nE;a1a*FZ1qlG1$(ru)z!Y!cn|-<Bs{tg9<iUR9hJUQH%N@A>RJAHYggjL!mM
ztio!DF-#2Ur>*5-vjx+&IP~XOyD?9>wRjPBg~VJIhHbWRUxA*C1-)(k{mI_f%~80%
zmM+4El;&lauEQ{F9L?26KgZYml3tyA+Y()!&3)ee<V}2Z|N8`1qVyjJ=&!-5(H}ye
zDbe&KJNw%%^z<ayX8NQJ5+~l;oH*agQr;9!?2h#cY?}Got;GqLZc9B()(C}eEIZH0
zdCkOP(+uLP`_3&`or?2OK%5y^2XUtUpr2E!(;MELx>a%3sVRQ7e7bo*^XD16Sbw<e
z5NuKT6~l6gag-R6d#Wcb<0h+Q)++4Nx@TddO7{(zzMj&sDelQPkGr}NaWkXr(>3`B
z7FW6tV7m5;um*I;(3QT~aX~WfaB`1$HQ5S?ap*ezzc9NdFT!+;Fl>`KF-8o3UQC^q
z#3deE%v(oH13nG>yIbG4U}>d)EkJ)1Hj4fr#!}8CQpx7NRBz9Df=J)(?m5@l+FX8E
zE{a*vRB;}|@@*WJZ6;>-@BItA+p>*sDRDY}?B|;7?DFap#dn;8-TXtaCrzvs@s%}N
zS5La9`An$l`Q}uz^>z9P`*nY~2peK?;_6(6)hO*@*f`p&4aP9FJK8;|bY~Y&8V1sl
zBNfp^@_BO`-!<E&9Q*kQR%GKytT8NsW!k=Ha(COGa7>7pNn*-*>^a@m+Rf+T0N+xW
zJlCnQzGvIgZN2!O2=J|k$-G^Nubi*er03l%+HVK=55XqzU;8b;4_SY=#+8{GPvKjO
zj<hk;_J^X$y@Ja=hsAJk**Z+_{RfFt$np04q3snnS|P>dpMt;0J(N23W!MahXVH^d
z&0v=>hBy<#gty~Us*Y=d&C8rg%(vLq*U}(tlKtzi`MSLf%Tm|(0(@`62JoH5SFW|w
z9j)EnZAtSm!1q3^4&PC$D_;dj{rBD2or8gu;G4kLn;(;%{&Q--e^*LTO7R)u)2-)m
z*kpmv-0Q46AVvZv_x^?$GSBy&?``jHHRf7??<i~(-}`^S*0u9&Cx>`#_Rd*brJ2U3
z7Ut&jDXh@u!)fsLJ%tUZbFv8wqn}1!`e#>bFJpRnYZu%3e`&YwOR!n)Io(*V!Ag|w
zQJCEO7o#h4g|F++6^Q`dt1!8rb#>o`E!f;MZJ2<`ygkpPAm`ocnwbF2&tL^g^GjG-
zX>P&F&`hDZlWU{{H&bwP_x$rRY=%M4&Gk4;_vbp;edfv{bp5%3Y|(ULoRS!6{A&4h
z^=`oAp5A32z*5TZwlHE$|E6D$<m;{8oU;HYY}WBlVKR`on*0Cq?!0q_q1l3_pEs&#
zNxeON?)5_WX#Eqg2^LVUes_R=0yfKhX?FB`<a+e<d%C^+FX5A~`qT%o8mljP&0|-v
z1Qx?2?!$n1_hEI!8-3MpPghq*YyjVV@9u89ftBE!z*qFdmk(1J;9CaUV*ZO~e0^DS
zD!{i9wt#OwzH-eY+nk~^Tn_LZhK=L9`WyZjBq?~c>1rN`aeqI_{#rC7zsz;<09Klx
z%|3(azP<{pL3<c&9ub*jSMyJ*BH_dBXS&Tc`}8?I{8v~9;^@YD8K(Ps9Jb!Ta}{*O
z_jJF%-jp@VP7L}KemcfGunraDMnH^l*f22`xUrD)-*vjbLi%MUMy1yd`b9vDFJSU6
zfZL9JQ@h(?U?s#z5W{bW7sC5aOWHw<5Vi8=#xi_$TTa5pNyv@U8xW@hHp4u){rmlX
z-*Mq=UpH%0InT0{jFy{`0R0bP)k^<ffc^rk5&d!WrN5<q5(|eTe%;RYyxz>F*$B{o
z?yvIPnTf#7KMK?5I|O^e{8erB$v@E@^~W~Z@*W5gqaMFnK3%;xVFN10)qofyuq(tE
z!sHzDbS~MPtf&ZA@Sa70?kBJkrTaKQcMVpJW(cE8WOT#fXv&ydyYr3078tbMHoOAU
zHK~TJG2bm=vUcd?tKs?py*FV4O7Cib-U#dpTFbxBKZhL`n!}NEJpY92e)j-Ysx&_f
z&|HPppgD}D^tFx)9lfvHekWV<94z;*?bfe^&2xZTgJUpVgBsW>$(5q-&mA4!O@tS#
z%b%MP0s2>A38nvTfc^w*2>tn^eocB(u~SEwMMeK~sv|0$d_F+`3s|nw-*<a={svZp
zegb_z|I<ksLp}W@_jBQxDaA+E|2S+?JsUY2px*+UW8TaCO8?Zkq}6B3yKII7^xuco
zD*d|w`g1Uuw<mvZ=KQwy$~~{-FI#dyx)z{+;IGqnX|Q{KUx4ZJTLP1LyB2-FKCQ`=
z&7Un#zZRcbn5)$f8&>*n2j~yMCYU4FUh&UwYX#F}5BG5Jd_-N%yRg!j&t%QrVq?z3
zb^I@2xhnp?_jb3>z)FapAU;EhNw=NmtL@J2O16vHr_cMVuql#q^Ewrv+X!1=Zp_cL
ztNTn+Zf>|_O?TVC`M%~#DWe$@J(73z-h)+_`%Jj3i>CQ(!sOql+0P+a$9xKt_ta#6
zW{vO^rqAalEKFYMU;pYFio~N;Chu?T)_f5*$H3-lz6#TtwXhZD&k{$oihkD=pm`aV
zRGQZVG{<0<&|IXU-nE$Oh*U)@%<TZpN3bHL`9*-{1}ui=K$d2?$(?4s_@nH0R>Ec?
z*{n>Ms{2V;=2;0T$g2P*ZhbDYPuD61yUiRb^R)DZ)^tZ3Uq{H0W8Mw$pMVYFo6Poa
z7ynNJ{Fh+$_-5v5U*Dfk{d@lAQ0#fW`@^pWKlj|e@HZLr!#<PiOOlfPrSJy3o#|W0
zVUzred;D3LZf^^0?ikN0F=_9`o}R8|zRzIOd-j^-O_Khe=8H$dtuaEI(STSV!Qv{`
zgMe6zax8IbY%CtqN5jc-xs<*4R&1=V@HYPXp5yTQ<ommJY9QxR+8BcCvET%3!tT%1
zr5mPekbuoHKi;%+WPf>At9L)bvHhL2-hvVQI+WgruxiEb1;ki@HKJc@W1MU2zQ{0Y
zp25=X@&Nj9qOHG^Z`Qh7yV@92;Qq7d?jF8wWwi0%)8lJ56a8D9TXdw|458lrmh@>k
zX5}TGA@O+}AKlJ6*!`p2|Nh8d50xJdn>Pad(y%G~HfwypD1J8r{KjF!_$^q!$l-{&
z4XY;yx8FU2NnBwp70t@;X1#-zVOJ<#Ew=Tm^ewCmTX~T&01ILBN0>*jZ5lX+)ic<H
z`JAtUW83teG3+*l*bZ?#+biEE_fq`0NIHl&OVB3lNsKXYuQ3zwWr{VzzlX2|s+*AU
z92=*;Q*@T2e1_;}%ZGWEhQ+aBbmJUXMx2Fjp-qDpu@ss&XSiO~HSh_n@K^kH%h)5i
z%wbZ8W%eDl+nK(3_;35~N7OTyi%*6hgwK>^vm#g?ep|2j*Dc?9^&JVvPjQ%uu}^Ei
zZMDVU)qVrsq_oqpDg4H=v?HGOXn^*oR$KgC?N5X&?Il<}e$`ppm7X@GGOaQXH;4LK
zyZX*|pJ^UAo$O5YwKX4!hM#ZdQAGdw<hj=FW_sG;NEO|}lzhPYO7*vX7`yi5seI1D
zSLthRjxDe`=EYnzl5ESQ@9zI8Y(w_LGw0(9?B?&uW}m^t<%;rs2%n*!yT{*$)!`S%
zzRf}I`+8N3vRmwzch}ux4*eb0D=+x$>m0LyNq;KFFOSa@pJn*y^Zq7`-8tK}*?Rr(
z5|vjHHjUo|X6N*1Ij?jM2k3rcbtjeX$MAfcXQp2BuqOPDV&5d0Xfzy)L?e}*Y_A4r
zSI+u5i+`>pkICl?@T59_CE|<U2Is@oK7)TX`*c0tvD)JAYQF_<cs`pA!Y1(>#rB~`
z-c{rsZAE-6K>M-P7Jpa!5&TIgn=Q-n_{FeHtz$f6_&MVE@5b|FgO`nEa?Jg^yW4RB
z#;!f3D%Pvw%NXj$uZ69=%(Vh9<e*f~g|^=QUZ(jo=i8dkbRCJsB#cQ0#QDg^ndNv_
z^L_XX<FCu6U}^kj7`MN-J5HLu9p>w(^L^}F#&?KM_xODua?kU_Y-V8k93F>pXiiAw
zbPOJ{<87wT)WBAM#5j%RaZpz@!ae&D=^neTy~<eBr0^eMpPScpSe?pi45ssX4CBz8
zQDV8rKY~9wl+Bjqc>H3RoD*65_4XuNo7)m+82@-1DN1kj*I;gcV2Uz@e~&e)s{L6Q
zyY>{S*eBt$%$IIH4Y0);o~6s&%Jw=t-Tm*tnryz(ADJS(zo(J;ZY_uT5PR}9fBm_-
zhq)Y^$EOG$f)8OcXz2RvpY!XJBDNcE54^^%YmxBwJ>mEeeubF5J|gj|DpP>3)@`@C
zMN0Psyy%y+W7Wf!evLH&CgX+weQlhT3;jPp`y;C@{%+3i!)MhvHw8=M*MdpTy<G#5
zBT=4z1n55d_x)NVl<t0bLh0thit)>Hb*mz#7_Mv4YjyLLZaaL1HH_PaCfGdl-fc|s
z<t2+F&3$bq9iV&H>Q1Y<_Cxrv(w&AS@oU7S#-6Ty$6_f!`_M;zEyUl=?^$@G8j}lP
z1^8_<cV+s;kw}#Z!F4U(u)0R+_6k?J9Wa@vC#|k`?s#@J6rlUS>fTiI=Uw>CP<9*U
zU@iF7IlA$Rh*=KMJ^T;+I@BxOgYXhHt`))Z@Y`ZO%%jGv|IWnYRo!fw82fZB-nQD}
z@7DMY_|^}zSsFHl-xwz6&$|bT94VKvXEZ?jQ>!ihuJ$MJ4!!<^)#F!<eHXp2#=Rcy
zvfI!1b@I*ARzQqn^L}k&D#lCj6%upf#$X%Fb<5aSeI#DKd%TZSnihO@t={)z?OEV>
zSNC1fRk0>uL-@_Z1LoGYc)8rMv0e*^^V~o5+O(%d#d!uktm5Rsituat|HcW+?YRl#
zmv|;S?^Cdm5}(O9x4@#aXfIa4{TE~P3gdSN1Mkvz7#I1RV1c<t!^iQR&FBB%<M=nq
zSkFSJg-M}sKi44p?z3C|j&p)e7^c`u_{noUc@J9dkxKBHVt+2N3ej06UJ<^F*btp-
z7@bnA1I=oD;vBP#e;)f5(a9x`E5sG;L2?`BGcFo;cAvLy80q8`u>Nq*+bMX#FJ-ev
z*aGw4G)5}kJA=`1MIW0c9iV;JYKy<C{ULn1G@DJslK3@Za{b8rh_P@*dl%bF0osTD
zk>3x*-_?E=-l((-U@~tva3V8#N5awbl6M*Vj`Hbhyy<J_$o;d+`{CQaGn*x0)A&tb
z8EqCK=S2Hbfc8VHE&i_dJ@|yuUVt^?SG%K~E)Q3Ed9MX%zqsI^OYwKL55sGfb}4L|
z_0T$&sdqW=f2PU1hJCswSFN`AyV`HV*AHg1A=m_dBN*M$^quaHQGPQPp!?YBjw;<p
z@KL3^EXU&)!)(rd^uI(B?X3XqWB=H%hxofWzXXpd?HFu>75y@n$ypZob#Psa>sEIK
z2UqtheB}q(Yy@@%zah-_k90+(!kcq$255h2wZ-4n{v1A}wAW#A{7Nxfk2B>*;_V`y
zdv|x<Ct&Q#5r0?vRd}h=u7$1e{AU4^`GD&&MXgO!fbM%%cM)G#_Xa%uE7@!uHjG~i
z+qqu+_l{=*wD<oLKWFiGwfDe_l=c&j58+qnXjhf{@01tfug~LIt1bSn_DT3`SvG5c
zEwZSe!JNF?DUC@5Xy38g;_qsI0B=#+w_z##nlPpUu9xO;l!0#{K=-*tzaA}0_ZfJe
z(#?Yvi63Tj?mkix_WE8a`*b}nT5a)nbMA&use6J1Y?gWTCYGs(j2}{up#bd%R$KgC
z?Yr=Lr9B60!LQEIt|$**5bc!!?dU)CYa;%x_96Jz3)!p~CeJK4Sz%=IM%$}NoPD|`
zm#wz=yV`HU$CUN}>?VF!u*~>WURl*I<I|M@?a!>X_`BK<;nhm}KCBMEIA*U$#*whu
z2+)1`W4|8NO1Bce`orv4Ww0&g#nqj;@eF;$>(@uwr|a=kt1bR+&hNm7mG&jr7=FV$
z+O5>u)1C~_ermPF-_?E$4=L?6ST%m39c}XV|K7s<v)y?ghjAKm#NX9E246gs&1zt)
ztPB@1+ehTO@$zm8(0<Qqi@&RV13q4o&BkHF_@#EV&sIf!?U?}W{Y!q{;_qtjfft3c
z*%OWr;a7-d=5$#@N#`!aU!TjfR$KgC?UV3XRqqDaBG2|`u#7fq5l=f6pnb<`i@&S=
z0sPYUv)OG}3cn`I*1NT;!v7nc#Q^Pt|J<*M_`BK%;AKiXA6AH;+0pJ|oRVdw39(Pt
z<PEDW{;qZ}eCfGt)&ZM2%>6HB^A>Hdp9}|Re`2-8-_`yYUZ?utJgf=7qdVGC6Hj|J
zK)dpuUlZ|nwO@d5s&#yc_~N(0{FynIqTNkRs@bP&@{ZLOe^>i0`0Oucvq9J-exq3C
zT(WLBFZCV^(0**S#oyI_1aDE=%W^z^F)Wieeb8I8ZUtx``xkyq#NX9^3BFRA&0??(
z=Fep;lecI~-u3L$HMwE6#oyJw2ESRH%|>CP_zmu8cTp2hdn!PC&wW2{@prYqz_(Q8
zy#b5iSAw}Uk#m`cj?Vj})fRtO`!)FHFJ!Z$uw~}vdCY1{-d;aR1ZaO?wZ-4nz6DRJ
z`;ST3AbuT~t+%Hw^_~mRKJYL7nux!vy$>E&c^g;>e)&7va_yWJ?PB)nn)F(2@prY`
z;VaK(vnJR)bMtL%FM8fuEy?|FfbLzZJFVvO58=Z~cN&(&uMx}Kf3Qq3ivhX^ANaLs
zQo09(mt^Og4=cpaV440QW2^L!5c_m3-mu!@@7AIhzVy9p)&ZM&p79TJ&S4kQW;j6m
z6RR!$uJ*_9I(7ds4{O5j=#Dmj=I4*Cs{z`T|H`k4_`BLKz>8GgCE|<UM(C@&rGH3C
zOf~y-P2REE;_qs|1;0{~%?4qU_>DT+JP+{3?L74DZ#c0EVeK3c>q|e@o*@<MbNKqt
zWye~F#qk>?heGt!cT{csohvgt-&rivUOJWJ&SN1HXG6AMf#0;6-{Rlrzs5R{&(R#l
zSd3xJdtYXS?KlU@J3*UREq)1(J<5K!9v?^?)>!ju?QjcTrgEQz4dT~<d3E9MyCdhK
zXE@%>1!y1m*S@y+yD|5{m-4fjftBEwk9`w829`+lTqGjm#q860_gZc7ceUH$X{Fr+
zn`f@RjY-W~`*`iGGt5(nRx=Qw{jt>+e^>h>_<gmWn1OZR*MNNkZFv_oQdMr20(1|3
z;@6{5={^e|RJsMQ0{pgF2kdilu1xU!i+#EtZ(42fcXRHCCup?GlCWv~CNN56{|1X!
zSeyO%Z6rYdk<}kp`VZjx{q{vz1AaBw&s+VS@BGTcUp@a0h;!`U_%({DI4{BV`|UB<
zMk(_@wwE}49nBT;_C_6C*XO#`)oX~W!qu8(1a<|@p^R>iuR9r_`_$?V<C{wkTz1~)
zW4K;Ztih`B3pu*w<hvc9``U_Mhcc!63cT#w*|Dl&Ys}S4m|Q!2^;;E*l=sQC)6c#_
ztHCf}8f<<k_~H+;b?(5}oil?@u54J}58(Rmj&8$J_%&g(Z0mYI^KX4Ey@uEem)}2h
z^|v{`6rBQ0t|7i6@xDE$nOX0QR91#fh~wpZS2<qVGlR+YNq8B@XLzQajqn<Z;>NoL
z<B*)CgW3E#Jg#yZgI&TejZLyWR+4?pQ>!(sv>wAZXp9?o4OWd`2;2XV`&GMN+PsLR
zu<AWLL!$77n0!|u#~0(P>)Q)sS5Cf)+YS$_bxIR#{s)YIn0KAfpJS(KQ-JPWt2>Ra
zoBN0GV%45$SQ5WROxhr0Uo28?76Wt-{yRV4CZ&4-KBVUQd{`lV#?fURVM^h;4i~L%
zuF~y>hm>vtHe13Rj!7N5+oI73v-)6w?meqJg|AzOJMg8VY&Hu^;8*YJvJku<p!>qA
zUxx;z`y71s>)E=6uzdWUeBT)UxAPsM_wIyTA`IBg=PzVlqo~DS*XJ6HLvz*;aNBVi
zUZ>&>!^ZKGdwp4(eD$5B7msn<dn11L`y@QCHFE*65Bz&C_MQ$Edmns+iNVdsz)J9w
z-zf@Q7nOTITVq1_4cff9RLnFYqH_{HYPV%hD`mR_hY;I?SSW|@8?a&QCVU%{-%*{!
zrrAHl@w#>oVC>4dspjUp@Wlh!`Om>x@T<c}&vdeW;VT`p9H4vnk*`~?bPvMQO1B7>
zhu;=+l&>3!@>PHd!*#xIS=}dUKD!7nR=O$JZT!ZqZhthwxAmO+0NsaHcS7mjgXd8{
zsXt})J{Mq(_|;+su<SnMpTmuS7%%?^zb4fxMkRcbbLGY;gKe?UmEXo<s8ca^E{VwV
zj7|9JI^D9dmN?$ky$+W(v&+U{m+(ttyW2y@=bzbt7|*Qv?MbQ_d*SyfmK$T6<4Z*!
zlX0}2R|L8Tj+jDxb?#@aZlTgW377AUUEK!QBKO%dnCNz=J3QTFfbK_DcUIk(zYo`c
zPd^1q<JV$!d55IDoVj^EK=;}I=+_~kboaw$&U5q4g%#tMm!(^7is8Bry;e70>9)hu
zO1B9%Pe-|pNgX8LSY^bd;q}V*uGO7ZbLfX~N}iSPG%Sf<BSyC|<y=70p4M=sJO=RI
z`^bLhy{jj|Zw2i8lsV7p%W-*_mj^tq=9UvMcIT9;d|ri@s`0xPw!)mZfYr0DYj(qG
zEvtF)8hn|=-E%Pt8^v!Bleu!Bv$dzI)4vAX_jq}iFv)H+hrj;)hXen~&%Hy%+Xt^x
z%)m-u`C@FBJ(Hc!DXUeijyn!7+Mn%P2fI%l=deb$tCiL*tEK1h>+oq+|1p^ShEf`v
zWLuAS51eD<JdScgW7sm+qFj%~*etvSllrdXug}X%>wcYTRO~2xRNaGzU{9DM*X;ax
zq4Qi1vZe;Eb+208bv4(%4Ua3`A=m_dnR&Udtt%zEV*$F4t?sDOeFR@+@^$;svK)_J
z43qk#dipxk{H=*-Zv|){`_FzY#NX9<2|lc}W3Ub8wPj51k-N?()1ArIKKc8$diLph
z+_2i>?`mIzhm`gxY!tsiOy=FLw$6+4K>TKa?w3|~Na=nqx~gxk!{qlDORaCRwZ9cv
zV?NuR?{OG=azaY?7`#;7)7QY{Hysx-Z=KMcZX4j>#sJ-0R(A<sw+`3g1FBz)!7kyK
zw!Wu+(tBRg<(A5SE|!jallmNfVRc`6;J?`GU&Yx6m*2&AV;fire)(c->v8#%)hbq6
z$KiThu7k;MD9vHsxE%KTo_&rNm;6)s=>BoX#+czaH@6SqmjEui4NKwIgh{`)+Et>x
z7@&P{!>@(-yE+HpxvKBw!wT_}g)qA^+Lf8#<St{s&izfRzx~~8{eJkQ>MKduG=3A9
zlY6D)eknlvq16_DH}`w+O$1!F0Bgjr7ISisHg~1UMSnd&|E2%x*QQ44N8xKqKLmTi
zc)o@?xkp92mVHP0bo04pwZ-4%m*FF-4-Uh|@w<fW<X%DUr#=120R5*{e^}{1hBwgp
z+?Z=HnTtZ$PVN<+wt2ie_v0{{lOz7F&N27^3mKQyz*fIQ-@_bjS<;)P0PXjzw)nd`
zH{j#USuPug4da)>c0H5fH%4az^q=`}e(p)7zZYJ};Nj|Tb9^a&1ws1!olr5y==0iZ
z_4Ac}JNz~a7FWLsHebN{58IVLOZdS6{d-n_iep@@JMcl($7W#({OW`B868*HU$2gd
zZu<49Q~HPCdORzJ$$O=nBroF`f6~0W9g!oZ7C&95Yc|fB8k;Y}M^qmghK=KQ3FCj+
z&)&G3oea=_YW0Ve{$qHV(qDsB;};6luQ0j)eRuvRV5E{$s`OulFRJrg3tM5nUBGZx
z`A1D6K>q`)Kd<KLTkxdPpM(wK*Ab}C{ZD}Yv!DC<CzSqvc#FDs&V?1@mlvR4!F5o|
z{#7n+w?AIA`ngKKTe#9sz-F1><@=))+q$pZv05`~4e$Xx^K6Bo!Ha(zmcp+I)3vUM
zM9mWWuc%xP{SUtujY|JnxE?DDU<LTeyVu2R>s))SR=(0|hc~Hlw+S|n&TVX0uKcFd
zV1WKTt3SmtZoTiohgDr?VF~=|1NGx(h5hU5T8M7>wUFP;cJ&XzkE-#v7$$3kP3G|s
z+uKU(4XY)SmaEk(zV>`&&RYj;1|3-wq}kSE*$vp5S}%;lbp1bv$#prRd_RT9RDD)p
zwfKcGxtH|+{ypei#7y4*Bq!CrlQ4-@qGG)U*JI^Tn7mg%k9lKdj6Mp}^?%Q5E~+)a
zjR4JYnEX9JDx=9N?>6kdI(F|DexCB%=k9rU8lbre3*%Ru(PZ^S*^K<6W<ENTO5-J1
zMwexhk+F4wVkV@Z*f@*+ICh`FDXajyji7v&P{-#6#TdcLFzH{b9G1c)&q1{H`Mu}2
zWzUqFtM9-^6`O@6U^mf~Wvlv5d!RDPB7uL_@T=!jVrKf|i~q~3*`8XZc^IzGZ7FP<
zF{Orsy>naPKVwnx_)md^O+CK4O*d?;Wi;FzuEA#&8-<O+?vtz3TgTwrQZtQ@ZqL3a
zetXhN?@N4jd$wR@_{n+j+QXemP^<{=jFzC=f}fjrzm1irVx5JTDc=^@oV~8NjLeT;
z{jDF@*Z<~g4ZmLN#g;#M;uP3>I^J$%`wGV#RXU&fF>~a<S?1>Q5U$sx_hEJT#jzdj
zcwDqM1GHcH-~RO@{;u|maQ)p=7`92jU%?!0o<h>!*{AcqZneeV)xHX!Q)~n#zc)RU
zt?fPRy&0hWrPUUHSNn6gT%Rsmhs9x~S=v=z-nmbA=Y0Z3lXAr0)p-@3r&uj)g)v|O
z+o?%qRBm`%0<_<^+T!nOzYCvIYyvg}ODb)KS2G)+{mlRI^A>+sdoR3Rv2Bi*XPE`q
zuKAg&7juk0pS@N;Uum_&Hwoy*Y=X@LZezRjxwZxa^zT`H{ToYn;G;@^7M6h32kD0^
z%}RiNbla~_ozgl44=Yv-lfMt!WE|g>Ki`X1vtQTe9jm{=F&z5Uc>5N7NwGngd=EAn
zpf6+Tc!2)rR)0ikeF{%0wgQv)kHSIvj5ON;`mg<8zdmJ3>lJu`V%4xU2GAu;*C)n(
zOB4I$@3q}JT!zVafWjDVOo;78=9M(2pV^M%tLytYOv);As;m1cJf+wQtQHo=xMa=i
z=aW6%XL$Y@pnL2~Uss+XySgvI<-dR8vKVZGeC1iQw88$Eh^d3?e6L&G6?|QdtMEC+
zMqpQ9Ls%ZVneXt}G&cjZzqH!o?`nSzPbjtyi^JqQB~A&y>)LL%12kXznLYpN_*dWs
zidDnb&|1Pg%`<JL9;Rz>!)h+$=jM40KC9R$Y!o(#$vNvd-+G~~llH@B9Nz-A<>_+)
zmHGw?aF%WIPfnWu%f!g|<a;E~^Qm+8vux*R*_-FpyU6a7F??s1_W!#v_WPtWsq=^V
z&sU_q-<Y6V7=BX2`!3Y(Z|zR>_H-ti`x57x`7vwn&yBt4^6ne%-a5njcC~h&Z*M)F
tKHtlO^Ig{u<BR0K6Svv_5(_4^F6S2oBwxF~C)LIe+cdZJ?L;-^{{ejuk8l70


From e2513c5f96a0504317960ad9a5df9dd381b278cb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 14:20:02 -0700
Subject: [PATCH 174/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/2b29f4d9ec1bcbfaabc1d552c8e7aac9dc2809a1.

PiperOrigin-RevId: 567716100
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 7fb69c7a3ed804..feaebb3685d3ba 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "3b7012849fde349a679c495f5bd421935d6599f9"
-    TFRT_SHA256 = "aff0363d40d564af9ad599f40b4b24b819e84549eb7ea9d5fb8875817c76a4bf"
+    TFRT_COMMIT = "2b29f4d9ec1bcbfaabc1d552c8e7aac9dc2809a1"
+    TFRT_SHA256 = "c767bf0e2da2a767a8a4d51fc0e1f314f7c46df6097b3ac03ca3612cdb5889c3"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index 7fb69c7a3ed804..feaebb3685d3ba 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "3b7012849fde349a679c495f5bd421935d6599f9"
-    TFRT_SHA256 = "aff0363d40d564af9ad599f40b4b24b819e84549eb7ea9d5fb8875817c76a4bf"
+    TFRT_COMMIT = "2b29f4d9ec1bcbfaabc1d552c8e7aac9dc2809a1"
+    TFRT_SHA256 = "c767bf0e2da2a767a8a4d51fc0e1f314f7c46df6097b3ac03ca3612cdb5889c3"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 7fb69c7a3ed804..feaebb3685d3ba 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "3b7012849fde349a679c495f5bd421935d6599f9"
-    TFRT_SHA256 = "aff0363d40d564af9ad599f40b4b24b819e84549eb7ea9d5fb8875817c76a4bf"
+    TFRT_COMMIT = "2b29f4d9ec1bcbfaabc1d552c8e7aac9dc2809a1"
+    TFRT_SHA256 = "c767bf0e2da2a767a8a4d51fc0e1f314f7c46df6097b3ac03ca3612cdb5889c3"
 
     tf_http_archive(
         name = "tf_runtime",

From 11ead767cba2489056fc577fee350025b2fc768a Mon Sep 17 00:00:00 2001
From: Jake Harmon <jakeharmon@google.com>
Date: Fri, 22 Sep 2023 14:22:36 -0700
Subject: [PATCH 175/567] Fix build error on windows

PiperOrigin-RevId: 567716676
---
 tensorflow/tools/pip_package/build_pip_package.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 07f755ac6137cc..ef6f655ba6039b 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -259,8 +259,13 @@ function prepare_src() {
   # We copy from bazel-bin/tensorflow instead of bazel-bin/internal to copy
   # headers from TSL/XLA into tensorflow so that InstallHeaders can move
   # them back into tensorflow/include
-  cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_tsl/tsl/ ${TMPDIR}/tensorflow
-  cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
+  if is_windows; then
+    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/local_tsl/tsl/ ${TMPDIR}/tensorflow
+    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
+  else
+    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_tsl/tsl/ ${TMPDIR}/tensorflow
+    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
+  fi
   # Fix the proto stubs
   if is_macos; then
     find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i '' 's/from tsl\./from tensorflow.tsl./' {} \;

From 5f7a3ed8d9672da3307865ec76102fead5296df8 Mon Sep 17 00:00:00 2001
From: Kanglan Tang <kanglan@google.com>
Date: Fri, 22 Sep 2023 14:42:34 -0700
Subject: [PATCH 176/567] Set the default clang version to 17 or 16

PiperOrigin-RevId: 567721179
---
 configure.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/configure.py b/configure.py
index e1e1326826e3be..cbb74beb00c22f 100644
--- a/configure.py
+++ b/configure.py
@@ -610,7 +610,9 @@ def set_clang_cuda_compiler_path(environ_cp):
   """Set CLANG_CUDA_COMPILER_PATH."""
   default_clang_path = '/usr/lib/llvm-17/bin/clang'
   if not os.path.exists(default_clang_path):
-    default_clang_path = which('clang') or ''
+    default_clang_path = '/usr/lib/llvm-16/bin/clang'
+    if not os.path.exists(default_clang_path):
+      default_clang_path = which('clang') or ''
 
   clang_cuda_compiler_path = prompt_loop_or_load_from_env(
       environ_cp,
@@ -819,9 +821,11 @@ def set_clang_compiler_path(environ_cp):
     string value for clang_compiler_path.
   """
   # Default path if clang-16 is installed by using apt-get install
-  default_clang_path = '/usr/lib/llvm-16/bin/clang'
+  default_clang_path = '/usr/lib/llvm-17/bin/clang'
   if not os.path.exists(default_clang_path):
-    default_clang_path = which('clang') or ''
+    default_clang_path = '/usr/lib/llvm-16/bin/clang'
+    if not os.path.exists(default_clang_path):
+      default_clang_path = which('clang') or ''
 
   clang_compiler_path = prompt_loop_or_load_from_env(
       environ_cp,

From 5f771ea1907d0974a2c78891de7c4f3fede8244c Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Fri, 22 Sep 2023 14:56:25 -0700
Subject: [PATCH 177/567] [XLA] Initialize tuple shapes of async-done in
 dataflow analysis.

PiperOrigin-RevId: 567724401
---
 third_party/xla/xla/python/xla_client.py      |  2 +-
 .../xla/xla/service/hlo_dataflow_analysis.cc  |  7 ++++-
 .../xla/service/hlo_dataflow_analysis_test.cc | 29 +++++++++++++++++++
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 0c19224f787919..d6172122d20e4f 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -44,7 +44,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 196
+_version = 197
 
 # Version number for MLIR:Python components.
 mlir_api_version = 54
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis.cc b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
index f1bb8f07cbfd2d..1f72360f3b4bfd 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis.cc
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
@@ -1503,7 +1503,12 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           });
           break;
         case HloOpcode::kAsyncDone:
-          // AsyncDone's output aliases its output.
+          // AsyncDone's output aliases its output. It defines all remaining
+          // tuple-shaped values.
+          define_all_values([&](const ShapeIndex& index) {
+            return ShapeUtil::GetSubshape(instruction->shape(), index)
+                .IsTuple();
+          });
           break;
         case HloOpcode::kCopyStart:
           // CopyStart produces a tuple of {destination buffer, aliased operand,
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc b/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc
index 98ce55b9d4ab5b..2a01eee2626286 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc
@@ -1183,6 +1183,35 @@ ENTRY %main (a: f32[4096], b: f32[4096]) -> f32[4096] {
   }
 }
 
+TEST_P(HloDataflowAnalysisTest, TupleShapedAsyncOp) {
+  std::string hlo_str = R"(
+  HloModule module
+
+  ENTRY entry {
+    p0 = f32[2,3] parameter(0)
+    async-start = ((f32[2,3]), (f32[2,3], f32[2,3]), u32[]) custom-call-start(p0), custom_call_target="foo"
+    async-update = ((f32[2,3]), (f32[2,3], f32[2,3]), u32[]) custom-call-update(async-start), custom_call_target="foo"
+    ROOT async-done = (f32[2,3], f32[2,3]) custom-call-done(async-update), custom_call_target="foo"
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      module_, ParseAndReturnVerifiedModule(hlo_str, GetModuleConfigForTest()));
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  const HloInstruction* async_start =
+      FindInstruction(module_.get(), "async-start");
+  const HloInstruction* async_update =
+      FindInstruction(module_.get(), "async-update");
+  const HloInstruction* async_done =
+      FindInstruction(module_.get(), "async-done");
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(async_start, /*index=*/{1}));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(async_update, /*index=*/{1}));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(async_done));
+}
+
 TEST_P(HloDataflowAnalysisTest, SendAndSendDone) {
   // Test that a Send forwards its operand to the output tuple at {0}.
   auto builder = HloComputation::Builder(TestName());

From 44251d54e6c5f58e4909af0d948123d6634ec1ea Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 22 Sep 2023 15:13:02 -0700
Subject: [PATCH 178/567] Integrate LLVM at llvm/llvm-project@46d5d264fc66

Updates LLVM usage to match
[46d5d264fc66](https://github.com/llvm/llvm-project/commit/46d5d264fc66)

PiperOrigin-RevId: 567728405
---
 third_party/llvm/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index eb146861cd0cb1..f24443e14ea083 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "6e3827af98fa59d5147598972625a5317936c31f"
-    LLVM_SHA256 = "bda0e24e2b92f19d2929237101edc1f66fa64f5407d32cbabaf44f878ff0827c"
+    LLVM_COMMIT = "46d5d264fc66a017bbd0182b2b5fcc0f3f23d3be"
+    LLVM_SHA256 = "f31d546ecdcd07971f7f8f5f6f83ee2e101bf677975fe96d6ba837628eba7e24"
 
     tf_http_archive(
         name = name,

From 745a7fae7eafa9fb16849fd5ee9f24e0c2db3b7e Mon Sep 17 00:00:00 2001
From: Kanglan Tang <kanglan@google.com>
Date: Fri, 22 Sep 2023 15:21:10 -0700
Subject: [PATCH 179/567] Fix TF-TPU build flags

PiperOrigin-RevId: 567730240
---
 ci/official/envs/nightly_linux_x86_tpu_py310 | 2 +-
 ci/official/envs/nightly_linux_x86_tpu_py311 | 2 +-
 ci/official/envs/nightly_linux_x86_tpu_py39  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/official/envs/nightly_linux_x86_tpu_py310 b/ci/official/envs/nightly_linux_x86_tpu_py310
index 96e6967b3e774e..d802791f6c9f8d 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py310
+++ b/ci/official/envs/nightly_linux_x86_tpu_py310
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
 TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10 --define=with_tpu_support=true --define=enable_mlir_bridge=true --config=opt --define=framework_shared_object=true --copt=-DLIBTPU_ON_GCE)
+TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10 --config=tpu)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py311 b/ci/official/envs/nightly_linux_x86_tpu_py311
index 6e15a1ba0064ed..c797ad33cac6a7 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py311
+++ b/ci/official/envs/nightly_linux_x86_tpu_py311
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
 TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.11 --define=with_tpu_support=true --define=enable_mlir_bridge=true --config=opt --define=framework_shared_object=true --copt=-DLIBTPU_ON_GCE)
+TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.11 --config=tpu)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.11
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py39 b/ci/official/envs/nightly_linux_x86_tpu_py39
index fb40176dbb24b1..f738c2cb75c216 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py39
+++ b/ci/official/envs/nightly_linux_x86_tpu_py39
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
 TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.9 --define=with_tpu_support=true --define=enable_mlir_bridge=true --config=opt --define=framework_shared_object=true --copt=-DLIBTPU_ON_GCE)
+TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.9 --config=tpu)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.9
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1

From 38c38ded948b16381b21b60d8cb9a53952bc8f03 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Fri, 22 Sep 2023 15:40:49 -0700
Subject: [PATCH 180/567] #tf-data-service Clean up log for distributed
 snapshot.

PiperOrigin-RevId: 567734740
---
 tensorflow/core/data/service/dispatcher_impl.cc       | 11 ++---------
 .../core/data/service/snapshot/snapshot_manager.cc    |  6 ------
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index d4070af6b565bb..89fcb140272c79 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -389,7 +389,6 @@ void DataServiceDispatcherImpl::ReportProcessingTimesFromActiveTasks(
 
 Status DataServiceDispatcherImpl::WorkerHeartbeat(
     const WorkerHeartbeatRequest* request, WorkerHeartbeatResponse* response) {
-  absl::Time start_time = absl::FromUnixMicros(env_->NowMicros());
   TF_RETURN_IF_ERROR(CheckStarted());
   VLOG(3) << "Received worker heartbeat request from worker "
           << request->worker_address();
@@ -443,9 +442,7 @@ Status DataServiceDispatcherImpl::WorkerHeartbeat(
   }
 
   VLOG(3) << "Finished worker heartbeat for worker at address "
-          << request->worker_address() << " in "
-          << (absl::ToDoubleSeconds(absl::FromUnixMicros(env_->NowMicros()) -
-                                    start_time));
+          << request->worker_address();
   return OkStatus();
 }
 
@@ -1179,7 +1176,6 @@ Status DataServiceDispatcherImpl::GetSnapshotStreams(
 Status DataServiceDispatcherImpl::GetSnapshotSplit(
     const GetSnapshotSplitRequest* request,
     GetSnapshotSplitResponse* response) {
-  absl::Time start_time = absl::FromUnixMicros(env_->NowMicros());
   TF_RETURN_IF_ERROR(CheckStarted());
 
   absl::flat_hash_map<std::string, std::unique_ptr<SnapshotManager>>::iterator
@@ -1193,10 +1189,7 @@ Status DataServiceDispatcherImpl::GetSnapshotSplit(
           request->base_path());
     }
   }
-  auto status = it->second->GetSnapshotSplit(*request, *response);
-  LOG(INFO) << "[tf.data snapshot] GetSnapshotSplit took "
-            << absl::FromUnixMicros(env_->NowMicros()) - start_time;
-  return status;
+  return it->second->GetSnapshotSplit(*request, *response);
 }
 
 Status DataServiceDispatcherImpl::DisableCompressionAtRuntime(
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.cc b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
index 7644ee8c6f6850..39d08b0cc2ae4e 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
@@ -580,10 +580,7 @@ absl::Status SnapshotManager::GetSnapshotSplit(
       TF_RETURN_IF_ERROR(ResetSource(source, request.source_index()));
     }
 
-    absl::Time start_time = absl::FromUnixMicros(env_->NowMicros());
     TF_RETURN_IF_ERROR(source.split_provider->GetNext(&split, &end_of_splits));
-    LOG(INFO) << "[tf.data SnapshotManager] GetNext took "
-              << absl::FromUnixMicros(env_->NowMicros()) - start_time;
     if (end_of_splits) {
       response.set_end_of_splits(true);
       return absl::OkStatus();
@@ -592,15 +589,12 @@ absl::Status SnapshotManager::GetSnapshotSplit(
     ++stream.num_assigned_splits_per_source[request.source_index()];
     ++num_assigned_splits_;
   }
-  absl::Time start_time = absl::FromUnixMicros(env_->NowMicros());
   std::string split_path = SplitPath(
       path_, request.stream_index(), request.source_index(),
       request.repetition_index(), local_split_index, global_split_index);
   TF_RETURN_IF_ERROR(AtomicallyWriteTFRecords(
       split_path, {split}, tsl::io::compression::kNone, env_));
   split.AsProtoTensorContent(response.mutable_split());
-  LOG(INFO) << "[tf.data SnapshotManager] Writing split took "
-            << absl::FromUnixMicros(env_->NowMicros()) - start_time;
   return absl::OkStatus();
 }
 

From ca20c978199289c147d0942861aac76b48cce33c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 15:52:05 -0700
Subject: [PATCH 181/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/e8d8c19d9314439fb3e1c08936ba0dc6863d1ccc.

PiperOrigin-RevId: 567737020
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index feaebb3685d3ba..321212b36339e1 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "2b29f4d9ec1bcbfaabc1d552c8e7aac9dc2809a1"
-    TFRT_SHA256 = "c767bf0e2da2a767a8a4d51fc0e1f314f7c46df6097b3ac03ca3612cdb5889c3"
+    TFRT_COMMIT = "e8d8c19d9314439fb3e1c08936ba0dc6863d1ccc"
+    TFRT_SHA256 = "775221f0d876c5d5df52f8c6fdd072bc52352c65276908d371fb164888e0c0d4"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index feaebb3685d3ba..321212b36339e1 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "2b29f4d9ec1bcbfaabc1d552c8e7aac9dc2809a1"
-    TFRT_SHA256 = "c767bf0e2da2a767a8a4d51fc0e1f314f7c46df6097b3ac03ca3612cdb5889c3"
+    TFRT_COMMIT = "e8d8c19d9314439fb3e1c08936ba0dc6863d1ccc"
+    TFRT_SHA256 = "775221f0d876c5d5df52f8c6fdd072bc52352c65276908d371fb164888e0c0d4"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index feaebb3685d3ba..321212b36339e1 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "2b29f4d9ec1bcbfaabc1d552c8e7aac9dc2809a1"
-    TFRT_SHA256 = "c767bf0e2da2a767a8a4d51fc0e1f314f7c46df6097b3ac03ca3612cdb5889c3"
+    TFRT_COMMIT = "e8d8c19d9314439fb3e1c08936ba0dc6863d1ccc"
+    TFRT_SHA256 = "775221f0d876c5d5df52f8c6fdd072bc52352c65276908d371fb164888e0c0d4"
 
     tf_http_archive(
         name = "tf_runtime",

From d7027bd713999cdb3def5d365a0bca49b1b0f440 Mon Sep 17 00:00:00 2001
From: Ziyin Huang <ziyinh@google.com>
Date: Fri, 22 Sep 2023 15:59:11 -0700
Subject: [PATCH 182/567] Add a warning for Feature/table config names if not
 set. This field is required for running embedding lookup on latest tpu
 hardware.

PiperOrigin-RevId: 567738526
---
 tensorflow/python/tpu/tpu_embedding_v2_utils.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
index e04ba6760528fc..53fec7c18f619d 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -1046,7 +1046,8 @@ def __init__(self,
         'mean' the default. 'sqrtn' often achieves good accuracy, in particular
         with bag-of-words columns. For more information, see
         `tf.nn.embedding_lookup_sparse`.
-      name: An optional string used to name the table. Useful for debugging.
+      name: An optional string used to name the table. Must be defined if
+        running on SparseCore.
       quantization_config: The simulated quantization config. An instance of
         `tf.tpu.experimental.embedding.QuantizationConfig`. See the class for
         more documentation.
@@ -1083,6 +1084,12 @@ def __init__(self,
           f"Argument `combiner` must be one of {accepted_combiners}. "
           f"Received: {combiner}")
 
+    if name is None:
+      logging.warning(
+          "Name of the table config must be specified for running on"
+          " SparseCore. Different table configs must have unique names."
+      )
+
     self.vocabulary_size = vocabulary_size
     self.dim = dim
     self.initializer = initializer
@@ -1224,7 +1231,8 @@ def __init__(self,
         has to match the shape (for ragged tensor, the input shape and output
         shape can mismatch). If not provided, the shape can be either provided
         to the `embedding.build` or auto detected at the runtime.
-      name: An optional name for the feature, useful for debugging.
+      name: An optional string used to name the table. Must be defined if
+        running on SparseCore.
 
     Returns:
       `FeatureConfig`.
@@ -1242,6 +1250,11 @@ def __init__(self,
       raise ValueError(
           f"Argument `max_sequence_length` must be an int and must be >= 0. "
           f"Received: {max_sequence_length}")
+    if name is None:
+      logging.warning(
+          "Name of the Feature config must be specified for running on"
+          " SparseCore. Different feature configs must have unique names."
+      )
 
     self.table = table
     self.max_sequence_length = max_sequence_length

From 8803725ae8ae13147e306b510e59caf2dbab58bd Mon Sep 17 00:00:00 2001
From: Jieying Luo <jieying@google.com>
Date: Fri, 22 Sep 2023 16:08:19 -0700
Subject: [PATCH 183/567] [PJRT C API] Add a custom call C API to the priv of
 GPU PJRT_Api* and implement it in GPU plugin.

The priv of GPU PJRT_Api* has a chained of PJRT_Structure_Base which contains a type and a pointer to next PJRT_Structure_Base. Rename priv to extension_start.

PiperOrigin-RevId: 567740522
---
 third_party/xla/xla/pjrt/c/BUILD              | 13 +++++
 third_party/xla/xla/pjrt/c/pjrt_c_api.h       | 19 +++++--
 .../xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h | 52 +++++++++++++++++++
 .../xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc | 29 +++++++++--
 .../xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc     | 31 +++++++++++
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h  |  5 +-
 6 files changed, 140 insertions(+), 9 deletions(-)
 create mode 100644 third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h

diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index ba1adb05c17794..83c8c354d38231 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -34,6 +34,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pjrt_c_api_gpu_extension_hdrs",
+    hdrs = ["pjrt_c_api_gpu_extension.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pjrt_c_api_hdrs",
+    ],
+)
+
 cc_library(
     name = "pjrt_c_api_wrapper_impl",
     srcs = ["pjrt_c_api_wrapper_impl.cc"],
@@ -134,6 +143,7 @@ cc_library(
     hdrs = ["pjrt_c_api_gpu_internal.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":pjrt_c_api_gpu_extension_hdrs",
         ":pjrt_c_api_hdrs",
         ":pjrt_c_api_helpers",
         ":pjrt_c_api_wrapper_impl",
@@ -142,6 +152,7 @@ cc_library(
         "//xla/pjrt/gpu:gpu_helpers",
         "//xla/pjrt/gpu:se_gpu_pjrt_client",
         "//xla/python:inspect_sharding",  # To register "InspectSharding" custom partitioning handler.
+        "//xla/service:custom_call_target_registry",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
@@ -204,6 +215,7 @@ xla_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":pjrt_c_api_gpu",
+        ":pjrt_c_api_gpu_extension_hdrs",
         ":pjrt_c_api_hdrs",
         ":pjrt_c_api_helpers",
         ":pjrt_c_api_test_base",
@@ -213,6 +225,7 @@ xla_cc_test(
         "//xla:statusor",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
+        "//xla/service:custom_call_target_registry",
         "//xla/service:gpu_plugin",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index 3bbc4cea4827a9..f3064d1fe41b13 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -53,7 +53,7 @@ extern "C" {
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 30
+#define PJRT_API_MINOR 31
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -1901,6 +1901,20 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Compile_Args, executable);
 // PJRT_Client before execution.
 typedef PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
 
+// -------------------------------- Extension ----------------------------------
+
+typedef enum {
+  PJRT_Structure_Type_Gpu_Custom_Call = 0,
+} PJRT_Structure_Type;
+
+// PJRT_Structure_Base contains a type and a pointer to next
+// PJRT_Structure_Base. The framework can go through this chain to find
+// structure and identify it with the type.
+typedef struct PJRT_Structure_Base {
+  PJRT_Structure_Type type;
+  const struct PJRT_Structure_Base* next;
+} PJRT_Structure_Base;
+
 // -------------------------------- API access ---------------------------------
 
 #define _PJRT_API_STRUCT_FIELD(fn_type) fn_type* fn_type
@@ -1908,7 +1922,7 @@ typedef PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
 // Please modify PJRT_Api_STRUCT_SIZE if the last field of PJRT_Api is changed.
 typedef struct {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
 
   PJRT_Api_Version pjrt_api_version;
 
@@ -2025,7 +2039,6 @@ const size_t PJRT_Api_STRUCT_SIZE =
     PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Executable_OutputDimensions);
 
 #undef _PJRT_API_STRUCT_FIELD
-#undef PJRT_DEFINE_STRUCT_TRAITS
 
 #ifdef __cplusplus
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h
new file mode 100644
index 00000000000000..f86803a2ae38be
--- /dev/null
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_GPU_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_GPU_EXTENSION_H_
+
+#include <cstddef>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PJRT_API_GPU_EXTENSION_VERSION 0
+
+struct PJRT_Gpu_Register_Custom_Call_Args {
+  size_t struct_size;
+  const char* function_name;
+  size_t function_name_size;
+  void* custom_call_function;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Gpu_Register_Custom_Call_Args,
+                          custom_call_function);
+
+// Registers a custom call.
+typedef PJRT_Error* PJRT_Gpu_Register_Custom_Call(
+    PJRT_Gpu_Register_Custom_Call_Args* args);
+
+typedef struct PJRT_Gpu_Custom_Call {
+  PJRT_Structure_Type type;
+  const void* next;
+  PJRT_Gpu_Register_Custom_Call* custom_call;
+} PJRT_Gpu_Custom_Call;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_GPU_EXTENSION_H_
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index 75a14f42118b4d..e14bf2b20f5768 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -25,17 +25,21 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_gpu_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/service/custom_call_target_registry.h"
 #include "tsl/platform/errors.h"
 
 namespace pjrt {
 namespace gpu_plugin {
 
+#define PJRT_GPU_PLUGIN_PLATFORM_NAME "CUDA"
+
 PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
       "PJRT_Client_Create_Args", PJRT_Client_Create_Args_STRUCT_SIZE,
@@ -116,10 +120,27 @@ PJRT_Error* PJRT_GpuDeviceTopology_Create(
       "Topology not supported for GPU compilation.")};
 }
 
-constexpr PJRT_Api pjrt_api =
-    pjrt::CreatePjrtApi(pjrt::gpu_plugin::PJRT_Client_Create,
-                        pjrt::gpu_plugin::PJRT_GpuDeviceTopology_Create,
-                        pjrt::PJRT_Plugin_Initialize_NoOp);
+PJRT_Error* PJRT_Gpu_Register_Custom_Call(
+    PJRT_Gpu_Register_Custom_Call_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_Gpu_Register_Custom_Call_Args",
+      PJRT_Gpu_Register_Custom_Call_Args_STRUCT_SIZE, args->struct_size));
+  std::string function_name(args->function_name, args->function_name_size);
+  xla::CustomCallTargetRegistry::Global()->Register(
+      function_name, args->custom_call_function, PJRT_GPU_PLUGIN_PLATFORM_NAME);
+  return nullptr;
+}
+
+PJRT_Gpu_Custom_Call custom_call{
+    /*type=*/PJRT_Structure_Type::PJRT_Structure_Type_Gpu_Custom_Call,
+    /*next=*/nullptr,
+    /*custom_call=*/PJRT_Gpu_Register_Custom_Call,
+};
+
+constexpr PJRT_Api pjrt_api = pjrt::CreatePjrtApi(
+    pjrt::gpu_plugin::PJRT_Client_Create,
+    pjrt::gpu_plugin::PJRT_GpuDeviceTopology_Create,
+    pjrt::PJRT_Plugin_Initialize_NoOp, static_cast<void*>(&custom_call));
 
 const PJRT_Api* GetGpuPjrtApi() { return &pjrt_api; }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 63102691defd0c..319ee8cec112db 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -33,12 +33,14 @@ limitations under the License.
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_gpu_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/c/pjrt_c_api_test.h"
 #include "xla/pjrt/c/pjrt_c_api_test_base.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/service/custom_call_target_registry.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
 #include "tsl/platform/status.h"
@@ -235,5 +237,34 @@ TEST(PjrtCApiGpuAllocatorTest, InvalidAllocatorOptionsParsing) {
 
   api->PJRT_Error_Destroy(&error_destroy_args);
 }
+
+void TestCustomCall() {}
+
+TEST(PjrtCApiGpuPrivTest, CustomCall) {
+  PJRT_Gpu_Register_Custom_Call_Args args;
+  args.struct_size = PJRT_Gpu_Register_Custom_Call_Args_STRUCT_SIZE;
+  std::string function_name = "function_name";
+  args.function_name = function_name.c_str();
+  args.function_name_size = function_name.size();
+  args.custom_call_function = reinterpret_cast<void*>(&TestCustomCall);
+  auto api = GetPjrtApi();
+  const PJRT_Structure_Base* next =
+      reinterpret_cast<const PJRT_Structure_Base*>(api->extension_start);
+  while (next != nullptr &&
+         next->type !=
+             PJRT_Structure_Type::PJRT_Structure_Type_Gpu_Custom_Call) {
+    next = next->next;
+  }
+  ASSERT_NE(next, nullptr);
+
+  PJRT_Error* error =
+      reinterpret_cast<const PJRT_Gpu_Custom_Call*>(next)->custom_call(&args);
+
+  CHECK_EQ(error, nullptr);
+  void* custom_call =
+      xla::CustomCallTargetRegistry::Global()->Lookup(function_name, "CUDA");
+  EXPECT_EQ(custom_call, reinterpret_cast<void*>(&TestCustomCall));
+}
+
 }  // namespace
 }  // namespace pjrt
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index 841725c595554a..7003f9c24ebac0 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -399,10 +399,11 @@ PJRT_Error* PJRT_Plugin_Initialize_NoOp(PJRT_Plugin_Initialize_Args* args);
 constexpr PJRT_Api CreatePjrtApi(
     PJRT_Client_Create* create_fn,
     PJRT_TopologyDescription_Create* topology_create_fn,
-    PJRT_Plugin_Initialize* plugin_initialize_fn) {
+    PJRT_Plugin_Initialize* plugin_initialize_fn,
+    void* extension_start = nullptr) {
   return PJRT_Api{
       /*struct_size=*/PJRT_Api_STRUCT_SIZE,
-      /*priv=*/nullptr,
+      /*extension_start=*/extension_start,
 
       /*pjrt_api_version=*/
       PJRT_Api_Version{/*struct_size=*/PJRT_Api_Version_STRUCT_SIZE,

From 5c845a9e76a76474302bda77ec9c7f93579e8227 Mon Sep 17 00:00:00 2001
From: Kanglan Tang <kanglan@google.com>
Date: Fri, 22 Sep 2023 16:08:53 -0700
Subject: [PATCH 184/567] Internal changes only

PiperOrigin-RevId: 567740639
---
 .../tools/pip_package/build_pip_package.sh      | 17 +++++++++++++++++
 tensorflow/tools/pip_package/setup.py           |  4 ++++
 2 files changed, 21 insertions(+)

diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index ef6f655ba6039b..f7b4a655a67beb 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -210,6 +210,23 @@ function prepare_src() {
     cp -L \
       bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/LICENSE \
       "${TMPDIR}"
+    # Check if it is a tpu build
+    if [[ ${TPU_BUILD} == "1" ]]; then
+      # Check if libtpu.so exists
+      if [[ -f "./tensorflow/lib/libtpu.so" ]]; then
+        if [[ ! -L "${RUNFILES}/tensorflow/lib/libtpu.so" ]]; then
+          mkdir "$(real_path ${RUNFILES}/tensorflow/lib)"
+          ln -s $(real_path ./tensorflow/lib/libtpu.so) $(real_path ${RUNFILES}/tensorflow/lib/libtpu.so)
+          echo "Created symlink: $(real_path ./tensorflow/lib/libtpu.so) -> \
+            $(real_path ${RUNFILES}/tensorflow/lib/libtpu.so)"
+        else
+          echo "Symlink already exists: ${RUNFILES}/tensorflow/lib/libtpu.so"
+        fi
+      else
+        echo "Libtpu.so is not found in $(real_path ./tensorflow/lib/)"
+        exit 1
+      fi
+    fi
     cp -LR \
       bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
       "${TMPDIR}"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 80b6e28ed1fc1c..a3f46675f70633 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -316,6 +316,10 @@ def find_files(pattern, root):
 for path in so_lib_paths:
   matches.extend(['../' + x for x in find_files('*', path) if '.py' not in x])
 
+# If building a tpu package, bundle libtpu.so as part of the wheel
+if '_tpu' in project_name:
+  matches.append('tensorflow/lib/libtpu.so')
+
 if os.name == 'nt':
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.pyd'
 else:

From 8bffb7a2070abb941a12ea21d6f079a617cdaa58 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 16:42:27 -0700
Subject: [PATCH 185/567] Fix a typo in `array_impl_test_lib`.

PiperOrigin-RevId: 567747222
---
 third_party/xla/xla/python/ifrt/array_impl_test_lib.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
index ad9571ac05d3fd..b29a4707e23d4c 100644
--- a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
@@ -488,7 +488,7 @@ TEST(ArrayImplTest, ReshardToDifferentDevice) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto reshared_array,
-      array->Reshard(sharding, ArrayCopySemantics::kAlwaysCopy));
+      array->Reshard(new_sharding, ArrayCopySemantics::kAlwaysCopy));
 
   std::vector<float> out_data(6);
   auto future = reshared_array->CopyToHostBuffer(

From 5cd0a8fd0ef40b95726cb73ffb6484e4078a3a5e Mon Sep 17 00:00:00 2001
From: Fiona Lang <flang@google.com>
Date: Fri, 22 Sep 2023 16:53:45 -0700
Subject: [PATCH 186/567] Import ops/gradients_impl.py in eager/backprop.py to
 ensure that registered gradients are available for gradient lookup.

Remove nn_grad.py's dependency on eager/backprop.py to remove the cycle that would block this, by moving the nn gradients that depend on backprop to a new file.

PiperOrigin-RevId: 567749407
---
 tensorflow/python/eager/BUILD                 |   1 +
 tensorflow/python/eager/backprop.py           |   1 +
 tensorflow/python/ops/BUILD                   |  17 +-
 tensorflow/python/ops/gradients_impl.py       |   1 -
 tensorflow/python/ops/nn.py                   |   1 +
 .../python/ops/nn_fused_batch_norm_grad.py    | 166 ++++++++++++++++
 .../python/ops/nn_fused_batchnorm_test.py     |   6 +-
 tensorflow/python/ops/nn_grad.py              | 180 ++----------------
 tensorflow/python/ops/nn_ops.py               |   2 +
 9 files changed, 205 insertions(+), 170 deletions(-)
 create mode 100644 tensorflow/python/ops/nn_fused_batch_norm_grad.py

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 2ce65e6ab4530c..e72f54c48fd553 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -707,6 +707,7 @@ py_strict_library(
         "//tensorflow/python/ops:check_ops",
         "//tensorflow/python/ops:control_flow_util",
         "//tensorflow/python/ops:default_gradient",
+        "//tensorflow/python/ops:gradients_impl",
         "//tensorflow/python/ops:math_ops_gen",
         "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/ops:unconnected_gradients",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 16fc829ee6ca20..57593eac09b26d 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -42,6 +42,7 @@
 from tensorflow.python.ops import default_gradient
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gradients_impl  # pylint: disable=unused-import
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_ops
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index cba5c7c60e69ac..ca03c9485561b1 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -1627,7 +1627,6 @@ py_strict_library(
         ":manip_grad",
         ":math_grad",
         ":math_ops",
-        ":nn_grad",
         ":optional_grad",
         ":random_grad",
         ":rnn_grad",
@@ -2182,6 +2181,7 @@ py_strict_library(
         ":ctc_ops",
         ":embedding_ops",
         ":math_ops",
+        ":nn_fused_batch_norm_grad",
         ":nn_grad",
         ":nn_impl",
         ":nn_impl_distribute",
@@ -2197,8 +2197,18 @@ py_strict_library(
         ":array_ops",
         ":array_ops_stack",
         ":math_ops",
-        ":nn_ops",
         ":nn_ops_gen",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "nn_fused_batch_norm_grad",
+    srcs = ["nn_fused_batch_norm_grad.py"],
+    deps = [
+        ":array_ops",
+        ":math_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
@@ -2215,6 +2225,7 @@ py_strict_library(
         ":check_ops",
         ":math_ops",
         ":math_ops_gen",
+        ":nn_grad",
         ":nn_ops_gen",
         ":random_ops",
         ":stateless_random_ops",
@@ -3684,7 +3695,7 @@ cuda_py_strict_test(
         ":gradient_checker",
         ":gradients_impl",
         ":math_ops",
-        ":nn_grad",
+        ":nn_fused_batch_norm_grad",
         ":nn_impl",
         ":nn_ops",
         ":nn_ops_gen",
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index b47f8d8bf9e595..9812b382093768 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -30,7 +30,6 @@
 from tensorflow.python.ops import manip_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import optional_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import random_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import rnn_grad  # pylint: disable=unused-import
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 8aa4a2fd74cef8..753f1708494d63 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -24,6 +24,7 @@
 from tensorflow.python.ops import ctc_ops as _ctc_ops
 from tensorflow.python.ops import embedding_ops as _embedding_ops
 from tensorflow.python.ops import nn_grad as _nn_grad
+from tensorflow.python.ops import nn_fused_batch_norm_grad as _nn_fused_batch_norm_grad
 from tensorflow.python.ops import nn_ops as _nn_ops
 from tensorflow.python.ops.math_ops import sigmoid
 from tensorflow.python.ops.math_ops import tanh
diff --git a/tensorflow/python/ops/nn_fused_batch_norm_grad.py b/tensorflow/python/ops/nn_fused_batch_norm_grad.py
new file mode 100644
index 00000000000000..245f65cdf28752
--- /dev/null
+++ b/tensorflow/python/ops/nn_fused_batch_norm_grad.py
@@ -0,0 +1,166 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Batch norm gradients for operators defined in nn_ops.py."""
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _BatchNormGrad(grad_y,
+                   x,
+                   scale,
+                   pop_mean,
+                   pop_var,
+                   epsilon,
+                   data_format,
+                   is_training=True):
+  """Returns the gradients for the 3 inputs of BatchNorm.
+
+  Args:
+    grad_y: A `Tensor` of 4 or 5 dimensions for gradient for y.
+    x: A `Tensor` of 4 or 5 dimensions for x.
+    scale: A `Tensor` of 1 dimension for scaling.
+    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
+      is_training=False.
+    pop_var: A `Tensor` of 1 dimension for the population variance. Only used
+      when is_training=False.
+    epsilon: A small float number added to the variance of x.
+    data_format: The data format for input. Either b"NHWC" or b"NCHW".
+    is_training: A bool value to indicate the operation is for training
+      (default) or inference.
+
+  Returns:
+    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
+    for x, grad_scale the gradient for scale, and grad_offset the gradient
+    for offset.
+  """
+  x_dtype = x.dtype.base_dtype
+  if x_dtype == dtypes.float16 or x_dtype == dtypes.bfloat16:
+    # float16 math is too imprecise, so we do the batch norm gradient
+    # computations in float32.
+    x = math_ops.cast(x, dtypes.float32)
+    grad_y = math_ops.cast(grad_y, dtypes.float32)
+  if is_training:
+    if data_format == b"NHWC":
+      keepdims = False
+      reduce_axis = [0, 1, 2]
+    elif data_format == b"NDHWC":
+      keepdims = False
+      reduce_axis = [0, 1, 2, 3]
+    elif data_format == b"NCHW":
+      keepdims = True
+      reduce_axis = [0, 2, 3]
+      shape = [1, array_ops.size(scale), 1, 1]
+      scale = array_ops.reshape(scale, shape)
+    else:
+      keepdims = True
+      reduce_axis = [0, 2, 3, 4]
+      shape = [1, array_ops.size(scale), 1, 1, 1]
+      scale = array_ops.reshape(scale, shape)
+    mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims)
+    mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
+    var_x = math_ops.reduce_mean(
+        math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
+        reduce_axis,
+        keepdims=keepdims)
+    grad_y_offset = grad_y - mean_grad_y
+    x_offset = x - mean_x
+    mean = math_ops.reduce_mean(
+        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
+    grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
+        grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
+    grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
+        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
+    if data_format == b"NCHW" or data_format == b"NCDHW":
+      grad_scale = array_ops.squeeze(grad_scale)
+    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
+    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
+  else:
+    if data_format == b"NHWC":
+      reduce_axis = [0, 1, 2]
+    elif data_format == b"NDHWC":
+      reduce_axis = [0, 1, 2, 3]
+    elif data_format == b"NCHW":
+      reduce_axis = [0, 2, 3]
+      shape = [1, array_ops.size(pop_mean), 1, 1]
+      pop_mean = array_ops.reshape(pop_mean, shape)
+      pop_var = array_ops.reshape(pop_var, shape)
+      scale = array_ops.reshape(scale, shape)
+    else:
+      reduce_axis = [0, 2, 3, 4]
+      shape = [1, array_ops.size(pop_mean), 1, 1, 1]
+      pop_mean = array_ops.reshape(pop_mean, shape)
+      pop_var = array_ops.reshape(pop_var, shape)
+      scale = array_ops.reshape(scale, shape)
+
+    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
+    var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
+    grad_scale = math_ops.reduce_sum(
+        grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis)
+    grad_x = grad_y * scale * var_rsqrt
+    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
+
+
+@ops.RegisterGradient("FusedBatchNormGrad")
+def _FusedBatchNormGradGrad(op: ops.Operation, *grad):
+  """Returns the gradients for the 3 inputs of FusedBatchNormGrad.
+
+  Args:
+    op: The FusedBatchNormGradOp for which we need to compute gradients.
+    *grad: An argument list for tensors of gradients wrt the outputs with
+      grad[0] as grad_grad_x, grad[1] as grad_grad_scale, grad[2] as
+      grad_grad_offset.
+
+  Returns:
+    A tuple (grad_grad_y, grad_x, grad_scale, None, None), where grad_grad_y
+    is the gradient for grad_y, grad_x the gradient for x, grad_scale the
+    gradient for scale.
+  """
+  data_format = op.get_attr("data_format")
+  epsilon = op.get_attr("epsilon")
+  is_training = op.get_attr("is_training")
+  grad_y = op.inputs[0]
+  x = op.inputs[1]
+  scale = op.inputs[2]
+  pop_mean = op.inputs[3]
+  pop_var = op.inputs[4]
+  grad_grad_x = grad[0]
+  grad_grad_scale = grad[1]
+  grad_grad_offset = grad[2]
+  with backprop.GradientTape() as tape:
+    tape.watch(grad_y)
+    tape.watch(x)
+    tape.watch(scale)
+    grad_x, grad_scale, grad_offset = _BatchNormGrad(
+        grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
+    grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
+  grad_grad_y, grad_x, grad_scale = tape.gradient(
+      [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
+  return grad_grad_y, grad_x, grad_scale, None, None
+
+
+@ops.RegisterGradient("FusedBatchNormGradV2")
+def _FusedBatchNormGradGradV2(op: ops.Operation, *grad):
+  return _FusedBatchNormGradGrad(op, *grad)
+
+
+@ops.RegisterGradient("FusedBatchNormGradV3")
+def _FusedBatchNormGradGradV3(op: ops.Operation, *grad):
+  grad_grad_y, grad_x, grad_scale, _, _ = _FusedBatchNormGradGrad(op, *grad)
+  return grad_grad_y, grad_x, grad_scale, None, None, None
+
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 4dd5202f6cdb2c..1131ec377fac18 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -26,7 +26,7 @@
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_grad
+from tensorflow.python.ops import nn_fused_batch_norm_grad
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
@@ -336,8 +336,8 @@ def _test_grad_grad(self,
         epsilon = y.op.get_attr('epsilon')
         data_format = y.op.get_attr('data_format')
         grad_vals = self.evaluate([grad_x, grad_scale, grad_offset])
-        grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean,
-                                               pop_var, epsilon, data_format)
+        grad_internal = nn_fused_batch_norm_grad._BatchNormGrad(
+            grad_y, x, scale, pop_mean, pop_var, epsilon, data_format)
         grad_internal_vals = self.evaluate(list(grad_internal))
         for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals):
           self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance)
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 258cdd147ada3e..8260caf0787fd2 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -18,14 +18,12 @@
 import itertools
 import operator
 
-from tensorflow.python.eager import backprop
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
 
 
 @ops.RegisterGradient("Conv2DBackpropInput")
@@ -153,7 +151,7 @@ def _Conv3DGrad(op: ops.Operation, grad):
   data_format = op.get_attr("data_format").decode()
   shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])
   return [
-      nn_ops.conv3d_backprop_input_v2(
+      gen_nn_ops.conv3d_backprop_input_v2(
           shape_0,
           op.inputs[1],
           grad,
@@ -161,7 +159,7 @@ def _Conv3DGrad(op: ops.Operation, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format),
-      nn_ops.conv3d_backprop_filter_v2(
+      gen_nn_ops.conv3d_backprop_filter_v2(
           op.inputs[0],
           shape_1,
           grad,
@@ -178,7 +176,7 @@ def _Conv3DBackpropInputGrad(op: ops.Operation, grad):
   data_format = op.get_attr("data_format").decode()
   return [
       None,
-      nn_ops.conv3d_backprop_filter_v2(
+      gen_nn_ops.conv3d_backprop_filter_v2(
           grad,
           array_ops.shape(op.inputs[1]),
           op.inputs[2],
@@ -186,7 +184,7 @@ def _Conv3DBackpropInputGrad(op: ops.Operation, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format),
-      nn_ops.conv3d(
+      gen_nn_ops.conv3d(
           grad,
           op.inputs[1],
           dilations=op.get_attr("dilations"),
@@ -200,7 +198,7 @@ def _Conv3DBackpropInputGrad(op: ops.Operation, grad):
 def _Conv3DBackpropFilterGrad(op: ops.Operation, grad):
   data_format = op.get_attr("data_format").decode()
   return [
-      nn_ops.conv3d_backprop_input_v2(
+      gen_nn_ops.conv3d_backprop_input_v2(
           array_ops.shape(op.inputs[0]),
           grad,
           op.inputs[2],
@@ -208,7 +206,7 @@ def _Conv3DBackpropFilterGrad(op: ops.Operation, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format), None,
-      nn_ops.conv3d(
+      gen_nn_ops.conv3d(
           op.inputs[0],
           grad,
           dilations=op.get_attr("dilations"),
@@ -526,7 +524,7 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op: ops.Operation, grad_loss, grad_grad):
   logits = op.inputs[0]
   if (grad_grad is not None and
       not getattr(grad_grad, "_is_zeros_tensor", False)):
-    softmax = nn_ops.softmax(logits)
+    softmax = gen_nn_ops.softmax(logits)
 
     grad += ((grad_grad - array_ops.squeeze(
         math_ops.matmul(
@@ -534,7 +532,7 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op: ops.Operation, grad_loss, grad_grad):
             array_ops.expand_dims(softmax, 2)),
         axis=1)) * softmax)
 
-  return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))  # pylint: disable=invalid-unary-operand-type
+  return grad, _BroadcastMul(grad_loss, -gen_nn_ops.log_softmax(logits))  # pylint: disable=invalid-unary-operand-type
 
 
 @ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits")
@@ -554,7 +552,7 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op: ops.Operation,
   logits = op.inputs[0]
   if (grad_grad is not None and
       not getattr(grad_grad, "_is_zeros_tensor", False)):
-    softmax = nn_ops.softmax(logits)
+    softmax = gen_nn_ops.softmax(logits)
 
     grad += ((grad_grad - array_ops.squeeze(
         math_ops.matmul(
@@ -633,14 +631,14 @@ def _DepthwiseConv2dNativeGrad(op: ops.Operation, grad):
 @ops.RegisterGradient("Dilation2D")
 def _Dilation2DGrad(op: ops.Operation, grad):
   return [
-      nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad,
-                                       op.get_attr("strides"),
-                                       op.get_attr("rates"),
-                                       op.get_attr("padding")),
-      nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad,
-                                        op.get_attr("strides"),
-                                        op.get_attr("rates"),
-                                        op.get_attr("padding"))
+      gen_nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad,
+                                           op.get_attr("strides"),
+                                           op.get_attr("rates"),
+                                           op.get_attr("padding")),
+      gen_nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad,
+                                            op.get_attr("strides"),
+                                            op.get_attr("rates"),
+                                            op.get_attr("padding"))
   ]
 
 
@@ -951,150 +949,6 @@ def _FusedBatchNormV3Grad(op: ops.Operation, *grad):
   return _BaseFusedBatchNormGrad(op, 2, *grad)
 
 
-def _BatchNormGrad(grad_y,
-                   x,
-                   scale,
-                   pop_mean,
-                   pop_var,
-                   epsilon,
-                   data_format,
-                   is_training=True):
-  """Returns the gradients for the 3 inputs of BatchNorm.
-
-  Args:
-    grad_y: A `Tensor` of 4 or 5 dimensions for gradient for y.
-    x: A `Tensor` of 4 or 5 dimensions for x.
-    scale: A `Tensor` of 1 dimension for scaling.
-    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
-      is_training=False.
-    pop_var: A `Tensor` of 1 dimension for the population variance. Only used
-      when is_training=False.
-    epsilon: A small float number added to the variance of x.
-    data_format: The data format for input. Either b"NHWC" or b"NCHW".
-    is_training: A bool value to indicate the operation is for training
-      (default) or inference.
-
-  Returns:
-    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
-    for x, grad_scale the gradient for scale, and grad_offset the gradient
-    for offset.
-  """
-  x_dtype = x.dtype.base_dtype
-  if x_dtype == dtypes.float16 or x_dtype == dtypes.bfloat16:
-    # float16 math is too imprecise, so we do the batch norm gradient
-    # computations in float32.
-    x = math_ops.cast(x, dtypes.float32)
-    grad_y = math_ops.cast(grad_y, dtypes.float32)
-  if is_training:
-    if data_format == b"NHWC":
-      keepdims = False
-      reduce_axis = [0, 1, 2]
-    elif data_format == b"NDHWC":
-      keepdims = False
-      reduce_axis = [0, 1, 2, 3]
-    elif data_format == b"NCHW":
-      keepdims = True
-      reduce_axis = [0, 2, 3]
-      shape = [1, array_ops.size(scale), 1, 1]
-      scale = array_ops.reshape(scale, shape)
-    else:
-      keepdims = True
-      reduce_axis = [0, 2, 3, 4]
-      shape = [1, array_ops.size(scale), 1, 1, 1]
-      scale = array_ops.reshape(scale, shape)
-    mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims)
-    mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
-    var_x = math_ops.reduce_mean(
-        math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
-        reduce_axis,
-        keepdims=keepdims)
-    grad_y_offset = grad_y - mean_grad_y
-    x_offset = x - mean_x
-    mean = math_ops.reduce_mean(
-        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
-    grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
-        grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
-    grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
-        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
-    if data_format == b"NCHW" or data_format == b"NCDHW":
-      grad_scale = array_ops.squeeze(grad_scale)
-    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
-    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
-  else:
-    if data_format == b"NHWC":
-      reduce_axis = [0, 1, 2]
-    elif data_format == b"NDHWC":
-      reduce_axis = [0, 1, 2, 3]
-    elif data_format == b"NCHW":
-      reduce_axis = [0, 2, 3]
-      shape = [1, array_ops.size(pop_mean), 1, 1]
-      pop_mean = array_ops.reshape(pop_mean, shape)
-      pop_var = array_ops.reshape(pop_var, shape)
-      scale = array_ops.reshape(scale, shape)
-    else:
-      reduce_axis = [0, 2, 3, 4]
-      shape = [1, array_ops.size(pop_mean), 1, 1, 1]
-      pop_mean = array_ops.reshape(pop_mean, shape)
-      pop_var = array_ops.reshape(pop_var, shape)
-      scale = array_ops.reshape(scale, shape)
-
-    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
-    var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
-    grad_scale = math_ops.reduce_sum(
-        grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis)
-    grad_x = grad_y * scale * var_rsqrt
-    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
-
-
-@ops.RegisterGradient("FusedBatchNormGrad")
-def _FusedBatchNormGradGrad(op: ops.Operation, *grad):
-  """Returns the gradients for the 3 inputs of FusedBatchNormGrad.
-
-  Args:
-    op: The FusedBatchNormGradOp for which we need to compute gradients.
-    *grad: An argument list for tensors of gradients wrt the outputs with
-      grad[0] as grad_grad_x, grad[1] as grad_grad_scale, grad[2] as
-      grad_grad_offset.
-
-  Returns:
-    A tuple (grad_grad_y, grad_x, grad_scale, None, None), where grad_grad_y
-    is the gradient for grad_y, grad_x the gradient for x, grad_scale the
-    gradient for scale.
-  """
-  data_format = op.get_attr("data_format")
-  epsilon = op.get_attr("epsilon")
-  is_training = op.get_attr("is_training")
-  grad_y = op.inputs[0]
-  x = op.inputs[1]
-  scale = op.inputs[2]
-  pop_mean = op.inputs[3]
-  pop_var = op.inputs[4]
-  grad_grad_x = grad[0]
-  grad_grad_scale = grad[1]
-  grad_grad_offset = grad[2]
-  with backprop.GradientTape() as tape:
-    tape.watch(grad_y)
-    tape.watch(x)
-    tape.watch(scale)
-    grad_x, grad_scale, grad_offset = _BatchNormGrad(
-        grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
-    grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
-  grad_grad_y, grad_x, grad_scale = tape.gradient(
-      [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
-  return grad_grad_y, grad_x, grad_scale, None, None
-
-
-@ops.RegisterGradient("FusedBatchNormGradV2")
-def _FusedBatchNormGradGradV2(op: ops.Operation, *grad):
-  return _FusedBatchNormGradGrad(op, *grad)
-
-
-@ops.RegisterGradient("FusedBatchNormGradV3")
-def _FusedBatchNormGradGradV3(op: ops.Operation, *grad):
-  grad_grad_y, grad_x, grad_scale, _, _ = _FusedBatchNormGradGrad(op, *grad)
-  return grad_grad_y, grad_x, grad_scale, None, None, None
-
-
 @ops.RegisterGradient("L2Loss")
 def _L2LossGrad(op: ops.Operation, grad):
   """Return the gradients for L2Loss.
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 0146f18a5f8767..b2f624e106bf93 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -187,6 +187,8 @@
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
+# Ensure all gradients are registered for nn_ops
+from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables as variables_lib

From f236ae3bca673d66d70e443dc8432a3e4e01bae2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 22 Sep 2023 18:15:08 -0700
Subject: [PATCH 187/567] [stream_executor] NFC: Clean up
 xla/stream_executor:kernel build dependencies

PiperOrigin-RevId: 567763730
---
 third_party/xla/xla/pjrt/gpu/BUILD            |  4 +-
 third_party/xla/xla/stream_executor/BUILD     | 44 ++-----------------
 .../xla/xla/stream_executor/allocator_stats.h |  1 -
 third_party/xla/xla/stream_executor/dnn.h     |  1 -
 .../xla/xla/stream_executor/host/BUILD        |  1 +
 third_party/xla/xla/stream_executor/kernel.cc |  4 +-
 third_party/xla/xla/stream_executor/kernel.h  |  6 +--
 .../stream_executor/stream_executor_pimpl.h   |  1 -
 .../xla/xla/tools/multihost_hlo_runner/BUILD  |  1 +
 9 files changed, 11 insertions(+), 52 deletions(-)

diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 12c50d4c918a80..5de730b142b637 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -25,8 +25,8 @@ cc_library(
         "//xla/client:client_library",
         "//xla/client:local_client",
         "//xla/service:platform_util",
+        "//xla/stream_executor",
         "//xla/stream_executor:device_mem_allocator",
-        "//xla/stream_executor:kernel",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/framework:bfc_allocator",
         "@local_tsl//tsl/framework:device_id_impl",
@@ -63,8 +63,8 @@ cc_library(
         "//xla/service:executable",
         "//xla/service:platform_util",
         "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/stream_executor",
         "//xla/stream_executor:device_mem_allocator",
-        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:tf_allocator_adapter",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 6783578a4bc150..03f2d6ca7538d3 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -469,6 +469,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -501,6 +502,7 @@ cc_library(
         ":fft",
         ":host_or_device_scalar",
         ":kernel",
+        ":kernel_spec",
         ":launch_dim",
         ":platform",
         ":plugin_registry",
@@ -531,64 +533,25 @@ cc_library(
 
 cc_library(
     name = "kernel",
-    srcs = [
-        "fft.h",
-        "kernel.cc",
-        "plugin.h",
-        "stream.h",
-        "stream_executor_pimpl.h",
-        "temporary_device_memory.h",
-        "temporary_memory_manager.h",
-    ],
+    srcs = ["kernel.cc"],
     hdrs = [
-        "blas.h",
-        "device_description.h",
-        "device_options.h",
-        "event.h",
         "kernel.h",
-        "kernel_spec.h",
-        "launch_dim.h",
         "multi_platform_manager.h",
-        "platform.h",
-        "plugin_registry.h",
-        "stream_executor.h",
-        "stream_executor_internal.h",
-        "trace_listener.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":allocator_stats",
-        ":data_type",
-        ":device_description",
-        ":device_description_proto_cc",
         ":device_memory",
-        ":device_options",
-        ":fft",
-        ":kernel_cache_config",
-        ":kernel_spec",
-        ":launch_dim",
         ":platform",
-        ":plugin",
-        ":plugin_registry",
         ":stream_executor_headers",
         "//xla/stream_executor/platform",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/protobuf:dnn_proto_cc",
     ],
 )
 
@@ -788,6 +751,7 @@ cc_library(
         ":dnn_proto_cc",
         ":event",
         ":kernel",
+        ":kernel_spec",
         ":launch_dim",
         ":multi_platform_manager",
         ":platform",
diff --git a/third_party/xla/xla/stream_executor/allocator_stats.h b/third_party/xla/xla/stream_executor/allocator_stats.h
index d073d08cb92d5b..1e15d0c51e64b9 100644
--- a/third_party/xla/xla/stream_executor/allocator_stats.h
+++ b/third_party/xla/xla/stream_executor/allocator_stats.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <string>
 
-#include "absl/types/optional.h"
 #include "xla/stream_executor/platform/port.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index 1cf598e7f37141..68f18f635048bc 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include <vector>
 
 #include "google/protobuf/wrappers.pb.h"
-#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 1cea4b7e5485eb..21095c57941840 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -64,6 +64,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:stream_executor_internal",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:denormal",
diff --git a/third_party/xla/xla/stream_executor/kernel.cc b/third_party/xla/xla/stream_executor/kernel.cc
index d09f25873c8555..64e6f0191f22f4 100644
--- a/third_party/xla/xla/stream_executor/kernel.cc
+++ b/third_party/xla/xla/stream_executor/kernel.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include "xla/stream_executor/kernel.h"
 
+#include <utility>
+
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/demangle.h"
-#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index 8991fcaa8404bf..61ba6b62f1fd85 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -75,20 +75,16 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <type_traits>
-#include <vector>
 
+#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel_cache_config.h"
 #include "xla/stream_executor/platform/port.h"
-#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
-class DeviceMemoryBase;
-template <typename ElemT>
-class DeviceMemory;
 class StreamExecutor;
 
 namespace internal {
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
index 1a208a303e22ae..c41ef6db37cc61 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/synchronization/mutex.h"
-#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory_allocator.h"
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index 01bff3c03b4f2f..dc1723bcce97ab 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -35,6 +35,7 @@ xla_cc_binary(
         ":hlo_runner_flags",
         "//xla:debug_options_flags",
         "//xla:status",
+        "//xla/stream_executor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",

From b0f08aa26133f3373d626381a58c04246dd35a11 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 20:27:03 -0700
Subject: [PATCH 188/567] Add convenience method to get memory spaces by kind

PiperOrigin-RevId: 567781230
---
 third_party/xla/xla/pjrt/BUILD         |  7 +++++++
 third_party/xla/xla/pjrt/pjrt_client.h | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index dac1259700f4ee..bf8cce6944164a 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -169,6 +169,7 @@ cc_library(
     hdrs = ["pjrt_client.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":pjrt_common",
         ":pjrt_compiler",
         ":pjrt_device_description",
         ":pjrt_executable",
@@ -182,13 +183,19 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/client:xla_computation",
         "//xla/hlo/ir:hlo",
+        "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_cost_analysis",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/framework:allocator",
diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index 5038a3e69dd399..4c32a1b81fd180 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef XLA_PJRT_PJRT_CLIENT_H_
 #define XLA_PJRT_PJRT_CLIENT_H_
 
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -25,20 +27,31 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/attributes.h"
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "xla/client/xla_computation.h"
+#include "xla/layout.h"
 #include "xla/literal.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
+#include "xla/service/computation_placer.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
@@ -177,6 +190,11 @@ class PjRtDevice {
   // Returns the default memory space attached to this device.
   virtual StatusOr<PjRtMemorySpace*> default_memory_space() const = 0;
 
+  virtual absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind(
+      absl::string_view memory_space_kind) const {
+    return Unimplemented("memory_space_by_kind not implemented");
+  }
+
   // Returns a platform-specific stream handle that should be used to track when
   // an externally-managed buffer is ready to use on this device. This is
   // intended to support dlpack on GPU and is not expected to be implemented for

From c2dfd70f7a3fa079dc53857571e4c406b254105c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Sep 2023 22:10:53 -0700
Subject: [PATCH 189/567] [stream_executor] NFC: Clean up
 xla/stream_executor:kernel build dependencies

PiperOrigin-RevId: 567794680
---
 third_party/xla/xla/pjrt/gpu/BUILD            |  4 +-
 third_party/xla/xla/stream_executor/BUILD     | 44 +++++++++++++++++--
 .../xla/xla/stream_executor/allocator_stats.h |  1 +
 third_party/xla/xla/stream_executor/dnn.h     |  1 +
 .../xla/xla/stream_executor/host/BUILD        |  1 -
 third_party/xla/xla/stream_executor/kernel.cc |  4 +-
 third_party/xla/xla/stream_executor/kernel.h  |  6 ++-
 .../stream_executor/stream_executor_pimpl.h   |  1 +
 .../xla/xla/tools/multihost_hlo_runner/BUILD  |  1 -
 9 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 5de730b142b637..12c50d4c918a80 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -25,8 +25,8 @@ cc_library(
         "//xla/client:client_library",
         "//xla/client:local_client",
         "//xla/service:platform_util",
-        "//xla/stream_executor",
         "//xla/stream_executor:device_mem_allocator",
+        "//xla/stream_executor:kernel",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/framework:bfc_allocator",
         "@local_tsl//tsl/framework:device_id_impl",
@@ -63,8 +63,8 @@ cc_library(
         "//xla/service:executable",
         "//xla/service:platform_util",
         "//xla/service/gpu:gpu_executable_run_options",
-        "//xla/stream_executor",
         "//xla/stream_executor:device_mem_allocator",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:tf_allocator_adapter",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 03f2d6ca7538d3..6783578a4bc150 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -469,7 +469,6 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -502,7 +501,6 @@ cc_library(
         ":fft",
         ":host_or_device_scalar",
         ":kernel",
-        ":kernel_spec",
         ":launch_dim",
         ":platform",
         ":plugin_registry",
@@ -533,25 +531,64 @@ cc_library(
 
 cc_library(
     name = "kernel",
-    srcs = ["kernel.cc"],
+    srcs = [
+        "fft.h",
+        "kernel.cc",
+        "plugin.h",
+        "stream.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+    ],
     hdrs = [
+        "blas.h",
+        "device_description.h",
+        "device_options.h",
+        "event.h",
         "kernel.h",
+        "kernel_spec.h",
+        "launch_dim.h",
         "multi_platform_manager.h",
+        "platform.h",
+        "plugin_registry.h",
+        "stream_executor.h",
+        "stream_executor_internal.h",
+        "trace_listener.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":allocator_stats",
+        ":data_type",
+        ":device_description",
+        ":device_description_proto_cc",
         ":device_memory",
+        ":device_options",
+        ":fft",
+        ":kernel_cache_config",
+        ":kernel_spec",
+        ":launch_dim",
         ":platform",
+        ":plugin",
+        ":plugin_registry",
         ":stream_executor_headers",
         "//xla/stream_executor/platform",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/protobuf:dnn_proto_cc",
     ],
 )
 
@@ -751,7 +788,6 @@ cc_library(
         ":dnn_proto_cc",
         ":event",
         ":kernel",
-        ":kernel_spec",
         ":launch_dim",
         ":multi_platform_manager",
         ":platform",
diff --git a/third_party/xla/xla/stream_executor/allocator_stats.h b/third_party/xla/xla/stream_executor/allocator_stats.h
index 1e15d0c51e64b9..d073d08cb92d5b 100644
--- a/third_party/xla/xla/stream_executor/allocator_stats.h
+++ b/third_party/xla/xla/stream_executor/allocator_stats.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/types/optional.h"
 #include "xla/stream_executor/platform/port.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index 68f18f635048bc..1cf598e7f37141 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include <vector>
 
 #include "google/protobuf/wrappers.pb.h"
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 21095c57941840..1cea4b7e5485eb 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -64,7 +64,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor:kernel",
-        "//xla/stream_executor:stream_executor_internal",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:denormal",
diff --git a/third_party/xla/xla/stream_executor/kernel.cc b/third_party/xla/xla/stream_executor/kernel.cc
index 64e6f0191f22f4..d09f25873c8555 100644
--- a/third_party/xla/xla/stream_executor/kernel.cc
+++ b/third_party/xla/xla/stream_executor/kernel.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include "xla/stream_executor/kernel.h"
 
-#include <utility>
-
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/demangle.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index 61ba6b62f1fd85..8991fcaa8404bf 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -75,16 +75,20 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <type_traits>
+#include <vector>
 
-#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel_cache_config.h"
 #include "xla/stream_executor/platform/port.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
+class DeviceMemoryBase;
+template <typename ElemT>
+class DeviceMemory;
 class StreamExecutor;
 
 namespace internal {
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
index c41ef6db37cc61..1a208a303e22ae 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory_allocator.h"
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index dc1723bcce97ab..01bff3c03b4f2f 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -35,7 +35,6 @@ xla_cc_binary(
         ":hlo_runner_flags",
         "//xla:debug_options_flags",
         "//xla:status",
-        "//xla/stream_executor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",

From 9a4bde88bf5779ef0f39ef46eda4dc4e6090cf37 Mon Sep 17 00:00:00 2001
From: Jared Junyoung Lim <jaredlim@google.com>
Date: Sat, 23 Sep 2023 00:09:02 -0700
Subject: [PATCH 190/567] Sub supports broadcasting up to 6 dimensions

PiperOrigin-RevId: 567812015
---
 RELEASE.md                                    |   2 +-
 tensorflow/compiler/mlir/lite/ir/tfl_ops.cc   |   4 +-
 .../compiler/mlir/lite/tests/legalize-tf.mlir |  25 +-
 tensorflow/compiler/mlir/lite/tests/ops.mlir  |  11 +-
 tensorflow/lite/kernels/BUILD                 |   3 +-
 .../lite/kernels/internal/reference/sub.h     | 350 ++++-----
 tensorflow/lite/kernels/sub_test.cc           | 672 +++++++++++++++++-
 7 files changed, 825 insertions(+), 242 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 465dc46bed5cf1..5e6cc62a627a62 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -45,7 +45,7 @@
 
 *   `tf.lite`:
 
-    *   `mul_op` supports broadcasting up to 6 dimensions.
+    *   `sub_op` and `mul_op` support broadcasting up to 6 dimensions.
 
     *  The `tflite::SignatureRunner` class, which provides support for named
        parameters and for multiple named computations within a single TF Lite
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 19fb5b3b4e0d41..939d840f404445 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -364,7 +364,7 @@ bool VerifySubOpShapeConstraints(SubOp op) {
       IsQI16Type(element_type)) {
     return VerifyOperandsHaveSameShapesOrBroadcastableShape(
         /*op=*/op.getOperation(), /*indices=*/ArrayRef<unsigned>{0, 1},
-        /*max_bcast_rank=*/5);
+        /*max_bcast_rank=*/6);
   }
 
   // Allows QI8 output when the operands have valid shapes, which are
@@ -372,7 +372,7 @@ bool VerifySubOpShapeConstraints(SubOp op) {
   if (IsQI8Type(element_type)) {
     return VerifyOperandsHaveSameShapesOrBroadcastableShape(
         /*op=*/op.getOperation(), /*indices=*/ArrayRef<unsigned>{0, 1},
-        /*max_bcast_rank=*/4);
+        /*max_bcast_rank=*/6);
   }
   return false;
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 953bd4a455f0dc..444a494f73769a 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1925,16 +1925,29 @@ func.func @add_with_int32_7d_inputs(%arg0: tensor<1x1x1x1x1x3x1xi32>, %arg1 : te
 // CHECK: %2 = tfl.add %0, %1 {fused_activation_function = "NONE"} : tensor<1x1x1x1x1x3x4xi32>
 }
 
-// CHECK-LABEL: testSubWithBroadcastToOps
-func.func @testSubWithBroadcastToOps(%arg0: tensor<1x2x1x4x5x6xi32>, %arg1: tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32> {
-  // CHECK: [[CST:%.*]] = arith.constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi64>
-  // CHECK: [[BCAST:%.*]] = "tfl.broadcast_to"(%arg0, [[CST]])
-  // CHECK: [[BCAST_1:%.*]] = "tfl.broadcast_to"(%arg1, [[CST]])
-  // CHECK: tfl.sub [[BCAST]], [[BCAST_1]] {fused_activation_function = "NONE"} : tensor<1x2x3x4x5x6xi32>
+func.func @test5DSubWithImplicitBroadcast(%arg0: tensor<1x1x1x3x1xi32>, %arg1 : tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32> {
+  %0 = "tf.Sub"(%arg0, %arg1): (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
+  func.return %0 : tensor<1x1x1x3x4xi32>
+// CHECK-LABEL: test5DSubWithImplicitBroadcast
+// CHECK: %0 = tfl.sub(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
+}
+
+func.func @test6DSubWithImplicitBroadcast(%arg0: tensor<1x2x1x4x5x6xi32>, %arg1: tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32> {
+// CHECK-LABEL: test6DSubWithImplicitBroadcast
+// CHECK:  %0 = tfl.sub(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
   %0 = "tf.Sub"(%arg0, %arg1) : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
   func.return %0 : tensor<1x2x3x4x5x6xi32>
 }
 
+func.func @sub_with_int32_7d_inputs(%arg0: tensor<1x1x1x1x1x3x1xi32>, %arg1 : tensor<1x1x1x1x1x1x4xi32>) -> tensor<1x1x1x1x1x3x4xi32> {
+  %0 = "tf.Sub"(%arg0, %arg1): (tensor<1x1x1x1x1x3x1xi32>, tensor<1x1x1x1x1x1x4xi32>) -> tensor<1x1x1x1x1x3x4xi32>
+  func.return %0 : tensor<1x1x1x1x1x3x4xi32>
+// CHECK-LABEL: sub_with_int32_7d_inputs
+// CHECK: %0 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<1x1x1x1x1x3x1xi32>, tensor<7xi64>) -> tensor<1x1x1x1x1x3x4xi32>
+// CHECK: %1 = "tfl.broadcast_to"(%arg1, %cst) : (tensor<1x1x1x1x1x1x4xi32>, tensor<7xi64>) -> tensor<1x1x1x1x1x3x4xi32>
+// CHECK: %2 = tfl.sub %0, %1 {fused_activation_function = "NONE"} : tensor<1x1x1x1x1x3x4xi32>
+}
+
 func.func @test5DMulWithImplicitBroadcast(%arg0: tensor<1x1x1x3x1xi32>, %arg1 : tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32> {
   %0 = "tf.Mul"(%arg0, %arg1): (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
   func.return %0 : tensor<1x1x1x3x4xi32>
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 28ff3abdc67b65..a2ca1158c5405f 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -411,9 +411,18 @@ func.func @add_with_quantized_i16_broadcasting(tensor<2x2xf32>, tensor<1xf32>) -
 
 // -----
 
+// CHECK-LABEL: sub_with_i32_five_dim_broadcasting
+func.func @sub_with_i32_five_dim_broadcasting(tensor<1x1x1x1x1xi32>, tensor<1xi32>) -> tensor<1x1x1x1x1xi32> {
+^bb0(%arg0: tensor<1x1x1x1x1xi32>, %arg1: tensor<1xi32>):
+  // CHECK: tfl.sub(%arg0, %arg1) {fused_activation_function = "RELU6"}
+  %0 = "tfl.sub"(%arg0, %arg1) {fused_activation_function = "RELU6"} : (tensor<1x1x1x1x1xi32>, tensor<1xi32>) -> tensor<1x1x1x1x1xi32>
+  func.return %0#0 : tensor<1x1x1x1x1xi32>
+}
+
+// -----
+
 func.func @sub_with_quantized_i8_five_dim_broadcasting(tensor<1x1x1x1x1xf32>, tensor<1xf32>) -> tensor<1x1x1x1x1x!quant.any<i8:f32>> {
 ^bb0(%arg0: tensor<1x1x1x1x1xf32>, %arg1: tensor<1xf32>):
-  // expected-error @+1 {{Operands do not have valid shapes}}
   %0 = "tfl.sub"(%arg0, %arg1) {fused_activation_function = "RELU6"} : (tensor<1x1x1x1x1xf32>, tensor<1xf32>) -> tensor<1x1x1x1x1x!quant.any<i8:f32>>
   func.return %0#0 : tensor<1x1x1x1x1x!quant.any<i8:f32>>
 }
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index f69a197d0b1eb0..b641ecdeadae14 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -1181,8 +1181,9 @@ cc_test(
 
 cc_test(
     name = "sub_test",
-    size = "small",
+    size = "medium",
     srcs = ["sub_test.cc"],
+    shard_count = 10,
     tags = ["tflite_nnapi"],
     deps = [
         ":test_main",
diff --git a/tensorflow/lite/kernels/internal/reference/sub.h b/tensorflow/lite/kernels/internal/reference/sub.h
index d0ebc95ada0851..50e8ec1ee06f53 100644
--- a/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/tensorflow/lite/kernels/internal/reference/sub.h
@@ -29,6 +29,9 @@ namespace tflite {
 
 namespace reference_ops {
 
+// Maximum dimension supported by the broadcast sub operation.
+constexpr int kMaxSubBroadcastDim = 6;
+
 inline void SubNonBroadcast(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
                             const float* input1_data,
@@ -61,144 +64,61 @@ inline void SubNonBroadcast(const ArithmeticParams& params,
   }
 }
 
+template <typename T, typename F>
+void BroadcastSubRecursiveDimensions(
+    const ArithmeticParams& params, int dimension, const T* input1_data,
+    const T* input2_data, T* output_data, size_t* input1_offset_p,
+    size_t* input2_offset_p, size_t* output_offset,
+    const NdArrayDesc<kMaxSubBroadcastDim>& desc1,
+    const NdArrayDesc<kMaxSubBroadcastDim>& desc2,
+    const int32_t extended_output_shape_dims[kMaxSubBroadcastDim],
+    F binary_func) {
+  if (dimension == kMaxSubBroadcastDim - 1) {
+    for (int c = 0; c < extended_output_shape_dims[dimension]; ++c) {
+      const T input1_val = input1_data[*input1_offset_p];
+      const T input2_val = input2_data[*input2_offset_p];
+      output_data[*output_offset] = binary_func(params, input1_val, input2_val);
+      *input1_offset_p += desc1.strides[dimension];
+      *input2_offset_p += desc2.strides[dimension];
+      ++(*output_offset);
+    }
+  } else {
+    for (int a = 0; a < extended_output_shape_dims[dimension]; ++a) {
+      size_t input1_offset_c = *input1_offset_p;
+      size_t input2_offset_c = *input2_offset_p;
+      BroadcastSubRecursiveDimensions(
+          params, dimension + 1, input1_data, input2_data, output_data,
+          &input1_offset_c, &input2_offset_c, output_offset, desc1, desc2,
+          extended_output_shape_dims, binary_func);
+      *input1_offset_p += desc1.strides[dimension];
+      *input2_offset_p += desc2.strides[dimension];
+    }
+  }
+}
+
 // TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
 // generate max(D1, D2) nested for loops.
-template <int N = 5>
-inline void BroadcastSubSlow(const ArithmeticParams& params,
-                             const RuntimeShape& input1_shape,
-                             const float* input1_data,
-                             const RuntimeShape& input2_shape,
-                             const float* input2_data,
-                             const RuntimeShape& output_shape,
-                             float* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/float");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        ActivationFunctionWithMinMax(
-            input1_data[SubscriptToIndex(desc1, indexes)] -
-                input2_data[SubscriptToIndex(desc2, indexes)],
-            params.float_activation_min, params.float_activation_max);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
-}
-
-template <int N = 5>
-inline void BroadcastSubSlow(const ArithmeticParams& params,
-                             const RuntimeShape& input1_shape,
-                             const int32_t* input1_data,
-                             const RuntimeShape& input2_shape,
-                             const int32_t* input2_data,
-                             const RuntimeShape& output_shape,
-                             int32_t* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        ActivationFunctionWithMinMax(
-            input1_data[SubscriptToIndex(desc1, indexes)] -
-                input2_data[SubscriptToIndex(desc2, indexes)],
-            params.quantized_activation_min, params.quantized_activation_max);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
-}
-
-template <int N = 5>
-void BroadcastSubSlow(const ArithmeticParams& params,
-                      const RuntimeShape& input1_shape,
-                      const int64_t* input1_data,
-                      const RuntimeShape& input2_shape,
-                      const int64_t* input2_data,
-                      const RuntimeShape& output_shape, int64_t* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        ActivationFunctionWithMinMax(
-            input1_data[SubscriptToIndex(desc1, indexes)] -
-                input2_data[SubscriptToIndex(desc2, indexes)],
-            params.int64_activation_min, params.int64_activation_max);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
-}
-
-template <typename T, int N = 5>
+template <typename T>
 void BroadcastSubSlow(const ArithmeticParams& params,
                       const RuntimeShape& input1_shape, const T* input1_data,
                       const RuntimeShape& input2_shape, const T* input2_data,
                       const RuntimeShape& output_shape, T* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/templated");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/T");
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxSubBroadcastDim);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxSubBroadcastDim);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxSubBroadcastDim);
+  NdArrayDesc<kMaxSubBroadcastDim> desc1;
+  NdArrayDesc<kMaxSubBroadcastDim> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(kMaxSubBroadcastDim, output_shape);
+  // Cache output shape dimensions.
+  int32_t extended_output_shape_dims[kMaxSubBroadcastDim];
+  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
+              sizeof(extended_output_shape_dims));
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -211,17 +131,21 @@ void BroadcastSubSlow(const ArithmeticParams& params,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        ActivationFunctionWithMinMax(
-            input1_data[SubscriptToIndex(desc1, indexes)] -
-                input2_data[SubscriptToIndex(desc2, indexes)],
-            params.quantized_activation_min, params.quantized_activation_max);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastSubRecursiveDimensions(
+      params, 0, input1_data, input2_data, output_data, &input1_offset,
+      &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
+      [](const ArithmeticParams& params, const T input1_val,
+         const T input2_val) {
+        T activation_min, activation_max;
+        GetActivationParams(params, &activation_min, &activation_max);
+        return ActivationFunctionWithMinMax<T>(input1_val - input2_val,
+                                               activation_min, activation_max);
+      });
 }
 
-template <int N = 5>
 inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
                                   const RuntimeShape& input1_shape,
                                   const int16_t* input1_data,
@@ -230,12 +154,19 @@ inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
                                   const RuntimeShape& output_shape,
                                   int16_t* output_data) {
   ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxSubBroadcastDim);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxSubBroadcastDim);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxSubBroadcastDim);
+  NdArrayDesc<kMaxSubBroadcastDim> desc1;
+  NdArrayDesc<kMaxSubBroadcastDim> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(kMaxSubBroadcastDim, output_shape);
+  // Cache output shape dimensions.
+  int32_t extended_output_shape_dims[kMaxSubBroadcastDim];
+  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
+              sizeof(extended_output_shape_dims));
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -248,24 +179,27 @@ inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
-    const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
-    const int32_t scaled_input1_val =
-        gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
-    const int32_t scaled_input2_val =
-        gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
-    const int32_t raw_output = scaled_input1_val - scaled_input2_val;
-    const int32_t clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, raw_output));
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        static_cast<int16_t>(clamped_output);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastSubRecursiveDimensions(
+      params, 0, input1_data, input2_data, output_data, &input1_offset,
+      &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
+      [](const ArithmeticParams& params, const int16_t input1_val,
+         const int16_t input2_val) {
+        const int32_t scaled_input1_val =
+            gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
+        const int32_t scaled_input2_val =
+            gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
+        const int32_t raw_output = scaled_input1_val - scaled_input2_val;
+        const int32_t clamped_output =
+            std::min(params.quantized_activation_max,
+                     std::max(params.quantized_activation_min, raw_output));
+        return static_cast<int16_t>(clamped_output);
+      });
 }
 
-template <typename T, int N = 5>
+template <typename T>
 void BroadcastQuantSubSlow(const ArithmeticParams& params,
                            const RuntimeShape& input1_shape,
                            const T* input1_data,
@@ -273,15 +207,19 @@ void BroadcastQuantSubSlow(const ArithmeticParams& params,
                            const T* input2_data,
                            const RuntimeShape& output_shape, T* output_data) {
   ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
-  NdArrayDesc<N> desc1;
-  NdArrayDesc<N> desc2;
-  NdArrayDesc<N> output_desc;
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxSubBroadcastDim);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxSubBroadcastDim);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxSubBroadcastDim);
+  NdArrayDesc<kMaxSubBroadcastDim> desc1;
+  NdArrayDesc<kMaxSubBroadcastDim> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
-  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(kMaxSubBroadcastDim, output_shape);
+  // Cache output shape dimensions.
+  int32_t extended_output_shape_dims[kMaxSubBroadcastDim];
+  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
+              sizeof(extended_output_shape_dims));
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -294,31 +232,36 @@ void BroadcastQuantSubSlow(const ArithmeticParams& params,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  auto sub_func = [&](int indexes[N]) {
-    const int32_t input1_val =
-        params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
-    const int32_t input2_val =
-        params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
-    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32_t scaled_input1_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32_t scaled_input2_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32_t raw_output =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            raw_sub, params.output_multiplier, params.output_shift) +
-        params.output_offset;
-    const int32_t clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, raw_output));
-    output_data[SubscriptToIndex(output_desc, indexes)] =
-        static_cast<T>(clamped_output);
-  };
-  NDOpsHelper<N>(output_desc, sub_func);
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastSubRecursiveDimensions(
+      params, 0, input1_data, input2_data, output_data, &input1_offset,
+      &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
+      [](const ArithmeticParams& params, const T input1_val,
+         const T input2_val) {
+        const int32_t shifted_input1_val =
+            (params.input1_offset + input1_val) * (1 << params.left_shift);
+        const int32_t shifted_input2_val =
+            (params.input2_offset + input2_val) * (1 << params.left_shift);
+        const int32_t scaled_input1_val =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                shifted_input1_val, params.input1_multiplier,
+                params.input1_shift);
+        const int32_t scaled_input2_val =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                shifted_input2_val, params.input2_multiplier,
+                params.input2_shift);
+        const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+        const int32_t raw_output =
+            MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                raw_sub, params.output_multiplier, params.output_shift) +
+            params.output_offset;
+        const int32_t clamped_output =
+            std::min(params.quantized_activation_max,
+                     std::max(params.quantized_activation_min, raw_output));
+        return static_cast<T>(clamped_output);
+      });
 }
 
 // Element-wise add that can often be used for inner loop of broadcast add as
@@ -405,12 +348,16 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
          const T* input1_data, const RuntimeShape& input2_shape,
          const T* input2_data, const RuntimeShape& output_shape,
          T* output_data) {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
+  NdArrayDesc<kMaxSubBroadcastDim> desc1;
+  NdArrayDesc<kMaxSubBroadcastDim> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
   const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
+      RuntimeShape::ExtendedShape(kMaxSubBroadcastDim, output_shape);
+  // Cache output shape dimensions.
+  int32_t extended_output_shape_dims[kMaxSubBroadcastDim];
+  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
+              sizeof(extended_output_shape_dims));
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -423,17 +370,14 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
-              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-        }
-      }
-    }
-  }
+  size_t input1_offset = 0;
+  size_t input2_offset = 0;
+  size_t output_offset = 0;
+  BroadcastSubRecursiveDimensions(
+      params, 0, input1_data, input2_data, output_data, &input1_offset,
+      &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
+      [](const ArithmeticParams& params, const T input1_val,
+         const T input2_val) { return input1_val - input2_val; });
 }
 
 inline void SetActivationMinMax(const ArithmeticParams& params,
diff --git a/tensorflow/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
index 5821fd302aa4e9..88ede0f922aa06 100644
--- a/tensorflow/lite/kernels/sub_test.cc
+++ b/tensorflow/lite/kernels/sub_test.cc
@@ -14,10 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <functional>
 #include <limits>
+#include <numeric>
+#include <random>
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
@@ -38,6 +45,7 @@ class BaseSubOpModel : public SingleOpModel {
     output_ = AddOutput(output);
     SetBuiltinOp(BuiltinOperator_SUB, BuiltinOptions_SubOptions,
                  CreateSubOptions(builder_, activation_type).Union());
+    SetBypassDefaultDelegates();
     BuildInterpreter({GetShape(input1_), GetShape(input2_)});
   }
 
@@ -63,14 +71,10 @@ class IntegerSubOpModel : public BaseSubOpModel {
  public:
   using BaseSubOpModel::BaseSubOpModel;
 
-  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
-};
-
-class Int64SubOpModel : public BaseSubOpModel {
- public:
-  using BaseSubOpModel::BaseSubOpModel;
-
-  std::vector<int64_t> GetOutput() { return ExtractVector<int64_t>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
 };
 
 class QuantizedSubOpModel : public BaseSubOpModel {
@@ -250,7 +254,7 @@ TEST(IntegerSubOpModel, NoActivation) {
   m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8});
   m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-21, 0, 4, 3}));
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({-21, 0, 4, 3}));
 }
 
 TEST(IntegerSubOpModel, ActivationRELU_N1_TO_1) {
@@ -260,7 +264,7 @@ TEST(IntegerSubOpModel, ActivationRELU_N1_TO_1) {
   m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8});
   m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1, 0, 1, 1}));
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({-1, 0, 1, 1}));
 }
 
 TEST(IntegerSubOpModel, VariousInputShapes) {
@@ -273,7 +277,7 @@ TEST(IntegerSubOpModel, VariousInputShapes) {
     m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8, 11, 20});
     m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5, 11, 1});
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
-    EXPECT_THAT(m.GetOutput(), ElementsAreArray({-21, 0, 4, 3, 0, 19}))
+    EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({-21, 0, 4, 3, 0, 19}))
         << "With shape number " << i;
   }
 }
@@ -288,43 +292,43 @@ TEST(IntegerSubOpModel, WithBroadcast) {
     m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8, 11, 20});
     m.PopulateTensor<int32_t>(m.input2(), {1});
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
-    EXPECT_THAT(m.GetOutput(),
+    EXPECT_THAT(m.GetOutput<int32_t>(),
                 ElementsAreArray(ArrayFloatNear({-21, 1, 6, 7, 10, 19})))
         << "With shape number " << i;
   }
 }
 
 TEST(Int64SubOpModel, NoActivation) {
-  Int64SubOpModel m({TensorType_INT64, {1, 2, 2, 1}},
-                    {TensorType_INT64, {1, 2, 2, 1}}, {TensorType_INT64, {}},
-                    ActivationFunctionType_NONE);
+  IntegerSubOpModel m({TensorType_INT64, {1, 2, 2, 1}},
+                      {TensorType_INT64, {1, 2, 2, 1}}, {TensorType_INT64, {}},
+                      ActivationFunctionType_NONE);
   m.PopulateTensor<int64_t>(m.input1(), {-20, 2, 7, 8});
   m.PopulateTensor<int64_t>(m.input2(), {1, 2, 3, 5});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-21, 0, 4, 3}));
+  EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({-21, 0, 4, 3}));
 }
 
 TEST(Int64SubOpModel, ActivationRELU_N1_TO_1) {
-  Int64SubOpModel m({TensorType_INT64, {1, 2, 2, 1}},
-                    {TensorType_INT64, {1, 2, 2, 1}}, {TensorType_INT64, {}},
-                    ActivationFunctionType_RELU_N1_TO_1);
+  IntegerSubOpModel m({TensorType_INT64, {1, 2, 2, 1}},
+                      {TensorType_INT64, {1, 2, 2, 1}}, {TensorType_INT64, {}},
+                      ActivationFunctionType_RELU_N1_TO_1);
   m.PopulateTensor<int64_t>(m.input1(), {-20, 2, 7, 8});
   m.PopulateTensor<int64_t>(m.input2(), {1, 2, 3, 5});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1, 0, 1, 1}));
+  EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({-1, 0, 1, 1}));
 }
 
 TEST(Int64SubOpModel, VariousInputShapes) {
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    Int64SubOpModel m({TensorType_INT64, test_shapes[i]},
-                      {TensorType_INT64, test_shapes[i]},
-                      {TensorType_INT64, {}}, ActivationFunctionType_NONE);
+    IntegerSubOpModel m({TensorType_INT64, test_shapes[i]},
+                        {TensorType_INT64, test_shapes[i]},
+                        {TensorType_INT64, {}}, ActivationFunctionType_NONE);
     m.PopulateTensor<int64_t>(m.input1(), {-20, 2, 7, 8, 11, 20});
     m.PopulateTensor<int64_t>(m.input2(), {1, 2, 3, 5, 11, 1});
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
-    EXPECT_THAT(m.GetOutput(), ElementsAreArray({-21, 0, 4, 3, 0, 19}))
+    EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({-21, 0, 4, 3, 0, 19}))
         << "With shape number " << i;
   }
 }
@@ -333,13 +337,13 @@ TEST(Int64SubOpModel, WithBroadcast) {
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}, {1, 3, 1, 2, 1}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    Int64SubOpModel m({TensorType_INT64, test_shapes[i]},
-                      {TensorType_INT64, {}},  // always a scalar
-                      {TensorType_INT64, {}}, ActivationFunctionType_NONE);
+    IntegerSubOpModel m({TensorType_INT64, test_shapes[i]},
+                        {TensorType_INT64, {}},  // always a scalar
+                        {TensorType_INT64, {}}, ActivationFunctionType_NONE);
     m.PopulateTensor<int64_t>(m.input1(), {-20, 2, 7, 8, 11, 20});
     m.PopulateTensor<int64_t>(m.input2(), {1});
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
-    EXPECT_THAT(m.GetOutput(),
+    EXPECT_THAT(m.GetOutput<int64_t>(),
                 ElementsAreArray(ArrayFloatNear({-21, 1, 6, 7, 10, 19})))
         << "With shape number " << i;
   }
@@ -592,5 +596,617 @@ TEST(QuantizedSubOpModel, QuantizedTestsReluActivationBroadcastInt16) {
   }
 }
 
+constexpr int kDim1 = 2;
+constexpr int kDim2 = 3;
+constexpr int kDim3 = 4;
+constexpr int kDim4 = 5;
+constexpr int kDim5 = 6;
+constexpr int kDim6 = 7;
+
+constexpr int kMaxBroadcastDim = 6;
+
+void TestFloatBroadcast(std::vector<int> input1_shape,
+                        std::vector<int> input2_shape) {
+  std::array<int, kMaxBroadcastDim> input1_dims;
+  std::array<int, kMaxBroadcastDim> input2_dims;
+  std::array<int, kMaxBroadcastDim> output_dims;
+  std::array<int, kMaxBroadcastDim> input1_strides;
+  std::array<int, kMaxBroadcastDim> input2_strides;
+  std::array<int, kMaxBroadcastDim> output_strides;
+  std::fill(input1_dims.begin(), input1_dims.end(), 1);
+  std::fill(input2_dims.begin(), input2_dims.end(), 1);
+  std::fill(output_dims.begin(), output_dims.end(), 1);
+  std::copy(input1_shape.cbegin(), input1_shape.cend(),
+            input1_dims.end() - input1_shape.size());
+  std::copy(input2_shape.cbegin(), input2_shape.cend(),
+            input2_dims.end() - input2_shape.size());
+
+  for (size_t i = 0; i < kMaxBroadcastDim; i++) {
+    if (input1_dims[i] != 1 && input2_dims[i] != 1) {
+      ASSERT_EQ(input1_dims[i], input2_dims[i]);
+    }
+    output_dims[i] = std::max(input1_dims[i], input2_dims[i]);
+  }
+  // Compute generalized strides.
+  size_t input1_stride = 1, input2_stride = 1, output_stride = 1;
+  for (size_t i = kMaxBroadcastDim; i != 0; i--) {
+    input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride;
+    input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride;
+    output_strides[i - 1] = output_stride;
+    input1_stride *= input1_dims[i - 1];
+    input2_stride *= input2_dims[i - 1];
+    output_stride *= output_dims[i - 1];
+  }
+  const int num_input1_elements = std::accumulate(
+      input1_dims.begin(), input1_dims.end(), 1, std::multiplies<int>());
+  const int num_input2_elements = std::accumulate(
+      input2_dims.begin(), input2_dims.end(), 1, std::multiplies<int>());
+  const int num_output_elements = std::accumulate(
+      output_dims.begin(), output_dims.end(), 1, std::multiplies<int>());
+  std::vector<float> input1(num_input1_elements);
+  std::vector<float> input2(num_input2_elements);
+  std::vector<float> output_ref(num_output_elements);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  std::uniform_real_distribution<float> f32dist(0.01f, 1.0f);
+
+  std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); });
+  std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); });
+
+  // Compute reference results.
+  for (size_t i = 0; i < output_dims[0]; i++) {
+    for (size_t j = 0; j < output_dims[1]; j++) {
+      for (size_t k = 0; k < output_dims[2]; k++) {
+        for (size_t l = 0; l < output_dims[3]; l++) {
+          for (size_t m = 0; m < output_dims[4]; m++) {
+            for (size_t n = 0; n < output_dims[5]; n++) {
+              output_ref[i * output_strides[0] + j * output_strides[1] +
+                         k * output_strides[2] + l * output_strides[3] +
+                         m * output_strides[4] + n * output_strides[5]] =
+                  input1[i * input1_strides[0] + j * input1_strides[1] +
+                         k * input1_strides[2] + l * input1_strides[3] +
+                         m * input1_strides[4] + n * input1_strides[5]] -
+                  input2[i * input2_strides[0] + j * input2_strides[1] +
+                         k * input2_strides[2] + l * input2_strides[3] +
+                         m * input2_strides[4] + n * input2_strides[5]];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  FloatSubOpModel m({TensorType_FLOAT32, input1_shape},
+                    {TensorType_FLOAT32, input2_shape},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), input1);
+  m.PopulateTensor<float>(m.input2(), input2);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), testing::ContainerEq(output_ref));
+}
+
+template <typename IntegerType>
+void TestIntegerBroadcast(std::vector<int> input1_shape,
+                          std::vector<int> input2_shape) {
+  std::array<int, kMaxBroadcastDim> input1_dims;
+  std::array<int, kMaxBroadcastDim> input2_dims;
+  std::array<int, kMaxBroadcastDim> output_dims;
+  std::array<int, kMaxBroadcastDim> input1_strides;
+  std::array<int, kMaxBroadcastDim> input2_strides;
+  std::array<int, kMaxBroadcastDim> output_strides;
+  std::fill(input1_dims.begin(), input1_dims.end(), 1);
+  std::fill(input2_dims.begin(), input2_dims.end(), 1);
+  std::fill(output_dims.begin(), output_dims.end(), 1);
+  std::copy(input1_shape.cbegin(), input1_shape.cend(),
+            input1_dims.end() - input1_shape.size());
+  std::copy(input2_shape.cbegin(), input2_shape.cend(),
+            input2_dims.end() - input2_shape.size());
+
+  for (size_t i = 0; i < kMaxBroadcastDim; i++) {
+    if (input1_dims[i] != 1 && input2_dims[i] != 1) {
+      ASSERT_EQ(input1_dims[i], input2_dims[i]);
+    }
+    output_dims[i] = std::max(input1_dims[i], input2_dims[i]);
+  }
+  // Compute generalized strides.
+  size_t input1_stride = 1, input2_stride = 1, output_stride = 1;
+  for (size_t i = kMaxBroadcastDim; i != 0; i--) {
+    input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride;
+    input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride;
+    output_strides[i - 1] = output_stride;
+    input1_stride *= input1_dims[i - 1];
+    input2_stride *= input2_dims[i - 1];
+    output_stride *= output_dims[i - 1];
+  }
+  const int num_input1_elements = std::accumulate(
+      input1_dims.begin(), input1_dims.end(), 1, std::multiplies<int>());
+  const int num_input2_elements = std::accumulate(
+      input2_dims.begin(), input2_dims.end(), 1, std::multiplies<int>());
+  const int num_output_elements = std::accumulate(
+      output_dims.begin(), output_dims.end(), 1, std::multiplies<int>());
+  std::vector<IntegerType> input1(num_input1_elements);
+  std::vector<IntegerType> input2(num_input2_elements);
+  std::vector<IntegerType> output_ref(num_output_elements);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  std::uniform_int_distribution<IntegerType> dist(0, 256);
+
+  std::generate(input1.begin(), input1.end(), [&]() { return dist(rng); });
+  std::generate(input2.begin(), input2.end(), [&]() { return dist(rng); });
+
+  // Compute reference results.
+  for (size_t i = 0; i < output_dims[0]; i++) {
+    for (size_t j = 0; j < output_dims[1]; j++) {
+      for (size_t k = 0; k < output_dims[2]; k++) {
+        for (size_t l = 0; l < output_dims[3]; l++) {
+          for (size_t m = 0; m < output_dims[4]; m++) {
+            for (size_t n = 0; n < output_dims[5]; n++) {
+              output_ref[i * output_strides[0] + j * output_strides[1] +
+                         k * output_strides[2] + l * output_strides[3] +
+                         m * output_strides[4] + n * output_strides[5]] =
+                  input1[i * input1_strides[0] + j * input1_strides[1] +
+                         k * input1_strides[2] + l * input1_strides[3] +
+                         m * input1_strides[4] + n * input1_strides[5]] -
+                  input2[i * input2_strides[0] + j * input2_strides[1] +
+                         k * input2_strides[2] + l * input2_strides[3] +
+                         m * input2_strides[4] + n * input2_strides[5]];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  IntegerSubOpModel m({GetTensorType<IntegerType>(), input1_shape},
+                      {GetTensorType<IntegerType>(), input2_shape},
+                      {GetTensorType<IntegerType>(), {}},
+                      ActivationFunctionType_NONE);
+  m.PopulateTensor<IntegerType>(m.input1(), input1);
+  m.PopulateTensor<IntegerType>(m.input2(), input2);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput<IntegerType>(), testing::ContainerEq(output_ref));
+}
+
+// To improve automatic test sharding (via shard_count in the BUILD file),
+// we need to ensure that each individual test case runs in a reasonable time,
+// otherwise we end up being limited by the performance of the longest shard.
+// Since TestFloat32MultiDimBroadcast has 2^12 iterations, it takes a
+// long time (over 30 seconds) to execute all iterations -- too long for a
+// single shard.  So we split it into a few "subshards" and have a separate
+// TYPED_TEST macro invocation for each subshard.
+
+void TestFloat32MultiDimBroadcast(int selected_subshard, int subshard_count) {
+  int iteration = 0;
+  for (uint32_t bm1 = 0; bm1 < (static_cast<uint32_t>(1) << kMaxBroadcastDim);
+       bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (static_cast<uint32_t>(1) << kMaxBroadcastDim);
+         bm2++) {
+      if (iteration++ % subshard_count != selected_subshard) {
+        continue;  // This iteration of the loop is not part of this subshard.
+      }
+      const bool input1_broadcast_dim1 = bm1 & (static_cast<uint32_t>(1) << 0);
+      const bool input1_broadcast_dim2 = bm1 & (static_cast<uint32_t>(1) << 1);
+      const bool input1_broadcast_dim3 = bm1 & (static_cast<uint32_t>(1) << 2);
+      const bool input1_broadcast_dim4 = bm1 & (static_cast<uint32_t>(1) << 3);
+      const bool input1_broadcast_dim5 = bm1 & (static_cast<uint32_t>(1) << 4);
+      const bool input1_broadcast_dim6 = bm1 & (static_cast<uint32_t>(1) << 5);
+      const bool input2_broadcast_dim1 = bm2 & (static_cast<uint32_t>(1) << 0);
+      const bool input2_broadcast_dim2 = bm2 & (static_cast<uint32_t>(1) << 1);
+      const bool input2_broadcast_dim3 = bm2 & (static_cast<uint32_t>(1) << 2);
+      const bool input2_broadcast_dim4 = bm2 & (static_cast<uint32_t>(1) << 3);
+      const bool input2_broadcast_dim5 = bm2 & (static_cast<uint32_t>(1) << 4);
+      const bool input2_broadcast_dim6 = bm2 & (static_cast<uint32_t>(1) << 5);
+      const int input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const int input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const int input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const int input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const int input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const int input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const int input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const int input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const int input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const int input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const int input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const int input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      std::vector<int> input1_full_shape{input1_dim1, input1_dim2, input1_dim3,
+                                         input1_dim4, input1_dim5, input1_dim6};
+      std::vector<int> input2_full_shape{input2_dim1, input2_dim2, input2_dim3,
+                                         input2_dim4, input2_dim5, input2_dim6};
+      for (int input1_dims = 1; input1_dims <= kMaxBroadcastDim;
+           ++input1_dims) {
+        for (int input2_dims = 1; input2_dims <= kMaxBroadcastDim;
+             ++input2_dims) {
+          std::vector<int> input1_shape(input1_dims), input2_shape(input2_dims);
+          std::copy(input1_full_shape.end() - input1_dims,
+                    input1_full_shape.end(), input1_shape.data());
+          std::copy(input2_full_shape.end() - input2_dims,
+                    input2_full_shape.end(), input2_shape.data());
+          TestFloatBroadcast(input1_shape, input2_shape);
+        }
+      }
+    }
+  }
+}
+
+// Should match the number of TEST or TYPED_TEST invoations for each of
+// Float32MultiDimBroadcastSubshard*,
+// IntegerMultiDimBroadcastSubshard*,
+// Int8QuantizedMultiDimBroadcastSubshard*, and
+// Uint8QuantizedMultiDimBroadcastSubshard* below.
+constexpr int kMultiDimBroadcastSubshardCount = 10;
+
+TEST(FloatSubOpModel, Float32MultiDimBroadcastSubshard0) {
+  TestFloat32MultiDimBroadcast(0, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatSubOpModel, Float32MultiDimBroadcastSubshard1) {
+  TestFloat32MultiDimBroadcast(1, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatSubOpModel, Float32MultiDimBroadcastSubshard2) {
+  TestFloat32MultiDimBroadcast(2, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatSubOpModel, Float32MultiDimBroadcastSubshard3) {
+  TestFloat32MultiDimBroadcast(3, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatSubOpModel, Float32MultiDimBroadcastSubshard4) {
+  TestFloat32MultiDimBroadcast(4, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatSubOpModel, Float32MultiDimBroadcastSubshard5) {
+  TestFloat32MultiDimBroadcast(5, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatSubOpModel, Float32MultiDimBroadcastSubshard6) {
+  TestFloat32MultiDimBroadcast(6, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatSubOpModel, Float32MultiDimBroadcastSubshard7) {
+  TestFloat32MultiDimBroadcast(7, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatSubOpModel, Float32MultiDimBroadcastSubshard8) {
+  TestFloat32MultiDimBroadcast(8, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatSubOpModel, Float32MultiDimBroadcastSubshard9) {
+  TestFloat32MultiDimBroadcast(9, kMultiDimBroadcastSubshardCount);
+}
+
+template <typename T>
+class IntegerSubOpTest : public ::testing::Test {};
+
+using Int32Or64Types = ::testing::Types<int32_t, int64_t>;
+TYPED_TEST_SUITE(IntegerSubOpTest, Int32Or64Types);
+
+// To improve automatic test sharding (via shard_count in the BUILD file),
+// we need to ensure that each individual test case runs in a reasonable time,
+// otherwise we end up being limited by the performance of the longest shard.
+// Since TestIntegerMultiDimBroadcast has 2^12 iterations, it takes a
+// long time (over 30 seconds) to execute all iterations -- too long for a
+// single shard.  So we split it into a few "subshards" and have a separate
+// TYPED_TEST macro invocation for each subshard.
+
+template <class TypeParam>
+void TestIntegerMultiDimBroadcast(int selected_subshard, int subshard_count) {
+  ASSERT_LT(selected_subshard, subshard_count);
+  int iteration = 0;
+  for (uint32_t bm1 = 0; bm1 < (static_cast<uint32_t>(1) << kMaxBroadcastDim);
+       bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (static_cast<uint32_t>(1) << kMaxBroadcastDim);
+         bm2++) {
+      if (iteration++ % subshard_count != selected_subshard) {
+        continue;  // This iteration of the loop is not part of this subshard.
+      }
+      const bool input1_broadcast_dim1 = bm1 & (static_cast<uint32_t>(1) << 0);
+      const bool input1_broadcast_dim2 = bm1 & (static_cast<uint32_t>(1) << 1);
+      const bool input1_broadcast_dim3 = bm1 & (static_cast<uint32_t>(1) << 2);
+      const bool input1_broadcast_dim4 = bm1 & (static_cast<uint32_t>(1) << 3);
+      const bool input1_broadcast_dim5 = bm1 & (static_cast<uint32_t>(1) << 4);
+      const bool input1_broadcast_dim6 = bm1 & (static_cast<uint32_t>(1) << 5);
+      const bool input2_broadcast_dim1 = bm2 & (static_cast<uint32_t>(1) << 0);
+      const bool input2_broadcast_dim2 = bm2 & (static_cast<uint32_t>(1) << 1);
+      const bool input2_broadcast_dim3 = bm2 & (static_cast<uint32_t>(1) << 2);
+      const bool input2_broadcast_dim4 = bm2 & (static_cast<uint32_t>(1) << 3);
+      const bool input2_broadcast_dim5 = bm2 & (static_cast<uint32_t>(1) << 4);
+      const bool input2_broadcast_dim6 = bm2 & (static_cast<uint32_t>(1) << 5);
+      const int input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const int input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const int input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const int input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const int input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const int input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const int input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const int input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const int input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const int input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const int input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const int input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      std::vector<int> input1_full_shape{input1_dim1, input1_dim2, input1_dim3,
+                                         input1_dim4, input1_dim5, input1_dim6};
+      std::vector<int> input2_full_shape{input2_dim1, input2_dim2, input2_dim3,
+                                         input2_dim4, input2_dim5, input2_dim6};
+      for (int input1_dims = 1; input1_dims <= kMaxBroadcastDim;
+           ++input1_dims) {
+        for (int input2_dims = 1; input2_dims <= kMaxBroadcastDim;
+             ++input2_dims) {
+          std::vector<int> input1_shape(input1_dims), input2_shape(input2_dims);
+          std::copy(input1_full_shape.end() - input1_dims,
+                    input1_full_shape.end(), input1_shape.data());
+          std::copy(input2_full_shape.end() - input2_dims,
+                    input2_full_shape.end(), input2_shape.data());
+          TestIntegerBroadcast<TypeParam>(input1_shape, input2_shape);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(IntegerSubOpTest, IntegerMultiDimBroadcastSubshard0) {
+  TestIntegerMultiDimBroadcast<TypeParam>(0, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerSubOpTest, IntegerMultiDimBroadcastSubshard1) {
+  TestIntegerMultiDimBroadcast<TypeParam>(1, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerSubOpTest, IntegerMultiDimBroadcastSubshard2) {
+  TestIntegerMultiDimBroadcast<TypeParam>(2, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerSubOpTest, IntegerMultiDimBroadcastSubshard3) {
+  TestIntegerMultiDimBroadcast<TypeParam>(3, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerSubOpTest, IntegerMultiDimBroadcastSubshard4) {
+  TestIntegerMultiDimBroadcast<TypeParam>(4, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerSubOpTest, IntegerMultiDimBroadcastSubshard5) {
+  TestIntegerMultiDimBroadcast<TypeParam>(5, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerSubOpTest, IntegerMultiDimBroadcastSubshard6) {
+  TestIntegerMultiDimBroadcast<TypeParam>(6, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerSubOpTest, IntegerMultiDimBroadcastSubshard7) {
+  TestIntegerMultiDimBroadcast<TypeParam>(7, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerSubOpTest, IntegerMultiDimBroadcastSubshard8) {
+  TestIntegerMultiDimBroadcast<TypeParam>(8, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerSubOpTest, IntegerMultiDimBroadcastSubshard9) {
+  TestIntegerMultiDimBroadcast<TypeParam>(9, kMultiDimBroadcastSubshardCount);
+}
+
+template <typename QuantizedType>
+void TestQuantizedBroadcast(std::vector<int> input1_shape,
+                            std::vector<int> input2_shape) {
+  std::array<int, kMaxBroadcastDim> input1_dims;
+  std::array<int, kMaxBroadcastDim> input2_dims;
+  std::array<int, kMaxBroadcastDim> output_dims;
+  std::array<int, kMaxBroadcastDim> input1_strides;
+  std::array<int, kMaxBroadcastDim> input2_strides;
+  std::array<int, kMaxBroadcastDim> output_strides;
+  std::fill(input1_dims.begin(), input1_dims.end(), 1);
+  std::fill(input2_dims.begin(), input2_dims.end(), 1);
+  std::fill(output_dims.begin(), output_dims.end(), 1);
+  std::copy(input1_shape.cbegin(), input1_shape.cend(),
+            input1_dims.end() - input1_shape.size());
+  std::copy(input2_shape.cbegin(), input2_shape.cend(),
+            input2_dims.end() - input2_shape.size());
+
+  for (size_t i = 0; i < kMaxBroadcastDim; i++) {
+    if (input1_dims[i] != 1 && input2_dims[i] != 1) {
+      ASSERT_EQ(input1_dims[i], input2_dims[i]);
+    }
+    output_dims[i] = std::max(input1_dims[i], input2_dims[i]);
+  }
+  // Compute generalized strides.
+  size_t input1_stride = 1, input2_stride = 1, output_stride = 1;
+  for (size_t i = kMaxBroadcastDim; i != 0; i--) {
+    input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride;
+    input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride;
+    output_strides[i - 1] = output_stride;
+    input1_stride *= input1_dims[i - 1];
+    input2_stride *= input2_dims[i - 1];
+    output_stride *= output_dims[i - 1];
+  }
+  const int num_input1_elements = std::accumulate(
+      input1_dims.begin(), input1_dims.end(), 1, std::multiplies<int>());
+  const int num_input2_elements = std::accumulate(
+      input2_dims.begin(), input2_dims.end(), 1, std::multiplies<int>());
+  const int num_output_elements = std::accumulate(
+      output_dims.begin(), output_dims.end(), 1, std::multiplies<int>());
+  std::vector<float> input1(num_input1_elements);
+  std::vector<float> input2(num_input2_elements);
+  std::vector<float> output_ref(num_output_elements);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+
+  std::uniform_real_distribution<float> dist(-0.5f, 0.5f);
+
+  std::generate(input1.begin(), input1.end(), [&]() { return dist(rng); });
+  std::generate(input2.begin(), input2.end(), [&]() { return dist(rng); });
+
+  QuantizedSubOpModel m(
+      {GetTensorType<QuantizedType>(), input1_shape, -0.5f, 0.5f},
+      {GetTensorType<QuantizedType>(), input2_shape, -0.5f, 0.5f},
+      {GetTensorType<QuantizedType>(), {}, -0.5f, 0.5f},
+      ActivationFunctionType_NONE);
+  m.QuantizeAndPopulate<QuantizedType>(m.input1(), input1);
+  m.QuantizeAndPopulate<QuantizedType>(m.input2(), input2);
+  // Compute reference results.
+  for (size_t i = 0; i < output_dims[0]; i++) {
+    for (size_t j = 0; j < output_dims[1]; j++) {
+      for (size_t k = 0; k < output_dims[2]; k++) {
+        for (size_t l = 0; l < output_dims[3]; l++) {
+          for (size_t m = 0; m < output_dims[4]; m++) {
+            for (size_t n = 0; n < output_dims[5]; n++) {
+              float x = input1[i * input1_strides[0] + j * input1_strides[1] +
+                               k * input1_strides[2] + l * input1_strides[3] +
+                               m * input1_strides[4] + n * input1_strides[5]];
+              float y = input2[i * input2_strides[0] + j * input2_strides[1] +
+                               k * input2_strides[2] + l * input2_strides[3] +
+                               m * input2_strides[4] + n * input2_strides[5]];
+              output_ref[i * output_strides[0] + j * output_strides[1] +
+                         k * output_strides[2] + l * output_strides[3] +
+                         m * output_strides[4] + n * output_strides[5]] = x - y;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (float& output_value : output_ref) {
+    output_value = std::max<float>(output_value, -1.0f);
+    output_value = std::min<float>(output_value, 1.0f);
+  }
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  std::vector<float> output = m.GetDequantizedOutput<QuantizedType>();
+  for (size_t i = 0; i < output_dims[0]; i++) {
+    for (size_t j = 0; j < output_dims[1]; j++) {
+      for (size_t k = 0; k < output_dims[2]; k++) {
+        for (size_t l = 0; l < output_dims[3]; l++) {
+          for (size_t m = 0; m < output_dims[4]; m++) {
+            for (size_t n = 0; n < output_dims[5]; n++) {
+              const size_t index =
+                  i * output_strides[0] + j * output_strides[1] +
+                  k * output_strides[2] + l * output_strides[3] +
+                  m * output_strides[4] + n * output_strides[5];
+              EXPECT_NEAR(output[index], output_ref[index], 0.6f)
+                  << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k
+                  << ", " << l << ", " << m << ", " << n << ")";
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+class QuantizedSubOpTest : public ::testing::Test {};
+
+using Int8OrUInt8OrInt16Types = ::testing::Types<int8_t, uint8_t, int16_t>;
+TYPED_TEST_SUITE(QuantizedSubOpTest, Int8OrUInt8OrInt16Types);
+
+// To improve automatic test sharding (via shard_count in the BUILD file),
+// we need to ensure that each individual test case runs in a reasonable time,
+// otherwise we end up being limited by the performance of the longest shard.
+// Since TestQuantizedMultiDimBroadcast has 2^12 iterations, it takes a
+// long time (over 30 seconds) to execute all iterations -- too long for a
+// single shard.  So we split it into a few "subshards" and have a separate
+// TEST macro invocation for each subshard.
+
+template <class T>
+void TestQuantizedMultiDimBroadcast(int selected_subshard, int subshard_count) {
+  ASSERT_LT(selected_subshard, subshard_count);
+  int iteration = 0;
+  for (uint32_t bm1 = 0; bm1 < (static_cast<uint32_t>(1) << kMaxBroadcastDim);
+       bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (static_cast<uint32_t>(1) << kMaxBroadcastDim);
+         bm2++) {
+      if (iteration++ % subshard_count != selected_subshard) {
+        continue;  // This iteration of the loop is not part of this subshard.
+      }
+      const bool input1_broadcast_dim1 = bm1 & (static_cast<uint32_t>(1) << 0);
+      const bool input1_broadcast_dim2 = bm1 & (static_cast<uint32_t>(1) << 1);
+      const bool input1_broadcast_dim3 = bm1 & (static_cast<uint32_t>(1) << 2);
+      const bool input1_broadcast_dim4 = bm1 & (static_cast<uint32_t>(1) << 3);
+      const bool input1_broadcast_dim5 = bm1 & (static_cast<uint32_t>(1) << 4);
+      const bool input1_broadcast_dim6 = bm1 & (static_cast<uint32_t>(1) << 5);
+      const bool input2_broadcast_dim1 = bm2 & (static_cast<uint32_t>(1) << 0);
+      const bool input2_broadcast_dim2 = bm2 & (static_cast<uint32_t>(1) << 1);
+      const bool input2_broadcast_dim3 = bm2 & (static_cast<uint32_t>(1) << 2);
+      const bool input2_broadcast_dim4 = bm2 & (static_cast<uint32_t>(1) << 3);
+      const bool input2_broadcast_dim5 = bm2 & (static_cast<uint32_t>(1) << 4);
+      const bool input2_broadcast_dim6 = bm2 & (static_cast<uint32_t>(1) << 5);
+      const int input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const int input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const int input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const int input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const int input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const int input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const int input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const int input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const int input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const int input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const int input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const int input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      std::vector<int> input1_full_shape{input1_dim1, input1_dim2, input1_dim3,
+                                         input1_dim4, input1_dim5, input1_dim6};
+      std::vector<int> input2_full_shape{input2_dim1, input2_dim2, input2_dim3,
+                                         input2_dim4, input2_dim5, input2_dim6};
+      for (int input1_dims = 1; input1_dims <= kMaxBroadcastDim;
+           ++input1_dims) {
+        for (int input2_dims = 1; input2_dims <= kMaxBroadcastDim;
+             ++input2_dims) {
+          std::vector<int> input1_shape(input1_dims), input2_shape(input2_dims);
+          std::copy(input1_full_shape.end() - input1_dims,
+                    input1_full_shape.end(), input1_shape.data());
+          std::copy(input2_full_shape.end() - input2_dims,
+                    input2_full_shape.end(), input2_shape.data());
+          TestQuantizedBroadcast<T>(input1_shape, input2_shape);
+        }
+      }
+    }
+  }
+}
+
+TEST(QuantizedSubOpModel, Int8QuantizedMultiDimBroadcastSubshard0) {
+  TestQuantizedMultiDimBroadcast<int8_t>(0, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Int8QuantizedMultiDimBroadcastSubshard1) {
+  TestQuantizedMultiDimBroadcast<int8_t>(1, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Int8QuantizedMultiDimBroadcastSubshard2) {
+  TestQuantizedMultiDimBroadcast<int8_t>(2, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Int8QuantizedMultiDimBroadcastSubshard3) {
+  TestQuantizedMultiDimBroadcast<int8_t>(3, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Int8QuantizedMultiDimBroadcastSubshard4) {
+  TestQuantizedMultiDimBroadcast<int8_t>(4, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Int8QuantizedMultiDimBroadcastSubshard5) {
+  TestQuantizedMultiDimBroadcast<int8_t>(5, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Int8QuantizedMultiDimBroadcastSubshard6) {
+  TestQuantizedMultiDimBroadcast<int8_t>(6, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Int8QuantizedMultiDimBroadcastSubshard7) {
+  TestQuantizedMultiDimBroadcast<int8_t>(7, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Int8QuantizedMultiDimBroadcastSubshard8) {
+  TestQuantizedMultiDimBroadcast<int8_t>(8, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Int8QuantizedMultiDimBroadcastSubshard9) {
+  TestQuantizedMultiDimBroadcast<int8_t>(9, kMultiDimBroadcastSubshardCount);
+}
+
+TEST(QuantizedSubOpModel, Uint8QuantizedMultiDimBroadcastSubshard0) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(0, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Uint8QuantizedMultiDimBroadcastSubshard1) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(1, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Uint8QuantizedMultiDimBroadcastSubshard2) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(2, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Uint8QuantizedMultiDimBroadcastSubshard3) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(3, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Uint8QuantizedMultiDimBroadcastSubshard4) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(4, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Uint8QuantizedMultiDimBroadcastSubshard5) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(5, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Uint8QuantizedMultiDimBroadcastSubshard6) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(6, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Uint8QuantizedMultiDimBroadcastSubshard7) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(7, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Uint8QuantizedMultiDimBroadcastSubshard8) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(8, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedSubOpModel, Uint8QuantizedMultiDimBroadcastSubshard9) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(9, kMultiDimBroadcastSubshardCount);
+}
+
 }  // namespace
 }  // namespace tflite

From 1b257eecced46265a7f3782c0cfaf70a32d73abd Mon Sep 17 00:00:00 2001
From: Jared Junyoung Lim <jaredlim@google.com>
Date: Sat, 23 Sep 2023 01:57:49 -0700
Subject: [PATCH 191/567] Optimize BroadcastSub functions in TFLite reference
 kernel.

It removes redundant dimensions, compresses compressible dimensions, and handles broadcasting. Also vectorize int32 element-wise subtraction.

PiperOrigin-RevId: 567825531
---
 tensorflow/lite/kernels/internal/common.h     |   4 +-
 .../lite/kernels/internal/reference/sub.h     | 396 ++++++++++--------
 2 files changed, 221 insertions(+), 179 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index 3e95c54def7aa8..14d859917522f7 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 
 #include "tensorflow/lite/kernels/internal/runtime_shape.h"
 #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
@@ -60,8 +61,7 @@ bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape,
   const size_t num_input2_dims = input2_shape.DimensionsCount();
   const int32_t* input1_dims = input1_shape.DimsData();
   const int32_t* input2_dims = input2_shape.DimsData();
-  const size_t num_common_dims =
-      (num_input1_dims < num_input2_dims) ? num_input1_dims : num_input2_dims;
+  const size_t num_common_dims = std::min(num_input1_dims, num_input2_dims);
   for (size_t i = 1; i <= num_common_dims; i++) {
     const size_t input1_dim = input1_dims[num_input1_dims - i];
     const size_t input2_dim = input2_dims[num_input2_dims - i];
diff --git a/tensorflow/lite/kernels/internal/reference/sub.h b/tensorflow/lite/kernels/internal/reference/sub.h
index 50e8ec1ee06f53..862bee149a5a69 100644
--- a/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/tensorflow/lite/kernels/internal/reference/sub.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <algorithm>
+#include <cstddef>
 #include <limits>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
@@ -29,96 +30,179 @@ namespace tflite {
 
 namespace reference_ops {
 
-// Maximum dimension supported by the broadcast sub operation.
-constexpr int kMaxSubBroadcastDim = 6;
+template <class T>
+struct SubImpl {
+  template <class F>
+  static void BroadcastInput1(const ArithmeticParams& params,
+                              const T* input1_data, const T* input2_data,
+                              T* output_data, size_t size, F binary_func) {
+    for (int c = 0; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[0], input2_data[c], params);
+    }
+  }
 
-inline void SubNonBroadcast(const ArithmeticParams& params,
-                            const RuntimeShape& input1_shape,
-                            const float* input1_data,
-                            const RuntimeShape& input2_shape,
-                            const float* input2_data,
-                            const RuntimeShape& output_shape,
-                            float* output_data) {
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.float_activation_min,
-        params.float_activation_max);
+  template <class F>
+  static void BroadcastInput2(const ArithmeticParams& params,
+                              const T* input1_data, const T* input2_data,
+                              T* output_data, size_t size, F binary_func) {
+    for (int c = 0; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[0], params);
+    }
   }
-}
 
-inline void SubNonBroadcast(const ArithmeticParams& params,
-                            const RuntimeShape& input1_shape,
-                            const int32_t* input1_data,
-                            const RuntimeShape& input2_shape,
-                            const int32_t* input2_data,
-                            const RuntimeShape& output_shape,
-                            int32_t* output_data) {
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.quantized_activation_min,
-        params.quantized_activation_max);
+  template <class F>
+  static void ElementWise(const ArithmeticParams& params, const T* input1_data,
+                          const T* input2_data, T* output_data, size_t size,
+                          F binary_func) {
+    for (int c = 0; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[c], params);
+    }
   }
-}
+};
+
+template <>
+struct SubImpl<int32_t> {
+  template <class F>
+  static void BroadcastInput1(const ArithmeticParams& params,
+                              const int32_t* input1_data,
+                              const int32_t* input2_data, int32_t* output_data,
+                              size_t size, F binary_func) {
+    size_t c = 0;
+    int32_t activation_min, activation_max;
+    GetActivationParams(params, &activation_min, &activation_max);
+#ifdef USE_NEON
+    const int32x4_t vmax = vdupq_n_s32(activation_max);
+    const int32x4_t vmin = vdupq_n_s32(activation_min);
+    const int32x4_t va = vdupq_n_s32(input1_data[0]);
+    for (; c + 4 <= size; c += 4) {
+      const int32x4_t vb = vld1q_s32(&input2_data[c]);
+      int32x4_t vres = vsubq_s32(va, vb);
+      vres = vmaxq_s32(vmin, vres);
+      vres = vminq_s32(vmax, vres);
+      vst1q_s32(&output_data[c], vres);
+    }
+#endif
+    for (; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[0], input2_data[c], params);
+    }
+  }
+
+  template <class F>
+  static void BroadcastInput2(const ArithmeticParams& params,
+                              const int32_t* input1_data,
+                              const int32_t* input2_data, int32_t* output_data,
+                              size_t size, F binary_func) {
+    size_t c = 0;
+    int32_t activation_min, activation_max;
+    GetActivationParams(params, &activation_min, &activation_max);
+#ifdef USE_NEON
+    const int32x4_t vmax = vdupq_n_s32(activation_max);
+    const int32x4_t vmin = vdupq_n_s32(activation_min);
+    const int32x4_t vb = vdupq_n_s32(input2_data[0]);
+    for (; c + 4 <= size; c += 4) {
+      const int32x4_t va = vld1q_s32(&input1_data[c]);
+      int32x4_t vres = vsubq_s32(va, vb);
+      vres = vmaxq_s32(vmin, vres);
+      vres = vminq_s32(vmax, vres);
+      vst1q_s32(&output_data[c], vres);
+    }
+#endif
+    for (; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[0], params);
+    }
+  }
+
+  template <class F>
+  static void ElementWise(const ArithmeticParams& params,
+                          const int32_t* input1_data,
+                          const int32_t* input2_data, int32_t* output_data,
+                          size_t size, F binary_func) {
+    size_t c = 0;
+    int32_t activation_min, activation_max;
+    GetActivationParams(params, &activation_min, &activation_max);
+#ifdef USE_NEON
+    int32x4_t vmax = vdupq_n_s32(activation_max);
+    int32x4_t vmin = vdupq_n_s32(activation_min);
+    for (; c + 4 <= size; c += 4) {
+      const int32x4_t va = vld1q_s32(&input1_data[c]);
+      const int32x4_t vb = vld1q_s32(&input2_data[c]);
+      int32x4_t vres = vsubq_s32(va, vb);
+      vres = vmaxq_s32(vmin, vres);
+      vres = vminq_s32(vmax, vres);
+      vst1q_s32(&output_data[c], vres);
+    }
+#endif
+    for (; c < size; ++c) {
+      output_data[c] = binary_func(input1_data[c], input2_data[c], params);
+    }
+  }
+};
 
 template <typename T, typename F>
-void BroadcastSubRecursiveDimensions(
-    const ArithmeticParams& params, int dimension, const T* input1_data,
+inline void BroadcastSubRecursiveDimensions(
+    int dimension, const ArithmeticParams& params, const T* input1_data,
     const T* input2_data, T* output_data, size_t* input1_offset_p,
     size_t* input2_offset_p, size_t* output_offset,
-    const NdArrayDesc<kMaxSubBroadcastDim>& desc1,
-    const NdArrayDesc<kMaxSubBroadcastDim>& desc2,
-    const int32_t extended_output_shape_dims[kMaxSubBroadcastDim],
-    F binary_func) {
-  if (dimension == kMaxSubBroadcastDim - 1) {
-    for (int c = 0; c < extended_output_shape_dims[dimension]; ++c) {
-      const T input1_val = input1_data[*input1_offset_p];
-      const T input2_val = input2_data[*input2_offset_p];
-      output_data[*output_offset] = binary_func(params, input1_val, input2_val);
-      *input1_offset_p += desc1.strides[dimension];
-      *input2_offset_p += desc2.strides[dimension];
-      ++(*output_offset);
-    }
-  } else {
-    for (int a = 0; a < extended_output_shape_dims[dimension]; ++a) {
+    size_t* compressed_input1_stride, size_t* compressed_input2_stride,
+    size_t* compressed_output_shape, F binary_func) {
+  if (dimension > 0) {
+    for (int c = 0; c < compressed_output_shape[dimension]; ++c) {
       size_t input1_offset_c = *input1_offset_p;
       size_t input2_offset_c = *input2_offset_p;
       BroadcastSubRecursiveDimensions(
-          params, dimension + 1, input1_data, input2_data, output_data,
-          &input1_offset_c, &input2_offset_c, output_offset, desc1, desc2,
-          extended_output_shape_dims, binary_func);
-      *input1_offset_p += desc1.strides[dimension];
-      *input2_offset_p += desc2.strides[dimension];
+          dimension - 1, params, input1_data, input2_data, output_data,
+          &input1_offset_c, &input2_offset_c, output_offset,
+          compressed_input1_stride, compressed_input2_stride,
+          compressed_output_shape, binary_func);
+      *input1_offset_p += compressed_input1_stride[dimension];
+      *input2_offset_p += compressed_input2_stride[dimension];
+    }
+  } else {
+    TFLITE_DCHECK(dimension == 0);
+    bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
+    bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
+    TFLITE_DCHECK(!(input1_is_broadcast && input2_is_broadcast));
+    const T* input1_data_ptr = input1_data + *input1_offset_p;
+    const T* input2_data_ptr = input2_data + *input2_offset_p;
+    T* output_data_ptr = output_data + *output_offset;
+    if (input1_is_broadcast) {
+      // input1 is broadcast.
+      SubImpl<T>::BroadcastInput1(
+          params, input1_data_ptr, input2_data_ptr, output_data_ptr,
+          compressed_output_shape[dimension], binary_func);
+      *input2_offset_p += compressed_output_shape[dimension];
+    } else if (input2_is_broadcast) {
+      // input2 is broadcast.
+      SubImpl<T>::BroadcastInput2(
+          params, input1_data_ptr, input2_data_ptr, output_data_ptr,
+          compressed_output_shape[dimension], binary_func);
+      *input1_offset_p += compressed_output_shape[dimension];
+    } else {
+      // Add element-wise.
+      SubImpl<T>::ElementWise(params, input1_data_ptr, input2_data_ptr,
+                              output_data_ptr,
+                              compressed_output_shape[dimension], binary_func);
+      *input1_offset_p += compressed_output_shape[dimension];
+      *input2_offset_p += compressed_output_shape[dimension];
     }
+    *output_offset += compressed_output_shape[dimension];
   }
 }
 
-// TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-template <typename T>
-void BroadcastSubSlow(const ArithmeticParams& params,
-                      const RuntimeShape& input1_shape, const T* input1_data,
-                      const RuntimeShape& input2_shape, const T* input2_data,
-                      const RuntimeShape& output_shape, T* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/T");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxSubBroadcastDim);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxSubBroadcastDim);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxSubBroadcastDim);
-  NdArrayDesc<kMaxSubBroadcastDim> desc1;
-  NdArrayDesc<kMaxSubBroadcastDim> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(kMaxSubBroadcastDim, output_shape);
-  // Cache output shape dimensions.
-  int32_t extended_output_shape_dims[kMaxSubBroadcastDim];
-  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
-              sizeof(extended_output_shape_dims));
+// TODO: b/296510380 - we may be able to factor out this to common.h for all
+// binary arithmetic ops (add, sub, mul).
+template <typename T, typename F>
+inline void BroadcastSubCommon(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const T* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const T* input2_data,
+                               const RuntimeShape& output_shape, T* output_data,
+                               F binary_func) {
+  constexpr int kMaxBroadcastDim = 6;
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxBroadcastDim);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxBroadcastDim);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxBroadcastDim);
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -131,18 +215,57 @@ void BroadcastSubSlow(const ArithmeticParams& params,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+
+  size_t compressed_input1_stride[kMaxBroadcastDim];
+  size_t compressed_input2_stride[kMaxBroadcastDim];
+  size_t compressed_output_shape[kMaxBroadcastDim];
+  bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
+      input1_shape, input2_shape, compressed_input1_stride,
+      compressed_input2_stride, compressed_output_shape);
+  // Skip broadcasting for degenerate shapes.
+  if (!broadcastable_shape) {
+    return;
+  }
+
   size_t input1_offset = 0;
   size_t input2_offset = 0;
   size_t output_offset = 0;
   BroadcastSubRecursiveDimensions(
-      params, 0, input1_data, input2_data, output_data, &input1_offset,
-      &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
-      [](const ArithmeticParams& params, const T input1_val,
-         const T input2_val) {
+      kMaxBroadcastDim - 1, params, input1_data, input2_data, output_data,
+      &input1_offset, &input2_offset, &output_offset, compressed_input1_stride,
+      compressed_input2_stride, compressed_output_shape, binary_func);
+}
+
+// TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <typename T>
+void BroadcastSubSlow(const ArithmeticParams& params,
+                      const RuntimeShape& input1_shape, const T* input1_data,
+                      const RuntimeShape& input2_shape, const T* input2_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/T");
+  BroadcastSubCommon<T>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](T input1_val, T input2_val, const ArithmeticParams& params) {
         T activation_min, activation_max;
         GetActivationParams(params, &activation_min, &activation_max);
-        return ActivationFunctionWithMinMax<T>(input1_val - input2_val,
-                                               activation_min, activation_max);
+        return ActivationFunctionWithMinMax(input1_val - input2_val,
+                                            activation_min, activation_max);
       });
 }
 
@@ -154,39 +277,11 @@ inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
                                   const RuntimeShape& output_shape,
                                   int16_t* output_data) {
   ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxSubBroadcastDim);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxSubBroadcastDim);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxSubBroadcastDim);
-  NdArrayDesc<kMaxSubBroadcastDim> desc1;
-  NdArrayDesc<kMaxSubBroadcastDim> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(kMaxSubBroadcastDim, output_shape);
-  // Cache output shape dimensions.
-  int32_t extended_output_shape_dims[kMaxSubBroadcastDim];
-  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
-              sizeof(extended_output_shape_dims));
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  size_t input1_offset = 0;
-  size_t input2_offset = 0;
-  size_t output_offset = 0;
-  BroadcastSubRecursiveDimensions(
-      params, 0, input1_data, input2_data, output_data, &input1_offset,
-      &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
-      [](const ArithmeticParams& params, const int16_t input1_val,
-         const int16_t input2_val) {
+  BroadcastSubCommon<int16_t>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](int16_t input1_val, int16_t input2_val,
+         const ArithmeticParams& params) {
         const int32_t scaled_input1_val =
             gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
         const int32_t scaled_input2_val =
@@ -207,39 +302,10 @@ void BroadcastQuantSubSlow(const ArithmeticParams& params,
                            const T* input2_data,
                            const RuntimeShape& output_shape, T* output_data) {
   ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
-  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), kMaxSubBroadcastDim);
-  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), kMaxSubBroadcastDim);
-  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), kMaxSubBroadcastDim);
-  NdArrayDesc<kMaxSubBroadcastDim> desc1;
-  NdArrayDesc<kMaxSubBroadcastDim> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(kMaxSubBroadcastDim, output_shape);
-  // Cache output shape dimensions.
-  int32_t extended_output_shape_dims[kMaxSubBroadcastDim];
-  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
-              sizeof(extended_output_shape_dims));
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  size_t input1_offset = 0;
-  size_t input2_offset = 0;
-  size_t output_offset = 0;
-  BroadcastSubRecursiveDimensions(
-      params, 0, input1_data, input2_data, output_data, &input1_offset,
-      &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
-      [](const ArithmeticParams& params, const T input1_val,
-         const T input2_val) {
+  BroadcastSubCommon<T>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](T input1_val, T input2_val, const ArithmeticParams& params) {
         const int32_t shifted_input1_val =
             (params.input1_offset + input1_val) * (1 << params.left_shift);
         const int32_t shifted_input2_val =
@@ -348,36 +414,12 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
          const T* input1_data, const RuntimeShape& input2_shape,
          const T* input2_data, const RuntimeShape& output_shape,
          T* output_data) {
-  NdArrayDesc<kMaxSubBroadcastDim> desc1;
-  NdArrayDesc<kMaxSubBroadcastDim> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(kMaxSubBroadcastDim, output_shape);
-  // Cache output shape dimensions.
-  int32_t extended_output_shape_dims[kMaxSubBroadcastDim];
-  std::memcpy(extended_output_shape_dims, extended_output_shape.DimsData(),
-              sizeof(extended_output_shape_dims));
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  size_t input1_offset = 0;
-  size_t input2_offset = 0;
-  size_t output_offset = 0;
-  BroadcastSubRecursiveDimensions(
-      params, 0, input1_data, input2_data, output_data, &input1_offset,
-      &input2_offset, &output_offset, desc1, desc2, extended_output_shape_dims,
-      [](const ArithmeticParams& params, const T input1_val,
-         const T input2_val) { return input1_val - input2_val; });
+  BroadcastSubCommon<T>(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      [](T input1_val, T input2_val, const ArithmeticParams& params) {
+        return input1_val - input2_val;
+      });
 }
 
 inline void SetActivationMinMax(const ArithmeticParams& params,

From c8fafddb7a53d779aa842ce075a2a0bea84711ee Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Sep 2023 02:01:58 -0700
Subject: [PATCH 192/567] compat: Update forward compatibility horizon to
 2023-09-23

PiperOrigin-RevId: 567826030
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 4145a22481f391..ba7e2ef349203f 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 22)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 23)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 35f4b9497cd316842d2d433b38be5ad28cb0e997 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Sep 2023 02:04:35 -0700
Subject: [PATCH 193/567] Update GraphDef version to 1628.

PiperOrigin-RevId: 567826408
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 470b11e06459b2..5419c15d3aec47 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1627  // Updated: 2023/9/22
+#define TF_GRAPH_DEF_VERSION 1628  // Updated: 2023/9/23
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From bb5c4abf1078ec2fde15f34052f7ba7d9251ab09 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Sat, 23 Sep 2023 07:51:38 -0700
Subject: [PATCH 194/567] [XLA] Fix build

A bad dep added in 198144efc4235162423939b2256505765f78e145 broke the build.

PiperOrigin-RevId: 567864100
---
 third_party/xla/xla/BUILD             | 1 -
 third_party/xla/xla/comparison_util.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 184e312a72ede1..1b728600ea6a37 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -144,7 +144,6 @@ cc_library(
         ":xla_data_proto_cc",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:float8",
         "@local_tsl//tsl/platform:logging",
diff --git a/third_party/xla/xla/comparison_util.h b/third_party/xla/xla/comparison_util.h
index 364e503ff7c0c1..0c11edf302d464 100644
--- a/third_party/xla/xla/comparison_util.h
+++ b/third_party/xla/xla/comparison_util.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
-#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "xla/primitive_util.h"
 #include "xla/statusor.h"

From 51cc9dc4d9131e832bfc858dc59159e63ea8f116 Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <rameshsampath@google.com>
Date: Sat, 23 Sep 2023 11:33:30 -0700
Subject: [PATCH 195/567] Adds back `allow_multiple_exports` which is a no-op
 to maintain `keras_export` compatibility with TF 2.13 and TF Nightly.

PiperOrigin-RevId: 567886699
---
 tensorflow/python/tools/api/generator2/extractor/extractor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/tools/api/generator2/extractor/extractor.py b/tensorflow/python/tools/api/generator2/extractor/extractor.py
index 6e859ffef4aae5..ccdd13099d13c4 100644
--- a/tensorflow/python/tools/api/generator2/extractor/extractor.py
+++ b/tensorflow/python/tools/api/generator2/extractor/extractor.py
@@ -335,6 +335,9 @@ def _add_exported_symbol(self, node: ast.Call, symbol_name: str) -> None:
         v1_apis = tuple(
             f'{self._api_name}.{v}' for v in self._literal_value(kw.value)
         )
+      elif kw.arg == 'allow_multiple_exports':
+        # no-op kept for backward comapatibility of `tf-keras` with TF 2.13
+        pass
       else:
         raise BadExportError(
             f'{self._current_file}:{node.lineno} export called'

From 39447256a55bf5cd69907ae261eaa6ac98f80a1c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Sep 2023 12:01:50 -0700
Subject: [PATCH 196/567] Internal Code Change

PiperOrigin-RevId: 567889304
---
 tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
index 2001bfc7186ca9..3e7d5db414a542 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
@@ -16,7 +16,6 @@ td_library(
         # copybara:uncomment "//learning/brain/tfrt/mlir:__subpackages__",
         "//learning/infra/mira/distributed:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
-        "//tensorflow/core/tfrt/mlrt:__subpackages__",
     ],
     deps = [
         "@llvm-project//mlir:OpBaseTdFiles",
@@ -91,7 +90,7 @@ td_library(
     ],
     includes = ["."],
     visibility = [
-        "//tensorflow/core/tfrt/mlrt:__subpackages__",
+        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
     ],
     deps = [
         ":mlrt_td_files",
@@ -161,7 +160,6 @@ cc_library(
         # copybara:uncomment "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/tests:__subpackages__",
         # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
-        "//tensorflow/core/tfrt/mlrt:__subpackages__",
     ],
     deps = [
         ":mlrt_ops",
@@ -180,10 +178,7 @@ cc_library(
     name = "tf_mlrt_tpu_ops",
     srcs = ["tf_mlrt_tpu_ops.cc"],
     hdrs = ["tf_mlrt_tpu_ops.h"],
-    visibility = [
-        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__",
-        "//tensorflow/core/tfrt/mlrt:__subpackages__",
-    ],
+    visibility = ["//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__"],
     deps = [
         ":mlrt_ops",
         ":tf_mlrt_ops",

From e1538f18a95d9fed68a12a5aa2d46d1148588809 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Sep 2023 18:48:03 -0700
Subject: [PATCH 197/567] Remove absl logging includes that are breaking build.

PiperOrigin-RevId: 567927507
---
 third_party/xla/xla/pjrt/BUILD         | 2 --
 third_party/xla/xla/pjrt/pjrt_client.h | 2 --
 2 files changed, 4 deletions(-)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index bf8cce6944164a..8666c726b89af4 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -190,8 +190,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index 4c32a1b81fd180..c4b73edea0529b 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -31,8 +31,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"

From 359cf4552e205f0fb9cba966932273f1df3c8fd0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Sep 2023 18:52:04 -0700
Subject: [PATCH 198/567] Having live range dcheck respect threads to avoid
 false positive.

PiperOrigin-RevId: 567927880
---
 third_party/xla/xla/service/copy_insertion.cc | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/copy_insertion.cc b/third_party/xla/xla/service/copy_insertion.cc
index 77e10c3f21508e..a49fb6914dc566 100644
--- a/third_party/xla/xla/service/copy_insertion.cc
+++ b/third_party/xla/xla/service/copy_insertion.cc
@@ -1188,7 +1188,8 @@ class CopyRemover {
   };
 
   CopyRemover(const HloModule& module, const HloAliasAnalysis& alias_analysis,
-              HloOrdering* ordering, bool check_live_range_ordering)
+              HloOrdering* ordering, bool check_live_range_ordering,
+              const absl::flat_hash_set<absl::string_view>& execution_threads)
       : dataflow_(alias_analysis.dataflow_analysis()), ordering_(ordering) {
     // Construct a list for each HLO buffer in the alias analysis. Maintain a
     // map from HloValue to the respective list element representing that
@@ -1203,6 +1204,14 @@ class CopyRemover {
         continue;
       }
       if (check_live_range_ordering) {
+        // Skip checking if execution thread is not included.
+        auto should_skip_value = [&execution_threads](const HloValue* value) {
+          return value->defining_instruction()->parent() != nullptr &&
+                 !HloInstruction::IsThreadIncluded(value->defining_instruction()
+                                                       ->parent()
+                                                       ->execution_thread(),
+                                                   execution_threads);
+        };
         // Verify values contained in the buffer are strictly ordered. This
         // should always be the case after adding copies to eliminate
         // interference. Specifically, the addition of the control flow edges
@@ -1213,8 +1222,11 @@ class CopyRemover {
             // Token values have no representation and cannot interfere.
             continue;
           }
+          if (should_skip_value(value_a)) {
+            continue;
+          }
           for (const HloValue* value_b : buffer.values()) {
-            if (value_a != value_b) {
+            if (!should_skip_value(value_b) && value_a != value_b) {
               DCHECK(ordering_->LiveRangeStrictlyBefore(
                          *value_a, *value_b, dataflow_,
                          /*use_is_always_before_def_in_same_instr=*/true) ||
@@ -2111,7 +2123,7 @@ Status CopyInsertion::RemoveUnnecessaryCopies(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module, can_share_buffer_));
   CopyRemover copy_remover(*module, *alias_analysis, ordering.get(),
-                           check_live_range_ordering);
+                           check_live_range_ordering, execution_threads);
   if (VLOG_IS_ON(3)) {
     LOG(INFO) << "Removing unnecessary copies in " << module->name();
     LOG(INFO) << "Buffer values, in dependency order: ";

From c7eaebc0792ca5748bf85ec35c10f79851b78370 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 Sep 2023 02:02:26 -0700
Subject: [PATCH 199/567] compat: Update forward compatibility horizon to
 2023-09-24

PiperOrigin-RevId: 567978384
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index ba7e2ef349203f..0ee9625866060c 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 23)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 24)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From fa4274041302f039f47527af1863e6574e73a42c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 Sep 2023 02:03:15 -0700
Subject: [PATCH 200/567] Update GraphDef version to 1629.

PiperOrigin-RevId: 567978540
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 5419c15d3aec47..aae48690687289 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1628  // Updated: 2023/9/23
+#define TF_GRAPH_DEF_VERSION 1629  // Updated: 2023/9/24
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From bbb1f8e55b8c738b20a1d06b792c43157ac8c1cd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 Sep 2023 10:40:59 -0700
Subject: [PATCH 201/567] [xla:gpu] Set cublas workspace after calling
 cublasSetStream

We need to set workspace because new memory is allocated every time cublas calls is captured in cuda graph: https://docs.nvidia.com/cuda/cublas/index.html#cuda-graphs-support

We need to call it after set stream because cublasSetStream resets workspace:
https://docs.nvidia.com/cuda/cublas/index.html#cuda-graphs-support

PiperOrigin-RevId: 568032523
---
 .../xla/xla/stream_executor/cuda/cuda_blas.cc | 39 -------------------
 .../xla/xla/stream_executor/cuda/cuda_blas.h  |  5 ---
 .../xla/stream_executor/gpu/gpu_executor.h    |  2 -
 3 files changed, 46 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index b56fb1236b45f5..d9d5451eb912a7 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "absl/synchronization/mutex.h"
 #include "Eigen/Core"  // from @eigen_archive
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -199,19 +198,6 @@ bool CUDABlas::Init() {
   }
 #endif  // CUDA_VERSION >= 11000
 
-  // Initialize cuBLAS workspace memory on device. The workspace size is
-  // determined by the GPU architecture:
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetworkspace
-  absl::MutexLock lock(&mu_);
-  uint64_t workspace_size =
-      parent_->cc_major() >= 9 ? 1 << 25 /*32 MiB*/ : 1 << 22 /*4 MiB*/;
-  workspace_ = parent_->Allocate(workspace_size, /*memory_space=*/0);
-
-  if (workspace_.is_null()) {
-    LOG(ERROR) << "Failed to allocate workspace memory";
-    return false;
-  }
-
   return true;
 }
 
@@ -228,9 +214,6 @@ CUDABlas::CUDABlas(gpu::GpuExecutor *parent)
 CUDABlas::~CUDABlas() {
   if (blas_ != nullptr) {
     gpu::ScopedActivateExecutorContext sac{parent_};
-    if (!workspace_.is_null()) {
-      parent_->Deallocate(&workspace_);
-    }
     cublasDestroy(blas_);
   }
 }
@@ -249,24 +232,6 @@ bool CUDABlas::SetStream(Stream *stream) {
   return true;
 }
 
-bool CUDABlas::SetWorkspace() {
-  CHECK(blas_ != nullptr);
-  gpu::ScopedActivateExecutorContext sac{parent_};
-
-  if (workspace_.is_null()) {
-    LOG(ERROR) << "cuBLAS workspace is not allocated";
-    return false;
-  }
-
-  cublasStatus_t ret =
-      cublasSetWorkspace(blas_, workspace_.opaque(), workspace_.size());
-  if (ret != CUBLAS_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set workspace for cuBLAS calls: " << ToString(ret);
-    return false;
-  }
-  return true;
-}
-
 cudaStream_t CUDABlas::CUDAStream(Stream *stream) {
   CHECK(stream != nullptr);
   CHECK(AsGpuStreamValue(stream) != nullptr);
@@ -394,10 +359,6 @@ tsl::Status CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
     return tsl::errors::Internal("Failed setting stream");
   }
 
-  if (!SetWorkspace()) {
-    return tsl::errors::Internal("Failed setting workspace");
-  }
-
   ScopedCublasMathMode math_mode{blas_};
 #if CUBLAS_VER_MAJOR >= 11
   if (math_type == CUBLAS_TF32_TENSOR_OP_MATH &&
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
index 1e98ab1787d240..8869b284ce487a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/cuda/cuda_blas_lt.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
 
@@ -71,8 +70,6 @@ class CUDABlas : public blas::BlasSupport {
   // invoked before calling into cuBLAS.
   bool SetStream(Stream *stream) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  bool SetWorkspace() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
   // Returns the underlying CUDA stream.
   cudaStream_t CUDAStream(Stream *stream);
 
@@ -124,8 +121,6 @@ class CUDABlas : public blas::BlasSupport {
 
   BlasLt blas_lt_;
 
-  DeviceMemoryBase workspace_ ABSL_GUARDED_BY(mu_);
-
   SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
 };
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index 06b2c014e88629..f5ed9093451926 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -284,8 +284,6 @@ class GpuExecutor : public internal::StreamExecutorInterface {
     return it->second;
   }
 
-  int cc_major() const { return cc_major_; }
-
  private:
   // Host callback landing routine invoked by CUDA.
   // data: User-provided callback provided to HostCallback() above, captured

From 03813792494acc2512659e17773ec01969f9e633 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Sun, 24 Sep 2023 21:31:44 -0700
Subject: [PATCH 202/567] [XLA] Fix build for graph dumper for OSS builds

The stream_executor target does not bring in the symbol definitions for

PiperOrigin-RevId: 568101334
---
 third_party/xla/xla/service/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index b32c8fb37c070f..5cd69b9ca4f2fd 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -5274,6 +5274,7 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/stream_executor",
+        "//xla/stream_executor:dnn",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",

From a984b89903194afde4cafab4658d1af8b5c5fc15 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 Sep 2023 22:29:04 -0700
Subject: [PATCH 203/567] Internal, build update for lexan.

PiperOrigin-RevId: 568111192
---
 third_party/xla/third_party/tsl/tsl/tsl.bzl | 45 +++++++++++++++------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/tsl.bzl b/third_party/xla/third_party/tsl/tsl/tsl.bzl
index adc0cbf00c39c4..e1c2b364fbca78 100644
--- a/third_party/xla/third_party/tsl/tsl/tsl.bzl
+++ b/third_party/xla/third_party/tsl/tsl/tsl.bzl
@@ -1,18 +1,19 @@
 """Provides build configuration for TSL"""
 
+load("@bazel_skylib//lib:new_sets.bzl", "sets")
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "if_cuda",
 )
 load(
-    "//tsl/platform:rules_cc.bzl",
-    "cc_binary",
-    "cc_library",
-    "cc_shared_library",
+    "//third_party/compute_library:build_defs.bzl",
+    "if_enable_acl",
 )
 load(
-    "@local_config_tensorrt//:build_defs.bzl",
-    "if_tensorrt",
+    "//third_party/mkl_dnn:build_defs.bzl",
+    "if_mkldnn_aarch64_acl",
+    "if_mkldnn_aarch64_acl_openmp",
+    "if_mkldnn_openmp",
 )
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
@@ -26,16 +27,15 @@ load(
     "onednn_v3_define",
 )
 load(
-    "//third_party/mkl_dnn:build_defs.bzl",
-    "if_mkldnn_aarch64_acl",
-    "if_mkldnn_aarch64_acl_openmp",
-    "if_mkldnn_openmp",
+    "//tsl/platform:rules_cc.bzl",
+    "cc_binary",
+    "cc_library",
+    "cc_shared_library",
 )
 load(
-    "//third_party/compute_library:build_defs.bzl",
-    "if_enable_acl",
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
 )
-load("@bazel_skylib//lib:new_sets.bzl", "sets")
 
 two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"]
 
@@ -198,6 +198,14 @@ def if_with_tpu_support(if_true, if_false = []):
 
 def get_win_copts(is_external = False):
     WINDOWS_COPTS = [
+        # copybara:uncomment_begin(no MSVC flags in google)
+        # "-DPLATFORM_WINDOWS",
+        # "-DEIGEN_HAS_C99_MATH",
+        # "-DTENSORFLOW_USE_EIGEN_THREADPOOL",
+        # "-DEIGEN_AVOID_STL_ARRAY",
+        # "-Iexternal/gemmlowp",
+        # "-DNOGDI",
+        # copybara:uncomment_end_and_comment_begin
         "/DPLATFORM_WINDOWS",
         "/DEIGEN_HAS_C99_MATH",
         "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
@@ -212,13 +220,24 @@ def get_win_copts(is_external = False):
         # "/EHs-c-",
         "/wd4577",
         "/DNOGDI",
+        # copybara:comment_end
         # Also see build:windows lines in tensorflow/opensource_only/.bazelrc
         # where we set some other options globally.
     ]
+
     if is_external:
+        # copybara:uncomment_begin(no MSVC flags in google)
+        # return WINDOWS_COPTS + ["-UTF_COMPILE_LIBRARY"]
+        # copybara:uncomment_end_and_comment_begin
         return WINDOWS_COPTS + ["/UTF_COMPILE_LIBRARY"]
+        # copybara:comment_end
+
     else:
+        # copybara:uncomment_begin(no MSVC flags in google)
+        # return WINDOWS_COPTS + ["-DTF_COMPILE_LIBRARY"]
+        # copybara:uncomment_end_and_comment_begin
         return WINDOWS_COPTS + ["/DTF_COMPILE_LIBRARY"]
+        # copybara:comment_end
 
 def tsl_copts(
         android_optimization_level_override = "-O2",

From 535288070845a6fb54e6d2a5b1b25549c80fedd2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 00:19:53 -0700
Subject: [PATCH 204/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/549bf94d9643e0a1a9ea71949fde9e0a21dd30ea.

PiperOrigin-RevId: 568131952
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 321212b36339e1..66c8f974bc5a69 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "e8d8c19d9314439fb3e1c08936ba0dc6863d1ccc"
-    TFRT_SHA256 = "775221f0d876c5d5df52f8c6fdd072bc52352c65276908d371fb164888e0c0d4"
+    TFRT_COMMIT = "549bf94d9643e0a1a9ea71949fde9e0a21dd30ea"
+    TFRT_SHA256 = "d8550a2abb57a78bd786947104d71f7735c96ed68b672e9afe2c45f143488ade"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index 321212b36339e1..66c8f974bc5a69 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "e8d8c19d9314439fb3e1c08936ba0dc6863d1ccc"
-    TFRT_SHA256 = "775221f0d876c5d5df52f8c6fdd072bc52352c65276908d371fb164888e0c0d4"
+    TFRT_COMMIT = "549bf94d9643e0a1a9ea71949fde9e0a21dd30ea"
+    TFRT_SHA256 = "d8550a2abb57a78bd786947104d71f7735c96ed68b672e9afe2c45f143488ade"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 321212b36339e1..66c8f974bc5a69 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "e8d8c19d9314439fb3e1c08936ba0dc6863d1ccc"
-    TFRT_SHA256 = "775221f0d876c5d5df52f8c6fdd072bc52352c65276908d371fb164888e0c0d4"
+    TFRT_COMMIT = "549bf94d9643e0a1a9ea71949fde9e0a21dd30ea"
+    TFRT_SHA256 = "d8550a2abb57a78bd786947104d71f7735c96ed68b672e9afe2c45f143488ade"
 
     tf_http_archive(
         name = "tf_runtime",

From 014b270e98a2a0b045f7e9e8c7a917bf3aad898d Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Mon, 25 Sep 2023 12:59:06 +0530
Subject: [PATCH 205/567] Corrected several typos

Documentation has been updated with correct words. Please have a look at this and do the needful Thank you!
---
 tensorflow/python/debug/cli/debugger_cli_common_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/debug/cli/debugger_cli_common_test.py b/tensorflow/python/debug/cli/debugger_cli_common_test.py
index 0bef5ab78ce1e6..fad304c271e1db 100644
--- a/tensorflow/python/debug/cli/debugger_cli_common_test.py
+++ b/tensorflow/python/debug/cli/debugger_cli_common_test.py
@@ -700,7 +700,7 @@ def testWrappingWithAttrCutoff(self):
     self.assertEqual("shorter wavelength", out.annotations[3])
     self.assertFalse(4 in out.annotations)
 
-    # Chec that the non-row-index field is present in output.
+    # Check that the non-row-index field is present in output.
     self.assertEqual("foo", out.annotations["metadata"])
 
     self.assertEqual(new_line_indices, [0, 1, 3])

From c0c7efcff5bc6bffd2d0668f777196509d6971e8 Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Mon, 25 Sep 2023 13:06:48 +0530
Subject: [PATCH 206/567] Update event_file_writer.py

---
 tensorflow/python/summary/writer/event_file_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/summary/writer/event_file_writer.py b/tensorflow/python/summary/writer/event_file_writer.py
index 09479c2cbbb284..7865c4d845ee2f 100644
--- a/tensorflow/python/summary/writer/event_file_writer.py
+++ b/tensorflow/python/summary/writer/event_file_writer.py
@@ -137,7 +137,7 @@ def flush(self):
     disk.
     """
     if not self._closed:
-      # Request a flush operation by enqueing a sentinel and then waiting for
+      # Request a flush operation by enqueuing a sentinel and then waiting for
       # the writer thread to mark the flush as complete.
       self._flush_complete.clear()
       self._try_put(self._flush_sentinel)

From 99605b91f178a68d1eb8aa6bfb8bd9164a43d255 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 01:50:06 -0700
Subject: [PATCH 207/567] Internal Code Change

PiperOrigin-RevId: 568147634
---
 tensorflow/compiler/mlir/tfrt/ir/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tfrt/ir/BUILD b/tensorflow/compiler/mlir/tfrt/ir/BUILD
index 8e1a0ef9ae750a..80257d4812ecd3 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/BUILD
@@ -70,7 +70,6 @@ cc_library(
     visibility = [
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
         "//tensorflow/core/runtime_fallback:internal",
-        "//tensorflow/core/tfrt/mlrt/application/tensorflow/tests:__subpackages__",
     ],
     deps = [
         ":tfrt_fallback_common",

From bf95ed963c863bfea33de7afad1dc47cafeac1d8 Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Mon, 25 Sep 2023 14:31:21 +0530
Subject: [PATCH 208/567] Update summary.py

---
 tensorflow/python/summary/summary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index da3783b7336ed7..161456a7aecae0 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -97,7 +97,7 @@ def scalar(name, tensor, collections=None, family=None):
   TF 2.0](https://www.tensorflow.org/tensorboard/migrate#in_tf_1x) for concrete
   steps for migration. `tf.summary.scalar` can also log training metrics in
   Keras, you can check [Logging training metrics in
-  Keras](https://www.tensorflow.org/tensorboard/scalars_and_keras) for detials.
+  Keras](https://www.tensorflow.org/tensorboard/scalars_and_keras) for details.
 
   #### How to Map Arguments
 

From ac05bfd3f4d0733eeb4211f9cbee1d0313aba75a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 02:01:58 -0700
Subject: [PATCH 209/567] Update GraphDef version to 1630.

PiperOrigin-RevId: 568149848
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index aae48690687289..3f5ae3ed744546 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1629  // Updated: 2023/9/24
+#define TF_GRAPH_DEF_VERSION 1630  // Updated: 2023/9/25
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 92213dbdb2a8c56b045aac745f466f9d15767435 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 02:02:03 -0700
Subject: [PATCH 210/567] compat: Update forward compatibility horizon to
 2023-09-25

PiperOrigin-RevId: 568149868
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 0ee9625866060c..dbb8b5b32bce92 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 24)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 25)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 2d3445b9b9e01470094174ef83f2d7d700f02259 Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Mon, 25 Sep 2023 14:47:19 +0530
Subject: [PATCH 211/567] Update run_models.py

---
 tensorflow/python/compiler/tensorrt/model_tests/run_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compiler/tensorrt/model_tests/run_models.py b/tensorflow/python/compiler/tensorrt/model_tests/run_models.py
index 8af5b35af9d03b..e1e8556a61aedb 100644
--- a/tensorflow/python/compiler/tensorrt/model_tests/run_models.py
+++ b/tensorflow/python/compiler/tensorrt/model_tests/run_models.py
@@ -105,7 +105,7 @@
 flags.DEFINE_enum("output_format", "CSV", ["CSV", "JSON"],
                   "Output format of analysis results.")
 
-DEFAUL_TRT_CONVERT_PARAMS = trt.DEFAULT_TRT_CONVERSION_PARAMS
+DEFAULT_TRT_CONVERT_PARAMS = trt.DEFAULT_TRT_CONVERSION_PARAMS
 
 
 # pylint: disable=bad-whitespace

From 7ff47e53fb2b3d9567f362dcd70066add7c7721c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 03:46:16 -0700
Subject: [PATCH 212/567] Integrate LLVM at llvm/llvm-project@f7bf99fb529f

Updates LLVM usage to match
[f7bf99fb529f](https://github.com/llvm/llvm-project/commit/f7bf99fb529f)

PiperOrigin-RevId: 568169974
---
 third_party/llvm/workspace.bzl                            | 4 ++--
 .../mlir/backends/cpu/transforms/sparse_rewrite_passes.cc | 8 ++++----
 third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h     | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index f24443e14ea083..59331ea1df5291 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "46d5d264fc66a017bbd0182b2b5fcc0f3f23d3be"
-    LLVM_SHA256 = "f31d546ecdcd07971f7f8f5f6f83ee2e101bf677975fe96d6ba837628eba7e24"
+    LLVM_COMMIT = "f7bf99fb529f9993548bf6fac0b5523c791f2416"
+    LLVM_SHA256 = "567656a6da8e3608a81859dcc57e4279ae468936769177d66a775d6800003434"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
index ce2c58cc647f6f..3402f867a3915f 100644
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
+++ b/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
@@ -242,8 +242,8 @@ struct SparseDynSliceCallRewriter {
     auto srcEnc =
         retTp.getEncoding().cast<sparse_tensor::SparseTensorEncodingAttr>();
     auto sliceEnc = sparse_tensor::SparseTensorEncodingAttr::get(
-        ctx, srcEnc.getLvlTypes(), srcEnc.getDimToLvl(), srcEnc.getPosWidth(),
-        srcEnc.getCrdWidth(), slice_attrs);
+        ctx, srcEnc.getLvlTypes(), srcEnc.getDimToLvl(), srcEnc.getLvlToDim(),
+        srcEnc.getPosWidth(), srcEnc.getCrdWidth(), slice_attrs);
     auto sliceTp = RankedTensorType::get(retTp.getShape(),
                                          retTp.getElementType(), sliceEnc);
 
@@ -351,8 +351,8 @@ struct SparseSliceCallRewriter {
         retTp.getEncoding().cast<sparse_tensor::SparseTensorEncodingAttr>();
     // TODO(peiming): add a getSliceEncodingFrom into MLIR upstream.
     auto sliceEnc = sparse_tensor::SparseTensorEncodingAttr::get(
-        ctx, srcEnc.getLvlTypes(), srcEnc.getDimToLvl(), srcEnc.getPosWidth(),
-        srcEnc.getCrdWidth(), slice_attrs);
+        ctx, srcEnc.getLvlTypes(), srcEnc.getDimToLvl(), srcEnc.getLvlToDim(),
+        srcEnc.getPosWidth(), srcEnc.getCrdWidth(), slice_attrs);
     auto sliceTp = RankedTensorType::get(retTp.getShape(),
                                          retTp.getElementType(), sliceEnc);
     auto slice = rewriter.create<tensor::ExtractSliceOp>(
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
index 9ca495016aeb93..5cc3fc571d281d 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
@@ -127,8 +127,8 @@ static StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
       auto id_map = mlir::AffineMap::getPermutationMap(major_to_minor,
                                                        builder.getContext());
       // TODO(atondwal): support sizes other than 32 when XLA does
-      encoding = SparseTensorEncodingAttr::get(builder.getContext(), dlts,
-                                               id_map, 32, 32);
+      encoding = SparseTensorEncodingAttr::get(
+          builder.getContext(), dlts, id_map, mlir::AffineMap(), 32, 32);
     }
   }
   return TypeT::get(shape, element_type_or.value(), encoding);

From 9a0c0322dccc8a84a190a76ce9506eacd4d9c456 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 04:27:31 -0700
Subject: [PATCH 213/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/cf730a32058241ae24fd12fe0a9a0258faa0ca89.

PiperOrigin-RevId: 568177684
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 66c8f974bc5a69..94cc1beb5b2f9c 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "549bf94d9643e0a1a9ea71949fde9e0a21dd30ea"
-    TFRT_SHA256 = "d8550a2abb57a78bd786947104d71f7735c96ed68b672e9afe2c45f143488ade"
+    TFRT_COMMIT = "cf730a32058241ae24fd12fe0a9a0258faa0ca89"
+    TFRT_SHA256 = "7fcc709032d16aa5f2cf04e0c4cbcdf9e0dbafb8377e141d2c0074f24ede4066"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index 66c8f974bc5a69..94cc1beb5b2f9c 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "549bf94d9643e0a1a9ea71949fde9e0a21dd30ea"
-    TFRT_SHA256 = "d8550a2abb57a78bd786947104d71f7735c96ed68b672e9afe2c45f143488ade"
+    TFRT_COMMIT = "cf730a32058241ae24fd12fe0a9a0258faa0ca89"
+    TFRT_SHA256 = "7fcc709032d16aa5f2cf04e0c4cbcdf9e0dbafb8377e141d2c0074f24ede4066"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 66c8f974bc5a69..94cc1beb5b2f9c 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "549bf94d9643e0a1a9ea71949fde9e0a21dd30ea"
-    TFRT_SHA256 = "d8550a2abb57a78bd786947104d71f7735c96ed68b672e9afe2c45f143488ade"
+    TFRT_COMMIT = "cf730a32058241ae24fd12fe0a9a0258faa0ca89"
+    TFRT_SHA256 = "7fcc709032d16aa5f2cf04e0c4cbcdf9e0dbafb8377e141d2c0074f24ede4066"
 
     tf_http_archive(
         name = "tf_runtime",

From 5edb674773776a44e24a73bcab94a38d7e15dab4 Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Mon, 25 Sep 2023 07:52:57 -0700
Subject: [PATCH 214/567] Make a TF Dialect to Executor API, which is used in
 the Bridge Phase 1.

We'll separate out the current implementation in bridge.cc which currently always runs as part of the Bridge.

PiperOrigin-RevId: 568218119
---
 tensorflow/compiler/mlir/tf2xla/api/v2/BUILD  | 22 ++++++++++
 .../tf2xla/api/v2/tf_dialect_to_executor.cc   | 34 +++++++++++++++
 .../tf2xla/api/v2/tf_dialect_to_executor.h    | 42 +++++++++++++++++++
 .../api/v2/tf_dialect_to_executor_test.cc     | 37 ++++++++++++++++
 4 files changed, 135 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index 0ff72d68d418c6..67a5fba6ccccf0 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -116,3 +116,25 @@ tf_cc_test(
         "@local_tsl//tsl/lib/core:status_test_util",
     ],
 )
+
+cc_library(
+    name = "tf_dialect_to_executor",
+    srcs = ["tf_dialect_to_executor.cc"],
+    hdrs = ["tf_dialect_to_executor.h"],
+    deps = [
+        "//tensorflow/core/platform:status",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
+tf_cc_test(
+    name = "tf_dialect_to_executor_test",
+    srcs = ["tf_dialect_to_executor_test.cc"],
+    deps = [
+        ":tf_dialect_to_executor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/lib/core:status_test_util",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
new file mode 100644
index 00000000000000..df04bfed17fd43
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+
+using mlir::ModuleOp;
+
+tensorflow::Status ExportFromTensorflowDialectToExecutor(ModuleOp module) {
+  return tsl::OkStatus();
+}
+
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h
new file mode 100644
index 00000000000000..16b9ad252b7c2f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TF_DIALECT_TO_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TF_DIALECT_TO_EXECUTOR_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+
+// Given the input Module op that's in the Tensorflow Dialect, convert the MLIR
+// module in place to the Tensorflow Executor Dialect. Returns an OK Status if
+// success, otherwise failure with an error message.
+// The Tensorflow Executor Dialect is required to export an MLIR module to a
+// Tensorflow GraphDef. This API will add control dependencies and verify that
+// the conversion was successful.
+//
+// Input: A MLIR Module in the Tensorflow Dialect with no
+// `tf_device.cluster_func` ops.
+// Output: A MLIR module in the Tensorflow Executor Dialect.
+tensorflow::Status ExportFromTensorflowDialectToExecutor(mlir::ModuleOp module);
+
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TF_DIALECT_TO_EXECUTOR_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
new file mode 100644
index 00000000000000..e43e82098066a8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
+
+#include <gtest/gtest.h>
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tsl/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v2 {
+namespace {
+
+using mlir::ModuleOp;
+
+TEST(TensorflowDialectToExecutor, ConvertsToExecutor) {
+  ModuleOp module;
+  TF_ASSERT_OK(ExportFromTensorflowDialectToExecutor(module));
+}
+
+}  // namespace
+}  // namespace v2
+}  // namespace tf2xla
+}  // namespace tensorflow

From b2a70ab4944fc4c54e8b6bf1b3bfcfc28d2abaad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 09:17:16 -0700
Subject: [PATCH 215/567] [XLA:GPU] Allow fusing binary elementwise operations
 where exactly one operand is a splat constant into normalization diamonds.

PiperOrigin-RevId: 568238994
---
 third_party/xla/xla/service/gpu/BUILD         |   1 +
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |  12 +-
 .../ir_emitter_triton_parametrized_test.cc    | 402 ++++++++++++++++++
 .../service/gpu/softmax_rewriter_triton.cc    |  67 ++-
 .../gpu/softmax_rewriter_triton_test.cc       | 320 ++++++++++++++
 5 files changed, 789 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index faf60063e796dc..9d988927395b17 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1326,6 +1326,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 9b7fbf675625ae..bb257683e203e2 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -598,8 +598,16 @@ StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
     b.setInsertionPointAfter(reduction);
   }
 
-  return Cast(b, reduction.getResult().front(),
-              TritonType(b, hlo_reduce.shape().element_type()));
+  Value result = reduction.getResult().front();
+
+  // We want to return a tensor of float32, but the ReturnReduceOp produces an
+  // f32 constant when reducing a single dim. To convert to a tensor we splat
+  // the result.
+  if (!reduction.getResult().front().dyn_cast<TensorValue>()) {
+    result = Splat(b, result, {});
+  }
+
+  return Cast(b, result, TritonType(b, hlo_reduce.shape().element_type()));
 }
 
 // Emit sequence of instructions using compatible tiling ordered producers
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 09af76884ead91..9be7fdbede4e83 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -1893,6 +1893,408 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec(/*aabs=*/0, /*arel=*/0)));
 }
 
+TEST_P(TritonSoftmaxTest, CanFuseAndEmitRMSNormDiamond) {
+  PrimitiveType data_type = GetParam();
+
+  if (data_type == F16) {
+    GTEST_SKIP() << "rsqrt op does not support F16.";
+  } else if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
+                                      se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
+    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
+  }
+
+  const std::string hlo_text_template = R"(
+HloModule rms_norm
+add_computation {
+  arg_0 = $0[] parameter(0)
+  arg_1 = $0[] parameter(1)
+  ROOT add.1 = $0[] add(arg_0, arg_1)
+}
+ENTRY main.30 {
+  param_0 = $0[10,10,10,128]{3,2,1,0} parameter(0)
+  multiply_param = $0[10,10,10,128]{3,2,1,0} multiply(param_0, param_0)
+  constant_0 = $0[] constant(0)
+  reduce = $0[10,10,10]{2,1,0} reduce(multiply_param, constant_0), dimensions={3}, to_apply=add_computation
+  constant_1 = $0[] constant(0.333333343)
+  splat = $0[10,10,10]{2,1,0} broadcast(constant_1), dimensions={}
+  multiply_splat = $0[10,10,10]{2,1,0} multiply(reduce, splat)
+  epsilon = $0[] constant(1e-06)
+  splat_epsilon = $0[10,10,10]{2,1,0} broadcast(epsilon), dimensions={}
+  add = $0[10,10,10]{2,1,0} add(multiply_splat, splat_epsilon)
+  rsqrt = $0[10,10,10]{2,1,0} rsqrt(add)
+  broadcast = $0[10,10,10,128]{3,2,1,0} broadcast(rsqrt), dimensions={0,1,2}
+  ROOT multiply = $0[10,10,10,128]{3,2,1,0} multiply(param_0, broadcast)
+}
+)";
+
+  const std::string hlo_text = absl::Substitute(
+      hlo_text_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  const std::string hlo_ref_template = R"(
+; CHECK:    ENTRY
+; CHECK:      %[[P0:.*]] = $0[10,10,10,128]{3,2,1,0} parameter(0)
+; CHECK:      ROOT
+; CHECK-SAME: fusion(%[[P0]])
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   __triton_softmax
+)";
+
+  const std::string hlo_ref = absl::Substitute(
+      hlo_ref_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  MatchOptimizedHlo(hlo_text, hlo_ref);
+
+  float tolerance;
+  switch (data_type) {
+    case F32:
+      tolerance = 1e-6;
+      break;
+    case F16:
+      tolerance = 2e-4;
+      break;
+    case BF16:
+      tolerance = 4e-2;
+      break;
+    default:
+      ABSL_UNREACHABLE();
+  }
+  EXPECT_TRUE(RunAndCompare(hlo_text,
+                            ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
+}
+
+TEST_P(
+    TritonSoftmaxTest,
+    CanFuseAndEmitBinaryElementwiseWhereTheFirstOperandIsASplatConstantBetweenDiamonds) {  // NOLINT(whitespace/line_length)
+  PrimitiveType data_type = GetParam();
+
+  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
+                               se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
+    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
+  }
+
+  const std::string hlo_text_template = R"(
+HloModule fusible_diamonds
+add_computation {
+  arg_0.1 = $0[] parameter(0)
+  arg_1.1 = $0[] parameter(1)
+  ROOT add = $0[] add(arg_0.1, arg_1.1)
+}
+ENTRY main {
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=add_computation
+  broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
+  subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
+  constant = $0[] constant(0.333333343)
+  broadcast_splat = $0[127,125]{1,0} broadcast(constant), dimensions={}
+  multiply = $0[127,125]{1,0} multiply(broadcast_splat, subtract)
+  constant_zero = $0[] constant(0)
+  second_reduce = $0[127]{0} reduce(multiply, constant_zero), dimensions={1}, to_apply=add_computation
+  second_broadcast = $0[127,125]{1,0} broadcast(second_reduce), dimensions={0}
+  ROOT second_subtract = $0[127,125]{1,0} subtract(multiply, second_broadcast)
+}
+)";
+
+  const std::string hlo_text = absl::Substitute(
+      hlo_text_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  const std::string hlo_ref_template = R"(
+; CHECK:    ENTRY
+; CHECK:      %[[P0:.*]] = $0[127,125]{1,0} parameter(0)
+; CHECK:      ROOT
+; CHECK-SAME: fusion(%[[P0]])
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   __triton_softmax
+)";
+
+  const std::string hlo_ref = absl::Substitute(
+      hlo_ref_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  MatchOptimizedHlo(hlo_text, hlo_ref);
+
+  float tolerance;
+  switch (data_type) {
+    case F32:
+      tolerance = 1e-6;
+      break;
+    case F16:
+      tolerance = 2e-4;
+      break;
+    case BF16:
+      tolerance = 2e-2;
+      break;
+    default:
+      ABSL_UNREACHABLE();
+  }
+  EXPECT_TRUE(RunAndCompare(hlo_text,
+                            ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
+}
+
+TEST_P(
+    TritonSoftmaxTest,
+    CanFuseAndEmitBinaryElementwiseWhereTheSecondOperandIsASplatConstantBetweenDiamonds) {  // NOLINT(whitespace/line_length)
+  PrimitiveType data_type = GetParam();
+
+  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
+                               se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
+    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
+  }
+
+  const std::string hlo_text_template = R"(
+HloModule fusible_diamonds
+add_computation {
+  arg_0.1 = $0[] parameter(0)
+  arg_1.1 = $0[] parameter(1)
+  ROOT add = $0[] add(arg_0.1, arg_1.1)
+}
+ENTRY main {
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=add_computation
+  broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
+  subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
+  constant = $0[] constant(0.333333343)
+  broadcast_splat = $0[127,125]{1,0} broadcast(constant), dimensions={}
+  multiply = $0[127,125]{1,0} multiply(subtract, broadcast_splat)
+  constant_zero = $0[] constant(0)
+  second_reduce = $0[127]{0} reduce(multiply, constant_zero), dimensions={1}, to_apply=add_computation
+  second_broadcast = $0[127,125]{1,0} broadcast(second_reduce), dimensions={0}
+  ROOT second_subtract = $0[127,125]{1,0} subtract(multiply, second_broadcast)
+}
+)";
+
+  const std::string hlo_text = absl::Substitute(
+      hlo_text_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  const std::string hlo_ref_template = R"(
+; CHECK:    ENTRY
+; CHECK:      %[[P0:.*]] = $0[127,125]{1,0} parameter(0)
+; CHECK:      ROOT
+; CHECK-SAME: fusion(%[[P0]])
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   __triton_softmax
+)";
+
+  const std::string hlo_ref = absl::Substitute(
+      hlo_ref_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  MatchOptimizedHlo(hlo_text, hlo_ref);
+
+  float tolerance;
+  switch (data_type) {
+    case F32:
+      tolerance = 1e-6;
+      break;
+    case F16:
+      tolerance = 2e-4;
+      break;
+    case BF16:
+      tolerance = 2e-2;
+      break;
+    default:
+      ABSL_UNREACHABLE();
+  }
+  EXPECT_TRUE(RunAndCompare(hlo_text,
+                            ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
+}
+
+TEST_P(
+    TritonSoftmaxTest,
+    CanFuseAndEmitBinaryElementwiseWhereTheFirstOperandIsASplatConstantWithinDiamond) {  // NOLINT(whitespace/line_length)
+  PrimitiveType data_type = GetParam();
+
+  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
+                               se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
+    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
+  }
+
+  const std::string hlo_text_template = R"(
+HloModule fusible_diamond
+max_computation {
+  arg_0 = $0[] parameter(0)
+  arg_1 = $0[] parameter(1)
+  ROOT maximum = $0[] maximum(arg_0, arg_1)
+}
+ENTRY main {
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  constant = $0[] constant(0.333333343)
+  broadcast_splat = $0[127]{0} broadcast(constant), dimensions={}
+  multiply = $0[127]{0} multiply(broadcast_splat, reduce)
+  broadcast = $0[127,125]{1,0} broadcast(multiply), dimensions={0}
+  ROOT subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
+}
+)";
+
+  const std::string hlo_text = absl::Substitute(
+      hlo_text_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  const std::string hlo_ref_template = R"(
+; CHECK:    ENTRY
+; CHECK:      %[[P0:.*]] = $0[127,125]{1,0} parameter(0)
+; CHECK:      ROOT
+; CHECK-SAME: fusion(%[[P0]])
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   __triton_softmax
+)";
+
+  const std::string hlo_ref = absl::Substitute(
+      hlo_ref_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  MatchOptimizedHlo(hlo_text, hlo_ref);
+
+  float tolerance;
+  switch (data_type) {
+    case F32:
+      tolerance = 1e-6;
+      break;
+    case F16:
+      tolerance = 2e-4;
+      break;
+    case BF16:
+      tolerance = 2e-2;
+      break;
+    default:
+      ABSL_UNREACHABLE();
+  }
+  EXPECT_TRUE(RunAndCompare(hlo_text,
+                            ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
+}
+
+TEST_P(
+    TritonSoftmaxTest,
+    CanFuseAndEmitBinaryElementwiseConsumerWhereTheFirstOperandIsASplatConstantIntoDiamond) {  // NOLINT(whitespace/line_length)
+  PrimitiveType data_type = GetParam();
+
+  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
+                               se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
+    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
+  }
+
+  const std::string hlo_text_template = R"(
+HloModule fusible_diamond
+add_computation {
+  arg_0.1 = $0[] parameter(0)
+  arg_1.1 = $0[] parameter(1)
+  ROOT add = $0[] add(arg_0.1, arg_1.1)
+}
+ENTRY main {
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=add_computation
+  broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
+  subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
+  constant = $0[] constant(0.333333343)
+  broadcast_splat = $0[127,125]{1,0} broadcast(constant), dimensions={}
+  ROOT multiply = $0[127,125]{1,0} multiply(broadcast_splat, subtract)
+}
+)";
+
+  const std::string hlo_text = absl::Substitute(
+      hlo_text_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  const std::string hlo_ref_template = R"(
+; CHECK:    ENTRY
+; CHECK:      %[[P0:.*]] = $0[127,125]{1,0} parameter(0)
+; CHECK:      ROOT
+; CHECK-SAME: fusion(%[[P0]])
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   __triton_softmax
+)";
+  const std::string hlo_ref = absl::Substitute(
+      hlo_ref_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  MatchOptimizedHlo(hlo_text, hlo_ref);
+
+  float tolerance;
+  switch (data_type) {
+    case F32:
+      tolerance = 1e-6;
+      break;
+    case F16:
+      tolerance = 2e-4;
+      break;
+    case BF16:
+      tolerance = 2e-2;
+      break;
+    default:
+      ABSL_UNREACHABLE();
+  }
+  EXPECT_TRUE(RunAndCompare(hlo_text,
+                            ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
+}
+
+TEST_P(
+    TritonSoftmaxTest,
+    CanFuseAndEmitBinaryElementwiseProducerWhereTheFirstOperandIsASplatConstantIntoDiamond) {  // NOLINT(whitespace/line_length)
+  PrimitiveType data_type = GetParam();
+
+  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
+                               se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
+    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
+  }
+
+  const std::string hlo_text_template = R"(
+HloModule fusible_diamond
+add_computation {
+  arg_0.1 = $0[] parameter(0)
+  arg_1.1 = $0[] parameter(1)
+  ROOT add = $0[] add(arg_0.1, arg_1.1)
+}
+ENTRY main {
+
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant = $0[] constant(0.333333343)
+  broadcast_splat = $0[127,125]{1,0} broadcast(constant), dimensions={}
+  multiply = $0[127,125]{1,0} multiply(broadcast_splat, param_0)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(multiply, constant_neg_inf), dimensions={1}, to_apply=add_computation
+  broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = $0[127,125]{1,0} subtract(multiply, broadcast)
+}
+)";
+
+  const std::string hlo_text = absl::Substitute(
+      hlo_text_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  const std::string hlo_ref_template = R"(
+; CHECK:    ENTRY
+; CHECK:      %[[P0:.*]] = $0[127,125]{1,0} parameter(0)
+; CHECK:      ROOT
+; CHECK-SAME: fusion(%[[P0]])
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   __triton_softmax
+)";
+  const std::string hlo_ref = absl::Substitute(
+      hlo_ref_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  MatchOptimizedHlo(hlo_text, hlo_ref);
+
+  float tolerance;
+  switch (data_type) {
+    case F32:
+      tolerance = 1e-6;
+      break;
+    case F16:
+      tolerance = 2e-4;
+      break;
+    case BF16:
+      tolerance = 2e-2;
+      break;
+    default:
+      ABSL_UNREACHABLE();
+  }
+  EXPECT_TRUE(RunAndCompare(hlo_text,
+                            ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
+}
+
 INSTANTIATE_TEST_SUITE_P(TritonSoftmaxTestSuite, TritonSoftmaxTest,
                          ::testing::Values(F32, F16, BF16));
 
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
index ebc87378a7751b..ce3dcacebc23e2 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
 #include "xla/layout_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
@@ -123,6 +124,26 @@ bool BitcastIsTilingNoop(HloInstruction* bitcast,
            last_dimension(bitcast->operand(0)) == last_dimension(bitcast)));
 }
 
+inline bool HasOneUse(const HloInstruction* instr) {
+  return instr->user_count() == 1;
+}
+
+using hlo_query::IsBroadcastOfScalarConstant;
+
+// Chooses which operand to use for fusion processing. Taking in a unary or
+// binary instruction, returns the first non-splat operand. If none is
+// present, returns any operand.
+HloInstruction* ChooseOperandForFusionProcessing(HloInstruction* instr) {
+  CHECK_GT(instr->operand_count(), 0);
+  CHECK_LE(instr->operand_count(), 2);
+
+  if (instr->operand_count() > 1 &&
+      IsBroadcastOfScalarConstant(*instr->operand(0))) {
+    return instr->mutable_operand(1);
+  }
+  return instr->mutable_operand(0);
+}
+
 bool IsTriviallyFusible(HloInstruction* instr, const GpuVersion& gpu_version,
                         int num_allowed_users = 1) {
   // Checks whether an op is trivially fusible. An op is said to be trivially
@@ -144,8 +165,36 @@ bool IsTriviallyFusible(HloInstruction* instr, const GpuVersion& gpu_version,
     return IsTritonSupportedInstruction(instr, gpu_version);
   }
 
-  if (instr->IsElementwiseBinary() && instr->operand(0) == instr->operand(1)) {
-    return IsTritonSupportedInstruction(instr, gpu_version);
+  // Elementwise binary ops are trivially fusible if the operands are the same,
+  // or if exactly one of the operands is a splat constant with a single user.
+  if (instr->IsElementwiseBinary()) {
+    const HloInstruction* operand_0 = instr->operand(0);
+    const HloInstruction* operand_1 = instr->operand(1);
+
+    // Elementwise binary ops should be fused if both operands are the same and
+    // if the operand is triton supported.
+    if (operand_0 == operand_1) {
+      return IsTritonSupportedInstruction(instr, gpu_version);
+    }
+
+    // If either operand is a splat constant with multiple users, we should not
+    // fuse.
+    bool operand_0_is_shared_splat_constant =
+        IsBroadcastOfScalarConstant(*operand_0) && !HasOneUse(operand_0);
+    bool operand_1_is_shared_splat_constant =
+        IsBroadcastOfScalarConstant(*operand_1) && !HasOneUse(operand_1);
+
+    if (operand_0_is_shared_splat_constant ||
+        operand_1_is_shared_splat_constant) {
+      return false;
+    }
+
+    // For simplicity we only fuse elementwise binary ops with splat operands
+    // if they contain one non-splat operand.
+    if (IsBroadcastOfScalarConstant(*operand_0) ^
+        IsBroadcastOfScalarConstant(*operand_1)) {
+      return IsTritonSupportedInstruction(instr, gpu_version);
+    }
   }
 
   return false;
@@ -155,7 +204,7 @@ bool TrivialEdge(HloInstruction** producer, HloInstruction* consumer,
                  HloOpcode opcode, const GpuVersion& gpu_version) {
   while (consumer->opcode() != opcode) {
     if (IsTriviallyFusible(consumer, gpu_version)) {
-      consumer = consumer->mutable_operand(0);
+      consumer = ChooseOperandForFusionProcessing(consumer);
     } else {
       return false;
     }
@@ -189,10 +238,6 @@ bool IsTriviallyConnectedProducerOf(HloInstruction* producer,
   return false;
 }
 
-inline bool HasOneUse(const HloInstruction* instr) {
-  return instr->user_count() == 1;
-}
-
 bool IsTritonSupportedComputation(const HloComputation* computation,
                                   const GpuVersion& gpu_version) {
   for (const HloInstruction* instr : computation->instructions()) {
@@ -264,7 +309,7 @@ std::optional<HloInstruction*> MatchesTritonCompatibleClosedReductionDiamond(
   }
 
   while (IsTriviallyFusible(producer, gpu_version)) {
-    producer = producer->mutable_operand(0);
+    producer = ChooseOperandForFusionProcessing(producer);
   }
 
   if (!HasDefaultLayout(producer->shape()) ||
@@ -288,9 +333,9 @@ HloInstruction* FindFirstNonFusibleDiamondProducer(
     HloInstruction* diamond_producer, const GpuVersion& gpu_version) {
   if (IsTriviallyFusible(diamond_producer, gpu_version,
                          /*num_allowed_users=*/2)) {
-    diamond_producer = diamond_producer->mutable_operand(0);
+    diamond_producer = ChooseOperandForFusionProcessing(diamond_producer);
     while (IsTriviallyFusible(diamond_producer, gpu_version)) {
-      diamond_producer = diamond_producer->mutable_operand(0);
+      diamond_producer = ChooseOperandForFusionProcessing(diamond_producer);
     }
   }
 
@@ -393,7 +438,7 @@ SoftmaxRewriterTriton::FindAllFusibleDiamondChains(
       [](HloInstruction* diamond_root) {
         HloInstruction* instr = diamond_root->mutable_operand(1);
         while (instr->opcode() != HloOpcode::kReduce) {
-          instr = instr->mutable_operand(0);
+          instr = ChooseOperandForFusionProcessing(instr);
         }
 
         int operand_rank = instr->operand(0)->shape().rank();
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
index 13d98d1823c4c8..c4c506064f983d 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
@@ -1404,6 +1404,326 @@ ENTRY main {
               GmockMatch(m::Fusion(m::Parameter())));
 }
 
+TEST_F(
+    SoftmaxRewriterTritonTest,
+    DoesNotFuseIntermediateBinaryElementwiseWithBothSplatOperandsIntoDiamond) {
+  const std::string hlo_string = R"(
+HloModule nonfusible_splat
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+ENTRY main {
+  constant_0 = f32[] constant(0.333333343)
+  splat_0 = f32[127,125]{1,0} broadcast(constant_0), dimensions={}
+  constant_1 = f32[] constant(0.66666)
+  splat_1 = f32[127,125]{1,0} broadcast(constant_1), dimensions={}
+  param_0 = f32[127,125]{1,0} parameter(0)
+  multiply_splats = f32[127,125]{1,0} multiply(splat_0, splat_1)
+  multiply_splat_param = f32[127,125]{1,0} multiply(multiply_splats, param_0)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[127]{0} reduce(multiply_splat_param, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[127,125]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[127,125]{1,0} subtract(param_0, broadcast)
+}
+)";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_FALSE(
+      SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
+}
+
+TEST_F(
+    SoftmaxRewriterTritonTest,
+    DoesNotFuseIntermediateBinaryElementwiseWithSameSplatOperandsIntoDiamond) {
+  const std::string hlo_string = R"(
+HloModule nonfusible_splat_diamond
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+ENTRY main {
+  constant_0 = f32[] constant(0.333333343)
+  splat = f32[127,125]{1,0} broadcast(constant_0), dimensions={}
+  param_0 = f32[127,125]{1,0} parameter(0)
+  multiply = f32[127,125]{1,0} multiply(splat, splat)
+  add = f32[127,125]{1,0} add(param_0, multiply)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[127]{0} reduce(add, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[127,125]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[127,125]{1,0} subtract(param_0, broadcast)
+}
+)";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxRewriterTriton fusion_rewriter(gpu_version_);
+  EXPECT_FALSE(
+      SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
+}
+
+TEST_P(SoftmaxRewriterTritonTest, CanFuseRMSNormDiamond) {
+  PrimitiveType data_type = GetParam();
+  const std::string hlo_string_template = R"(
+HloModule rms_norm
+add_computation {
+  arg_0 = $0[] parameter(0)
+  arg_1 = $0[] parameter(1)
+  ROOT add.1 = $0[] add(arg_0, arg_1)
+}
+ENTRY main.30 {
+  param_0 = $0[10,10,10,128]{3,2,1,0} parameter(0)
+  multiply_param = $0[10,10,10,128]{3,2,1,0} multiply(param_0, param_0)
+  constant_0 = $0[] constant(0)
+  reduce = $0[10,10,10]{2,1,0} reduce(multiply_param, constant_0), dimensions={3}, to_apply=add_computation
+  constant_1 = $0[] constant(0.333333343)
+  splat = $0[10,10,10]{2,1,0} broadcast(constant_1), dimensions={}
+  multiply_splat = $0[10,10,10]{2,1,0} multiply(reduce, splat)
+  epsilon = $0[] constant(1e-06)
+  splat_epsilon = $0[10,10,10]{2,1,0} broadcast(epsilon), dimensions={}
+  add = $0[10,10,10]{2,1,0} add(multiply_splat, splat_epsilon)
+  rsqrt = $0[10,10,10]{2,1,0} rsqrt(add)
+  broadcast = $0[10,10,10,128]{3,2,1,0} broadcast(rsqrt), dimensions={0,1,2}
+  ROOT multiply = $0[10,10,10,128]{3,2,1,0} multiply(param_0, broadcast)
+}
+)";
+  const std::string hlo_string =
+      absl::Substitute(hlo_string_template,
+                       primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+
+  switch (data_type) {
+    case F32:
+    case BF16:
+      EXPECT_TRUE(
+          SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get())
+              .value());
+      EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+      EXPECT_THAT(module->entry_computation()->root_instruction(),
+                  GmockMatch(m::Fusion(m::Parameter())));
+      break;
+    case F16:
+      // Triton does not support F16 rsqrt.
+      EXPECT_FALSE(
+          SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get())
+              .value());
+      break;
+    default:
+      ABSL_UNREACHABLE();
+  }
+}
+
+TEST_P(
+    SoftmaxRewriterTritonTest,
+    CanFuseAndEmitBinaryElementwiseWhereTheFirstOperandIsASplatConstantBetweenDiamonds) {  // NOLINT(whitespace/line_length)
+  PrimitiveType data_type = GetParam();
+  const std::string hlo_string_template = R"(
+HloModule fusible_diamonds
+add_computation {
+  arg_0.1 = $0[] parameter(0)
+  arg_1.1 = $0[] parameter(1)
+  ROOT add = $0[] add(arg_0.1, arg_1.1)
+}
+ENTRY main {
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=add_computation
+  broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
+  subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
+  constant = $0[] constant(0.333333343)
+  broadcast_splat = $0[127,125]{1,0} broadcast(constant), dimensions={}
+  multiply = $0[127,125]{1,0} multiply(broadcast_splat, subtract)
+  constant_zero = $0[] constant(0)
+  second_reduce = $0[127]{0} reduce(multiply, constant_zero), dimensions={1}, to_apply=add_computation
+  second_broadcast = $0[127,125]{1,0} broadcast(second_reduce), dimensions={0}
+  ROOT second_subtract = $0[127,125]{1,0} subtract(multiply, second_broadcast)
+}
+)";
+  const std::string hlo_string =
+      absl::Substitute(hlo_string_template,
+                       primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(
+      SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter())));
+}
+
+TEST_P(
+    SoftmaxRewriterTritonTest,
+    CanFuseAndEmitBinaryElementwiseWhereTheSecondOperandIsASplatConstantBetweenDiamonds) {  // NOLINT(whitespace/line_length)
+  PrimitiveType data_type = GetParam();
+  const std::string hlo_string_template = R"(
+HloModule fusible_diamonds
+add_computation {
+  arg_0.1 = $0[] parameter(0)
+  arg_1.1 = $0[] parameter(1)
+  ROOT add = $0[] add(arg_0.1, arg_1.1)
+}
+ENTRY main {
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=add_computation
+  broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
+  subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
+  constant = $0[] constant(0.333333343)
+  broadcast_splat = $0[127,125]{1,0} broadcast(constant), dimensions={}
+  multiply = $0[127,125]{1,0} multiply(subtract, broadcast_splat)
+  constant_zero = $0[] constant(0)
+  second_reduce = $0[127]{0} reduce(multiply, constant_zero), dimensions={1}, to_apply=add_computation
+  second_broadcast = $0[127,125]{1,0} broadcast(second_reduce), dimensions={0}
+  ROOT second_subtract = $0[127,125]{1,0} subtract(multiply, second_broadcast)
+}
+)";
+  const std::string hlo_string =
+      absl::Substitute(hlo_string_template,
+                       primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(
+      SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter())));
+}
+
+TEST_P(
+    SoftmaxRewriterTritonTest,
+
+    CanFuseBinaryElementwiseWhereTheFirstOperandIsASplatConstantWithinDiamond) {
+  PrimitiveType data_type = GetParam();
+  const std::string hlo_string_template = R"(
+HloModule fusible_diamond
+max_computation {
+  arg_0 = $0[] parameter(0)
+  arg_1 = $0[] parameter(1)
+  ROOT maximum = $0[] maximum(arg_0, arg_1)
+}
+ENTRY main {
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  constant = $0[] constant(0.333333343)
+  broadcast_splat = $0[127]{0} broadcast(constant), dimensions={}
+  multiply = $0[127]{0} multiply(broadcast_splat, reduce)
+  broadcast = $0[127,125]{1,0} broadcast(multiply), dimensions={0}
+  ROOT subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
+}
+)";
+  const std::string hlo_string =
+      absl::Substitute(hlo_string_template,
+                       primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(
+      SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter())));
+}
+
+TEST_P(SoftmaxRewriterTritonTest,
+       CanFuseBinaryElementwiseConsumerWhereTheFirstOperandIsASplatConstant) {
+  PrimitiveType data_type = GetParam();
+  const std::string hlo_string_template = R"(
+HloModule fusible_diamond
+add_computation {
+  arg_0.1 = $0[] parameter(0)
+  arg_1.1 = $0[] parameter(1)
+  ROOT add = $0[] add(arg_0.1, arg_1.1)
+}
+ENTRY main {
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=add_computation
+  broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
+  subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
+  constant = $0[] constant(0.333333343)
+  broadcast_splat = $0[127,125]{1,0} broadcast(constant), dimensions={}
+  ROOT multiply = $0[127,125]{1,0} multiply(broadcast_splat, subtract)
+}
+)";
+  const std::string hlo_string =
+      absl::Substitute(hlo_string_template,
+                       primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(
+      SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter())));
+}
+
+TEST_F(
+    SoftmaxRewriterTritonTest,
+    DoesNotFuseBinaryElementwiseOperationWhereOneOperandIsASharedSplatProducer) {  // NOLINT(whitespace/line_length)
+  const std::string hlo_string = R"(
+HloModule nonfusible_diamond
+add_computation {
+  arg_0.1 = f32[] parameter(0)
+  arg_1.1 = f32[] parameter(1)
+  ROOT add = f32[] add(arg_0.1, arg_1.1)
+}
+ENTRY main {
+  param_0 = f32[127,125]{1,0} parameter(0)
+  constant.2 = f32[] constant(0.333333343)
+  broadcast_splat = f32[127,125]{1,0} broadcast(constant.2), dimensions={}
+  param_1 = f32[127,125]{1,0} parameter(1)
+  multiply_splat = f32[127,125]{1,0} multiply(broadcast_splat, param_1)
+  multiply = f32[127,125]{1,0} multiply(param_0, broadcast_splat)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[127]{0} reduce(multiply, constant_neg_inf), dimensions={1}, to_apply=add_computation
+  broadcast = f32[127,125]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[127,125]{1,0} subtract(param_0, broadcast)
+}
+)";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_FALSE(
+      SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
+}
+
+TEST_F(
+    SoftmaxRewriterTritonTest,
+    DoesNotFuseBinaryElementwiseOperationWhereFirstOperandIsASplatAndSecondOperandIsASharedSplatProducer) {  // NOLINT(whitespace/line_length)
+  const std::string hlo_string = R"(
+HloModule nonfusible_diamond
+add_computation {
+  arg_0.1 = f32[] parameter(0)
+  arg_1.1 = f32[] parameter(1)
+  ROOT add = f32[] add(arg_0.1, arg_1.1)
+}
+ENTRY main {
+  param_0 = f32[127,125]{1,0} parameter(0)
+  constant_2 = f32[] constant(0.333333343)
+  broadcast_splat_shared = f32[127,125]{1,0} broadcast(constant_2), dimensions={}
+  param_1 = f32[127,125]{1,0} parameter(1)
+  multiply_splat_shared = f32[127,125]{1,0} multiply(broadcast_splat_shared, param_1)
+  constant_3 = f32[] constant(0.5)
+  broadcast_splat = f32[127,125]{1,0} broadcast(constant_3), dimensions={}
+  multiply_splat = f32[127,125]{1,0} multiply(broadcast_splat, broadcast_splat_shared)
+  multiply = f32[127,125]{1,0} multiply(param_0, multiply_splat)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[127]{0} reduce(multiply, constant_neg_inf), dimensions={1}, to_apply=add_computation
+  broadcast = f32[127,125]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[127,125]{1,0} subtract(param_0, broadcast)
+}
+)";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_FALSE(
+      SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
+}
+
 INSTANTIATE_TEST_SUITE_P(SoftmaxRewriterTritonTestSuite,
                          SoftmaxRewriterTritonTest,
                          ::testing::Values(F32, F16, BF16));

From 1ee265c2092b3dc631281965c3aa8967279a3e8e Mon Sep 17 00:00:00 2001
From: Michael Hudgins <michaelhudgins@google.com>
Date: Mon, 25 Sep 2023 09:18:32 -0700
Subject: [PATCH 216/567] Update the remote GCC toolchains to the newest
 container and defines to match the upgrade from CUDA 11.8 to 12.2

PiperOrigin-RevId: 568239285
---
 .../tools/toolchains/remote_config/configs.bzl    | 15 +++++++--------
 .../tools/toolchains/remote_config/configs.bzl    | 15 +++++++--------
 .../tools/toolchains/remote_config/configs.bzl    | 15 +++++++--------
 3 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index 6e37be2447c0ec..497b8e94c7334d 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -580,11 +580,10 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.14": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:c46d275e5bc760b7af465dc063629b234cfa34aabf0c7fe30581effc0b99648a",
-            "sigbuild-r2.14-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:06b3a97ef247dbb00a9c6d8315e4e035d891ae2f18088de254f15d6ecedadfb9",
-            "sigbuild-r2.14-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:00dc9e13130727dcdeb54ca77423e317a79aae84d5783c05b38b7bbdf753f0f6",
+            "sigbuild-r2.14": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
+            "sigbuild-r2.14-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
         },
         # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
         # and manylinux2014 is 2.17.
@@ -608,13 +607,13 @@ def initialize_rbe_configs():
             "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
             "TF_CUDA_CLANG": "0",
             "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
-            "TF_CUDA_VERSION": "11.8",
-            "TF_CUDNN_VERSION": "8.6",
+            "TF_CUDA_VERSION": "12.2",
+            "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_NEED_TENSORRT": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_TENSORRT_VERSION": "8.4",
+            "TF_TENSORRT_VERSION": "8.6",
         },
     )
 
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index 666e51de5ffb71..cd829d1a88c42c 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -580,11 +580,10 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.14": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:c46d275e5bc760b7af465dc063629b234cfa34aabf0c7fe30581effc0b99648a",
-            "sigbuild-r2.14-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:06b3a97ef247dbb00a9c6d8315e4e035d891ae2f18088de254f15d6ecedadfb9",
-            "sigbuild-r2.14-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:00dc9e13130727dcdeb54ca77423e317a79aae84d5783c05b38b7bbdf753f0f6",
+            "sigbuild-r2.14": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
+            "sigbuild-r2.14-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
         },
         # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
         # and manylinux2014 is 2.17.
@@ -608,13 +607,13 @@ def initialize_rbe_configs():
             "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
             "TF_CUDA_CLANG": "0",
             "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
-            "TF_CUDA_VERSION": "11.8",
-            "TF_CUDNN_VERSION": "8.6",
+            "TF_CUDA_VERSION": "12.2",
+            "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_NEED_TENSORRT": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_TENSORRT_VERSION": "8.4",
+            "TF_TENSORRT_VERSION": "8.6",
         },
     )
 
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index 666e51de5ffb71..cd829d1a88c42c 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -580,11 +580,10 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.14": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:c46d275e5bc760b7af465dc063629b234cfa34aabf0c7fe30581effc0b99648a",
-            "sigbuild-r2.14-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
-            "sigbuild-r2.14-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:06b3a97ef247dbb00a9c6d8315e4e035d891ae2f18088de254f15d6ecedadfb9",
-            "sigbuild-r2.14-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:00dc9e13130727dcdeb54ca77423e317a79aae84d5783c05b38b7bbdf753f0f6",
+            "sigbuild-r2.14": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b",
+            "sigbuild-r2.14-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5b06acad335d5c24aa2f63d39d9de230affd04aa982dda0242eeb893b9dae363",
+            "sigbuild-r2.14-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:80d991d5cf0ac710b568c71c7790270691749afa19de49d7c1a121ba3f92cd58",
         },
         # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
         # and manylinux2014 is 2.17.
@@ -608,13 +607,13 @@ def initialize_rbe_configs():
             "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
             "TF_CUDA_CLANG": "0",
             "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
-            "TF_CUDA_VERSION": "11.8",
-            "TF_CUDNN_VERSION": "8.6",
+            "TF_CUDA_VERSION": "12.2",
+            "TF_CUDNN_VERSION": "8.9",
             "TF_ENABLE_XLA": "1",
             "TF_NEED_CUDA": "1",
             "TF_NEED_TENSORRT": "1",
             "TF_SYSROOT": "/dt9",
-            "TF_TENSORRT_VERSION": "8.4",
+            "TF_TENSORRT_VERSION": "8.6",
         },
     )
 

From 6655ca4631791a46df4ea7215aa0321200684216 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 09:19:29 -0700
Subject: [PATCH 217/567] Definition for DFATAL is added to TensorFlow logging.

PiperOrigin-RevId: 568239531
---
 .../xla/third_party/tsl/tsl/platform/default/logging.h      | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/logging.h b/third_party/xla/third_party/tsl/tsl/platform/default/logging.h
index 3dba4b3b8653b7..c8bad4bedefe11 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/logging.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/logging.h
@@ -156,6 +156,12 @@ class LogMessageNull : public std::basic_ostringstream<char> {
 
 #define _TF_LOG_QFATAL _TF_LOG_FATAL
 
+#ifdef NDEBUG
+#define _TF_LOG_DFATAL _TF_LOG_ERROR
+#else
+#define _TF_LOG_DFATAL _TF_LOG_FATAL
+#endif
+
 #define LOG(severity) _TF_LOG_##severity
 
 #ifdef IS_MOBILE_PLATFORM

From 9aaeb1bb04b12be62cad962f4b3adfcd344b535d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 09:30:14 -0700
Subject: [PATCH 218/567] Keep dims when reducing gradients of broadcasted
 arguments to reduce size of dynamically-shaped graphs.

Motivation: Consider, for example, `z = x * y` where `x` has shape `[3, 1]` and `y` - `[3, 5]`. During backprop the upsteam gradient `gz` of `z` with shape `[3, 5]` is propagated as follows:

```
ix, iy = broadcast_gradient_args(x, y)
gx = reshape(reduce_sum(gz * y, ix), shape(x))
gy = reshape(reduce_sum(x * gz, iy), shape(y))
```

Since input shapes are fully-defined, the optimization passes can easily infer `ix = [1]` and `iy = []` and rewrite the above computation as follows (note that `reduce_sum(x, [])` is an identity):

```
gx = reshape(reduce_sum(gz * y, [1]), [3, 1])
gy = x * gz
```

If we set `keepdims=True` in the above reduction, the `reshape` becomes an identity and can be optimized away as well:

```
gx = reduce_sum(gz * y, [1])
gy = x * gz
```

The impact is not limited to fully-defined shapes. When `keepdims=True` the shape refiner is able to infer the rank of `reduce_sum` result which opens up some other opportunities for optimization (see, for example, `_SumGrad` in the same file).

PiperOrigin-RevId: 568242315
---
 tensorflow/python/ops/math_grad.py | 374 +++++++++++------------------
 1 file changed, 139 insertions(+), 235 deletions(-)

diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 794d4ff2ac102f..d702d7fad03c19 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -30,11 +30,6 @@
 from tensorflow.python.ops import special_math_ops
 
 
-def _safe_shape_div(x, y):
-  """Divides `x / y` assuming `x, y >= 0`, treating `0 / 0 = 0`."""
-  return x // math_ops.maximum(y, 1)
-
-
 @ops.RegisterGradient("ArgMax")
 def _ArgMaxGrad(op: ops.Operation, grad):
   del op, grad
@@ -133,11 +128,37 @@ def SmartBroadcastGradientArgs(x, y, grad):
         y_shape_tuple, ry_value, y_needs_reduction)
 
 
-_empty_tuple = ()
+def _ReduceGradientArg(grad, shape_axes_must_reduce):
+  """Reduces gradients of one of the arguments of a broadcasting binary op."""
+  shape, axes, must_reduce = shape_axes_must_reduce
+  if grad is not None and must_reduce:
+    # Applying keepdims=True in presence of unknown axes opens up some
+    # opportunities for optimizations. For example, _SumGrad below won't have to
+    # emit extra ops to recover reduced indices for broadcasting.
+    grad = math_ops.reduce_sum(grad, axes, keepdims=True)
+    grad = array_ops.reshape(grad, shape)
+  return grad
+
+
+def _ReduceGradientArgs(x, y, grad, gx, gy):
+  """Reduces gradients of both arguments of a broadcasting binary op."""
+  if gx is not None or gy is not None:
+    bx, by = SmartBroadcastGradientArgs(x, y, grad)
+    gx = _ReduceGradientArg(gx, bx)
+    gy = _ReduceGradientArg(gy, by)
+  return gx, gy
+
+
+_EMPTY_TUPLE = ()
 
 
 def _IsScalar(x):
-  return x._shape_tuple() is _empty_tuple  # pylint: disable=protected-access
+  return x._shape_tuple() is _EMPTY_TUPLE  # pylint: disable=protected-access
+
+
+def _SafeShapeDiv(x, y):
+  """Divides `x / y` assuming `x, y >= 0`, treating `0 / 0 = 0`."""
+  return x // math_ops.maximum(y, 1)
 
 
 @ops.RegisterGradient("Sum")
@@ -193,7 +214,7 @@ def EvaluateAsTuple(t):
           output_shape_kept_dims = EvaluateAsTuple(
               math_ops.reduced_shape(input_0_shape, axes))
           tile_scaling = EvaluateAsTuple(
-              _safe_shape_div(input_0_shape, output_shape_kept_dims))
+              _SafeShapeDiv(input_0_shape, output_shape_kept_dims))
           graph._reduced_shape_cache[(input_0_shape, axes)] = (  # pylint:disable=protected-access
               output_shape_kept_dims, tile_scaling)
 
@@ -1309,24 +1330,16 @@ def _AtanGrad(op: ops.Operation, grad):
 
 @ops.RegisterGradient("Atan2")
 def _Atan2Grad(op: ops.Operation, grad):
-  """Returns grad * x / (x^2 + y^2), grad * -y / (x^2 + y^2)."""
+  """Returns grad * x / (y^2 + x^2), grad * -y / (y^2 + x^2)."""
   y = op.inputs[0]
   x = op.inputs[1]
   with ops.control_dependencies([grad]):
-    (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
-        SmartBroadcastGradientArgs(x, y, grad)
-    )
-
-    grad_inv = grad / (math_ops.square(x) + math_ops.square(y))
-
-    gx = -y * grad_inv
-    if must_reduce_x:
-      gx = array_ops.reshape(math_ops.reduce_sum(gx, rx), sx)
-
+    grad_inv = grad / (math_ops.square(y) + math_ops.square(x))
     gy = x * grad_inv
-    if must_reduce_y:
-      gy = array_ops.reshape(math_ops.reduce_sum(gy, ry), sy)
-    return gy, gx
+    gx = -y * grad_inv
+    # pylint: disable=arguments-out-of-order
+    return _ReduceGradientArgs(y, x, grad, gy, gx)
+    # pylint: enable=arguments-out-of-order
 
 
 @ops.RegisterGradient("AddN")
@@ -1351,109 +1364,80 @@ def _ShapesFullySpecifiedAndEqual(x, y, grad):
 def _AddGrad(op: ops.Operation, grad):
   """Gradient for Add."""
   y = op.inputs[1]
-  skip_input_indices = None
   try:
-    skip_input_indices = op.skip_input_indices
-    if skip_input_indices is not None and 1 in skip_input_indices and _IsScalar(
-        y):
+    skip_input_indices = op.skip_input_indices or ()
+    if 1 in skip_input_indices and _IsScalar(y):
       return grad, None
   except AttributeError:
     # No gradient skipping, so do the full gradient computation
-    pass
+    skip_input_indices = ()
+
   x = op.inputs[0]
-  if (isinstance(grad, tensor.Tensor) and
-      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+  if isinstance(grad, tensor.Tensor) and _ShapesFullySpecifiedAndEqual(
+      x, y, grad
+  ):
     return grad, grad
-  (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
-      SmartBroadcastGradientArgs(x, y, grad))
-  if skip_input_indices is not None and 0 in skip_input_indices:
-    gx = None
-  elif not must_reduce_x:
-    gx = grad
-  else:
-    gx = array_ops.reshape(math_ops.reduce_sum(grad, rx), sx)
-  if skip_input_indices is not None and 1 in skip_input_indices:
-    gy = None
-  elif not must_reduce_y:
-    gy = grad
-  else:
-    gy = array_ops.reshape(math_ops.reduce_sum(grad, ry), sy)
-  return (gx, gy)
+
+  gx = None if 0 in skip_input_indices else grad
+  gy = None if 1 in skip_input_indices else grad
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 @ops.RegisterGradient("Sub")
 def _SubGrad(op: ops.Operation, grad):
   """Gradient for Sub."""
   y = op.inputs[1]
-  skip_input_indices = None
   try:
-    skip_input_indices = op.skip_input_indices
-    if skip_input_indices is not None and 1 in skip_input_indices and _IsScalar(
-        y):
+    skip_input_indices = op.skip_input_indices or ()
+    if 1 in skip_input_indices and _IsScalar(y):
       return grad, None
   except AttributeError:
     # No gradient skipping, so do the full gradient computation
-    pass
+    skip_input_indices = ()
+
   x = op.inputs[0]
-  if (isinstance(grad, tensor.Tensor) and
-      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+  if isinstance(grad, tensor.Tensor) and _ShapesFullySpecifiedAndEqual(
+      x, y, grad
+  ):
     return grad, -grad
-  (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
-      SmartBroadcastGradientArgs(x, y, grad))
-  if skip_input_indices is not None and 0 in skip_input_indices:
-    gx = None
-  elif not must_reduce_x:
-    gx = grad
-  else:
-    gx = array_ops.reshape(math_ops.reduce_sum(grad, rx), sx)
-  if skip_input_indices is not None and 1 in skip_input_indices:
-    gy = None
-  elif not must_reduce_y:
-    gy = -grad
-  else:
-    gy = array_ops.reshape(math_ops.reduce_sum(-grad, ry), sy)
-  return (gx, gy)
+
+  gx = None if 0 in skip_input_indices else grad
+  gy = None if 1 in skip_input_indices else -grad
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 @ops.RegisterGradient("Mul")
 def _MulGrad(op: ops.Operation, grad):
   """The gradient of scalar multiplication."""
   y = op.inputs[1]
-  skip_input_indices = None
   try:
-    skip_input_indices = op.skip_input_indices
-    if skip_input_indices is not None and 1 in skip_input_indices and _IsScalar(
-        y):
+    skip_input_indices = op.skip_input_indices or ()
+    if 1 in skip_input_indices and _IsScalar(y):
       return gen_math_ops.mul(grad, math_ops.conj(y)), None
   except AttributeError:
     # No gradient skipping, so do the full gradient computation
-    pass
+    skip_input_indices = ()
+
   x = op.inputs[0]
-  if (isinstance(grad, tensor.Tensor) and
-      _ShapesFullySpecifiedAndEqual(x, y, grad) and
-      grad.dtype in (dtypes.int32, dtypes.float32)):
+  if (
+      isinstance(grad, tensor.Tensor)
+      and _ShapesFullySpecifiedAndEqual(x, y, grad)
+      and grad.dtype in (dtypes.int32, dtypes.float32)
+  ):
     return gen_math_ops.mul(grad, y), gen_math_ops.mul(grad, x)
   assert x.dtype.base_dtype == y.dtype.base_dtype, (x.dtype, " vs. ", y.dtype)
 
-  (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
-      SmartBroadcastGradientArgs(x, y, grad))
-  x = math_ops.conj(x)
-  y = math_ops.conj(y)
-  if skip_input_indices is not None and 0 in skip_input_indices:
+  if 0 in skip_input_indices:
     gx = None
-  elif not must_reduce_x:
-    gx = gen_math_ops.mul(grad, y)
   else:
-    gx = array_ops.reshape(
-        math_ops.reduce_sum(gen_math_ops.mul(grad, y), rx), sx)
-  if skip_input_indices is not None and 1 in skip_input_indices:
+    gx = gen_math_ops.mul(grad, math_ops.conj(y))
+
+  if 1 in skip_input_indices:
     gy = None
-  elif not must_reduce_y:
-    gy = gen_math_ops.mul(x, grad)
   else:
-    gy = array_ops.reshape(
-        math_ops.reduce_sum(gen_math_ops.mul(x, grad), ry), sy)
-  return (gx, gy)
+    gy = gen_math_ops.mul(math_ops.conj(x), grad)
+
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 @ops.RegisterGradient("MulNoNan")
@@ -1461,17 +1445,15 @@ def _MulNoNanGrad(op: ops.Operation, grad):
   """The gradient of scalar multiplication with NaN-suppression."""
   x = op.inputs[0]
   y = op.inputs[1]
-  if (isinstance(grad, tensor.Tensor) and
-      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+  if isinstance(grad, tensor.Tensor) and _ShapesFullySpecifiedAndEqual(
+      x, y, grad
+  ):
     return gen_math_ops.mul_no_nan(grad, y), gen_math_ops.mul_no_nan(x, grad)
+
   assert x.dtype.base_dtype == y.dtype.base_dtype, (x.dtype, " vs. ", y.dtype)
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
-  return (array_ops.reshape(
-      math_ops.reduce_sum(gen_math_ops.mul_no_nan(grad, y), rx), sx),
-          array_ops.reshape(
-              math_ops.reduce_sum(gen_math_ops.mul_no_nan(x, grad), ry), sy))
+  gx = gen_math_ops.mul_no_nan(grad, y)
+  gy = gen_math_ops.mul_no_nan(x, grad)
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 @ops.RegisterGradient("Div")
@@ -1479,17 +1461,11 @@ def _DivGrad(op: ops.Operation, grad):
   """The gradient for the Div operator."""
   x = op.inputs[0]
   y = op.inputs[1]
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
-  x = math_ops.conj(x)
-  y = math_ops.conj(y)
-  # pylint: disable=invalid-unary-operand-type
-  return (
-      array_ops.reshape(math_ops.reduce_sum(math_ops.divide(grad, y), rx), sx),
-      array_ops.reshape(
-          math_ops.reduce_sum(grad * math_ops.divide(math_ops.divide(-x, y), y),
-                              ry), sy))
+  cx = math_ops.conj(x)
+  cy = math_ops.conj(y)
+  gx = math_ops.divide(grad, cy)
+  gy = grad * math_ops.divide(math_ops.divide(-cx, cy), cy)
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 @ops.RegisterGradient("FloorDiv")
@@ -1503,15 +1479,10 @@ def _FloorModGrad(op: ops.Operation, grad):
   """Returns grad * (1, -floor(x/y))."""
   x = math_ops.conj(op.inputs[0])
   y = math_ops.conj(op.inputs[1])
-
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   floor_xy = math_ops.floor_div(x, y)
-  gx = array_ops.reshape(math_ops.reduce_sum(grad, rx), sx)
-  gy = array_ops.reshape(
-      math_ops.reduce_sum(grad * math_ops.negative(floor_xy), ry), sy)
-  return gx, gy
+  gx = grad
+  gy = grad * math_ops.negative(floor_xy)
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 @ops.RegisterGradient("TruncateDiv")
@@ -1524,36 +1495,21 @@ def _RealDivGrad(op: ops.Operation, grad):
   """RealDiv op gradient."""
   x = op.inputs[0]
   y = op.inputs[1]
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
-  x = math_ops.conj(x)
-  y = math_ops.conj(y)
-  return (array_ops.reshape(
-      math_ops.reduce_sum(math_ops.realdiv(grad, y), rx), sx),
-          array_ops.reshape(
-              math_ops.reduce_sum(
-                  grad * math_ops.realdiv(math_ops.realdiv(-x, y), y), ry), sy))  # pylint: disable=invalid-unary-operand-type
+  cx = math_ops.conj(op.inputs[0])
+  cy = math_ops.conj(op.inputs[1])
+  gx = math_ops.realdiv(grad, cy)
+  gy = grad * math_ops.realdiv(math_ops.realdiv(-cx, cy), cy)
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 @ops.RegisterGradient("DivNoNan")
 def _DivNoNanGrad(op: ops.Operation, grad):
   """DivNoNan op gradient."""
-  x = op.inputs[0]
-  y = op.inputs[1]
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
-  x = math_ops.conj(x)
-  y = math_ops.conj(y)
-  return (
-      array_ops.reshape(
-          math_ops.reduce_sum(math_ops.div_no_nan(grad, y), rx), sx),
-      array_ops.reshape(
-          math_ops.reduce_sum(
-              grad * math_ops.div_no_nan(math_ops.div_no_nan(-x, y), y),  # pylint: disable=invalid-unary-operand-type
-              ry),
-          sy))
+  x = math_ops.conj(op.inputs[0])
+  y = math_ops.conj(op.inputs[1])
+  gx = math_ops.div_no_nan(grad, y)
+  gy = grad * math_ops.div_no_nan(math_ops.div_no_nan(-x, y), y)
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 @ops.RegisterGradient("Pow")
@@ -1561,52 +1517,36 @@ def _PowGrad(op: ops.Operation, grad):
   """Returns grad * (y*x^(y-1), z*log(x))."""
   x = op.inputs[0]
   y = op.inputs[1]
-  skip_input_indices = None
+  cx = math_ops.conj(x)
+  cy = math_ops.conj(y)
   try:
-    skip_input_indices = op.skip_input_indices
-    # TODO(mrry): If `y` is a constant, we can combine `tf.sub()` and the
-    # constant `1` into a single constant op.
-    if skip_input_indices is not None and 1 in skip_input_indices and _IsScalar(
-        y):
-      x = math_ops.conj(x)
-      y = math_ops.conj(y)
-      return grad * y * math_ops.pow(x, y - 1), None
-
+    skip_input_indices = op.skip_input_indices or ()
+    if 1 in skip_input_indices and _IsScalar(y):
+      return grad * cy * math_ops.pow(cx, cy - 1), None
   except AttributeError:
     # No gradient skipping, so do the full gradient computation
-    pass
-
-  (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
-      SmartBroadcastGradientArgs(x, y, grad))
-  x = math_ops.conj(x)
-  y = math_ops.conj(y)
+    skip_input_indices = ()
 
-  if skip_input_indices is None or 0 not in skip_input_indices:
-    gx = grad * y * math_ops.pow(x, y - 1)
-    if must_reduce_x:
-      gx = array_ops.reshape(math_ops.reduce_sum(gx, rx), sx)
-  else:
+  if 0 in skip_input_indices:
     gx = None
+  else:
+    gx = grad * cy * math_ops.pow(cx, cy - 1)
 
-  if skip_input_indices is None or 1 not in skip_input_indices:
-    z = math_ops.conj(op.outputs[0])
-
+  if 1 in skip_input_indices:
+    gy = None
+  else:
     # Avoid false singularity at x = 0
     if x.dtype.is_complex:
       # real(x) < 0 is fine for the complex case
-      mask = math_ops.not_equal(x, 0)
+      mask = math_ops.not_equal(cx, 0)
     else:
       # There's no sensible real value to return if x < 0, so return 0
-      mask = x > 0
-    safe_x = array_ops.where(mask, x, array_ops.ones_like(x))
+      mask = cx > 0
+    safe_x = array_ops.where(mask, cx, array_ops.ones_like(x))
     log_x = array_ops.where(mask, math_ops.log(safe_x), array_ops.zeros_like(x))
-    gy = grad * z * log_x
-    if must_reduce_y:
-      gy = array_ops.reshape(math_ops.reduce_sum(gy, ry), sy)
-  else:
-    gy = None
+    gy = grad * math_ops.conj(op.outputs[0]) * log_x
 
-  return gx, gy
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 def _MaximumMinimumGradInputOnly(op: ops.Operation, grad, selector_op):
@@ -1622,36 +1562,27 @@ def _MaximumMinimumGradInputOnly(op: ops.Operation, grad, selector_op):
 def _MaximumMinimumGrad(op: ops.Operation, grad, selector_op):
   """Factor out the code for the gradient of Maximum or Minimum."""
   y = op.inputs[1]
-  skip_input_indices = None
   try:
-    skip_input_indices = op.skip_input_indices
-    if skip_input_indices is not None and 1 in skip_input_indices and _IsScalar(
-        y):
+    skip_input_indices = op.skip_input_indices or ()
+    if 1 in skip_input_indices and _IsScalar(y):
       # When we want to get gradients for the first input only, and the second
       # input tensor is a scalar, we can do a much simpler calculation
       return _MaximumMinimumGradInputOnly(op, grad, selector_op)
   except AttributeError:
     # No gradient skipping, so do the full gradient computation
-    pass
+    skip_input_indices = ()
   x = op.inputs[0]
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
   zeros = array_ops.zeros_like(grad)
   xmask = selector_op(x, y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
-  if skip_input_indices is not None and 0 in skip_input_indices:
+  if 0 in skip_input_indices:
     gx = None
   else:
-    xgrad = array_ops.where_v2(xmask, grad, zeros)
-    gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
-
-  if skip_input_indices is not None and 1 in skip_input_indices:
+    gx = array_ops.where_v2(xmask, grad, zeros)
+  if 1 in skip_input_indices:
     gy = None
   else:
-    ygrad = array_ops.where_v2(xmask, zeros, grad)
-    gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy)
-
-  return (gx, gy)
+    gy = array_ops.where_v2(xmask, zeros, grad)
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 @ops.RegisterGradient("Maximum")
@@ -1671,39 +1602,25 @@ def _SquaredDifferenceGrad(op: ops.Operation, grad):
   """Returns the gradient for (x-y)^2."""
   x = op.inputs[0]
   y = op.inputs[1]
-  skip_input_indices = None
   try:
-    skip_input_indices = op.skip_input_indices
+    skip_input_indices = op.skip_input_indices or ()
   except AttributeError:
     # No gradient skipping, so do the full gradient computation
-    pass
+    skip_input_indices = ()
 
   with ops.control_dependencies([grad]):
     # The parens ensure that if grad is IndexedSlices, it'll get multiplied by
     # Tensor (not a number like 2.0) which causes it to convert to Tensor.
     x_grad = math_ops.scalar_mul(2.0, grad) * (x - y)
 
-  if (isinstance(grad, tensor.Tensor) and
-      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+  if isinstance(grad, tensor.Tensor) and _ShapesFullySpecifiedAndEqual(
+      x, y, grad
+  ):
     return x_grad, -x_grad
 
-  (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
-      SmartBroadcastGradientArgs(x, y, grad))
-
-  if skip_input_indices is not None and 0 in skip_input_indices:
-    gx = None
-  elif must_reduce_x:
-    gx = array_ops.reshape(math_ops.reduce_sum(x_grad, rx), sx)
-  else:
-    gx = x_grad
-
-  if skip_input_indices is not None and 1 in skip_input_indices:
-    gy = None
-  elif must_reduce_y:
-    gy = -array_ops.reshape(math_ops.reduce_sum(x_grad, ry), sy)
-  else:
-    gy = -x_grad
-  return (gx, gy)
+  gx = None if 0 in skip_input_indices else x_grad
+  gy = None if 1 in skip_input_indices else -x_grad
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 # Logical operations have no gradients.
@@ -1731,29 +1648,18 @@ def _SelectGrad(op: ops.Operation, grad):
   )
 
 
-# pylint: disable=missing-function-docstring
 @ops.RegisterGradient("SelectV2")
 def _SelectGradV2(op: ops.Operation, grad):
   c = op.inputs[0]
   x = op.inputs[1]
   y = op.inputs[2]
+  z = op.outputs[0]
   zeros = array_ops.zeros([], dtype=grad.dtype.base_dtype)
   gx = array_ops.where_v2(c, grad, zeros)
-  x_shape = array_ops.shape(x)
-  output_shape = array_ops.shape(op.outputs[0])
-  # Reduce away broadcasted leading dims.
-  reduce_x, _ = gen_array_ops.broadcast_gradient_args(x_shape, output_shape)
-  gx = math_ops.reduce_sum(gx, keepdims=True, axis=reduce_x)
-  gx = array_ops.reshape(gx, x_shape)
-
   gy = array_ops.where_v2(c, zeros, grad)
-  y_shape = array_ops.shape(y)
-  # Reduce away broadcasted leading dims.
-  reduce_y, _ = gen_array_ops.broadcast_gradient_args(y_shape, output_shape)
-  gy = math_ops.reduce_sum(gy, keepdims=True, axis=reduce_y)
-  gy = array_ops.reshape(gy, y_shape)
-
-  return (None, gx, gy)
+  gx, _ = _ReduceGradientArgs(x, z, grad, gx, None)
+  gy, _ = _ReduceGradientArgs(y, z, grad, gy, None)
+  return None, gx, gy
 
 
 def _MatMulGradAgainstFirstOnly(op: ops.Operation, grad):
@@ -1975,11 +1881,9 @@ def _ComplexGrad(op: ops.Operation, grad):
   """Returns the real and imaginary components of 'grad', respectively."""
   x = op.inputs[0]
   y = op.inputs[1]
-  sx = array_ops.shape(x)
-  sy = array_ops.shape(y)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
-  return (array_ops.reshape(math_ops.reduce_sum(math_ops.real(grad), rx), sx),
-          array_ops.reshape(math_ops.reduce_sum(math_ops.imag(grad), ry), sy))
+  gx = math_ops.real(grad)
+  gy = math_ops.imag(grad)
+  return _ReduceGradientArgs(x, y, grad, gx, gy)
 
 
 @ops.RegisterGradient("Real")
@@ -1998,7 +1902,7 @@ def _ImagGrad(_, grad):
 
 @ops.RegisterGradient("Angle")
 def _AngleGrad(op: ops.Operation, grad):
-  """Returns -grad / (Im(x) + iRe(x))"""
+  """Returns `-grad / (Im(x) + i Re(x))`."""
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     re = math_ops.real(x)

From ec9e1ee786feb60de67eecc4b4e2ae9c6dca493c Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Mon, 25 Sep 2023 09:37:59 -0700
Subject: [PATCH 219/567] Add cuda 12.2 support to JAX

PiperOrigin-RevId: 568244230
---
 ...n8.9-ubuntu20.04-manylinux2014-multipython | 47 +++++++++++++++++++
 .../toolchains/remote_config/configs.bzl      | 22 +++++++++
 .../toolchains/remote_config/containers.bzl   |  8 ++++
 .../toolchains/remote_config/configs.bzl      | 22 +++++++++
 .../toolchains/remote_config/containers.bzl   |  8 ++++
 .../toolchains/remote_config/configs.bzl      | 22 +++++++++
 .../toolchains/remote_config/containers.bzl   |  8 ++++
 7 files changed, 137 insertions(+)
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython
new file mode 100644
index 00000000000000..12e9356664f896
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython
@@ -0,0 +1,47 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda12.2-cudnn8.6-ubuntu20.04-manylinux2014-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda12.2-cudnn8.6-ubuntu20.04-manylinux2014-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda12.2-cudnn8.6-ubuntu20.04-manylinux2014-multipython
+
+FROM gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    patchelf \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/install_bazel.sh /install/
+RUN /install/install_bazel.sh
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "3.9.4"
+RUN /install/build_and_install_python.sh "3.10.0"
+RUN /install/build_and_install_python.sh "3.11.0"
+RUN /install/build_and_install_python.sh "3.12.0rc3"
+
+COPY install/install_pip_packages_by_version.sh /install/
+# https://github.com/numpy/numpy/issues/22623 for `SETUPTOOLS_USE_DISTUTILS`.
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.12" "jax"
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index 497b8e94c7334d..e774cdba1bffd7 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -252,6 +252,28 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9",
+        compiler = "/usr/lib/llvm-17/bin/clang",
+        cuda_version = "12.2",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        sysroot = "/dt9",
+        python_install_path = "/usr/local",
+    )
+
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9",
+        compiler = "/dt9/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "12.2",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        python_install_path = "/usr/local",
+    )
+
     tensorflow_rbe_win_config(
         name = "windows_py37",
         python_bin_path = "C:/Python37/python.exe",
diff --git a/tensorflow/tools/toolchains/remote_config/containers.bzl b/tensorflow/tools/toolchains/remote_config/containers.bzl
index 830b05b0c444b6..2819512e4b902c 100644
--- a/tensorflow/tools/toolchains/remote_config/containers.bzl
+++ b/tensorflow/tools/toolchains/remote_config/containers.bzl
@@ -9,6 +9,7 @@ container_digests = {
     "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:d17894a1349a12baea1732cb133f65f08754ed97d0a6647efe23c916a9ab8f1c",
     "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:c973a5dd1b335b83f5cc65ab2d1f12e12c0cc5d310a2d9bf676fcdb52cf08285",
     "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:2551b1587bdd0b63a4dd329eba6416cd07acb25496dde411c376609ce4f076f0",
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:55cf36aa54debd7ec7b3aac5a84af1fe3691a186aceae8d1d8eafe886d6a6950",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
@@ -114,6 +115,13 @@ containers = {
         "digest": container_digests["cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython.
     "rocm-ubuntu18.04-manylinux2010-multipython": {
         "registry": "gcr.io",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index cd829d1a88c42c..695384b9bb70b3 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -252,6 +252,28 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9",
+        compiler = "/usr/lib/llvm-17/bin/clang",
+        cuda_version = "12.2",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        sysroot = "/dt9",
+        python_install_path = "/usr/local",
+    )
+
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9",
+        compiler = "/dt9/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "12.2",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        python_install_path = "/usr/local",
+    )
+
     tensorflow_rbe_win_config(
         name = "windows_py37",
         python_bin_path = "C:/Python37/python.exe",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
index 830b05b0c444b6..2819512e4b902c 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
@@ -9,6 +9,7 @@ container_digests = {
     "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:d17894a1349a12baea1732cb133f65f08754ed97d0a6647efe23c916a9ab8f1c",
     "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:c973a5dd1b335b83f5cc65ab2d1f12e12c0cc5d310a2d9bf676fcdb52cf08285",
     "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:2551b1587bdd0b63a4dd329eba6416cd07acb25496dde411c376609ce4f076f0",
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:55cf36aa54debd7ec7b3aac5a84af1fe3691a186aceae8d1d8eafe886d6a6950",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
@@ -114,6 +115,13 @@ containers = {
         "digest": container_digests["cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython.
     "rocm-ubuntu18.04-manylinux2010-multipython": {
         "registry": "gcr.io",
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index cd829d1a88c42c..695384b9bb70b3 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -252,6 +252,28 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9",
+        compiler = "/usr/lib/llvm-17/bin/clang",
+        cuda_version = "12.2",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        sysroot = "/dt9",
+        python_install_path = "/usr/local",
+    )
+
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9",
+        compiler = "/dt9/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "12.2",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        python_install_path = "/usr/local",
+    )
+
     tensorflow_rbe_win_config(
         name = "windows_py37",
         python_bin_path = "C:/Python37/python.exe",
diff --git a/third_party/xla/tools/toolchains/remote_config/containers.bzl b/third_party/xla/tools/toolchains/remote_config/containers.bzl
index 830b05b0c444b6..2819512e4b902c 100644
--- a/third_party/xla/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/containers.bzl
@@ -9,6 +9,7 @@ container_digests = {
     "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:d17894a1349a12baea1732cb133f65f08754ed97d0a6647efe23c916a9ab8f1c",
     "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:c973a5dd1b335b83f5cc65ab2d1f12e12c0cc5d310a2d9bf676fcdb52cf08285",
     "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:2551b1587bdd0b63a4dd329eba6416cd07acb25496dde411c376609ce4f076f0",
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:55cf36aa54debd7ec7b3aac5a84af1fe3691a186aceae8d1d8eafe886d6a6950",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
@@ -114,6 +115,13 @@ containers = {
         "digest": container_digests["cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython.
     "rocm-ubuntu18.04-manylinux2010-multipython": {
         "registry": "gcr.io",

From 149cf817b927dc03c56ad57c4def9fb18fb5957a Mon Sep 17 00:00:00 2001
From: Yishuang Pang <ypang@google.com>
Date: Mon, 25 Sep 2023 10:06:51 -0700
Subject: [PATCH 220/567] Legalize some MHLO broadcasted binary element-wise
 ops (add, div, max, min, mul, pow, shift left, sub, atan2) to TF ops
 directly. tf.BroadcastTo op folder folds constants by default, this would
 increase the size of models converted from StableHLO because StableHLO
 requires explicit broadcasting. This change helps reduce model size by
 removing unnecessary broadcasts at legalization stage.

PiperOrigin-RevId: 568252070
---
 .../lite/stablehlo/tests/legalize_hlo.mlir    | 143 +++++++++++++++++-
 .../transforms/legalize_hlo_patterns.td       |  12 ++
 2 files changed, 151 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index ddc6848686f5a6..e02ce38908d9ff 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -47,12 +47,27 @@ func.func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 // CHECK-LABEL:   func @broadcast_add(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1x1xf32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+// CHECK-DAG.       %cst = arith.constant dense<[1, 1000]> : tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x1xf32>, tensor<1x1000xf32>) -> tensor<1x1000xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.AddV2"(%[[VAL_1]], %[[VAL_0]]) : (tensor<1x1000xf32>, tensor<1x1xf32>) -> tensor<1x1000xf32>
+// CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
+// CHECK:         }
+func.func @broadcast_add(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %1 = mhlo.add %0, %arg1 : tensor<1x1000xf32>
+  %2 = mhlo.add %arg1, %0 : tensor<1x1000xf32>
+  func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
+}
+
+// CHECK-LABEL:   func @broadcast_add_chlo(
 // CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
 // CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
 // CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
 // CHECK:         }
-func.func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
+func.func @broadcast_add_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   func.return %0 : tensor<1x2xi32>
 }
@@ -86,12 +101,27 @@ func.func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 // CHECK-LABEL:   func @broadcast_div(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1x1xf32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+// CHECK-DAG.       %cst = arith.constant dense<[1, 1000]> : tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x1xf32>, tensor<1x1000xf32>) -> tensor<1x1000xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Div"(%[[VAL_1]], %[[VAL_0]]) : (tensor<1x1000xf32>, tensor<1x1xf32>) -> tensor<1x1000xf32>
+// CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
+// CHECK:         }
+func.func @broadcast_div(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %1 = mhlo.divide %0, %arg1 : tensor<1x1000xf32>
+  %2 = mhlo.divide %arg1, %0 : tensor<1x1000xf32>
+  func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
+}
+
+// CHECK-LABEL:   func @broadcast_div_chlo(
 // CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
 // CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
 // CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
 // CHECK:         }
-func.func @broadcast_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
+func.func @broadcast_div_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   func.return %0 : tensor<1x2xi32>
 }
@@ -107,6 +137,21 @@ func.func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi3
   func.return %0 : tensor<4xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_shift_left(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<4xi32>) -> (tensor<4xi32>, tensor<4xi32>) {
+// CHECK-DAG.       %cst = arith.constant dense<[4]> : tensor<1xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.LeftShift"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           %[[VAL_3:.*]] = "tf.LeftShift"(%[[VAL_1]], %[[VAL_0]]) : (tensor<4xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<4xi32>, tensor<4xi32>
+// CHECK:         }
+func.func @broadcast_shift_left(%arg0: tensor<1xi32>, %arg1: tensor<4xi32>) -> (tensor<4xi32>, tensor<4xi32>) {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0]> : tensor<1xi64>} : (tensor<1xi32>) -> tensor<4xi32>
+  %1 = mhlo.shift_left %0, %arg1 : tensor<4xi32>
+  %2 = mhlo.shift_left %arg1, %0 : tensor<4xi32>
+  func.return %1, %2 : tensor<4xi32>, tensor<4xi32>
+}
+
 // CHECK-LABEL:   func @div_dynamic(
 // CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xi32>,
 // CHECK-SAME:                      %[[VAL_1:.*]]: tensor<?x?xi32>) -> tensor<?x?xi32> {
@@ -129,6 +174,21 @@ func.func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 }
 
+// CHECK-LABEL:   func @broadcast_maximum(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1x1xf32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+// CHECK-DAG.       %cst = arith.constant dense<[1, 1000]> : tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Maximum"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x1xf32>, tensor<1x1000xf32>) -> tensor<1x1000xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Maximum"(%[[VAL_1]], %[[VAL_0]]) : (tensor<1x1000xf32>, tensor<1x1xf32>) -> tensor<1x1000xf32>
+// CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
+// CHECK:         }
+func.func @broadcast_maximum(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %1 = mhlo.maximum %0, %arg1 : tensor<1x1000xf32>
+  %2 = mhlo.maximum %arg1, %0 : tensor<1x1000xf32>
+  func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
+}
+
 // CHECK-LABEL:   func @minimum(
 // CHECK-SAME:                  %[[VAL_0:.*]]: tensor<4xf32>,
 // CHECK-SAME:                  %[[VAL_1:.*]]: tensor<4xf32>) -> tensor<4xf32> {
@@ -140,6 +200,21 @@ func.func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 }
 
+// CHECK-LABEL:   func @broadcast_minimum(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1x1xf32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+// CHECK-DAG.       %cst = arith.constant dense<[1, 1000]> : tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Minimum"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x1xf32>, tensor<1x1000xf32>) -> tensor<1x1000xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Minimum"(%[[VAL_1]], %[[VAL_0]]) : (tensor<1x1000xf32>, tensor<1x1xf32>) -> tensor<1x1000xf32>
+// CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
+// CHECK:         }
+func.func @broadcast_minimum(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %1 = mhlo.minimum %0, %arg1 : tensor<1x1000xf32>
+  %2 = mhlo.minimum %arg1, %0 : tensor<1x1000xf32>
+  func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
+}
+
 // CHECK-LABEL:   func @mul(
 // CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
 // CHECK:           %[[VAL_1:.*]] = "tf.Mul"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
@@ -151,12 +226,27 @@ func.func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 // CHECK-LABEL:   func @broadcast_mul(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1x1xf32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+// CHECK-DAG.       %cst = arith.constant dense<[1, 1000]> : tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Mul"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x1xf32>, tensor<1x1000xf32>) -> tensor<1x1000xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Mul"(%[[VAL_1]], %[[VAL_0]]) : (tensor<1x1000xf32>, tensor<1x1xf32>) -> tensor<1x1000xf32>
+// CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
+// CHECK:         }
+func.func @broadcast_mul(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %1 = mhlo.multiply %0, %arg1 : tensor<1x1000xf32>
+  %2 = mhlo.multiply %arg1, %0 : tensor<1x1000xf32>
+  func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
+}
+
+// CHECK-LABEL:   func @broadcast_mul_chlo(
 // CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
 // CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
 // CHECK:           %[[VAL_2:.*]] = "tf.Mul"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
 // CHECK:         }
-func.func @broadcast_mul(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
+func.func @broadcast_mul_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   func.return %0 : tensor<1x2xi32>
 }
@@ -193,16 +283,46 @@ func.func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 // CHECK-LABEL:   func @broadcast_sub(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1x1xf32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+// CHECK-DAG.       %cst = arith.constant dense<[1, 1000]> : tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Sub"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x1xf32>, tensor<1x1000xf32>) -> tensor<1x1000xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Sub"(%[[VAL_1]], %[[VAL_0]]) : (tensor<1x1000xf32>, tensor<1x1xf32>) -> tensor<1x1000xf32>
+// CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
+// CHECK:         }
+func.func @broadcast_sub(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %1 = mhlo.subtract %0, %arg1 : tensor<1x1000xf32>
+  %2 = mhlo.subtract %arg1, %0 : tensor<1x1000xf32>
+  func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
+}
+
+// CHECK-LABEL:   func @broadcast_sub_chlo(
 // CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
 // CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
 // CHECK:           %[[VAL_2:.*]] = "tf.Sub"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
 // CHECK:         }
-func.func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
+func.func @broadcast_sub_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   func.return %0 : tensor<1x2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_atan2(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xf32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
+// CHECK-DAG.       %cst = arith.constant dense<[4]> : tensor<1xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Atan2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Atan2"(%[[VAL_1]], %[[VAL_0]]) : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32>
+// CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<4xf32>, tensor<4xf32>
+// CHECK:         }
+func.func @broadcast_atan2(%arg0: tensor<1xf32>, %arg1: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0]> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<4xf32>
+  %1 = mhlo.atan2 %0, %arg1 : tensor<4xf32>
+  %2 = mhlo.atan2 %arg1, %0 : tensor<4xf32>
+  func.return %1, %2 : tensor<4xf32>, tensor<4xf32>
+}
+
 // CHECK-LABEL:   func @shift_right(
 // CHECK-SAME:                      %[[VAL_0:.*]]: tensor<4xi32>,
 // CHECK-SAME:                      %[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xi32> {
@@ -367,6 +487,21 @@ func.func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   func.return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @broadcast_pow(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<4xi32>) -> (tensor<4xi32>, tensor<4xi32>) {
+// CHECK-DAG.       %cst = arith.constant dense<[4]> : tensor<1xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Pow"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Pow"(%[[VAL_1]], %[[VAL_0]]) : (tensor<4xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<4xi32>, tensor<4xi32>
+// CHECK:         }
+func.func @broadcast_pow(%arg0: tensor<1xi32>, %arg1: tensor<4xi32>) -> (tensor<4xi32>, tensor<4xi32>) {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0]> : tensor<1xi64>} : (tensor<1xi32>) -> tensor<4xi32>
+  %1 = mhlo.power %0, %arg1 : tensor<4xi32>
+  %2 = mhlo.power %arg1, %0 : tensor<4xi32>
+  func.return %1, %2 : tensor<4xi32>, tensor<4xi32>
+}
+
 // CHECK-LABEL:   func @pow_dynamic(
 // CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
 // CHECK:           %[[VAL_1:.*]] = "tf.Pow"(%[[VAL_0]], %[[VAL_0]]) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
index 3ba9d1911ffab6..9cf9aa518849f5 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
@@ -64,6 +64,18 @@ foreach fromToBinPair = [[MHLO_AddOp, CHLO_BroadcastAddOp, TF_AddV2Op],
   def : Pat<(fromToBinPair[1] $l, $r, $broadcast_dimensions),
             (fromToBinPair[2] $l, $r),
             [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
+  def : Pat<(fromToBinPair[0] $l,
+                              (MHLO_BroadcastInDimOp:$output
+                                $bcast_operand,
+                                $broadcast_dimensions)),
+            (fromToBinPair[2] $l, $bcast_operand),
+            [(IsTFStyleBroadcast $broadcast_dimensions, $output)]>;
+  def : Pat<(fromToBinPair[0] (MHLO_BroadcastInDimOp:$output
+                                $bcast_operand,
+                                $broadcast_dimensions),
+                              $r),
+            (fromToBinPair[2] $bcast_operand, $r),
+            [(IsTFStyleBroadcast $broadcast_dimensions, $output)]>;
 }
 
 foreach pair  = [[MHLO_AndOp, CHLO_BroadcastAndOp, TF_BitwiseAndOp],

From 036b68f244770a268ef3025bbeaac89793cf22bf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 10:08:00 -0700
Subject: [PATCH 221/567] Add dependency from source node to VarHandleOp

This can guanrantee the graph's source node is connected to all source nodes in the graph.

PiperOrigin-RevId: 568252463
---
 tensorflow/core/tpu/kernels/tpu_functional_ops.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
index a9f1aa375f0b9c..12174b42276a9c 100644
--- a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
@@ -1951,6 +1951,12 @@ Status TPUPartitionedCallOp::ReplaceAndPartitionXLAShardingVariable(
     AddNodeAttr("shape", proto, &ndef);
 
     TF_ASSIGN_OR_RETURN(Node * new_node, graph->AddNode(ndef));
+
+    // connect new node to source graph, so it can meet the graph specification
+    for (const Edge* edge : variable->in_edges()) {
+      graph->AddEdge(edge->src(), edge->src_output(), new_node,
+                     edge->dst_input());
+    }
     per_core_vars.push_back(new_node);
   }
 

From edf7215123c67d76199d099779137b974b6e1293 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 10:08:22 -0700
Subject: [PATCH 222/567] [mlir][sparse][xla-cpu] Add rewriting rules to
 legalize sparse_tensor.convert

PiperOrigin-RevId: 568252598
---
 .../legalize_sparse_ops/sparse_ops_to_custom_calls.cc          | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sparse_ops/sparse_ops_to_custom_calls.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sparse_ops/sparse_ops_to_custom_calls.cc
index 69d061517a47a2..fa17f5145fa87f 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sparse_ops/sparse_ops_to_custom_calls.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sparse_ops/sparse_ops_to_custom_calls.cc
@@ -57,7 +57,8 @@ void populateLegalizeSparseOpsToCustomCallPatterns(
     MLIRContext* context, TypeConverter& typeConverter,
     RewritePatternSet* patterns) {
   patterns->add<SparseOpToCustomCallConverter<sparse_tensor::PackOp>,
-                SparseOpToCustomCallConverter<sparse_tensor::UnpackOp>>(
+                SparseOpToCustomCallConverter<sparse_tensor::UnpackOp>,
+                SparseOpToCustomCallConverter<sparse_tensor::ConvertOp>>(
       typeConverter, context);
 }
 

From 8e7844a87b8bf640a3bf27040644060bce23119e Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 25 Sep 2023 10:15:18 -0700
Subject: [PATCH 223/567] Add nccl to sigbuild and JAX Dockerfiles.

https://github.com/openxla/xla/pull/5850/files adds the ability to use a binary NCCL via a stub, just as with NVIDIA's other libraries. Make sure that NCCL is available in CI dockerfiles.

PiperOrigin-RevId: 568254770
---
 tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
index 50c4eca080a4b5..311a4d905814a0 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
@@ -11,6 +11,7 @@ libcurand-12-2
 libcusolver-dev-12-2
 libcusparse-dev-12-2
 libcublas-dev-12-2
+libnccl-dev=2.18.5-1+cuda12.2
 # CuDNN: https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#ubuntu-network-installation
 libcudnn8-dev=8.9.4.25-1+cuda12.2
 libcudnn8=8.9.4.25-1+cuda12.2

From 26d09c4465b8d9376d574f7728092793a02deac9 Mon Sep 17 00:00:00 2001
From: Jieying Luo <jieying@google.com>
Date: Mon, 25 Sep 2023 10:51:12 -0700
Subject: [PATCH 224/567] [PJRT C API] Implement framework side change for
 registering a custom call.

- Add a py extension to call the custom call C API.
- Change the implementation of register_custom_call_target to store handlers for the custom call targets and delays the registration until the handler for a xla platform is registered.
- Change register_plugin to load PJRT plugin when register_pluin is called (instead of when a client is created), and let it return the PJRT_Api* loaded.
- Delay calling discover_pjrt_plugins() and register_pjrt_plugin_factories_from_env() until the first time backends() is called.

PiperOrigin-RevId: 568265745
---
 third_party/xla/xla/pjrt/pjrt_api.cc      |  8 ++--
 third_party/xla/xla/pjrt/pjrt_api.h       |  7 +--
 third_party/xla/xla/python/BUILD          |  2 +
 third_party/xla/xla/python/xla.cc         |  8 +++-
 third_party/xla/xla/python/xla_client.py  | 55 ++++++++++++++++++++---
 third_party/xla/xla/python/xla_client.pyi |  8 +++-
 6 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/third_party/xla/xla/pjrt/pjrt_api.cc b/third_party/xla/xla/pjrt/pjrt_api.cc
index 170edcb6f447e3..84dbad9687d048 100644
--- a/third_party/xla/xla/pjrt/pjrt_api.cc
+++ b/third_party/xla/xla/pjrt/pjrt_api.cc
@@ -79,8 +79,8 @@ xla::Status SetPjrtApi(absl::string_view device_type, const PJRT_Api* api) {
 }
 
 typedef const PJRT_Api* (*PjrtApiInitFn)();
-xla::Status LoadPjrtPlugin(absl::string_view device_type,
-                           absl::string_view library_path) {
+xla::StatusOr<const PJRT_Api*> LoadPjrtPlugin(absl::string_view device_type,
+                                              absl::string_view library_path) {
 #ifdef PLATFORM_WINDOWS
   return tsl::errors::Unimplemented(
       "LoadPjrtPlugin is not implemented on windows yet.");
@@ -97,7 +97,9 @@ xla::Status LoadPjrtPlugin(absl::string_view device_type,
   }
   LOG(INFO) << "GetPjrtApi was found for " << device_type << " at "
             << library_path;
-  return SetPjrtApi(device_type, init_fn());
+  const PJRT_Api* api = init_fn();
+  TF_RETURN_IF_ERROR(SetPjrtApi(device_type, api));
+  return api;
 #endif
 }
 
diff --git a/third_party/xla/xla/pjrt/pjrt_api.h b/third_party/xla/xla/pjrt/pjrt_api.h
index 4b089b1214ac48..8a1d3cfe7468c7 100644
--- a/third_party/xla/xla/pjrt/pjrt_api.h
+++ b/third_party/xla/xla/pjrt/pjrt_api.h
@@ -31,9 +31,10 @@ xla::Status SetPjrtApi(absl::string_view device_type, const PJRT_Api* api);
 // Loads a PJRT plugin. The library provided by library_path must export a
 // symbol called `GetPjrtApi` with function signature `const PJRT_Api*
 // GetPjrtApi()`. This method dlopen the plugin library, dlsym `GetPjrtApi`,
-// calls `GetPjrtApi` and `SetPjrtApi`.
-xla::Status LoadPjrtPlugin(absl::string_view device_type,
-                           absl::string_view library_path);
+// calls `GetPjrtApi` and `SetPjrtApi`. Returns the loaded PJRT_Api* if
+// successful.
+xla::StatusOr<const PJRT_Api*> LoadPjrtPlugin(absl::string_view device_type,
+                                              absl::string_view library_path);
 
 // Requires that SetPjrtApi has been successfully called on `device_type` before
 // calling this method.
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index d5f3d9197fbab5..44852eec3fb6f1 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1071,6 +1071,7 @@ cc_library(
         ":weakref_lru_cache",
         ":xla_compiler",
         # placeholder for index annotation deps
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/log:initialize",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1089,6 +1090,7 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:tfrt_cpu_pjrt_client",
+        "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/distributed",
         "//xla/pjrt/distributed:client",
         "//xla/pjrt/distributed:protocol_proto_cc",
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index c7277c0d501335..46053a1009a0ae 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -26,11 +26,13 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/base/casts.h"
 // clang-format off
 // Must be included first
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/python/py_client.h"
 #include "tsl/python/lib/core/numpy.h"  //NOLINT
@@ -472,8 +474,10 @@ static void Init(py::module_& m) {
     return pjrt_api.ok();
   });
   m.def("load_pjrt_plugin",
-        [](std::string platform_name, std::string library_path) {
-          xla::ThrowIfError(pjrt::LoadPjrtPlugin(platform_name, library_path));
+        [](std::string platform_name, std::string library_path) -> py::capsule {
+          const PJRT_Api* api = xla::ValueOrThrow(
+              pjrt::LoadPjrtPlugin(platform_name, library_path));
+          return py::capsule(absl::bit_cast<void*>(api), "pjrt_c_api");
         });
   m.def("pjrt_plugin_initialized", [](std::string platform_name) -> bool {
     return xla::ValueOrThrow(pjrt::IsPjrtPluginInitialized(platform_name));
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index d6172122d20e4f..e0d48c09120db1 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -21,6 +21,7 @@
 import inspect
 import logging
 import os
+import threading
 from typing import Any, List, Mapping, Optional, Sequence, Tuple, Union
 
 from . import xla_extension as _xla
@@ -44,7 +45,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 197
+_version = 198
 
 # Version number for MLIR:Python components.
 mlir_api_version = 54
@@ -60,6 +61,9 @@
 
 
 def make_cpu_client() -> ...:
+  register_custom_call_handler(
+      'cpu', _xla.register_custom_call_target
+  )
   return _xla.get_tfrt_cpu_client(asynchronous=True)
 
 
@@ -91,6 +95,12 @@ def make_gpu_client(
   if memory_fraction:
     config.memory_fraction = float(memory_fraction)
   config.preallocate = preallocate not in ('0', 'false', 'False')
+  register_custom_call_handler(
+      'CUDA', _xla.register_custom_call_target
+  )
+  register_custom_call_handler(
+      'ROCM', _xla.register_custom_call_target
+  )
 
   if mock:
     return _xla.get_mock_gpu_client(
@@ -137,8 +147,8 @@ def pjrt_plugin_loaded(plugin_name: str) -> bool:
   return _xla.pjrt_plugin_loaded(plugin_name)
 
 
-def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> None:
-  _xla.load_pjrt_plugin(plugin_name, library_path)
+def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> Any:
+  return _xla.load_pjrt_plugin(plugin_name, library_path)
 
 
 def pjrt_plugin_initialized(plugin_name: str) -> bool:
@@ -509,6 +519,12 @@ def LoadedExecutable_execute_with_token(self, arguments, device=None):
 LoadedExecutable.execute_with_token = LoadedExecutable_execute_with_token
 
 
+_custom_callback_handler: dict[str, Any] = {}
+# Key is xla_platform_name, value is (function_name, function)
+_custom_callback: dict[str, list[Tuple[str, Any]]] = {}
+_custom_callback_lock = threading.Lock()
+
+
 def register_custom_call_target(
     name: str, fn: Any, platform: str = 'cpu'
 ) -> None:
@@ -521,8 +537,37 @@ def register_custom_call_target(
   """
   # To support AMD GPUs, we need to have xla_platform_names["gpu"] == "ROCM"
   # Since that is hardcoded to CUDA, we are using the following as workaround.
-  _xla.register_custom_call_target(name, fn,
-                                   xla_platform_names.get(platform, platform))
+  xla_platform_name = xla_platform_names.get(platform, platform)
+  with _custom_callback_lock:
+    if xla_platform_name in _custom_callback_handler:
+      _custom_callback_handler[xla_platform_name](name, fn, xla_platform_name)
+    else:
+      _custom_callback.setdefault(xla_platform_name, []).append((name, fn))
+
+
+def register_custom_call_handler(platform: str, handler: Any) -> None:
+  """Registers a custom handler and use it to register existing custom calls.
+
+  If a custom call handler for the platform already exist, calling this method
+  is a no-op and it will not register a new handler.
+  Args:
+    platform: the target platform.
+    handler: the function to register a custom call.
+  """
+  xla_platform_name = xla_platform_names.get(platform, platform)
+  with _custom_callback_lock:
+    if xla_platform_name in _custom_callback_handler:
+      logger.debug(
+          'Custom call handler for %s is already register. Will not register a'
+          ' new one',
+          xla_platform_name,
+      )
+      return
+    _custom_callback_handler[xla_platform_name] = handler
+    if xla_platform_name in _custom_callback:
+      for name, fn in _custom_callback[xla_platform_name]:
+        handler(name, fn, xla_platform_name)
+      del _custom_callback[xla_platform_name]
 
 
 register_custom_call_partitioner = _xla.register_custom_call_partitioner
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
index a6f48f7a0d469e..7a01e4817a5364 100644
--- a/third_party/xla/xla/python/xla_client.pyi
+++ b/third_party/xla/xla/python/xla_client.pyi
@@ -119,7 +119,7 @@ def make_c_api_client(
 def pjrt_plugin_loaded(plugin_name: str) -> bool:
   ...
 
-def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> None:
+def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> Any:
   ...
 
 
@@ -249,4 +249,10 @@ def register_custom_call_target(
     name: str, fn: Callable, platform: str = ...
 ) -> None:
   ...
+
+
+def register_custom_call_handler(xla_platform_name: str, handler: Any) -> None:
+  ...
+
+
 def encode_inspect_sharding_callback(handler: Any) -> bytes: ...

From 397c205a3fa7b3c7c481f2224741c8b8a5e3523d Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 25 Sep 2023 11:00:39 -0700
Subject: [PATCH 225/567] Enable imports such as `from tensorflow.keras import
 layers`

PiperOrigin-RevId: 568268550
---
 tensorflow/api_template.__init__.py    | 19 ++++++++++++-------
 tensorflow/api_template_v1.__init__.py |  7 ++++++-
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 8b789f7202db6d..6894ab538894d6 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -112,7 +112,7 @@
     if _keras_module.__version__.startswith("3."):
       # This is the Keras 3.x case.
       _keras_to_use = _keras_module._tf_keras
-      _keras_package_name = "keras._tf_keras"
+      _keras_package_name = "keras._tf_keras.keras"
       _keras_version = "keras_3"
     else:
       # This is the Keras 2.x case.
@@ -123,7 +123,12 @@
     pass
 
 if _keras_to_use is not None:
-  setattr(_current_module, "keras", _keras_to_use)
+  _module_dir = _module_util.get_parent_dir_for_name(_keras_package_name)
+  if _module_dir:
+    _current_module.__path__ = [_module_dir] + _current_module.__path__
+  setattr(_current_module,
+          "keras",
+          _LazyLoader("keras", globals(), _keras_package_name))
 else:
     # TF will not have `tf.keras` in this case. This should not be silent.
   _logging.warning("Unable to load `tf.keras`. Check that the `keras` package "
@@ -235,11 +240,11 @@ def _running_from_pip_package():
     from tf_keras.api._v2.keras import optimizers
     from tf_keras.api._v2.keras import initializers
   elif _keras_version == "keras_3":
-    from keras import _tf_keras as keras
-    from keras._tf_keras import losses
-    from keras._tf_keras import metrics
-    from keras._tf_keras import optimizers
-    from keras._tf_keras import initializers
+    from keras._tf_keras import keras
+    from keras._tf_keras.keras import losses
+    from keras._tf_keras.keras import metrics
+    from keras._tf_keras.keras import optimizers
+    from keras._tf_keras.keras import initializers
 
 # pylint: enable=undefined-variable
 
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index fe250ed3df42f5..82ef4315f4aef1 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -112,7 +112,12 @@
     pass
 
 if _keras_to_use is not None:
-  setattr(_current_module, "keras", _keras_to_use)
+  _module_dir = _module_util.get_parent_dir_for_name(_keras_package_name)
+  if _module_dir:
+    _current_module.__path__ = [_module_dir] + _current_module.__path__
+  setattr(_current_module,
+          "keras",
+          _LazyLoader("keras", globals(), _keras_package_name))
 else:
     # TF will not have `tf.keras` in this case. This should not be silent.
   _logging.warning("Unable to load `tf.keras`. Check that the `keras` package "

From dc60587c40fcfa7e22111e76b2e4ec0a75d561a3 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 25 Sep 2023 11:12:05 -0700
Subject: [PATCH 226/567] [XLA:GPU] Support HLO input for AOT compilation

PiperOrigin-RevId: 568271995
---
 third_party/xla/xla/service/BUILD             | 10 ++++++
 .../xla/service/xla_aot_compile_gpu_test.cc   | 14 ++++++--
 .../xla/xla/service/xla_aot_compile_test.hlo  |  7 ++++
 third_party/xla/xla/service/xla_compile.bzl   |  3 +-
 .../xla/xla/service/xla_compile_main.cc       | 32 +++++++++++++------
 5 files changed, 53 insertions(+), 13 deletions(-)
 create mode 100644 third_party/xla/xla/service/xla_aot_compile_test.hlo

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 5cd69b9ca4f2fd..2a4e6d77e45cba 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -6968,11 +6968,13 @@ xla_cc_binary(
         "//xla/service:cpu_plugin",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:cpu_executable",
+        "//xla/tools:hlo_module_loader",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/util:command_line_flags",
@@ -7010,6 +7012,13 @@ xla_aot_compile_gpu(
     module = "xla_aot_compile_test.mlir",
 )
 
+xla_aot_compile_gpu(
+    name = "xla_aot_compile_test_gpu_executable_hlo",
+    autotune_results = "xla_aot_compile_test_autotune_results.prototxt",
+    gpu_target_config = "xla_aot_compile_test_gpu_target_config.prototxt",
+    module = "xla_aot_compile_test.hlo",
+)
+
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_constant",
     autotune_results = "xla_aot_compile_test_autotune_results.prototxt",
@@ -7092,6 +7101,7 @@ xla_cc_test(
     srcs = if_cuda_is_configured(["xla_aot_compile_gpu_test.cc"]),
     data = if_cuda_is_configured([
         ":xla_aot_compile_test_gpu_executable",
+        ":xla_aot_compile_test_gpu_executable_hlo",
         ":xla_aot_compile_test_gpu_executable_constant",
         ":xla_aot_compile_test_gpu_executable_gemm",
         ":xla_aot_compile_test_gpu_executable_gemm_runtime_autotuning",
diff --git a/third_party/xla/xla/service/xla_aot_compile_gpu_test.cc b/third_party/xla/xla/service/xla_aot_compile_gpu_test.cc
index d4c8fd91787a7a..2e1283f6d13f0d 100644
--- a/third_party/xla/xla/service/xla_aot_compile_gpu_test.cc
+++ b/third_party/xla/xla/service/xla_aot_compile_gpu_test.cc
@@ -31,9 +31,12 @@ namespace xla {
 namespace xla_compile {
 namespace {
 
-TEST(XlaCompileTest, LoadGpuExecutable) {
-  std::string path = tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service",
-                                       "xla_aot_compile_test_gpu_executable");
+class XlaAotCompileTest : public ::testing::TestWithParam<absl::string_view> {};
+
+TEST_P(XlaAotCompileTest, LoadGpuExecutable) {
+  std::string path =
+      tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service", GetParam()
+                        /*"xla_aot_compile_test_gpu_executable"*/);
   std::string serialized_aot_result;
   TF_ASSERT_OK(
       tsl::ReadFileToString(tsl::Env::Default(), path, &serialized_aot_result));
@@ -76,6 +79,11 @@ TEST(XlaCompileTest, LoadGpuExecutable) {
   EXPECT_EQ(expected, output);
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    TestingAotFormats, XlaAotCompileTest,
+    testing::Values("xla_aot_compile_test_gpu_executable",
+                    "xla_aot_compile_test_gpu_executable_hlo"));
+
 TEST(XlaCompileTest, LoadGpuExecutableWithConstant) {
   std::string path =
       tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service",
diff --git a/third_party/xla/xla/service/xla_aot_compile_test.hlo b/third_party/xla/xla/service/xla_aot_compile_test.hlo
new file mode 100644
index 00000000000000..a1c0d016f0f0f5
--- /dev/null
+++ b/third_party/xla/xla/service/xla_aot_compile_test.hlo
@@ -0,0 +1,7 @@
+HloModule Module
+
+ENTRY computation {
+    p0 = f64[3] parameter(0)
+    p1 = f64[3] parameter(1)
+    ROOT out = add(p0, p1)
+}
diff --git a/third_party/xla/xla/service/xla_compile.bzl b/third_party/xla/xla/service/xla_compile.bzl
index 6704bc3f4b00e6..5e3b96f5d52fa7 100644
--- a/third_party/xla/xla/service/xla_compile.bzl
+++ b/third_party/xla/xla/service/xla_compile.bzl
@@ -45,7 +45,8 @@ def xla_aot_compile_gpu(
         module,
         gpu_target_config,
         autotune_results):
-    """Runs xla_compile to compile an MHLO or StableHLO module into an AotCompilationResult for GPU
+    """Runs xla_compile to compile an MHLO, StableHLO or HLO module into an
+    AotCompilationResult for GPU
 
     Args:
         name: The name of the build rule.
diff --git a/third_party/xla/xla/service/xla_compile_main.cc b/third_party/xla/xla/service/xla_compile_main.cc
index fd8c4461d56517..2b53a4394c4657 100644
--- a/third_party/xla/xla/service/xla_compile_main.cc
+++ b/third_party/xla/xla/service/xla_compile_main.cc
@@ -33,8 +33,10 @@ limitations under the License.
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/statusor.h"
+#include "xla/tools/hlo_module_loader.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
+#include "tsl/platform/path.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/util/command_line_flags.h"
 
@@ -52,8 +54,9 @@ namespace xla {
 namespace xla_compile {
 
 const char kUsageHeader[] =
-    "xla_compile performs ahead-of-time compilation of an MHLO or StableHLO "
-    "module,\nresulting in an AotCompilationResult compiled for CPU.\n"
+    "xla_compile performs ahead-of-time compilation of an MHLO, StableHLO or "
+    "HLO "
+    "module,\nresulting in an AotCompilationResult compiled for CPU or GPU.\n"
     "A typical invocation looks like this:\n"
     "\n"
     "   $ xla_compile --module_file=mymodule.mlir --output_file=output "
@@ -100,11 +103,14 @@ StatusOr<std::string> AotCompileGpuExecutable(
 }
 #endif
 
-xla::Status XlaCompileMain(const std::string& module_path,
-                           const std::string& output_path,
-                           const std::string& platform,
-                           const std::string& gpu_target_config_path,
-                           const std::string& autotune_results_path) {
+xla::StatusOr<std::unique_ptr<HloModule>> LoadModule(
+    const std::string& module_path) {
+  auto format = std::string(tsl::io::Extension(module_path));
+  if (format == "hlo" || format == "txt") {
+    return LoadModuleFromFile(
+        module_path, hlo_module_loader_details::Config(),
+        /*format=*/"hlo", [&](HloModuleConfig* c) {}, nullptr);
+  }
   std::string module_string;
   TF_RETURN_IF_ERROR(
       tsl::ReadFileToString(tsl::Env::Default(), module_path, &module_string));
@@ -132,8 +138,16 @@ xla::Status XlaCompileMain(const std::string& module_path,
   DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
   HloModuleConfig config(shape);
   config.set_debug_options(debug_options);
+  return HloModule::CreateFromProto(hlo_module_proto, config);
+}
+
+xla::Status XlaCompileMain(const std::string& module_path,
+                           const std::string& output_path,
+                           const std::string& platform,
+                           const std::string& gpu_target_config_path,
+                           const std::string& autotune_results_path) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                      HloModule::CreateFromProto(hlo_module_proto, config));
+                      LoadModule(module_path));
 
   // Run AOT compilation.
   std::string result;
@@ -194,7 +208,7 @@ int main(int argc, char* argv[]) {
   std::string autotune_results_path;
   std::vector<tsl::Flag> flag_list = {
       tsl::Flag("module_file", &module_path,
-                "The path to the MHLO or StableHLO file"),
+                "The path to the HLO, MHLO or StableHLO file"),
       tsl::Flag("output_file", &output_path, "The path to the output file"),
       tsl::Flag("platform", &platform,
                 "The platform on which the built executable runs"),

From a914df7952a8ac3cce98266ded227a29a6315fa5 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 25 Sep 2023 11:23:33 -0700
Subject: [PATCH 227/567] [stream_executor] NFC: Clean up stream executor
 public libraries

Clean up and document stream executor "non-executor" public libraries.

https://github.com/openxla/xla/issues/5761

PiperOrigin-RevId: 568275550
---
 tensorflow/core/common_runtime/gpu/BUILD      |   4 +-
 .../common_runtime/pluggable_device/BUILD     |   4 +-
 tensorflow/core/util/BUILD                    |   2 +-
 .../xla/third_party/tsl/tsl/framework/BUILD   |   2 +-
 third_party/xla/xla/stream_executor/BUILD     | 279 +++++++-----------
 .../xla/xla/stream_executor/data_type.h       |   1 -
 .../xla/stream_executor/device_description.cc |   4 -
 .../xla/stream_executor/device_description.h  |   2 +-
 .../stream_executor/device_memory_allocator.h |   2 +-
 .../xla/xla/stream_executor/device_options.h  |   6 +-
 third_party/xla/xla/stream_executor/gpu/BUILD |   1 -
 .../xla/xla/stream_executor/platform.cc       |  17 +-
 third_party/xla/xla/xla.bzl                   |   1 -
 13 files changed, 133 insertions(+), 192 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 4652e0051cd8fd..e77d1e25ef2620 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -188,7 +188,7 @@ tf_cuda_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/framework:device_id_utils",
-        "@local_xla//xla/stream_executor:device_id_utils",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
         "@local_xla//xla/stream_executor/gpu:gpu_init_impl",
     ] + if_google(
@@ -232,7 +232,7 @@ tf_cuda_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:stream_executor",
         "@eigen_archive//:eigen3",
-        "@local_xla//xla/stream_executor:device_id_utils",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor/gpu:gpu_init",
     ] + if_static([":gpu_runtime_impl"]),
 )
diff --git a/tensorflow/core/common_runtime/pluggable_device/BUILD b/tensorflow/core/common_runtime/pluggable_device/BUILD
index 92a6ca4af74fcc..514bad92fc5fec 100644
--- a/tensorflow/core/common_runtime/pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/pluggable_device/BUILD
@@ -57,10 +57,8 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/framework:device_id_utils",
-        "@local_xla//xla/stream_executor:device_id_utils",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:device_mem_allocator",
-        "@local_xla//xla/stream_executor:event",
-        "@local_xla//xla/stream_executor:kernel",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index aa86cb6a0ff568..e01ba6ff6a792e 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -779,7 +779,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
     ] + if_cuda([
         "@local_xla//xla/stream_executor/cuda:cuda_blas_utils",
-        "@local_xla//xla/stream_executor:data_type",
+        "@local_xla//xla/stream_executor",
         "@local_tsl//tsl/cuda:cusparse",
         "@local_config_cuda//cuda:cub_headers",
     ]) + if_rocm([
diff --git a/third_party/xla/third_party/tsl/tsl/framework/BUILD b/third_party/xla/third_party/tsl/tsl/framework/BUILD
index f3c83660c6aece..af0eb46e4ae6e4 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/BUILD
@@ -230,7 +230,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        "//tsl/platform:types",
+        "//tsl/lib/gtl:int_type",
     ] + if_static([
         ":device_id_impl",
     ]),
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 6783578a4bc150..e9f3b11e0d0912 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -44,13 +44,14 @@ package_group(
 # a header-only dependency (this is a very rare exception when we are building dynamic librarires
 # for open source projects, e.g. Tensorflow, internally at Google we almost always link statically),
 # or usually on a `sream_executor` target that will also link implementation.
-
 filegroup(
     name = "stream_executor_public_headers",
     srcs = [
         "allocator_stats.h",
         "command_buffer.h",
+        "data_type.h",
         "device_description.h",
+        "device_id_utils.h",
         "device_memory.h",
         "device_memory_allocator.h",
         "device_options.h",
@@ -87,6 +88,104 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+#===--------------------------------------------------------------------------------------------===#
+# StreamExecutor public libraries
+#===--------------------------------------------------------------------------------------------===#
+
+# Some of the StreamExecutor libraries that do not depend on StreamExecutor itself (Stream, Kernel,
+# Event, etc.) exported as standalone libraries (these libraries should not depend on
+# `stream_executor` and `stream_executor_headers` targets). This is mostly a historical artifact of
+# an era when StreamExecutor was a part of Tensorflow.
+
+# TODO(ezhulenev): Consider merging some (all?) of these libraries into StreamExecutor target, e.g.
+# does it really make sense to have a separate `device_memory` library which is not usable without
+# StreamExecutor.
+
+tf_proto_library(
+    name = "device_description_proto",
+    srcs = ["device_description.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = ["//xla:autotune_results_proto"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "device_description",
+    srcs = ["device_description.cc"],
+    hdrs = ["device_description.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device_description_proto_cc",
+        ":launch_dim",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/lib/math:math_util",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "device_memory",
+    hdrs = ["device_memory.h"],
+    visibility = ["//visibility:public"],
+    deps = ["//xla/stream_executor/platform"],
+)
+
+cc_library(
+    name = "device_options",
+    hdrs = ["device_options.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "host_or_device_scalar",
+    hdrs = ["host_or_device_scalar.h"],
+    visibility = ["//visibility:public"],
+    deps = [":device_memory"],
+)
+
+cc_library(
+    name = "launch_dim",
+    hdrs = ["launch_dim.h"],
+    visibility = ["//visibility:public"],
+    deps = ["@com_google_absl//absl/strings"],
+)
+
+cc_library(
+    name = "numeric_options",
+    hdrs = ["numeric_options.h"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "platform",
+    srcs = ["platform.cc"],
+    hdrs = ["platform.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device_description",
+        ":device_options",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "plugin",
+    hdrs = ["plugin.h"],
+    visibility = ["//visibility:public"],
+)
+
 #===--------------------------------------------------------------------------------------------===#
 # StreamExecutor platform-dependent interfaces
 #===--------------------------------------------------------------------------------------------===#
@@ -165,6 +264,7 @@ cc_library(
         "device_memory.h",
         "device_memory_allocator.h",
         "device_options.h",
+        "device_id_utils.h",
         "dnn.h",
         "event.h",
         "executor_cache.h",
@@ -172,6 +272,7 @@ cc_library(
         "kernel.h",
         "kernel_cache_config.h",
         "kernel_spec.h",
+        "data_type.h",
         "launch_dim.h",
         "numeric_options.h",
         "module_spec.h",
@@ -204,8 +305,11 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/framework:device_id",
+        "@local_tsl//tsl/framework:device_type",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:float8",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -218,77 +322,9 @@ transitive_hdrs(
     deps = [":stream_executor_headers"],
 )
 
-cc_library(
-    name = "launch_dim",
-    hdrs = ["launch_dim.h"],
-    visibility = ["//visibility:public"],
-    deps = ["@com_google_absl//absl/strings"],
-)
-
-cc_library(
-    name = "trace_listener",
-    hdrs = [
-        "trace_listener.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device_memory",
-        ":kernel",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:status",
-    ],
-)
-
-tf_proto_library(
-    name = "device_description_proto",
-    srcs = ["device_description.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    protodeps = [
-        "//xla:autotune_results_proto",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "device_description",
-    srcs = ["device_description.cc"],
-    hdrs = ["device_description.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device_description_proto_cc",
-        ":launch_dim",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/lib/math:math_util",
-        "@local_tsl//tsl/platform:numbers",
-    ],
-)
-
-cc_library(
-    name = "kernel_cache_config",
-    hdrs = ["kernel_cache_config.h"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "module_spec",
-    hdrs = ["module_spec.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
 cc_library(
     name = "allocator_stats",
-    srcs = [
-        "allocator_stats.cc",
-    ],
+    srcs = ["allocator_stats.cc"],
     hdrs = ["allocator_stats.h"],
     visibility = ["//visibility:public"],
     deps = [
@@ -298,49 +334,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "data_type",
-    hdrs = ["data_type.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":dnn_proto_cc",
-        "@local_tsl//tsl/platform:float8",
-        "@local_tsl//tsl/protobuf:dnn_proto_cc",
-    ],
-)
-
-cc_library(
-    name = "device_id_utils",
-    hdrs = ["device_id_utils.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":platform",
-        ":stream_executor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/framework:device_id_impl",
-        "@local_tsl//tsl/lib/gtl:int_type",
-        "@local_tsl//tsl/platform:str_util",
-    ],
-)
-
-cc_library(
-    name = "device_memory",
-    hdrs = ["device_memory.h"],
-    visibility = ["//visibility:public"],
-    deps = ["//xla/stream_executor/platform"],
-)
-
-cc_library(
-    name = "device_options",
-    hdrs = ["device_options.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
 tf_proto_library(
     name = "dnn_proto",
     srcs = ["dnn.proto"],
@@ -359,36 +352,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "platform",
-    srcs = ["platform.cc"],
-    hdrs = ["platform.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":plugin",
-        ":stream_executor_headers",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "plugin",
-    hdrs = ["plugin.h"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "numeric_options",
-    hdrs = ["numeric_options.h"],
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "device_mem_allocator",
     hdrs = [
@@ -412,7 +375,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":platform",
-        ":stream_executor_pimpl_header",
+        ":stream_executor_headers",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:node_hash_map",
@@ -459,9 +422,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":device_description",
         ":device_description_proto_cc",
-        ":kernel_cache_config",
         ":platform",
         ":stream_executor_headers",
         ":stream_executor_internal",
@@ -501,7 +462,7 @@ cc_library(
         ":fft",
         ":host_or_device_scalar",
         ":kernel",
-        ":launch_dim",
+        ":kernel_spec",
         ":platform",
         ":plugin_registry",
         ":stream_executor_headers",
@@ -558,17 +519,14 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":allocator_stats",
-        ":data_type",
         ":device_description",
         ":device_description_proto_cc",
         ":device_memory",
         ":device_options",
         ":fft",
-        ":kernel_cache_config",
         ":kernel_spec",
         ":launch_dim",
         ":platform",
-        ":plugin",
         ":plugin_registry",
         ":stream_executor_headers",
         "//xla/stream_executor/platform",
@@ -598,7 +556,6 @@ cc_library(
     hdrs = ["blas.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":data_type",
         ":stream_executor_headers",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/strings",
@@ -608,17 +565,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "host_or_device_scalar",
-    hdrs = ["host_or_device_scalar.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":data_type",
-        ":device_memory",
-        "//xla/stream_executor/platform",
-    ],
-)
-
 cc_library(
     name = "command_buffer",
     srcs = ["command_buffer.cc"],
@@ -656,7 +602,6 @@ cc_library(
         ":fft",
         ":multi_platform_manager",
         ":platform",
-        ":plugin",
         ":stream_executor_headers",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
@@ -674,7 +619,6 @@ cc_library(
     hdrs = ["dnn.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":data_type",
         ":device_description_proto_cc",
         ":device_memory",
         ":dnn_proto_cc",
@@ -741,10 +685,12 @@ cc_library(
     name = "stream_executor",
     textual_hdrs = [
         "blas.h",
+        "data_type.h",
         "device_description.h",
         "device_memory.h",
         "device_memory_allocator.h",
         "device_options.h",
+        "device_id_utils.h",
         "dnn.h",
         "event.h",
         "executor_cache.h",
@@ -771,6 +717,8 @@ cc_library(
     deps = [
         ":stream_executor_headers",
         "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/framework:device_id",
+        "@local_tsl//tsl/framework:device_type",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
@@ -782,13 +730,12 @@ cc_library(
     name = "stream_executor_impl",
     visibility = ["//visibility:public"],
     deps = [
-        ":device_description",
         ":device_memory",
         ":dnn",
         ":dnn_proto_cc",
         ":event",
         ":kernel",
-        ":launch_dim",
+        ":kernel_spec",
         ":multi_platform_manager",
         ":platform",
         ":stream_executor_headers",
diff --git a/third_party/xla/xla/stream_executor/data_type.h b/third_party/xla/xla/stream_executor/data_type.h
index 382f87d20fddfb..c0f900d4794e4d 100644
--- a/third_party/xla/xla/stream_executor/data_type.h
+++ b/third_party/xla/xla/stream_executor/data_type.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <complex>
 #include <cstdint>
 
-#include "xla/stream_executor/dnn.pb.h"
 #include "tsl/platform/float8.h"
 #include "tsl/protobuf/dnn.pb.h"
 
diff --git a/third_party/xla/xla/stream_executor/device_description.cc b/third_party/xla/xla/stream_executor/device_description.cc
index b2b6ffdfbe0851..619a252432e9f4 100644
--- a/third_party/xla/xla/stream_executor/device_description.cc
+++ b/third_party/xla/xla/stream_executor/device_description.cc
@@ -15,15 +15,11 @@ limitations under the License.
 
 #include "xla/stream_executor/device_description.h"
 
-#include <algorithm>
 #include <cstdint>
-#include <map>
 #include <memory>
 #include <string>
 
-#include "absl/strings/str_cat.h"
 #include "tsl/lib/math/math_util.h"
-#include "tsl/platform/numbers.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index 936974dd65df27..cbbde07417ba70 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -20,7 +20,7 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
 #define XLA_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
 
-#include <map>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
diff --git a/third_party/xla/xla/stream_executor/device_memory_allocator.h b/third_party/xla/xla/stream_executor/device_memory_allocator.h
index fe529648c8252f..7b3ee6eb162f6f 100644
--- a/third_party/xla/xla/stream_executor/device_memory_allocator.h
+++ b/third_party/xla/xla/stream_executor/device_memory_allocator.h
@@ -42,7 +42,7 @@ class DeviceMemoryAllocator;
 //
 // We say that an instance of ScopedDeviceMemory is "active" if it currently
 // owns a (possibly empty) slice of memory on the device.  Moving,
-// Release()'ing, Free()'ing, and other actions can deactive an active object.
+// Release()'ing, Free()'ing, and other actions can deactivate an active object.
 template <typename ElemT>
 class ScopedDeviceMemory {
  public:
diff --git a/third_party/xla/xla/stream_executor/device_options.h b/third_party/xla/xla/stream_executor/device_options.h
index 5fc86b25e1c97a..776fa4220813c4 100644
--- a/third_party/xla/xla/stream_executor/device_options.h
+++ b/third_party/xla/xla/stream_executor/device_options.h
@@ -19,11 +19,13 @@ limitations under the License.
 
 #ifndef XLA_STREAM_EXECUTOR_DEVICE_OPTIONS_H_
 #define XLA_STREAM_EXECUTOR_DEVICE_OPTIONS_H_
+
 #include <map>
+#include <string>
+#include <vector>
 
+#include "absl/log/check.h"
 #include "absl/strings/str_join.h"
-#include "xla/stream_executor/platform/port.h"
-#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index a790045a3e42f0..aec49df203b217 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -407,7 +407,6 @@ tsl_gpu_library(
     deps = [
         ":gpu_init_impl",
         "//xla/stream_executor",
-        "//xla/stream_executor:device_id_utils",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/stream_executor/platform.cc b/third_party/xla/xla/stream_executor/platform.cc
index 9f2c5c0978f128..801e381ffa8172 100644
--- a/third_party/xla/xla/stream_executor/platform.cc
+++ b/third_party/xla/xla/stream_executor/platform.cc
@@ -15,11 +15,12 @@ limitations under the License.
 
 #include "xla/stream_executor/platform.h"
 
-#include "absl/strings/str_cat.h"
-#include "xla/stream_executor/platform/port.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
+#include <map>
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/stream_executor/device_options.h"
+#include "tsl/platform/status.h"
 
 namespace stream_executor {
 
@@ -47,10 +48,10 @@ bool Platform::Initialized() const { return true; }
 tsl::Status Platform::Initialize(
     const std::map<std::string, std::string> &platform_options) {
   if (!platform_options.empty()) {
-    return tsl::Status(absl::StatusCode::kUnimplemented,
-                       "this platform does not support custom initialization");
+    return absl::UnimplementedError(
+        "this platform does not support custom initialization");
   }
-  return ::tsl::OkStatus();
+  return tsl::OkStatus();
 }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index 0cfaa67192e898..3fe599358e2baf 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -96,7 +96,6 @@ def xla_cc_test(
                        clean_dep("//xla/service/gpu:hlo_op_profile_proto_cc_impl"),
                        clean_dep("//xla/stream_executor:dnn_proto_cc_impl"),
                        clean_dep("//xla/stream_executor:stream_executor_impl"),
-                       clean_dep("//xla/stream_executor:device_id_utils"),
                        clean_dep("//xla/stream_executor/gpu:gpu_cudamallocasync_allocator"),
                        clean_dep("//xla/stream_executor/gpu:gpu_init_impl"),
                        clean_dep("@local_tsl//tsl/profiler/utils:time_utils_impl"),

From d00f9eeb3105a7abe75ed7965b5a6b22e9aef596 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 25 Sep 2023 11:55:03 -0700
Subject: [PATCH 228/567] [stream_executor] Add back device_id_utils library

PiperOrigin-RevId: 568284525
---
 third_party/xla/xla/stream_executor/BUILD | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index e9f3b11e0d0912..b0f10c4d83d930 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -144,6 +144,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "device_id_utils",
+    hdrs = ["device_id_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":platform",
+        ":stream_executor",
+        "@local_tsl//tsl/framework:device_id_impl",
+    ],
+)
+
 cc_library(
     name = "host_or_device_scalar",
     hdrs = ["host_or_device_scalar.h"],

From 3890704374c2a0ff95d4b3e2c684f19275f8071f Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Mon, 25 Sep 2023 11:59:56 -0700
Subject: [PATCH 229/567] Move resource hoisting to before TPURewrite.

PiperOrigin-RevId: 568285914
---
 tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index dfe07231999791..7e56a6496b5292 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -254,6 +254,9 @@ void CreateTPUBridgePipelineImpl(
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateRewriteTPUEmbeddingOpsPass());
   pm.addPass(CreateTPUAnnotateDynamicShapeInputsPass());
+  pm.addNestedPass<func::FuncOp>(
+      TF::CreateHoistReplicateInvariantResourceWritesPass());
+
   pm.addPass(CreateTPURewritePass(module_name));
   pm.addPass(createSymbolDCEPass());
   pm.addNestedPass<func::FuncOp>(TFDevice::CreateEmbeddingProgramKeyPass());
@@ -261,8 +264,6 @@ void CreateTPUBridgePipelineImpl(
       TFDevice::CreateReplicateInvariantOpHoistingPass());
   pm.addPass(CreateTPUMergeVariablesWithExecutePass());
   pm.addNestedPass<func::FuncOp>(CreateExtractTPUCopyWithDynamicShapeOpPass());
-  pm.addNestedPass<func::FuncOp>(
-      TF::CreateHoistReplicateInvariantResourceWritesPass());
   pm.addNestedPass<func::FuncOp>(CreateTPUColocateCompositeResourceOps());
   if (tensorflow::GetMlirCommonFlags()
           ->tf_mlir_enable_tpu_variable_runtime_reformatting_pass) {

From 9e29f611891202074bb669f2ffc5c865552644cd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 12:14:01 -0700
Subject: [PATCH 230/567] Regenerate pyi files after mypy update

PiperOrigin-RevId: 568290052
---
 .../python/client/_pywrap_tf_session.pyi      | 57 ++++++++++++-------
 .../python/util/_pywrap_checkpoint_reader.pyi |  3 +-
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/client/_pywrap_tf_session.pyi b/tensorflow/python/client/_pywrap_tf_session.pyi
index 35e1832acb9ac4..2d0c304ff7d8c9 100644
--- a/tensorflow/python/client/_pywrap_tf_session.pyi
+++ b/tensorflow/python/client/_pywrap_tf_session.pyi
@@ -118,13 +118,20 @@ class OpsByName:
 class PyGraph:
     @classmethod
     def __init__(cls, *args, **kwargs) -> None: ...
-    def Dismantle(self, *args, **kwargs) -> Any: ...
-    def _add_op(self, *args, **kwargs) -> Any: ...
-    def _get_operation_by_name(self, *args, **kwargs) -> Any: ...
-    def _op_def_for_type(self, *args, **kwargs) -> Any: ...
-    def get_operations(self, *args, **kwargs) -> Any: ...
-    def new_operations(self, *args, **kwargs) -> Any: ...
-    def num_operations(self, *args, **kwargs) -> Any: ...
+    @classmethod
+    def Dismantle(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _add_op(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _get_operation_by_name(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _op_def_for_type(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def get_operations(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def new_operations(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def num_operations(cls, *args, **kwargs) -> Any: ...
     @property
     def _nodes_by_id(self) -> OpsById: ...
     @property
@@ -140,14 +147,22 @@ class PyOperation:
     graph
     @classmethod
     def __init__(cls, *args, **kwargs) -> None: ...
-    def _add_control_input(self, *args, **kwargs) -> Any: ...
-    def _add_control_inputs(self, *args, **kwargs) -> Any: ...
-    def _add_outputs(self, *args, **kwargs) -> Any: ...
-    def _init_outputs(self, *args, **kwargs) -> Any: ...
-    def _remove_all_control_inputs(self, *args, **kwargs) -> Any: ...
-    def _set_device_from_string(self, *args, **kwargs) -> Any: ...
-    def _tf_input(self, *args, **kwargs) -> Any: ...
-    def _tf_output(self, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _add_control_input(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _add_control_inputs(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _add_outputs(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _init_outputs(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _remove_all_control_inputs(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _set_device_from_string(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _tf_input(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _tf_output(cls, *args, **kwargs) -> Any: ...
     @property
     def _c_op(self) -> TF_Operation: ...
     @property
@@ -175,10 +190,14 @@ class PyTensor:
     _shape_val: object
     @classmethod
     def __init__(cls, *args, **kwargs) -> None: ...
-    def _as_tf_output(self, *args, **kwargs) -> Any: ...
-    def _rank(self, *args, **kwargs) -> Any: ...
-    def _set_shape(self, *args, **kwargs) -> Any: ...
-    def consumers(self, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _as_tf_output(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _rank(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def _set_shape(cls, *args, **kwargs) -> Any: ...
+    @classmethod
+    def consumers(cls, *args, **kwargs) -> Any: ...
     @property
     def _dtype(self) -> object: ...
     @property
diff --git a/tensorflow/python/util/_pywrap_checkpoint_reader.pyi b/tensorflow/python/util/_pywrap_checkpoint_reader.pyi
index f194a759355e08..1402d60148afeb 100644
--- a/tensorflow/python/util/_pywrap_checkpoint_reader.pyi
+++ b/tensorflow/python/util/_pywrap_checkpoint_reader.pyi
@@ -17,7 +17,8 @@ from typing import Any
 
 class CheckpointReader:
     def __init__(self, arg0: str) -> None: ...
-    def CheckpointReader_GetTensor(self, *args, **kwargs) -> Any: ...
+    @classmethod
+    def CheckpointReader_GetTensor(cls, arg0: CheckpointReader, arg1: str) -> object: ...
     def _GetVariableToDataTypeMap(self, *args, **kwargs) -> Any: ...
     def _HasTensor(self, arg0: str) -> bool: ...
     def debug_string(self) -> bytes: ...

From e892ce92a745b2dc79349d6aa08eaf33e59e3744 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 12:19:37 -0700
Subject: [PATCH 231/567] Support dynamic shape for zero-point-offset
 calculation for DotGeneral

PiperOrigin-RevId: 568291491
---
 .../bridge/convert_mhlo_quant_to_int.cc       | 229 +++++++---
 .../bridge/convert-mhlo-quant-to-int.mlir     | 422 ++++++++++++++----
 2 files changed, 494 insertions(+), 157 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
index 0a081269b4db4e..d3136f00e33fbe 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
@@ -647,18 +647,72 @@ Value CreateZeroPointPartialOffset(OpBuilder &builder, Location loc,
   return mul_op;
 }
 
-llvm::SmallVector<int64_t> CalculateBroadcastDims(
-    Value zp_contribution, llvm::ArrayRef<int64_t> contracting_dims,
-    llvm::ArrayRef<int64_t> batching_dims, int64_t non_batching_starting_idx) {
+Value GetDimValue(OpBuilder &builder, Location loc, Value tensor,
+                  mlir::ShapedType tensor_shape, int64_t idx) {
+  if (tensor_shape.isDynamicDim(idx)) {
+    // Get dynamic dim using GetDimensionSizeOp and convert result from <i32> to
+    // <1xi64>.
+    Value dynamic_dim = builder.create<mhlo::GetDimensionSizeOp>(
+        loc, tensor, builder.getI64IntegerAttr(idx));
+    dynamic_dim = builder.create<mhlo::ConvertOp>(
+        loc, RankedTensorType::get(ArrayRef<int64_t>{}, builder.getI64Type()),
+        dynamic_dim);
+    return builder.create<mhlo::ReshapeOp>(
+        loc, RankedTensorType::get({1}, builder.getI64Type()), dynamic_dim);
+  } else {
+    return builder.create<mhlo::ConstantOp>(
+        loc, DenseIntElementsAttr::get(
+                 RankedTensorType::get({1}, builder.getI64Type()),
+                 {tensor_shape.getDimSize(idx)}));
+  }
+}
+
+Value CalculateDynamicOutputDims(OpBuilder &builder, Location loc, Value lhs,
+                                 Value rhs,
+                                 mhlo::DotDimensionNumbersAttr dims) {
+  mlir::ShapedType lhs_shape = lhs.getType().cast<mlir::ShapedType>();
+  mlir::ShapedType rhs_shape = rhs.getType().cast<mlir::ShapedType>();
+  // Calculate each output dim and concatenate into a 1D tensor.
+  llvm::SmallVector<Value> output_dims;
+  for (int64_t i = 0; i < lhs_shape.getRank(); ++i) {
+    if (absl::c_count(dims.getLhsBatchingDimensions(), i) != 0) {
+      output_dims.push_back(GetDimValue(builder, loc, lhs, lhs_shape, i));
+    }
+  }
+  for (int64_t i = 0; i < lhs_shape.getRank(); ++i) {
+    if (absl::c_count(dims.getLhsContractingDimensions(), i) == 0 &&
+        absl::c_count(dims.getLhsBatchingDimensions(), i) == 0) {
+      output_dims.push_back(GetDimValue(builder, loc, lhs, lhs_shape, i));
+    }
+  }
+  for (int64_t i = 0; i < rhs_shape.getRank(); ++i) {
+    if (absl::c_count(dims.getRhsContractingDimensions(), i) == 0 &&
+        absl::c_count(dims.getRhsBatchingDimensions(), i) == 0) {
+      output_dims.push_back(GetDimValue(builder, loc, rhs, rhs_shape, i));
+    }
+  }
+  return builder.create<mhlo::ConcatenateOp>(loc, output_dims,
+                                             builder.getI64IntegerAttr(0));
+}
+
+Value BroadcastZpContribution(OpBuilder &builder, Location loc,
+                              Value zp_contribution,
+                              llvm::ArrayRef<int64_t> contracting_dims,
+                              llvm::ArrayRef<int64_t> batching_dims,
+                              int64_t non_batching_starting_idx,
+                              RankedTensorType output_tensor_type,
+                              Value &output_dims_value, Value lhs, Value rhs,
+                              mhlo::DotDimensionNumbersAttr dims) {
   // This function calculates the dims for broadcasting from the
-  // zero-point-offset tensor to the final output tensor.
+  // zero-point-offset tensor to the final output tensor, and then do the
+  // broadcast.
   auto zp_contribution_rank =
       zp_contribution.getType().dyn_cast<ShapedType>().getRank();
   llvm::SmallVector<int64_t> broadcast_dims;
   broadcast_dims.resize(zp_contribution_rank, 0);
   // Result tensor will have batching dims first, then LHS result dims, then
   // RHS result dims. So non-batching result dims index doesn't start from 0.
-  // The arg non_batching_starting_idx is used distinguish LHS and RHS.
+  // The arg non_batching_starting_idx is used to distinguish LHS and RHS.
   int64_t result_batching_idx = 0;
   int64_t result_non_batching_idx = non_batching_starting_idx;
   for (int64_t idx = 0, original_idx = 0; idx < zp_contribution_rank;
@@ -674,7 +728,28 @@ llvm::SmallVector<int64_t> CalculateBroadcastDims(
       broadcast_dims[idx] = result_batching_idx++;
     }
   }
-  return broadcast_dims;
+  // Use broadcast_in_dim or dyanmic_broadcast_in_dim based on input shape
+  // dynamism.
+  if (zp_contribution.getType().dyn_cast<ShapedType>().hasStaticShape()) {
+    zp_contribution = builder.create<mhlo::BroadcastInDimOp>(
+        loc, output_tensor_type, zp_contribution,
+        DenseIntElementsAttr::get(
+            RankedTensorType::get({static_cast<int64_t>(broadcast_dims.size())},
+                                  builder.getI64Type()),
+            broadcast_dims));
+  } else {
+    if (!output_dims_value) {
+      output_dims_value =
+          CalculateDynamicOutputDims(builder, loc, lhs, rhs, dims);
+    }
+    zp_contribution = builder.create<mhlo::DynamicBroadcastInDimOp>(
+        loc, output_tensor_type, zp_contribution, output_dims_value,
+        DenseIntElementsAttr::get(
+            RankedTensorType::get({static_cast<int64_t>(broadcast_dims.size())},
+                                  builder.getI64Type()),
+            broadcast_dims));
+  }
+  return zp_contribution;
 }
 
 Value CalculateZeroPointOffset(OpBuilder &builder, Location loc, Value lhs,
@@ -709,57 +784,64 @@ Value CalculateZeroPointOffset(OpBuilder &builder, Location loc, Value lhs,
   auto output_tensor_type =
       RankedTensorType::get(output_dims, output_element_type);
 
-  Value result = builder.create<mhlo::ConstantOp>(
-      loc, DenseIntElementsAttr::get(output_tensor_type, {0}));
-
+  Value result = nullptr;
+  Value output_dims_value = nullptr;
   // Calculate LHS contribution when RHS zp is non-zero.
   if (rhs_zp != 0) {
     Value lhs_zp_contribution = CreateZeroPointPartialOffset(
         builder, loc, lhs, rhs_zp, dims.getLhsContractingDimensions());
     // Broadcast lhs ZP contribution to result tensor shape.
-    llvm::SmallVector<int64_t> broadcast_dims = CalculateBroadcastDims(
-        lhs_zp_contribution, dims.getLhsContractingDimensions(),
-        dims.getLhsBatchingDimensions(),
-        dims.getLhsBatchingDimensions().size());
-    lhs_zp_contribution = builder.create<mhlo::BroadcastInDimOp>(
-        loc, output_tensor_type, lhs_zp_contribution,
-        DenseIntElementsAttr::get(
-            RankedTensorType::get({static_cast<int64_t>(broadcast_dims.size())},
-                                  builder.getI64Type()),
-            broadcast_dims));
-    result = builder.create<mhlo::AddOp>(loc, result, lhs_zp_contribution);
+    lhs_zp_contribution = BroadcastZpContribution(
+        builder, loc, lhs_zp_contribution, dims.getLhsContractingDimensions(),
+        dims.getLhsBatchingDimensions(), dims.getLhsBatchingDimensions().size(),
+        output_tensor_type, output_dims_value, lhs, rhs, dims);
+    result = lhs_zp_contribution;
   }
   // Calculate RHS contribution when LHS zp is non-zero.
   if (lhs_zp != 0) {
     Value rhs_zp_contribution = CreateZeroPointPartialOffset(
         builder, loc, rhs, lhs_zp, dims.getRhsContractingDimensions());
     // Broadcast rhs ZP contribution to result tensor shape.
-    llvm::SmallVector<int64_t> broadcast_dims = CalculateBroadcastDims(
-        rhs_zp_contribution, dims.getRhsContractingDimensions(),
+    rhs_zp_contribution = BroadcastZpContribution(
+        builder, loc, rhs_zp_contribution, dims.getRhsContractingDimensions(),
         dims.getRhsBatchingDimensions(),
-        lhs_shape.getRank() - dims.getLhsContractingDimensions().size());
-
-    rhs_zp_contribution = builder.create<mhlo::BroadcastInDimOp>(
-        loc, output_tensor_type, rhs_zp_contribution,
-        DenseIntElementsAttr::get(
-            RankedTensorType::get({static_cast<int64_t>(broadcast_dims.size())},
-                                  builder.getI64Type()),
-            broadcast_dims));
-    result = builder.create<mhlo::AddOp>(loc, result, rhs_zp_contribution);
+        lhs_shape.getRank() - dims.getLhsContractingDimensions().size(),
+        output_tensor_type, output_dims_value, lhs, rhs, dims);
+    if (result) {
+      result = builder.create<mhlo::AddOp>(loc, result, rhs_zp_contribution);
+    } else {
+      result = rhs_zp_contribution;
+    }
   }
 
   if (lhs_zp != 0 && rhs_zp != 0) {
     // Contributions from LHS_ZP * RHS_ZP.
     // This is multiplied by the product of all contracting dimensions.
-    int32_t contracting_dim_total = 1;
+    int32_t contracting_dim_total_int = 1;
+    bool has_dynamic_contracting_dim = false;
+    Value dynamic_contracting_dim_total = builder.create<mhlo::ConstantOp>(
+        loc, builder.getI32IntegerAttr(static_cast<int32_t>(1)));
+    // Calculate the product for static/dynamic dims separately.
     for (const int64_t rhs_idx : dims.getRhsContractingDimensions()) {
-      contracting_dim_total *= rhs_shape.getDimSize(rhs_idx);
+      if (rhs_shape.isDynamicDim(rhs_idx)) {
+        has_dynamic_contracting_dim = true;
+        auto dim = builder.create<mhlo::GetDimensionSizeOp>(
+            loc, rhs, builder.getI64IntegerAttr(rhs_idx));
+        dynamic_contracting_dim_total = builder.create<mhlo::MulOp>(
+            loc, dynamic_contracting_dim_total, dim);
+      } else {
+        contracting_dim_total_int *= rhs_shape.getDimSize(rhs_idx);
+      }
     }
-    const int32_t zp_constant_offset = static_cast<int32_t>(lhs_zp) *
+    Value zp_offset_value = builder.create<mhlo::ConstantOp>(
+        loc, builder.getI32IntegerAttr(static_cast<int32_t>(lhs_zp) *
                                        static_cast<int32_t>(rhs_zp) *
-                                       contracting_dim_total;
-    auto zp_offset_value = builder.create<mhlo::ConstantOp>(
-        loc, builder.getI32IntegerAttr(zp_constant_offset));
+                                       contracting_dim_total_int));
+    // Multiply the static dims contribution by the dynamic one if needed.
+    if (has_dynamic_contracting_dim) {
+      zp_offset_value = builder.create<mhlo::MulOp>(
+          loc, zp_offset_value, dynamic_contracting_dim_total);
+    }
     result = builder.create<chlo::BroadcastSubOp>(loc, result, zp_offset_value,
                                                   nullptr);
   }
@@ -772,7 +854,7 @@ LogicalResult RewriteDotGeneralOp(DotOp op, DotOpAdaptor adaptor,
                                   const mhlo::DotDimensionNumbersAttr &dims,
                                   ConversionPatternRewriter &rewriter) {
   // Lower Dot/DotGeneral UQ ops to DotGeneral int.
-  // Assumes that operands and results are static-shape tensors of uq types.
+  // Assumes that operands and results are uq types.
   auto lhs_element_quant_type =
       getElementTypeOrSelf(op.getLhs().getType())
           .template dyn_cast<quant::UniformQuantizedType>();
@@ -788,14 +870,14 @@ LogicalResult RewriteDotGeneralOp(DotOp op, DotOpAdaptor adaptor,
       op.getResult().getType().clone(rewriter.getI32Type());
 
   // Dot result
-  //  = dot((lhs - zp_l) * scale_l, (rhs - zp_r) * scale_r) / scale_res
-  //      + zp_res
-  //  = dot(lhs - zp_l, rhs - zp_r) * scale_l * scale_r / scale_res + zp_res
-  //  = dot(lhs, rhs) * combined_scale + combined_zp
+  //   = dot((lhs - zp_l) * scale_l, (rhs - zp_r) * scale_r) / scale_res
+  //       + zp_res
+  //   = dot(lhs - zp_l, rhs - zp_r) * scale_l * scale_r / scale_res + zp_res
+  //   = dot(lhs, rhs) * combined_scale + combined_zp
   // where:
-  //   zp_offset = zp_l*rhs + zp_r*lhs - zp_l*zp_r
   //   combined_scale = scale_l * scale_r / scale_res
   //   combined_zp = res_zp - zp_offset * combined_scale
+  //   zp_offset = zp_l*rhs + zp_r*lhs - zp_l*zp_r
   SmallVector<Value, 2> operands{lhs, rhs};
   Value res_i32 = rewriter.create<mhlo::DotGeneralOp>(
       op->getLoc(), res_int32_tensor_type, operands, attrs);
@@ -805,10 +887,10 @@ LogicalResult RewriteDotGeneralOp(DotOp op, DotOpAdaptor adaptor,
       rhs_element_quant_type.getZeroPoint(), dims);
 
   // Multiply dot result and zp_offset by combined_scale only if it is not 1.0.
-  float combined_scale_fp = lhs_element_quant_type.getScale() *
-                            rhs_element_quant_type.getScale() /
-                            res_element_quant_type.getScale();
-  if (combined_scale_fp != 1.0f) {
+  double combined_scale_fp = lhs_element_quant_type.getScale() *
+                             rhs_element_quant_type.getScale() /
+                             res_element_quant_type.getScale();
+  if (combined_scale_fp != 1.0) {
     Value combined_scale = rewriter.create<mhlo::ConstantOp>(
         op->getLoc(), rewriter.getF32FloatAttr(combined_scale_fp));
 
@@ -822,26 +904,32 @@ LogicalResult RewriteDotGeneralOp(DotOp op, DotOpAdaptor adaptor,
     res_i32 = rewriter.create<mhlo::ConvertOp>(op->getLoc(),
                                                res_int32_tensor_type, res_f32);
 
-    auto zp_offset_float32_tensor_type =
-        zp_offset.getType().dyn_cast<TensorType>().clone(rewriter.getF32Type());
-    zp_offset = rewriter.create<mhlo::ConvertOp>(
-        op->getLoc(), zp_offset_float32_tensor_type, zp_offset);
-    zp_offset = rewriter.create<chlo::BroadcastMulOp>(
-        op->getLoc(), zp_offset_float32_tensor_type, zp_offset, combined_scale,
-        nullptr);
-    zp_offset = rewriter.create<mhlo::ConvertOp>(
-        op->getLoc(),
-        zp_offset_float32_tensor_type.clone(rewriter.getI32Type()), zp_offset);
+    // Skip zp_offset if it is 0.
+    if (zp_offset) {
+      auto zp_offset_float32_tensor_type =
+          zp_offset.getType().dyn_cast<TensorType>().clone(
+              rewriter.getF32Type());
+      zp_offset = rewriter.create<mhlo::ConvertOp>(
+          op->getLoc(), zp_offset_float32_tensor_type, zp_offset);
+      zp_offset = rewriter.create<chlo::BroadcastMulOp>(
+          op->getLoc(), zp_offset_float32_tensor_type, zp_offset,
+          combined_scale, nullptr);
+      zp_offset = rewriter.create<mhlo::ConvertOp>(
+          op->getLoc(),
+          zp_offset_float32_tensor_type.clone(rewriter.getI32Type()),
+          zp_offset);
+    }
   }
 
-  Value res_zp = rewriter.create<mhlo::ConstantOp>(
+  Value combined_zp = rewriter.create<mhlo::ConstantOp>(
       op->getLoc(),
       rewriter.getI32IntegerAttr(res_element_quant_type.getZeroPoint()));
-  Value combined_zp = rewriter.create<chlo::BroadcastSubOp>(
-      op->getLoc(), res_int32_tensor_type, res_zp, zp_offset, nullptr);
-
-  rewriter.replaceOpWithNewOp<mhlo::AddOp>(op, res_int32_tensor_type, res_i32,
-                                           combined_zp);
+  if (zp_offset) {
+    combined_zp = rewriter.create<chlo::BroadcastSubOp>(
+        op->getLoc(), res_int32_tensor_type, combined_zp, zp_offset, nullptr);
+  }
+  rewriter.replaceOpWithNewOp<chlo::BroadcastAddOp>(
+      op, res_int32_tensor_type, res_i32, combined_zp, nullptr);
   return success();
 }
 
@@ -852,7 +940,7 @@ class ConvertUniformQuantizedDotOp : public OpConversionPattern<mhlo::DotOp> {
   LogicalResult matchAndRewrite(
       mhlo::DotOp op, mhlo::DotOpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    // Use matchAndRewriteDotLikeOp for DotHybrid and dynamic shapes.
+    // Use matchAndRewriteDotLikeOp for DotHybrid.
     if (!op.getLhs()
              .getType()
              .getElementType()
@@ -860,10 +948,7 @@ class ConvertUniformQuantizedDotOp : public OpConversionPattern<mhlo::DotOp> {
         !op.getRhs()
              .getType()
              .getElementType()
-             .isa<quant::UniformQuantizedType>() ||
-        !op.getLhs().getType().cast<ShapedType>().hasStaticShape() ||
-        !op.getRhs().getType().cast<ShapedType>().hasStaticShape() ||
-        !op.getResult().getType().cast<ShapedType>().hasStaticShape()) {
+             .isa<quant::UniformQuantizedType>()) {
       return matchAndRewriteDotLikeOp(op, adaptor, rewriter);
     }
 
@@ -889,8 +974,7 @@ class ConvertUniformQuantizedDotGeneralOp
   LogicalResult matchAndRewrite(
       mhlo::DotGeneralOp op, mhlo::DotGeneralOpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    // Use matchAndRewriteDotLikeOp for DotHybridGeneral case and dynamic
-    // shapes.
+    // Use matchAndRewriteDotLikeOp for DotHybridGeneral.
     if (!op.getLhs()
              .getType()
              .getElementType()
@@ -898,10 +982,7 @@ class ConvertUniformQuantizedDotGeneralOp
         !op.getRhs()
              .getType()
              .getElementType()
-             .isa<quant::UniformQuantizedType>() ||
-        !op.getLhs().getType().cast<ShapedType>().hasStaticShape() ||
-        !op.getRhs().getType().cast<ShapedType>().hasStaticShape() ||
-        !op.getResult().getType().cast<ShapedType>().hasStaticShape()) {
+             .isa<quant::UniformQuantizedType>()) {
       return matchAndRewriteDotLikeOp(op, adaptor, rewriter);
     }
     return RewriteDotGeneralOp(op, adaptor, op->getAttrs(),
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
index 9e00c80fb5aab5..81882209250883 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
@@ -303,74 +303,152 @@ func.func @uniform_quantize_requantize_merged_zp_zero_and_dequantize(%arg0: tens
 
 // -----
 
+// CHECK-LABEL: func @uniform_quantize_dot_dequantize
+func.func @uniform_quantize_dot_dequantize(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+
+  // CHECK: "mhlo.dot_general"
+  // CHECK-SAME: lhs_contracting_dimensions = [1]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+  // CHECK-SAME: (tensor<2x2xi8>, tensor<2x2xi8>) -> tensor<2x2xi32>
+  %2 = "mhlo.dot" (%0, %1) : (tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  %3 = mhlo.uniform_dequantize %2 : (tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2xf32>
+  return %3 : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantize_dot_int4
+func.func @uniform_quantize_dot_int4(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>
+
+  // CHECK: "mhlo.dot_general"
+  // CHECK-SAME: lhs_contracting_dimensions = [1]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+  // CHECK-SAME: (tensor<2x2xi4>, tensor<2x2xi4>) -> tensor<2x2xi32>
+  %2 = "mhlo.dot" (%0, %1): (tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>, tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @uniform_quantize_dot_dequantize_dynamic
 func.func @uniform_quantize_dot_dequantize_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i8:f32, 2.000000e+00:3>>
-  %1 = mhlo.uniform_quantize %arg1 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:2>>
 
-  // CHECK: %[[VAL1:.*]] = mhlo.convert %[[VAL0:.*]] : (tensor<?x?xi8>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL3:.*]] = chlo.broadcast_subtract %[[VAL1]], %[[VAL2:.*]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL5:.*]] = mhlo.convert %[[VAL4:.*]] : (tensor<?x?xi8>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL7:.*]] = chlo.broadcast_subtract %[[VAL5]], %[[VAL6:.*]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL8:.*]] = "mhlo.dot"(%[[VAL3]], %[[VAL7]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL10:.*]] = chlo.broadcast_multiply %[[VAL8]], %[[VAL9:.*]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL12:.*]] = chlo.broadcast_add %[[VAL10]], %[[VAL11:.*]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL13:.*]] = mhlo.floor %[[VAL12]] : tensor<?x?xf32>
-  // CHECK: %[[VAL15:.*]] = chlo.broadcast_add %[[VAL13]], %[[VAL14:.*]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL16:.*]] = mhlo.convert %[[VAL15]] : (tensor<?x?xf32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL19:.*]] = mhlo.clamp %[[VAL17:.*]], %[[VAL16]], %[[VAL18:.*]] : (tensor<i32>, tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL20:.*]] = mhlo.convert %[[VAL19]] : (tensor<?x?xi32>) -> tensor<?x?xi8>
-  %2 = "mhlo.dot" (%0, %1) : (tensor<?x?x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK: mhlo.dot_general
+  // CHECK-SAME: lhs_contracting_dimensions = [1]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+  // CHECK-SAME: (tensor<?x?xi8>, tensor<?x?xi8>) -> tensor<?x?xi32>
+
+  // CHECK: mhlo.reduce
+  // CHECK-SAME: applies mhlo.add across dimensions = [1]
+  // CHECK-SAME: (tensor<?x?xi32>, tensor<i32>) -> tensor<?xi32>
+  // CHECK: "mhlo.get_dimension_size"
+  // CHECK-SAME: {dimension = 0 : i64} : (tensor<?x?xi8>) -> tensor<i32>
+  // CHECK: "mhlo.get_dimension_size"
+  // CHECK-SAME: {dimension = 1 : i64} : (tensor<?x?xi8>) -> tensor<i32>
+  // CHECK: %[[DYN_DIMS:.*]] = "mhlo.concatenate"
+  // CHECK-SAME: {dimension = 0 : i64}
+  // CHECK: mhlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: %[[DYN_DIMS]])
+  // CHECK-SAME: broadcast_dimensions = dense<0>
+  // CHECK-SAME: (tensor<?xi32>, tensor<2xi64>) -> tensor<?x?xi32>
+
+  // CHECK: mhlo.reduce
+  // CHECK-SAME: applies mhlo.add across dimensions = [0]
+  // CHECK-SAME: (tensor<?x?xi32>, tensor<i32>) -> tensor<?xi32>
+  // CHECK: mhlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: %[[DYN_DIMS]])
+  // CHECK-SAME: broadcast_dimensions = dense<1>
+  // CHECK-SAME: (tensor<?xi32>, tensor<2xi64>) -> tensor<?x?xi32>
+  %2 = "mhlo.dot" (%0, %1) : (tensor<?x?x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:2>>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>
   %3 = mhlo.uniform_dequantize %2 : (tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<?x?xf32>
   return %3 : tensor<?x?xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @uniform_quantize_dot_dynamic_int4
-func.func @uniform_quantize_dot_dynamic_int4(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) {
-  %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>
-  %1 = mhlo.uniform_quantize %arg1 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>
+// CHECK-LABEL: func @uniform_quantize_dot_dequantize_dynamic_int4
+func.func @uniform_quantize_dot_dequantize_dynamic_int4(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i4:f32, 2.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:2>>
 
-  // CHECK: %[[VAL2:.*]] = "mhlo.dot"(%[[VAL0:.*]], %[[VAL1:.*]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL4:.*]] = mhlo.convert %[[VAL3:.*]] : (tensor<?x?xf32>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[VAL5:.*]] = mhlo.constant dense<-8> : tensor<i32>
-  // CHECK-DAG: %[[VAL6:.*]] = mhlo.constant dense<7> : tensor<i32>
-  // CHECK: %[[VAL7:.*]] = mhlo.clamp %[[VAL5]], %[[VAL4]], %[[VAL6]] : (tensor<i32>, tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL8:.*]] = mhlo.convert %[[VAL7]] : (tensor<?x?xi32>) -> tensor<?x?xi4>
-  %2 = "mhlo.dot" (%0, %1): (tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>, tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>) -> tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>
-  return
+  // CHECK: mhlo.dot_general
+  // CHECK-SAME: lhs_contracting_dimensions = [1]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+  // CHECK-SAME: (tensor<?x?xi4>, tensor<?x?xi4>) -> tensor<?x?xi32>
+  %2 = "mhlo.dot" (%0, %1) : (tensor<?x?x!quant.uniform<i4:f32, 2.000000e+00:3>>, tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:2>>) -> tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>
+  %3 = mhlo.uniform_dequantize %2 : (tensor<?x?x!quant.uniform<i4:f32, 1.000000e+00:3>>) -> tensor<?x?xf32>
+  return %3 : tensor<?x?xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @uniform_quantize_dot_dequantize
-func.func @uniform_quantize_dot_dequantize(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
-  %1 = mhlo.uniform_quantize %arg1 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+// CHECK-LABEL: func @uniform_quantize_dot_dequantize_dynamic_contracting_dim
+func.func @uniform_quantize_dot_dequantize_dynamic_contracting_dim(%arg0: tensor<2x?xf32>, %arg1: tensor<?x2xf32>) -> tensor<2x2xf32> {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x?xf32>) -> tensor<2x?x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<?x2xf32>) -> tensor<?x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
 
   // CHECK: "mhlo.dot_general"
   // CHECK-SAME: lhs_contracting_dimensions = [1]
   // CHECK-SAME: rhs_contracting_dimensions = [0]
-  // CHECK-SAME: (tensor<2x2xi8>, tensor<2x2xi8>) -> tensor<2x2xi32>
-  %2 = "mhlo.dot" (%0, %1) : (tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK-SAME: (tensor<2x?xi8>, tensor<?x2xi8>) -> tensor<2x2xi32>
+
+  // CHECK: mhlo.reduce
+  // CHECK-SAME: applies mhlo.add across dimensions = [1]
+  // CHECK-SAME: (tensor<2x?xi32>, tensor<i32>) -> tensor<2xi32>
+
+  // CHECK: mhlo.reduce
+  // CHECK-SAME: applies mhlo.add across dimensions = [0]
+  // CHECK-SAME: (tensor<?x2xi32>, tensor<i32>) -> tensor<2xi32>
+
+  // CHECK: %[[DYNAMIC_DIM_INIT:.*]] = mhlo.constant dense<1> : tensor<i32>
+  // CHECK: %[[DYNAMIC_DIM:.*]] = "mhlo.get_dimension_size"
+  // CHECK-SAME: {dimension = 0 : i64} : (tensor<?x2xi8>) -> tensor<i32>
+  // CHECK: %[[DYNAMIC_DIM_TOTAL:.*]] = mhlo.multiply
+  // CHECK-SAME: %[[DYNAMIC_DIM_INIT]], %[[DYNAMIC_DIM]]
+  // CHECK: %[[DIMS:.*]] = mhlo.constant dense<9> : tensor<i32>
+  // CHECK: %[[DIMS_1:.*]] = mhlo.multiply %[[DIMS]], %[[DYNAMIC_DIM_TOTAL]]
+  // CHECK: chlo.broadcast_subtract %[[ZP_OFFSET:.*]], %[[DIMS:.*]]
+  %2 = "mhlo.dot" (%0, %1) : (tensor<2x?x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<?x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
   %3 = mhlo.uniform_dequantize %2 : (tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2xf32>
   return %3 : tensor<2x2xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @uniform_quantize_dot_int4
-func.func @uniform_quantize_dot_int4(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) {
-  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>
-  %1 = mhlo.uniform_quantize %arg1 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>
+// CHECK-LABEL: func @uniform_quantize_dot_dequantize_dynamic_result_dim
+func.func @uniform_quantize_dot_dequantize_dynamic_result_dim(%arg0: tensor<?x2xf32>, %arg1: tensor<2x?xf32>) -> tensor<?x?xf32> {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<?x2xf32>) -> tensor<?x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<2x?xf32>) -> tensor<2x?x!quant.uniform<i8:f32, 1.000000e+00:3>>
 
   // CHECK: "mhlo.dot_general"
   // CHECK-SAME: lhs_contracting_dimensions = [1]
   // CHECK-SAME: rhs_contracting_dimensions = [0]
-  // CHECK-SAME: (tensor<2x2xi4>, tensor<2x2xi4>) -> tensor<2x2xi32>
-  %2 = "mhlo.dot" (%0, %1): (tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>, tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i4:f32, 1.000000e+00:3>>
-  return
+  // CHECK-SAME: (tensor<?x2xi8>, tensor<2x?xi8>) -> tensor<?x?xi32>
+
+  // CHECK: mhlo.reduce
+  // CHECK-SAME: applies mhlo.add across dimensions = [1]
+  // CHECK-SAME: (tensor<?x2xi32>, tensor<i32>) -> tensor<?xi32>
+  // CHECK: mhlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: broadcast_dimensions = dense<0>
+  // CHECK-SAME: (tensor<?xi32>, tensor<2xi64>) -> tensor<?x?xi32>
+
+  // CHECK: mhlo.reduce
+  // CHECK-SAME: applies mhlo.add across dimensions = [0]
+  // CHECK-SAME: (tensor<2x?xi32>, tensor<i32>) -> tensor<?xi32>
+  // CHECK: mhlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: broadcast_dimensions = dense<1>
+  // CHECK-SAME: (tensor<?xi32>, tensor<2xi64>) -> tensor<?x?xi32>
+
+
+  %2 = "mhlo.dot" (%0, %1) : (tensor<?x2x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<2x?x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  %3 = mhlo.uniform_dequantize %2 : (tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<?x?xf32>
+  return %3 : tensor<?x?xf32>
 }
 
 // -----
@@ -389,8 +467,6 @@ func.func @uniform_quantize_dot_general_dequantize(
   // CHECK-SAME: lhs_contracting_dimensions = [2]
   // CHECK-SAME: rhs_contracting_dimensions = [0]
 
-  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.constant dense<0> : tensor<2x5x8xi32>
-
   // Zero point offset contribution from LHS tensor * RHS ZP.
 
   // CHECK: %[[LHS_I32:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<2x5x6xi8>)
@@ -407,7 +483,6 @@ func.func @uniform_quantize_dot_general_dequantize(
   // CHECK: %[[LHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[LHS_ZP_CONTRIB]])
   // CHECK-SAME: broadcast_dimensions = dense<[0, 1]>
   // CHECK-SAME: (tensor<2x5xi32>) -> tensor<2x5x8xi32>
-  // CHECK: %[[ZP_TOTAL_2:.*]] = mhlo.add %[[ZP_TOTAL_1]], %[[LHS_ZP_BCAST]]
 
   // Zero point offset contribution from RHS tensor * LHS ZP.
 
@@ -425,12 +500,12 @@ func.func @uniform_quantize_dot_general_dequantize(
   // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[RHS_ZP_CONTRIB]])
   // CHECK-SAME: broadcast_dimensions = dense<[2, 0]>
   // CHECK-SAME: (tensor<8x2xi32>) -> tensor<2x5x8xi32>
-  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.add %[[ZP_TOTAL_2]], %[[RHS_ZP_BCAST]]
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.add %[[LHS_ZP_BCAST]], %[[RHS_ZP_BCAST]]
 
   // Zero point offset contribution from LHS ZP * RHS ZP.
 
   // CHECK: %[[ZPS:.*]] = mhlo.constant dense<90> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_3]], %[[ZPS]]
+  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_1]], %[[ZPS]]
   // CHECK-SAME: (tensor<2x5x8xi32>, tensor<i32>) -> tensor<2x5x8xi32>
 
   // Combine dot result with zero point offset and output final result.
@@ -443,17 +518,17 @@ func.func @uniform_quantize_dot_general_dequantize(
   // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
   // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
 
-  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
+  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
   // CHECK-SAME: (tensor<2x5x8xi32>) -> tensor<2x5x8xf32>
-  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[ZP_TOTAL_5:.*]], %[[COMBINED_SCALE]]
-  // CHECK: %[[ZP_TOTAL_7:.*]] = mhlo.convert %[[ZP_TOTAL_6]]
+  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[ZP_TOTAL_3:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
   // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
 
   // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_8:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_7]]
+  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_5]]
   // CHECK-SAME: (tensor<i32>, tensor<2x5x8xi32>) -> tensor<2x5x8xi32>
-  // CHECK: mhlo.add %[[RES_INT]], %[[ZP_TOTAL_8]]
+  // CHECK: chlo.broadcast_add %[[RES_INT]], %[[ZP_TOTAL_6]]
 
   %2 = "mhlo.dot_general" (%0, %1) {
     dot_dimension_numbers = #mhlo.dot<
@@ -473,6 +548,87 @@ func.func @uniform_quantize_dot_general_dequantize(
 
 // -----
 
+// CHECK-LABEL: func @uniform_quantize_dot_general_dequantize_combined_scale_1
+func.func @uniform_quantize_dot_general_dequantize_combined_scale_1(
+    %arg0: tensor<2x5x6xf32>, %arg1: tensor<6x8x2xf32>) -> tensor<2x5x8xf32> {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<2x5x6xf32>)
+      -> tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<6x8x2xf32>)
+      -> tensor<6x8x2x!quant.uniform<i8:f32, 3.000000e+00:5>>
+
+  // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
+  // CHECK-SAME: lhs_batching_dimensions = [0]
+  // CHECK-SAME: rhs_batching_dimensions = [2]
+  // CHECK-SAME: lhs_contracting_dimensions = [2]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+
+  // Zero point offset contribution from LHS tensor * RHS ZP.
+
+  // CHECK: %[[LHS_I32:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<2x5x6xi8>)
+  // CHECK-SAME: -> tensor<2x5x6xi32>
+  // CHECK: %[[LHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[LHS_REDUCE:.*]] = mhlo.reduce(%[[LHS_I32]] init: %[[LHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [2]
+  // CHECK-SAME: (tensor<2x5x6xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<2x5xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<5> : tensor<i32>
+  // CHECK: %[[LHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[LHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<2x5xi32>, tensor<i32>) -> tensor<2x5xi32>
+  // CHECK: %[[LHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[LHS_ZP_CONTRIB]])
+  // CHECK-SAME: broadcast_dimensions = dense<[0, 1]>
+  // CHECK-SAME: (tensor<2x5xi32>) -> tensor<2x5x8xi32>
+
+  // Zero point offset contribution from RHS tensor * LHS ZP.
+
+  // CHECK: %[[RHS_I32:.*]] = mhlo.convert %[[RHS:.*]] : (tensor<6x8x2xi8>)
+  // CHECK-SAME: -> tensor<6x8x2xi32>
+  // CHECK: %[[RHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[RHS_REDUCE:.*]] = mhlo.reduce(%[[RHS_I32]] init: %[[RHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [0]
+  // CHECK-SAME: (tensor<6x8x2xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<8x2xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<3> : tensor<i32>
+  // CHECK: %[[RHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<8x2xi32>, tensor<i32>) -> tensor<8x2xi32>
+  // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[RHS_ZP_CONTRIB]])
+  // CHECK-SAME: broadcast_dimensions = dense<[2, 0]>
+  // CHECK-SAME: (tensor<8x2xi32>) -> tensor<2x5x8xi32>
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.add %[[LHS_ZP_BCAST]], %[[RHS_ZP_BCAST]]
+
+  // Zero point offset contribution from LHS ZP * RHS ZP.
+
+  // CHECK: %[[ZPS:.*]] = mhlo.constant dense<90> : tensor<i32>
+  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_1]], %[[ZPS]]
+  // CHECK-SAME: (tensor<2x5x8xi32>, tensor<i32>) -> tensor<2x5x8xi32>
+
+  // Combine dot result with zero point offset and output final result.
+  // Do not multiply by combined scale since it is 1.0 and thus no-op.
+
+  // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
+  // CHECK: %[[ZP_TOTAL_3:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_2]]
+  // CHECK-SAME: (tensor<i32>, tensor<2x5x8xi32>) -> tensor<2x5x8xi32>
+  // CHECK: chlo.broadcast_add %[[DOT_RES]], %[[ZP_TOTAL_3]]
+
+  %2 = "mhlo.dot_general" (%0, %1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [2],
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [0]
+    >} : (
+      tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
+      tensor<6x8x2x!quant.uniform<i8:f32, 3.000000e+00:5>>
+    ) -> tensor<2x5x8x!quant.uniform<i8:f32, 6.000000e+00:7>>
+  %3 = mhlo.uniform_dequantize %2 : (
+    tensor<2x5x8x!quant.uniform<i8:f32, 6.000000e+00:7>>
+  ) -> tensor<2x5x8xf32>
+  return %3 : tensor<2x5x8xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @uniform_quantize_dot_general_dequantize_multiple_batching_dims
 func.func @uniform_quantize_dot_general_dequantize_multiple_batching_dims(
     %arg0: tensor<2x5x3x7x6xf32>, %arg1: tensor<6x2x7x8x3xf32>) -> tensor<2x3x5x8xf32> {
@@ -487,8 +643,6 @@ func.func @uniform_quantize_dot_general_dequantize_multiple_batching_dims(
   // CHECK-SAME: lhs_contracting_dimensions = [4, 3]
   // CHECK-SAME: rhs_contracting_dimensions = [0, 2]>}
 
-  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.constant dense<0> : tensor<2x3x5x8xi32>
-
   // Zero point offset contribution from LHS tensor * RHS ZP.
 
   // CHECK: %[[LHS_I32:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<2x5x3x7x6xi8>)
@@ -505,7 +659,6 @@ func.func @uniform_quantize_dot_general_dequantize_multiple_batching_dims(
   // CHECK: %[[LHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[LHS_ZP_CONTRIB]])
   // CHECK-SAME: broadcast_dimensions = dense<[0, 2, 1]>
   // CHECK-SAME: (tensor<2x5x3xi32>) -> tensor<2x3x5x8xi32>
-  // CHECK: %[[ZP_TOTAL_2:.*]] = mhlo.add %[[ZP_TOTAL_1]], %[[LHS_ZP_BCAST]]
 
   // Zero point offset contribution from RHS tensor * LHS ZP.
 
@@ -523,12 +676,12 @@ func.func @uniform_quantize_dot_general_dequantize_multiple_batching_dims(
   // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[RHS_ZP_CONTRIB]])
   // CHECK-SAME: broadcast_dimensions = dense<[0, 3, 1]>
   // CHECK-SAME: (tensor<2x8x3xi32>) -> tensor<2x3x5x8xi32>
-  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.add %[[ZP_TOTAL_2]], %[[RHS_ZP_BCAST]]
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.add %[[LHS_ZP_BCAST]], %[[RHS_ZP_BCAST]]
 
   // Zero point offset contribution from LHS ZP * RHS ZP.
 
   // CHECK: %[[ZPS:.*]] = mhlo.constant dense<630> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_3]], %[[ZPS]]
+  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_1]], %[[ZPS]]
   // CHECK-SAME: (tensor<2x3x5x8xi32>, tensor<i32>) -> tensor<2x3x5x8xi32>
 
   // Combine dot result with zero point offset and output final result.
@@ -541,18 +694,18 @@ func.func @uniform_quantize_dot_general_dequantize_multiple_batching_dims(
   // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
   // CHECK-SAME: (tensor<2x3x5x8xf32>) -> tensor<2x3x5x8xi32>
 
-  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
+  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
   // CHECK-SAME: (tensor<2x3x5x8xi32>) -> tensor<2x3x5x8xf32>
-  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[ZP_TOTAL_5:.*]], %[[COMBINED_SCALE]]
-  // CHECK: %[[ZP_TOTAL_7:.*]] = mhlo.convert %[[ZP_TOTAL_6]]
+  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[ZP_TOTAL_3:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
   // CHECK-SAME: (tensor<2x3x5x8xf32>) -> tensor<2x3x5x8xi32>
 
   // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_8:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_7]]
+  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_5]]
   // CHECK-SAME: (tensor<i32>, tensor<2x3x5x8xi32>) -> tensor<2x3x5x8xi32>
-  // CHECK: mhlo.add %[[RES_INT]], %[[ZP_TOTAL_8]]
-  
+  // CHECK: chlo.broadcast_add %[[RES_INT]], %[[ZP_TOTAL_6]]
+
   %2 = "mhlo.dot_general" (%0, %1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0, 2],
@@ -585,8 +738,6 @@ func.func @uniform_quantize_dot_general_dequantize_rhs_zero_zp(
   // CHECK-SAME: lhs_contracting_dimensions = [2]
   // CHECK-SAME: rhs_contracting_dimensions = [0]
 
-  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.constant dense<0> : tensor<2x5x8xi32>
-
   // Zero point offset contribution from LHS tensor * RHS ZP is 0 and skipped.
 
   // Zero point offset contribution from RHS tensor * LHS ZP.
@@ -605,7 +756,6 @@ func.func @uniform_quantize_dot_general_dequantize_rhs_zero_zp(
   // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[RHS_ZP_CONTRIB]])
   // CHECK-SAME: broadcast_dimensions = dense<[2, 0]>
   // CHECK-SAME: (tensor<8x2xi32>) -> tensor<2x5x8xi32>
-  // CHECK: %[[ZP_TOTAL_2:.*]] = mhlo.add %[[ZP_TOTAL_1]], %[[RHS_ZP_BCAST]]
 
   // Zero point offset contribution from LHS ZP * RHS ZP is 0 and skipped.
 
@@ -619,17 +769,17 @@ func.func @uniform_quantize_dot_general_dequantize_rhs_zero_zp(
   // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
   // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
 
-  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.convert %[[RHS_ZP_BCAST]]
   // CHECK-SAME: (tensor<2x5x8xi32>) -> tensor<2x5x8xf32>
-  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[ZP_TOTAL_3:.*]], %[[COMBINED_SCALE]]
-  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
+  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[ZP_TOTAL_1:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
   // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
 
   // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_5]]
+  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_3]]
   // CHECK-SAME: (tensor<i32>, tensor<2x5x8xi32>) -> tensor<2x5x8xi32>
-  // CHECK: mhlo.add %[[RES_INT]], %[[ZP_TOTAL_6]]
+  // CHECK: chlo.broadcast_add %[[RES_INT]], %[[ZP_TOTAL_4]]
 
   %2 = "mhlo.dot_general" (%0, %1) {
     dot_dimension_numbers = #mhlo.dot<
@@ -665,8 +815,6 @@ func.func @uniform_quantize_dot_general_dequantize_zero_zp(
 
   // Both LHS/RHS have zero zp. No zp contribution.
 
-  // CHECK-DAG: %[[ZP_CONTRIB:.*]] = mhlo.constant dense<0> : tensor<2x5x8xi32>
-
   // CHECK-DAG: %[[COMBINED_SCALE:.*]] = mhlo.constant dense<1.500000e+00> : tensor<f32>
   // CHECK: %[[RES_FP:.*]] = mhlo.convert %[[DOT_RES]] :
   // CHECK-SAME: (tensor<2x5x8xi32>) -> tensor<2x5x8xf32>
@@ -675,17 +823,8 @@ func.func @uniform_quantize_dot_general_dequantize_zero_zp(
   // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
   // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
 
-  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.convert %[[ZP_CONTRIB]]
-  // CHECK-SAME: (tensor<2x5x8xi32>) -> tensor<2x5x8xf32>
-  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[ZP_TOTAL_1:.*]], %[[COMBINED_SCALE]]
-  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
-  // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
-
   // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_3]]
-  // CHECK-SAME: (tensor<i32>, tensor<2x5x8xi32>) -> tensor<2x5x8xi32>
-  // CHECK: mhlo.add %[[RES_INT]], %[[ZP_TOTAL_4]]
+  // CHECK: chlo.broadcast_add %[[RES_INT]], %[[RES_ZP]]
 
   %2 = "mhlo.dot_general" (%0, %1) {
     dot_dimension_numbers = #mhlo.dot<
@@ -705,6 +844,123 @@ func.func @uniform_quantize_dot_general_dequantize_zero_zp(
 
 // -----
 
+// CHECK-LABEL: func @uniform_quantize_dot_general_dequantize_multiple_dynamic_dims
+func.func @uniform_quantize_dot_general_dequantize_multiple_dynamic_dims(
+    %arg0: tensor<?x?x3x?x6xf32>, %arg1: tensor<6x?x?x8x3xf32>) -> tensor<?x3x?x8xf32> {
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?x3x?x6xf32>)
+      -> tensor<?x?x3x?x6x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  %1 = mhlo.uniform_quantize %arg1 : (tensor<6x?x?x8x3xf32>)
+      -> tensor<6x?x?x8x3x!quant.uniform<i8:f32, 1.000000e+00:5>>
+
+  // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
+  // CHECK-SAME: lhs_batching_dimensions = [0, 2]
+  // CHECK-SAME: rhs_batching_dimensions = [1, 4]
+  // CHECK-SAME: lhs_contracting_dimensions = [4, 3]
+  // CHECK-SAME: rhs_contracting_dimensions = [0, 2]>}
+
+  // Zero point offset contribution from LHS tensor * RHS ZP.
+
+  // CHECK: %[[LHS_I32:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<?x?x3x?x6xi8>)
+  // CHECK-SAME: -> tensor<?x?x3x?x6xi32>
+  // CHECK: %[[LHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[LHS_REDUCE:.*]] = mhlo.reduce(%[[LHS_I32]] init: %[[LHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [4, 3]
+  // CHECK-SAME: (tensor<?x?x3x?x6xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<?x?x3xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<5> : tensor<i32>
+  // CHECK: %[[LHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[LHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<?x?x3xi32>, tensor<i32>) -> tensor<?x?x3xi32>
+
+  // Calculate output dynamic dims.
+  // CHECK: %[[DIM_1_1:.*]] = "mhlo.get_dimension_size"(%[[LHS]])
+  // CHECK-SAME: {dimension = 0 : i64}
+  // CHECK: %[[DIM_1_2:.*]] = mhlo.convert %[[DIM_1_1]] : (tensor<i32>) -> tensor<i64>
+  // CHECK: %[[DIM_1:.*]] = mhlo.reshape %[[DIM_1_2]] : (tensor<i64>) -> tensor<1xi64>
+  // CHECK: %[[DIM_2:.*]] = mhlo.constant dense<3> : tensor<1xi64>
+  // CHECK: %[[DIM_3_1:.*]] = "mhlo.get_dimension_size"(%[[LHS]])
+  // CHECK-SAME: {dimension = 1 : i64}
+  // CHECK: %[[DIM_3_2:.*]] = mhlo.convert %[[DIM_3_1]] : (tensor<i32>) -> tensor<i64>
+  // CHECK: %[[DIM_3:.*]] = mhlo.reshape %[[DIM_3_2]] : (tensor<i64>) -> tensor<1xi64>
+  // CHECK: %[[DIM_4:.*]] = mhlo.constant dense<8> : tensor<1xi64>
+  // CHECK: %[[OUTPUT_DIMS:.*]] = "mhlo.concatenate"
+  // CHECK-SAME: %[[DIM_1]], %[[DIM_2]], %[[DIM_3]], %[[DIM_4]]
+
+  // CHECK: %[[LHS_ZP_BCAST:.*]] = "mhlo.dynamic_broadcast_in_dim"
+  // CHECK-SAME: (%[[LHS_ZP_CONTRIB]], %[[OUTPUT_DIMS]])
+  // CHECK-SAME: broadcast_dimensions = dense<[0, 2, 1]>
+  // CHECK-SAME: (tensor<?x?x3xi32>, tensor<4xi64>) -> tensor<?x3x?x8xi32>
+
+  // Zero point offset contribution from RHS tensor * LHS ZP.
+
+  // CHECK: %[[RHS_I32:.*]] = mhlo.convert %[[RHS:.*]] : (tensor<6x?x?x8x3xi8>)
+  // CHECK-SAME: -> tensor<6x?x?x8x3xi32>
+  // CHECK: %[[RHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[RHS_REDUCE:.*]] = mhlo.reduce(%[[RHS_I32]] init: %[[RHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [0, 2]
+  // CHECK-SAME: (tensor<6x?x?x8x3xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<?x8x3xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<3> : tensor<i32>
+  // CHECK: %[[RHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<?x8x3xi32>, tensor<i32>) -> tensor<?x8x3xi32>
+
+  // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.dynamic_broadcast_in_dim"
+  // CHECK-SAME: (%[[RHS_ZP_CONTRIB]], %[[OUTPUT_DIMS]])
+  // CHECK-SAME: broadcast_dimensions = dense<[0, 3, 1]>
+  // CHECK-SAME: (tensor<?x8x3xi32>, tensor<4xi64>) -> tensor<?x3x?x8xi32>
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.add %[[LHS_ZP_BCAST]], %[[RHS_ZP_BCAST]]
+
+  // Zero point offset contribution from LHS ZP * RHS ZP.
+
+  // CHECK: %[[ZPS_INIT:.*]] = mhlo.constant dense<1> : tensor<i32>
+  // CHECK: %[[DYN_DIM:.*]] = "mhlo.get_dimension_size"(%[[RHS]])
+  // CHECK: %[[ZPS_1:.*]] = mhlo.multiply %[[ZPS_INIT]], %[[DYN_DIM]]
+  // CHECK: %[[STATIC_DIM:.*]] = mhlo.constant dense<90> : tensor<i32>
+  // CHECK: %[[ZPS:.*]] = mhlo.multiply %[[STATIC_DIM]], %[[ZPS_1]]
+  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_1]], %[[ZPS]]
+  // CHECK-SAME: (tensor<?x3x?x8xi32>, tensor<i32>) -> tensor<?x3x?x8xi32>
+
+  // Combine dot result with zero point offset and output final result.
+
+  // CHECK: %[[COMBINED_SCALE:.*]] = mhlo.constant dense<5.000000e-01> : tensor<f32>
+  // CHECK: %[[RES_FP:.*]] = mhlo.convert %[[DOT_RES]]
+  // CHECK-SAME: (tensor<?x3x?x8xi32>) -> tensor<?x3x?x8xf32>
+  // CHECK: %[[RES_FP_1:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RES_FP:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
+  // CHECK-SAME: (tensor<?x3x?x8xf32>) -> tensor<?x3x?x8xi32>
+
+  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
+  // CHECK-SAME: (tensor<?x3x?x8xi32>) -> tensor<?x3x?x8xf32>
+  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[ZP_TOTAL_3:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
+  // CHECK-SAME: (tensor<?x3x?x8xf32>) -> tensor<?x3x?x8xi32>
+
+  // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
+  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_5]]
+  // CHECK-SAME: (tensor<i32>, tensor<?x3x?x8xi32>) -> tensor<?x3x?x8xi32>
+  // CHECK: chlo.broadcast_add %[[RES_INT]], %[[ZP_TOTAL_6]]
+
+  %2 = "mhlo.dot_general" (%0, %1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0, 2],
+      rhs_batching_dimensions = [1, 4],
+      lhs_contracting_dimensions = [4, 3],
+      rhs_contracting_dimensions = [0, 2]
+    >} : (
+      tensor<?x?x3x?x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
+      tensor<6x?x?x8x3x!quant.uniform<i8:f32, 1.000000e+00:5>>
+    ) -> tensor<?x3x?x8x!quant.uniform<i8:f32, 4.000000e+00:7>>
+  %3 = mhlo.uniform_dequantize %2 : (
+    tensor<?x3x?x8x!quant.uniform<i8:f32, 4.000000e+00:7>>
+  ) -> tensor<?x3x?x8xf32>
+  return %3 : tensor<?x3x?x8xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @uniform_quantized_convolution
 func.func @uniform_quantized_convolution(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>) {
   %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?x?x?xf32>) -> tensor<?x?x?x?x!quant.uniform<i8:f32, 2.000000e+00:4>>

From 676af55749f95c486c6fce5b8b550c5412e28a5b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 12:49:49 -0700
Subject: [PATCH 232/567] Use MLIR op lowering of FusedBatchNormV3 and
 FusedBatchNormGradV3 for tf2xla for

PiperOrigin-RevId: 568299087
---
 tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc | 3 ++-
 tensorflow/compiler/tf2xla/kernels/BUILD                  | 1 +
 tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc       | 5 +++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index 95e6bd8be37c29..d4c0703e5b0093 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -2304,7 +2304,8 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
       // Apply Bessel's correction on the variance.
       int total_input_size = bn_train_input_type_tensor.getNumElements();
       int total_scale_size = scale_type_tensor.getNumElements();
-      int sample_size = total_input_size / total_scale_size;
+      int sample_size =
+          total_scale_size > 0 ? total_input_size / total_scale_size : 0;
       int sample_size_minus_one = std::max(1, sample_size - 1);
       double factor = static_cast<double>(sample_size) /
                       static_cast<double>(sample_size_minus_one);
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index a3b082939c1dc8..e5ff2ae36c0c42 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -3732,6 +3732,7 @@ tf_kernel_library(
         ":while_op",
         ":xla_call_module_op",
         "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:mlir_xla_op_kernel",
         "//tensorflow/compiler/tf2xla:xla_compilation_device",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_context",
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index f361d39fee92c8..b1878892a2cf79 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/kernels/relu_op.h"
+#include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -237,7 +238,7 @@ class FusedBatchNormOpEx : public FusedBatchNormOp {
 
 REGISTER_XLA_OP(Name("FusedBatchNorm"), FusedBatchNormOp);
 REGISTER_XLA_OP(Name("FusedBatchNormV2"), FusedBatchNormOp);
-REGISTER_XLA_OP(Name("FusedBatchNormV3"), FusedBatchNormOpV3);
+REGISTER_XLA_OP(Name("FusedBatchNormV3"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("_FusedBatchNormEx"), FusedBatchNormOpEx);
 
 class FusedBatchNormGradOp : public XlaOpKernel {
@@ -354,7 +355,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("FusedBatchNormGrad"), FusedBatchNormGradOp);
 REGISTER_XLA_OP(Name("FusedBatchNormGradV2"), FusedBatchNormGradOp);
-REGISTER_XLA_OP(Name("FusedBatchNormGradV3"), FusedBatchNormGradOp);
+REGISTER_XLA_OP(Name("FusedBatchNormGradV3"), MlirXlaOpKernel);
 
 }  // namespace
 }  // namespace tensorflow

From 2991aeb60931800b8103d6a89bc5d3b669f9ee73 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rjascani@google.com>
Date: Mon, 25 Sep 2023 13:08:44 -0700
Subject: [PATCH 233/567] Use similarly signed types for comparison

In the downstream TFLM project, comparing int and size_t types results in sign-compare compiler warnings.

PiperOrigin-RevId: 568304134
---
 tensorflow/lite/kernels/internal/reference/sub.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/reference/sub.h b/tensorflow/lite/kernels/internal/reference/sub.h
index 862bee149a5a69..1a74aebeafc2f1 100644
--- a/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/tensorflow/lite/kernels/internal/reference/sub.h
@@ -36,7 +36,7 @@ struct SubImpl {
   static void BroadcastInput1(const ArithmeticParams& params,
                               const T* input1_data, const T* input2_data,
                               T* output_data, size_t size, F binary_func) {
-    for (int c = 0; c < size; ++c) {
+    for (size_t c = 0; c < size; ++c) {
       output_data[c] = binary_func(input1_data[0], input2_data[c], params);
     }
   }
@@ -45,7 +45,7 @@ struct SubImpl {
   static void BroadcastInput2(const ArithmeticParams& params,
                               const T* input1_data, const T* input2_data,
                               T* output_data, size_t size, F binary_func) {
-    for (int c = 0; c < size; ++c) {
+    for (size_t c = 0; c < size; ++c) {
       output_data[c] = binary_func(input1_data[c], input2_data[0], params);
     }
   }
@@ -54,7 +54,7 @@ struct SubImpl {
   static void ElementWise(const ArithmeticParams& params, const T* input1_data,
                           const T* input2_data, T* output_data, size_t size,
                           F binary_func) {
-    for (int c = 0; c < size; ++c) {
+    for (size_t c = 0; c < size; ++c) {
       output_data[c] = binary_func(input1_data[c], input2_data[c], params);
     }
   }
@@ -146,7 +146,7 @@ inline void BroadcastSubRecursiveDimensions(
     size_t* compressed_input1_stride, size_t* compressed_input2_stride,
     size_t* compressed_output_shape, F binary_func) {
   if (dimension > 0) {
-    for (int c = 0; c < compressed_output_shape[dimension]; ++c) {
+    for (size_t c = 0; c < compressed_output_shape[dimension]; ++c) {
       size_t input1_offset_c = *input1_offset_p;
       size_t input2_offset_c = *input2_offset_p;
       BroadcastSubRecursiveDimensions(

From 731a6beacd6ca2dfe3a24bbe47c18eb130307a3f Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 25 Sep 2023 13:24:59 -0700
Subject: [PATCH 234/567] [TSL:CUDA] Improve the "I didn't find this symbol"
 stub for cudaGetErrorString() and cudaGetErrorName().

These functions return strings; return a generic error here instead of returning an invalid integer value.

Fixes crash in JAX CI.

PiperOrigin-RevId: 568308538
---
 .../xla/third_party/tsl/tsl/cuda/BUILD.bazel  |  1 +
 .../third_party/tsl/tsl/cuda/cudart_stub.cc   | 24 +++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
index 71bed13a8bae96..10195390362436 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
+++ b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
@@ -123,6 +123,7 @@ cc_library(
             ":cuda",
             "//tsl/platform:dso_loader",
             "//tsl/platform:env",
+            "@com_google_absl//absl/container:flat_hash_set",
             "@local_config_cuda//cuda:cuda_headers",
         ],
         "//conditions:default": [],
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc
index cfae868dc667fc..a3797b5c751cd8 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc
@@ -16,6 +16,9 @@ limitations under the License.
 // This file wraps cuda runtime calls with dso loader so that we don't need to
 // have explicit linking to libcuda.
 
+#include <string_view>
+
+#include "absl/container/flat_hash_set.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tsl/platform/dso_loader.h"
 #include "tsl/platform/env.h"
@@ -47,11 +50,23 @@ const char *kSymbols[] = {
 
 constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char *);
 
+absl::flat_hash_set<std::string_view> const &ErrorStringSymbols() {
+  static auto *syms = new absl::flat_hash_set<std::string_view>{
+      "cudaGetErrorName",
+      "cudaGetErrorString",
+  };
+  return *syms;
+}
+
 }  // namespace
 
 extern "C" {
 
-static cudaError_t CudartGetSymbolNotFoundError() {
+static const char *ReturnStringError() {
+  return "Error loading CUDA libraries. GPU will not be used.";
+}
+
+static cudaError_t GetSymbolNotFoundError() {
   return cudaErrorSharedObjectSymbolNotFound;
 }
 
@@ -62,7 +77,12 @@ void _cudart_tramp_resolve(int i) {
   CHECK_LT(i, kNumSymbols);
   void *p = LoadSymbol(kSymbols[i]);
   if (!p) {
-    p = reinterpret_cast<void *>(&CudartGetSymbolNotFoundError);
+    const auto &error_string_symbols = ErrorStringSymbols();
+    if (error_string_symbols.find(kSymbols[i]) != error_string_symbols.end()) {
+      p = reinterpret_cast<void *>(&ReturnStringError);
+    } else {
+      p = reinterpret_cast<void *>(&GetSymbolNotFoundError);
+    }
   }
   _cudart_tramp_table[i] = p;
 }

From 257a679f7c80046ce6398413d2db767b3e15aa08 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Mon, 25 Sep 2023 13:33:11 -0700
Subject: [PATCH 235/567] [xla:gpu] xfeed: add missing absl/synchronization dep

PiperOrigin-RevId: 568310784
---
 third_party/xla/xla/service/gpu/BUILD         | 1 +
 third_party/xla/xla/service/gpu/xfeed_queue.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 9d988927395b17..3f0ca3b23d173c 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3048,6 +3048,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:logging",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/xfeed_queue.h b/third_party/xla/xla/service/gpu/xfeed_queue.h
index e787ee1aa0b3dc..09fb778d90eb07 100644
--- a/third_party/xla/xla/service/gpu/xfeed_queue.h
+++ b/third_party/xla/xla/service/gpu/xfeed_queue.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {

From f1b7a6a604dc847ddf6edb90f25c8a42f2434c58 Mon Sep 17 00:00:00 2001
From: Luke Boyer <lukeboyer@google.com>
Date: Mon, 25 Sep 2023 13:37:11 -0700
Subject: [PATCH 236/567] Legalize EmptyTensorList as custom TensorListReserve
 with length 0.

PiperOrigin-RevId: 568311796
---
 .../mlir/lite/tests/legalize-tensorlist.mlir  | 10 ++++++
 .../lite/transforms/legalize_tensorlist.cc    | 12 +++++--
 .../lite/transforms/legalize_tensorlist.td    |  8 +++++
 .../kernels/variants/py/end_to_end_test.py    | 36 +++++++++++++++++++
 4 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir
index f97bab00c89503..9a8434489387fd 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir
@@ -100,3 +100,13 @@ func.func @notAllOpsSupportedNotLegalized(%arg0: tensor<!tf_type.variant<tensor<
   // CHECK-MSG: Tried legalizing to tfl custom tensorlist ops, but not all can be supported.
   func.return %1 : tensor<i32>
 }
+
+// -----
+
+// CHECK-LABEL: listEmptyToListReserve
+func.func @listEmptyToListReserve(%arg0: tensor<?xi32>, %arg1: tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>> {
+  %0 = "tf.EmptyTensorList"(%arg0, %arg1) : (tensor<?xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>>
+  // CHECK: %cst = arith.constant dense<0> : tensor<i32>
+  // CHECK: %0 = "tfl.custom"(%arg0, %cst) {custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x04">} : (tensor<?xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>>
+  func.return %0 : tensor<!tf_type.variant<tensor<*xi64>>>
+}
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
index 820021284b833b..68ca0e53192a43 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
@@ -81,9 +81,14 @@ std::optional<mlir::TFL::ConstBytesAttr> CustomOptions(
     mlir::MLIRContext* context, mlir::Operation* op) {
   if (auto reserve =
           llvm::dyn_cast_or_null<mlir::TF::TensorListReserveOp>(op)) {
-    mlir::Type element_type = reserve.getElementDtype();
     tflite::TensorType tflite_type =
-        tflite::ConvertTypeToTensorType(element_type);
+        tflite::ConvertTypeToTensorType(reserve.getElementDtype());
+
+    return CreateListReserveOptions(context, tflite_type);
+  }
+  if (auto empty = llvm::dyn_cast_or_null<mlir::TF::EmptyTensorListOp>(op)) {
+    tflite::TensorType tflite_type =
+        tflite::ConvertTypeToTensorType(empty.getElementDtype());
 
     return CreateListReserveOptions(context, tflite_type);
   }
@@ -120,6 +125,9 @@ bool IsOpSupported(mlir::Operation* op) {
   if (auto get_item = llvm::dyn_cast_or_null<TF::TensorListGetItemOp>(op)) {
     element_type = get_item.getElementDtype();
   }
+  if (auto empty = llvm::dyn_cast_or_null<TF::EmptyTensorListOp>(op)) {
+    element_type = empty.getElementDtype();
+  }
 
   if (!element_type.has_value()) return false;
   // TODO(b/288302706) add support for all types handled in the
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
index be13cd7c4ef326..5f8757fea2c844 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
@@ -17,6 +17,10 @@ include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "mlir/IR/OpBase.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+
+def ConstDenseElementsI32ZeroAttr
+  : NativeCodeCall<"$_builder.create<TFL::ConstOp>($_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {0}))">;
 
 def Size2InputRange : NativeCodeCall<
   "SmallVector<Value, 2>{$0, $1}">;
@@ -52,3 +56,7 @@ def LegalizeTensorListFromTensor : Pat<(TF_TensorListFromTensorOp $input, $eleme
 def LegalizeTensorListGetItem : Pat<(TF_TensorListGetItemOp $input, $index, $element_shape),
     (TFL_CustomOp (Size3InputRange $input, $index, $element_shape),
     (CreateStringAttr<"\"TensorListGetItem\"">), (EmptyCustomOptions))>;
+
+def LegalizeTensorListEmpty : Pat<(TF_EmptyTensorListOp:$tf_op $element_shape, $unused_max_elements),
+    (TFL_CustomOp (Size2InputRange $element_shape, (ConstDenseElementsI32ZeroAttr)),
+    (CreateStringAttr<"\"TensorListReserve\"">), (CustomOptions $tf_op))>;
diff --git a/tensorflow/lite/kernels/variants/py/end_to_end_test.py b/tensorflow/lite/kernels/variants/py/end_to_end_test.py
index 115e799cd5b7a0..dff0d397b3c69d 100644
--- a/tensorflow/lite/kernels/variants/py/end_to_end_test.py
+++ b/tensorflow/lite/kernels/variants/py/end_to_end_test.py
@@ -237,6 +237,42 @@ def reserve_stack(stack_element_shape) -> tf.Tensor:
     self.assertEqual(tf_out.shape, output_tensor.shape)
     self.assertTrue((tf_out == output_tensor).numpy().all())
 
+  def test_empty_tensorlist_set_stack(self):
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(shape=tf.TensorShape(None), dtype=tf.int32)
+        ]
+    )
+    def empty_tensorlist_set_stack(x) -> tf.Tensor:
+      l = list_ops.empty_tensor_list(tf.TensorShape(None), tf.int32)
+      l2 = list_ops.tensor_list_set_item(l, 0, x, True)
+
+      return list_ops.tensor_list_stack(l2, tf.int32)
+
+    interpreter = self._get_interpreter_from_c_func(empty_tensorlist_set_stack)
+
+    input_index = interpreter.get_input_details()[0]["index"]
+
+    interpreter.resize_tensor_input(input_index, [2, 2])
+
+    interpreter.allocate_tensors()
+
+    input_tensor = np.ndarray(shape=[2, 2], dtype=np.int32)
+    input_tensor.fill(0)
+    interpreter.set_tensor(input_index, input_tensor)
+
+    interpreter.invoke()
+
+    output_tensor = interpreter.get_tensor(
+        interpreter.get_output_details()[0]["index"]
+    )
+
+    tf_out = empty_tensorlist_set_stack(input_tensor)
+
+    self.assertEqual(tf_out.dtype, output_tensor.dtype)
+    self.assertEqual(tf_out.shape, output_tensor.shape)
+    self.assertTrue((tf_out == output_tensor).numpy().all())
+
 
 if __name__ == "__main__":
   googletest.main()

From 3bc2b252352279036c5957b44156d9366fda0474 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 25 Sep 2023 13:40:07 -0700
Subject: [PATCH 237/567] [stream_executor] Add GPU graph handle to
 GpuCommandBuffer

https://github.com/openxla/xla/issues/5857

PiperOrigin-RevId: 568312557
---
 .../xla/stream_executor/cuda/cuda_driver.cc   |  2 +-
 .../stream_executor/cuda/cuda_gpu_executor.cc |  8 ++-
 third_party/xla/xla/stream_executor/gpu/BUILD |  3 +
 .../stream_executor/gpu/gpu_command_buffer.cc | 57 +++++++++++++++++++
 .../stream_executor/gpu/gpu_command_buffer.h  | 32 ++++++++++-
 .../xla/stream_executor/platform/initialize.h |  4 +-
 6 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
index bf180cb86486d1..4daea1ebc57fb2 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
@@ -662,7 +662,7 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
 /* static */ tsl::Status GpuDriver::DestroyGraphExec(CUgraphExec exec) {
   VLOG(2) << "Destroying CUDA executable graph " << exec;
   RETURN_IF_CUDA_RES_ERROR(cuGraphExecDestroy(exec),
-                           "Failed to destroy CUDA graph");
+                           "Failed to destroy CUDA executable graph");
   return ::tsl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc
index 0d882033ee56e5..483f9a973083d1 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc
@@ -40,9 +40,11 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_driver.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/gpu/gpu_command_buffer.h"
+#include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_event.h"
 #include "xla/stream_executor/gpu/gpu_kernel.h"
 #include "xla/stream_executor/gpu/gpu_timer.h"
+#include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel_cache_config.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/initialize.h"
@@ -844,8 +846,10 @@ GpuExecutor::GetStreamImplementation() {
 
 tsl::StatusOr<std::unique_ptr<internal::CommandBufferInterface>>
 GpuExecutor::GetCommandBufferImplementation() {
-  return std::unique_ptr<internal::CommandBufferInterface>(
-      new GpuCommandBuffer());
+  VLOG(2) << "Create CUDA command buffer (CUDA graph)";
+  GpuGraphHandle graph = nullptr;
+  TF_RETURN_IF_ERROR(GpuDriver::CreateGraph(&graph));
+  return std::make_unique<GpuCommandBuffer>(graph);
 }
 
 void* GpuExecutor::GpuContextHack() { return context_; }
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index aec49df203b217..87a41605b5046d 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -91,7 +91,10 @@ cc_library(
         ":gpu_driver_header",
         ":gpu_executor_header",
         ":gpu_stream",
+        ":gpu_types_header",
         "//xla/stream_executor:stream_executor_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index 3068e37710a618..9e18de6611234e 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -14,3 +14,60 @@ limitations under the License.
 ==============================================================================*/
 
 #include "xla/stream_executor/gpu/gpu_command_buffer.h"
+
+#include <atomic>
+#include <cstdint>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor::gpu {
+
+//===----------------------------------------------------------------------===//
+// GpuCommandBuffer resource usage tracking
+//===----------------------------------------------------------------------===//
+
+static std::atomic<int64_t> allocated_execs(0);
+static std::atomic<int64_t> alive_execs(0);
+
+static int64_t NotifyExecCreated() {
+  alive_execs.fetch_add(1, std::memory_order_relaxed);
+  return allocated_execs.fetch_add(1, std::memory_order_relaxed);
+}
+
+static int64_t NotifyExecDestroyed() {
+  DCHECK_GE(alive_execs.load(std::memory_order_relaxed), 1);
+  return alive_execs.fetch_sub(1, std::memory_order_relaxed) - 1;
+}
+
+/*static*/ int64_t GpuCommandBuffer::AllocatedExecs() {
+  return allocated_execs.load(std::memory_order_relaxed);
+}
+
+/*static*/ int64_t GpuCommandBuffer::AliveExecs() {
+  return alive_execs.load(std::memory_order_relaxed);
+}
+
+//===----------------------------------------------------------------------===//
+// GpuCommandBuffer implementation
+//===----------------------------------------------------------------------===//
+
+GpuCommandBuffer::GpuCommandBuffer(GpuGraphHandle graph)
+    : graph_(graph), exec_(nullptr) {}
+
+GpuCommandBuffer::~GpuCommandBuffer() {
+  if (exec_ != nullptr) {
+    VLOG(5) << "Destroy GPU command buffer graph exec. "
+            << "Remaining alive instances: " << NotifyExecDestroyed();
+    auto st = GpuDriver::DestroyGraphExec(exec_);
+    CHECK(st.ok()) << "Failed to destroy GPU graph exec: " << st.message();
+  }
+  if (graph_ != nullptr) {
+    auto st = GpuDriver::DestroyGraph(graph_);
+    CHECK(st.ok()) << "Failed to destroy GPU graph: " << st.message();
+  }
+}
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index 4669dd1abc823b..7ce16607b7a60d 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -16,13 +16,43 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_GPU_GPU_COMMAND_BUFFER_H_
 #define XLA_STREAM_EXECUTOR_GPU_GPU_COMMAND_BUFFER_H_
 
+#include <cstdint>
+#include <type_traits>
+
+#include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor::gpu {
 
 // GpuCommandBuffer provides platform-specific CommandBufferInterface
 // implementation (it's backed by CUDA or HIP graphs on NVIDIA and AMD devices).
-class GpuCommandBuffer : public internal::CommandBufferInterface {};
+class GpuCommandBuffer : public internal::CommandBufferInterface {
+ public:
+  explicit GpuCommandBuffer(GpuGraphHandle graph);
+  ~GpuCommandBuffer() override;
+
+  // We track the total number of allocated and alive executable graphs in the
+  // process to track the command buffers resource usage. Executable graph
+  // allocates resources on a GPU devices (rule of thumb is ~8kb per node), so
+  // we have to be careful not to keep too many of them alive for too long, or
+  // we have a higher risk of OOM errors.
+  //
+  // TODO(ezhulenev): We need to have a policy for how to evict unused
+  // executable graph instances from a device, currently lifetime of an
+  // executable graph is tied to a parent command buffer, and we can have
+  // thousands of command buffers alive at the same time.
+  static int64_t AllocatedExecs();
+  static int64_t AliveExecs();
+
+ private:
+  static_assert(std::is_pointer_v<GpuGraphHandle>,
+                "GpuGraphHandle must be a pointer");
+  static_assert(std::is_pointer_v<GpuGraphExecHandle>,
+                "GpuGraphExecHandle must be a pointer");
+
+  GpuGraphHandle graph_ = nullptr;     // owned handle
+  GpuGraphExecHandle exec_ = nullptr;  // owned handle
+};
 
 }  // namespace stream_executor::gpu
 
diff --git a/third_party/xla/xla/stream_executor/platform/initialize.h b/third_party/xla/xla/stream_executor/platform/initialize.h
index f0d43b82b23a97..d56bda8383ae2b 100644
--- a/third_party/xla/xla/stream_executor/platform/initialize.h
+++ b/third_party/xla/xla/stream_executor/platform/initialize.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include "xla/stream_executor/platform/platform.h"
 
 #if defined(PLATFORM_GOOGLE) || defined(PLATFORM_CHROMIUMOS)
-#include "xla/stream_executor/platform/google/initialize.h"
+#include "xla/stream_executor/platform/google/initialize.h"  // IWYU pragma: export
 #else
-#include "xla/stream_executor/platform/default/initialize.h"
+#include "xla/stream_executor/platform/default/initialize.h"  // IWYU pragma: export
 #endif
 
 #endif  // XLA_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_

From bbda501caf7e83e7d390c372647ec23864a9c65c Mon Sep 17 00:00:00 2001
From: Feng Wang <wffw@google.com>
Date: Mon, 25 Sep 2023 14:26:26 -0700
Subject: [PATCH 238/567] Add new context type for serving

PiperOrigin-RevId: 568324654
---
 .../xla/third_party/tsl/tsl/profiler/lib/context_types.cc       | 2 ++
 .../xla/third_party/tsl/tsl/profiler/lib/context_types.h        | 1 +
 2 files changed, 3 insertions(+)

diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
index f5394ef32016bb..9379885c4de76b 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
@@ -44,6 +44,8 @@ const char* GetContextTypeString(ContextType context_type) {
       return "tpu_stream";
     case ContextType::kTpuLaunch:
       return "tpu_launch";
+    case ContextType::kPathwaysExecutor:
+      return "pathways_exec";
   }
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
index dc4a0ebe1f5a65..6f65454354a1dc 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
@@ -35,6 +35,7 @@ enum class ContextType : int {
   kBatcher,
   kTpuStream,
   kTpuLaunch,
+  kPathwaysExecutor,
   kLastContextType = ContextType::kTpuLaunch,
 };
 

From f21782ea1d8b5bb25ce2a5146950677aff650160 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 25 Sep 2023 15:46:42 -0700
Subject: [PATCH 239/567] [tsl] Add the option to obtain NCCL via a stub,
 rather than linking it statically or dynamically.

NCCL via a stub is enabled if the environment variable TF_NCCL_USE_STUB=1 is set.

The intent is to use this option in JAX to reduce the size of the CUDA wheels. NCCL takes up about 80MB in each compiled JAX wheel and takes a significant amount of time to build. It is possible other users of TSL may wish to do this also.

PiperOrigin-RevId: 568344701
---
 third_party/gpus/cuda/cuda_config.h.tpl       |  1 +
 third_party/nccl/archive.BUILD                | 23 +++++
 third_party/nccl/nccl_configure.bzl           | 28 +++++-
 .../third_party/gpus/cuda/cuda_config.h.tpl   |  1 +
 .../xla/third_party/nccl/archive.BUILD        | 23 +++++
 .../xla/third_party/nccl/nccl_configure.bzl   | 28 +++++-
 .../third_party/gpus/cuda/cuda_config.h.tpl   |  1 +
 .../tsl/third_party/nccl/archive.BUILD        | 23 +++++
 .../tsl/third_party/nccl/nccl_configure.bzl   | 28 +++++-
 .../xla/third_party/tsl/tsl/cuda/BUILD.bazel  | 26 ++++++
 .../xla/third_party/tsl/tsl/cuda/nccl.symbols | 54 +++++++++++
 .../xla/third_party/tsl/tsl/cuda/nccl_stub.cc | 93 +++++++++++++++++++
 .../tsl/tsl/platform/default/dso_loader.cc    |  5 +
 .../tsl/tsl/platform/default/dso_loader.h     |  1 +
 third_party/xla/xla/python/BUILD              |  1 +
 15 files changed, 333 insertions(+), 3 deletions(-)
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/nccl.symbols
 create mode 100644 third_party/xla/third_party/tsl/tsl/cuda/nccl_stub.cc

diff --git a/third_party/gpus/cuda/cuda_config.h.tpl b/third_party/gpus/cuda/cuda_config.h.tpl
index 03ecd0159f496a..cd2850db146685 100644
--- a/third_party/gpus/cuda/cuda_config.h.tpl
+++ b/third_party/gpus/cuda/cuda_config.h.tpl
@@ -25,6 +25,7 @@ limitations under the License.
 #define TF_CUFFT_VERSION "%{cufft_version}"
 #define TF_CUSPARSE_VERSION "%{cusparse_version}"
 #define TF_CUDNN_VERSION "%{cudnn_version}"
+#define TF_NCCL_VERSION "%{nccl_version}"
 
 #define TF_CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}"
 
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index a608faf8a262ef..0a639a560976ae 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -86,6 +86,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "nccl_via_stub",
+    hdrs = ["src/nccl.h"],
+    include_prefix = "third_party/nccl",
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_tsl//tsl/cuda:nccl_stub",
+    ],
+)
+
+cc_library(
+    name = "nccl_headers",
+    hdrs = ["src/nccl.h"],
+    include_prefix = "third_party/nccl",
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
 cc_library(
     name = "nccl",
     srcs = glob(
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 10cc2729ee92e8..acdeac3b9fd647 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -9,6 +9,8 @@
   * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
     `/usr/local/cuda,usr/`.
   * `TF_CUDA_CLANG`: "1" if using Clang, "0" if using NVCC.
+  * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
+    be used, "0" if NCCL should be linked in statically.
 
 """
 
@@ -32,6 +34,7 @@ _TF_NCCL_VERSION = "TF_NCCL_VERSION"
 _TF_NEED_CUDA = "TF_NEED_CUDA"
 _TF_CUDA_PATHS = "TF_CUDA_PATHS"
 _TF_CUDA_CLANG = "TF_CUDA_CLANG"
+_TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
 
 _DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
 _DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
@@ -63,6 +66,26 @@ alias(
 )
 """
 
+_NCCL_ARCHIVE_STUB_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl_via_stub",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_headers",
+  actual = "@nccl_archive//:nccl_headers",
+  visibility = ["//visibility:public"],
+)
+"""
+
 def _label(file):
     return Label("//third_party/nccl:{}".format(file))
 
@@ -82,7 +105,10 @@ def _create_local_nccl_repository(repository_ctx):
 
     if nccl_version == "":
         # Alias to open source build from @nccl_archive.
-        repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+        if get_host_environ(repository_ctx, _TF_NCCL_USE_STUB, "0") == "0":
+            repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+        else:
+            repository_ctx.file("BUILD", _NCCL_ARCHIVE_STUB_BUILD_CONTENT)
 
         repository_ctx.template(
             "build_defs.bzl",
diff --git a/third_party/xla/third_party/gpus/cuda/cuda_config.h.tpl b/third_party/xla/third_party/gpus/cuda/cuda_config.h.tpl
index 03ecd0159f496a..cd2850db146685 100644
--- a/third_party/xla/third_party/gpus/cuda/cuda_config.h.tpl
+++ b/third_party/xla/third_party/gpus/cuda/cuda_config.h.tpl
@@ -25,6 +25,7 @@ limitations under the License.
 #define TF_CUFFT_VERSION "%{cufft_version}"
 #define TF_CUSPARSE_VERSION "%{cusparse_version}"
 #define TF_CUDNN_VERSION "%{cudnn_version}"
+#define TF_NCCL_VERSION "%{nccl_version}"
 
 #define TF_CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}"
 
diff --git a/third_party/xla/third_party/nccl/archive.BUILD b/third_party/xla/third_party/nccl/archive.BUILD
index a608faf8a262ef..0a639a560976ae 100644
--- a/third_party/xla/third_party/nccl/archive.BUILD
+++ b/third_party/xla/third_party/nccl/archive.BUILD
@@ -86,6 +86,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "nccl_via_stub",
+    hdrs = ["src/nccl.h"],
+    include_prefix = "third_party/nccl",
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_tsl//tsl/cuda:nccl_stub",
+    ],
+)
+
+cc_library(
+    name = "nccl_headers",
+    hdrs = ["src/nccl.h"],
+    include_prefix = "third_party/nccl",
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
 cc_library(
     name = "nccl",
     srcs = glob(
diff --git a/third_party/xla/third_party/nccl/nccl_configure.bzl b/third_party/xla/third_party/nccl/nccl_configure.bzl
index 9653ca49deff4a..7a9a9ac6112461 100644
--- a/third_party/xla/third_party/nccl/nccl_configure.bzl
+++ b/third_party/xla/third_party/nccl/nccl_configure.bzl
@@ -9,6 +9,8 @@
   * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
     `/usr/local/cuda,usr/`.
   * `TF_CUDA_CLANG`: "1" if using Clang, "0" if using NVCC.
+  * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
+    be used, "0" if NCCL should be linked in statically.
 
 """
 
@@ -32,6 +34,7 @@ _TF_NCCL_VERSION = "TF_NCCL_VERSION"
 _TF_NEED_CUDA = "TF_NEED_CUDA"
 _TF_CUDA_PATHS = "TF_CUDA_PATHS"
 _TF_CUDA_CLANG = "TF_CUDA_CLANG"
+_TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
 
 _DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
 _DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
@@ -63,6 +66,26 @@ alias(
 )
 """
 
+_NCCL_ARCHIVE_STUB_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl_via_stub",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_headers",
+  actual = "@nccl_archive//:nccl_headers",
+  visibility = ["//visibility:public"],
+)
+"""
+
 def _label(file):
     return Label("//third_party/nccl:{}".format(file))
 
@@ -82,7 +105,10 @@ def _create_local_nccl_repository(repository_ctx):
 
     if nccl_version == "":
         # Alias to open source build from @nccl_archive.
-        repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+        if get_host_environ(repository_ctx, _TF_NCCL_USE_STUB, "0") == "0":
+            repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+        else:
+            repository_ctx.file("BUILD", _NCCL_ARCHIVE_STUB_BUILD_CONTENT)
 
         repository_ctx.template(
             "build_defs.bzl",
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_config.h.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_config.h.tpl
index 03ecd0159f496a..cd2850db146685 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_config.h.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/cuda_config.h.tpl
@@ -25,6 +25,7 @@ limitations under the License.
 #define TF_CUFFT_VERSION "%{cufft_version}"
 #define TF_CUSPARSE_VERSION "%{cusparse_version}"
 #define TF_CUDNN_VERSION "%{cudnn_version}"
+#define TF_NCCL_VERSION "%{nccl_version}"
 
 #define TF_CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}"
 
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
index a608faf8a262ef..0a639a560976ae 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
@@ -86,6 +86,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "nccl_via_stub",
+    hdrs = ["src/nccl.h"],
+    include_prefix = "third_party/nccl",
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_tsl//tsl/cuda:nccl_stub",
+    ],
+)
+
+cc_library(
+    name = "nccl_headers",
+    hdrs = ["src/nccl.h"],
+    include_prefix = "third_party/nccl",
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
 cc_library(
     name = "nccl",
     srcs = glob(
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl b/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
index ca5c66ea0322fe..5ba212f002e85e 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
@@ -9,6 +9,8 @@
   * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
     `/usr/local/cuda,usr/`.
   * `TF_CUDA_CLANG`: "1" if using Clang, "0" if using NVCC.
+  * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
+    be used, "0" if NCCL should be linked in statically.
 
 """
 
@@ -32,6 +34,7 @@ _TF_NCCL_VERSION = "TF_NCCL_VERSION"
 _TF_NEED_CUDA = "TF_NEED_CUDA"
 _TF_CUDA_PATHS = "TF_CUDA_PATHS"
 _TF_CUDA_CLANG = "TF_CUDA_CLANG"
+_TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
 
 _DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
 _DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
@@ -63,6 +66,26 @@ alias(
 )
 """
 
+_NCCL_ARCHIVE_STUB_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl_via_stub",
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl_headers",
+  actual = "@nccl_archive//:nccl_headers",
+  visibility = ["//visibility:public"],
+)
+"""
+
 def _label(file):
     return Label("//third_party/nccl:{}".format(file))
 
@@ -82,7 +105,10 @@ def _create_local_nccl_repository(repository_ctx):
 
     if nccl_version == "":
         # Alias to open source build from @nccl_archive.
-        repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+        if get_host_environ(repository_ctx, _TF_NCCL_USE_STUB, "0") == "0":
+            repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+        else:
+            repository_ctx.file("BUILD", _NCCL_ARCHIVE_STUB_BUILD_CONTENT)
 
         repository_ctx.template(
             "build_defs.bzl",
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
index 10195390362436..47291d50c4e2cd 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
+++ b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
@@ -285,3 +285,29 @@ cc_library(
         "//tsl/platform:env",
     ]),
 )
+
+cuda_stub(
+    name = "nccl",
+    srcs = ["nccl.symbols"],
+)
+
+cc_library(
+    name = "nccl_stub",
+    srcs = if_cuda_is_configured([
+        "nccl_stub.cc",
+        "nccl.tramp.S",
+    ]),
+    linkopts = if_cuda_is_configured(cuda_rpath_flags("nvidia/nccl/lib")),
+    local_defines = [
+        "IMPLIB_EXPORT_SHIMS=1",
+    ],
+    textual_hdrs = ["nccl.inc"],
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_nccl//:nccl_headers",
+        "//tsl/platform:dso_loader",
+        "//tsl/platform:env",
+    ]),
+)
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/nccl.symbols b/third_party/xla/third_party/tsl/tsl/cuda/nccl.symbols
new file mode 100644
index 00000000000000..0d6552dafe0238
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/nccl.symbols
@@ -0,0 +1,54 @@
+ncclAllGather
+ncclAllReduce
+ncclBcast
+ncclBroadcast
+ncclCommAbort
+ncclCommCount
+ncclCommCuDevice
+ncclCommDestroy
+ncclCommFinalize
+ncclCommGetAsyncError
+ncclCommInitAll
+ncclCommInitRank
+ncclCommInitRankConfig
+ncclCommSplit
+ncclCommUserRank
+ncclGetErrorString
+ncclGetLastError
+ncclGetUniqueId
+ncclGetVersion
+ncclGroupEnd
+ncclGroupStart
+ncclRecv
+ncclRedOpCreatePreMulSum
+ncclRedOpDestroy
+ncclReduce
+ncclReduceScatter
+ncclSend
+pncclAllGather
+pncclAllReduce
+pncclBcast
+pncclBroadcast
+pncclCommAbort
+pncclCommCount
+pncclCommCuDevice
+pncclCommDestroy
+pncclCommFinalize
+pncclCommGetAsyncError
+pncclCommInitAll
+pncclCommInitRank
+pncclCommInitRankConfig
+pncclCommSplit
+pncclCommUserRank
+pncclGetErrorString
+pncclGetLastError
+pncclGetUniqueId
+pncclGetVersion
+pncclGroupEnd
+pncclGroupStart
+pncclRecv
+pncclRedOpCreatePreMulSum
+pncclRedOpDestroy
+pncclReduce
+pncclReduceScatter
+pncclSend
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/nccl_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/nccl_stub.cc
new file mode 100644
index 00000000000000..0ebae2f3c2b2eb
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/cuda/nccl_stub.cc
@@ -0,0 +1,93 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string_view>
+
+#include "absl/container/flat_hash_set.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/nccl/nccl.h"
+#include "tsl/platform/dso_loader.h"
+#include "tsl/platform/env.h"
+
+// Implements the nccl API by forwarding to nccl loaded from a DSO.
+
+namespace {
+// Returns DSO handle or null if loading the DSO fails.
+void* GetDsoHandle() {
+#ifdef PLATFORM_GOOGLE
+  return nullptr;
+#else
+  static auto handle = []() -> void* {
+    auto handle_or = tsl::internal::DsoLoader::GetNcclDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.value();
+  }();
+  return handle;
+#endif
+}
+
+void* LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
+  if (auto handle = GetDsoHandle()) {
+    tsl::Env::Default()
+        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+        .IgnoreError();
+  }
+  return symbol;
+}
+
+const char* kSymbols[] = {
+#include "tsl/cuda/nccl.inc"
+};
+
+constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
+
+absl::flat_hash_set<std::string_view> const& ErrorStringSymbols() {
+  static auto* syms = new absl::flat_hash_set<std::string_view>{
+      "ncclGetErrorString",
+      "pncclGetErrorString",
+      "ncclGetLastError",
+      "pncclGetLastError",
+  };
+  return *syms;
+}
+
+}  // namespace
+
+extern "C" {
+
+static ncclResult_t GetSymbolNotFoundError() { return ncclSystemError; }
+
+static const char* ReturnErrorString() {
+  return "Unable to load NCCL library. Multi-GPU collectives will not work.";
+}
+
+extern void* _nccl_tramp_table[];
+
+void _nccl_tramp_resolve(int i) {
+  CHECK_LE(0, i);
+  CHECK_LT(i, kNumSymbols);
+  void* p = LoadSymbol(kSymbols[i]);
+  if (!p) {
+    const auto& error_string_syms = ErrorStringSymbols();
+    if (error_string_syms.find(kSymbols[i]) != error_string_syms.end()) {
+      p = reinterpret_cast<void*>(&ReturnErrorString);
+    } else {
+      p = reinterpret_cast<void*>(&GetSymbolNotFoundError);
+    }
+  }
+  _nccl_tramp_table[i] = p;
+}
+
+}  // extern "C"
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.cc b/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.cc
index d3552daa298b5f..c75806e902cac3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.cc
@@ -44,6 +44,7 @@ string GetCublasVersion() { return TF_CUBLAS_VERSION; }
 string GetCusolverVersion() { return TF_CUSOLVER_VERSION; }
 string GetCufftVersion() { return TF_CUFFT_VERSION; }
 string GetCusparseVersion() { return TF_CUSPARSE_VERSION; }
+string GetNcclVersion() { return TF_NCCL_VERSION; }
 string GetTensorRTVersion() { return TF_TENSORRT_VERSION; }
 
 StatusOr<void*> GetDsoHandle(const string& name, const string& version) {
@@ -119,6 +120,10 @@ StatusOr<void*> GetCudnnDsoHandle() {
   return GetDsoHandle("cudnn", GetCudnnVersion());
 }
 
+StatusOr<void*> GetNcclDsoHandle() {
+  return GetDsoHandle("nccl", GetNcclVersion());
+}
+
 StatusOr<void*> GetNvInferDsoHandle() {
 #if defined(PLATFORM_WINDOWS)
   return GetDsoHandle("nvinfer", "");
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.h b/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.h
index db20d493336953..ee5b2b28af3486 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.h
@@ -37,6 +37,7 @@ StatusOr<void*> GetCusolverDsoHandle();
 StatusOr<void*> GetCusparseDsoHandle();
 StatusOr<void*> GetCuptiDsoHandle();
 StatusOr<void*> GetCudnnDsoHandle();
+StatusOr<void*> GetNcclDsoHandle();
 StatusOr<void*> GetNvInferDsoHandle();
 StatusOr<void*> GetNvInferPluginDsoHandle();
 
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 44852eec3fb6f1..0b4e59bd199c40 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1013,6 +1013,7 @@ tsl_pybind_extension(
             "-Wl,-rpath,$$ORIGIN/../nvidia/cufft/lib",
             "-Wl,-rpath,$$ORIGIN/../nvidia/cudnn/lib",
             "-Wl,-rpath,$$ORIGIN/../nvidia/cusolver/lib",
+            "-Wl,-rpath,$$ORIGIN/../nvidia/nccl/lib",
         ],
         "//conditions:default": [],
     }),

From 450dec35448a73b3fcb5d4f82108d5fdcb3f59b4 Mon Sep 17 00:00:00 2001
From: Ziyin Huang <ziyinh@google.com>
Date: Mon, 25 Sep 2023 16:06:27 -0700
Subject: [PATCH 240/567] Internal change, add some checks on the sparseTensor
 format checking.

PiperOrigin-RevId: 568349775
---
 .../core/tpu/kernels/sparse_core_preprocess_ops.cc       | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index 38df331827907a..da656554186be5 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -122,8 +122,15 @@ Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
     // The row ids are just the sample ids which is the first dim of the
     // indices.
     auto indices_matrix = indices_or_row_splits.matrix<int32>();
+    int32 previous_row_id = -1;
     for (int32 i = 0; i < total_id_count; ++i) {
-      *(row_ids_before_padding + i) = indices_matrix(i, 0);
+      int32 current_row_id = indices_matrix(i, 0);
+      if (current_row_id < previous_row_id) {
+        return absl::InvalidArgumentError(
+            "Invalid indices_or_row_splits input, indices of SparseTensor need "
+            "to be sorted in ascending order.");
+      }
+      *(row_ids_before_padding + i) = current_row_id;
     }
   } else if (indices_or_row_splits.dims() == 1 &&
              indices_or_row_splits.NumElements() > 0) {

From 0bd205eeb7db15238aa795243cc07e401e97e71e Mon Sep 17 00:00:00 2001
From: Jared Junyoung Lim <jaredlim@google.com>
Date: Mon, 25 Sep 2023 16:14:51 -0700
Subject: [PATCH 241/567] Split long-running test cases in mul_test into
 subshards. This improves parallelism by evening out the sizes of the
 different shards and thus speeds up this sharded test.

PiperOrigin-RevId: 568351990
---
 tensorflow/lite/kernels/mul_test.cc | 230 +++++++++++++++++++++-------
 1 file changed, 176 insertions(+), 54 deletions(-)

diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
index eff799518435db..34b484a4ca9c2c 100644
--- a/tensorflow/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -808,8 +808,8 @@ constexpr int kDim6 = 7;
 
 constexpr int kMaxMulBroadcastDim = 6;
 
-void TestBroadcast(std::vector<int> input1_shape,
-                   std::vector<int> input2_shape) {
+void TestFloatBroadcast(std::vector<int> input1_shape,
+                        std::vector<int> input2_shape) {
   std::array<int, kMaxMulBroadcastDim> input1_dims;
   std::array<int, kMaxMulBroadcastDim> input2_dims;
   std::array<int, kMaxMulBroadcastDim> output_dims;
@@ -974,11 +974,23 @@ void TestIntegerBroadcast(std::vector<int> input1_shape,
   EXPECT_THAT(m.GetOutput(), testing::ContainerEq(output_ref));
 }
 
-TEST(FloatMulOpModel, Float32MultiDimBroadcast) {
+// To improve automatic test sharding (via shard_count in the BUILD file),
+// we need to ensure that each individual test case runs in a reasonable time,
+// otherwise we end up being limited by the performance of the longest shard.
+// Since TestFloat32MultiDimBroadcast has 2^12 iterations, it takes a
+// long time (over 30 seconds) to execute all iterations -- too long for a
+// single shard.  So we split it into a few "subshards" and have a separate
+// TYPED_TEST macro invocation for each subshard.
+
+void TestFloat32MultiDimBroadcast(int selected_subshard, int subshard_count) {
+  int iteration = 0;
   for (uint32_t bm1 = 0;
        bm1 < (static_cast<uint32_t>(1) << kMaxMulBroadcastDim); bm1++) {
     for (uint32_t bm2 = 0;
          bm2 < (static_cast<uint32_t>(1) << kMaxMulBroadcastDim); bm2++) {
+      if (iteration++ % subshard_count != selected_subshard) {
+        continue;  // This iteration of the loop is not part of this subshard.
+      }
       const bool input1_broadcast_dim1 = bm1 & (static_cast<uint32_t>(1) << 0);
       const bool input1_broadcast_dim2 = bm1 & (static_cast<uint32_t>(1) << 1);
       const bool input1_broadcast_dim3 = bm1 & (static_cast<uint32_t>(1) << 2);
@@ -1016,24 +1028,76 @@ TEST(FloatMulOpModel, Float32MultiDimBroadcast) {
                     input1_full_shape.end(), input1_shape.data());
           std::copy(input2_full_shape.end() - input2_dims,
                     input2_full_shape.end(), input2_shape.data());
-          TestBroadcast(input1_shape, input2_shape);
+          TestFloatBroadcast(input1_shape, input2_shape);
         }
       }
     }
   }
 }
 
+// Should match the number of TEST or TYPED_TEST invoations for each of
+// Float32MultiDimBroadcastSubshard*,
+// IntegerMultiDimBroadcastSubshard*,
+// Int8QuantizedMultiDimBroadcastSubshard*, and
+// Uint8QuantizedMultiDimBroadcastSubshard* below.
+constexpr int kMultiDimBroadcastSubshardCount = 10;
+
+TEST(FloatMulOpModel, Float32MultiDimBroadcastSubshard0) {
+  TestFloat32MultiDimBroadcast(0, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatMulOpModel, Float32MultiDimBroadcastSubshard1) {
+  TestFloat32MultiDimBroadcast(1, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatMulOpModel, Float32MultiDimBroadcastSubshard2) {
+  TestFloat32MultiDimBroadcast(2, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatMulOpModel, Float32MultiDimBroadcastSubshard3) {
+  TestFloat32MultiDimBroadcast(3, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatMulOpModel, Float32MultiDimBroadcastSubshard4) {
+  TestFloat32MultiDimBroadcast(4, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatMulOpModel, Float32MultiDimBroadcastSubshard5) {
+  TestFloat32MultiDimBroadcast(5, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatMulOpModel, Float32MultiDimBroadcastSubshard6) {
+  TestFloat32MultiDimBroadcast(6, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatMulOpModel, Float32MultiDimBroadcastSubshard7) {
+  TestFloat32MultiDimBroadcast(7, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatMulOpModel, Float32MultiDimBroadcastSubshard8) {
+  TestFloat32MultiDimBroadcast(8, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatMulOpModel, Float32MultiDimBroadcastSubshard9) {
+  TestFloat32MultiDimBroadcast(9, kMultiDimBroadcastSubshardCount);
+}
+
 template <typename T>
 class IntegerMulOpTest : public ::testing::Test {};
 
 using Int16OrInt32Or64Types = ::testing::Types<int16_t, int32_t, int64_t>;
 TYPED_TEST_SUITE(IntegerMulOpTest, Int16OrInt32Or64Types);
 
-TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcast) {
+// To improve automatic test sharding (via shard_count in the BUILD file),
+// we need to ensure that each individual test case runs in a reasonable time,
+// otherwise we end up being limited by the performance of the longest shard.
+// Since TestIntegerMultiDimBroadcast has 2^12 iterations, it takes a
+// long time (over 30 seconds) to execute all iterations -- too long for a
+// single shard.  So we split it into a few "subshards" and have a separate
+// TYPED_TEST macro invocation for each subshard.
+
+template <class TypeParam>
+void TestIntegerMultiDimBroadcast(int selected_subshard, int subshard_count) {
+  ASSERT_LT(selected_subshard, subshard_count);
+  int iteration = 0;
   for (uint32_t bm1 = 0;
        bm1 < (static_cast<uint32_t>(1) << kMaxMulBroadcastDim); bm1++) {
     for (uint32_t bm2 = 0;
          bm2 < (static_cast<uint32_t>(1) << kMaxMulBroadcastDim); bm2++) {
+      if (iteration++ % subshard_count != selected_subshard) {
+        continue;  // This iteration of the loop is not part of this subshard.
+      }
       const bool input1_broadcast_dim1 = bm1 & (static_cast<uint32_t>(1) << 0);
       const bool input1_broadcast_dim2 = bm1 & (static_cast<uint32_t>(1) << 1);
       const bool input1_broadcast_dim3 = bm1 & (static_cast<uint32_t>(1) << 2);
@@ -1078,6 +1142,37 @@ TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcast) {
   }
 }
 
+TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcastSubshard0) {
+  TestIntegerMultiDimBroadcast<TypeParam>(0, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcastSubshard1) {
+  TestIntegerMultiDimBroadcast<TypeParam>(1, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcastSubshard2) {
+  TestIntegerMultiDimBroadcast<TypeParam>(2, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcastSubshard3) {
+  TestIntegerMultiDimBroadcast<TypeParam>(3, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcastSubshard4) {
+  TestIntegerMultiDimBroadcast<TypeParam>(4, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcastSubshard5) {
+  TestIntegerMultiDimBroadcast<TypeParam>(5, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcastSubshard6) {
+  TestIntegerMultiDimBroadcast<TypeParam>(6, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcastSubshard7) {
+  TestIntegerMultiDimBroadcast<TypeParam>(7, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcastSubshard8) {
+  TestIntegerMultiDimBroadcast<TypeParam>(8, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerMulOpTest, IntegerMultiDimBroadcastSubshard9) {
+  TestIntegerMultiDimBroadcast<TypeParam>(9, kMultiDimBroadcastSubshardCount);
+}
+
 template <typename QuantizedType>
 void TestQuantizedBroadcast(std::vector<int> input1_shape,
                             std::vector<int> input2_shape) {
@@ -1185,11 +1280,25 @@ void TestQuantizedBroadcast(std::vector<int> input1_shape,
   }
 }
 
-TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcast) {
+// To improve automatic test sharding (via shard_count in the BUILD file),
+// we need to ensure that each individual test case runs in a reasonable time,
+// otherwise we end up being limited by the performance of the longest shard.
+// Since TestQuantizedMultiDimBroadcast has 2^12 iterations, it takes a
+// long time (over 30 seconds) to execute all iterations -- too long for a
+// single shard.  So we split it into a few "subshards" and have a separate
+// TEST macro invocation for each subshard.
+
+template <class T>
+void TestQuantizedMultiDimBroadcast(int selected_subshard, int subshard_count) {
+  ASSERT_LT(selected_subshard, subshard_count);
+  int iteration = 0;
   for (uint32_t bm1 = 0;
        bm1 < (static_cast<uint32_t>(1) << kMaxMulBroadcastDim); bm1++) {
     for (uint32_t bm2 = 0;
          bm2 < (static_cast<int32_t>(1) << kMaxMulBroadcastDim); bm2++) {
+      if (iteration++ % subshard_count != selected_subshard) {
+        continue;  // This iteration of the loop is not part of this subshard.
+      }
       const bool input1_broadcast_dim1 = bm1 & (static_cast<uint32_t>(1) << 0);
       const bool input1_broadcast_dim2 = bm1 & (static_cast<uint32_t>(1) << 1);
       const bool input1_broadcast_dim3 = bm1 & (static_cast<uint32_t>(1) << 2);
@@ -1227,60 +1336,73 @@ TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcast) {
                     input1_full_shape.end(), input1_shape.data());
           std::copy(input2_full_shape.end() - input2_dims,
                     input2_full_shape.end(), input2_shape.data());
-          TestQuantizedBroadcast<int8_t>(input1_shape, input2_shape);
+          TestQuantizedBroadcast<T>(input1_shape, input2_shape);
         }
       }
     }
   }
 }
 
-TEST(QuantizedMulOpModel, Uint8QuantizedMultiDimBroadcast) {
-  for (uint32_t bm1 = 0;
-       bm1 < (static_cast<uint32_t>(1) << kMaxMulBroadcastDim); bm1++) {
-    for (uint32_t bm2 = 0;
-         bm2 < (static_cast<uint32_t>(1) << kMaxMulBroadcastDim); bm2++) {
-      const bool input1_broadcast_dim1 = bm1 & (static_cast<uint32_t>(1) << 0);
-      const bool input1_broadcast_dim2 = bm1 & (static_cast<uint32_t>(1) << 1);
-      const bool input1_broadcast_dim3 = bm1 & (static_cast<uint32_t>(1) << 2);
-      const bool input1_broadcast_dim4 = bm1 & (static_cast<uint32_t>(1) << 3);
-      const bool input1_broadcast_dim5 = bm1 & (static_cast<uint32_t>(1) << 4);
-      const bool input1_broadcast_dim6 = bm1 & (static_cast<uint32_t>(1) << 5);
-      const bool input2_broadcast_dim1 = bm2 & (static_cast<uint32_t>(1) << 0);
-      const bool input2_broadcast_dim2 = bm2 & (static_cast<uint32_t>(1) << 1);
-      const bool input2_broadcast_dim3 = bm2 & (static_cast<uint32_t>(1) << 2);
-      const bool input2_broadcast_dim4 = bm2 & (static_cast<uint32_t>(1) << 3);
-      const bool input2_broadcast_dim5 = bm2 & (static_cast<uint32_t>(1) << 4);
-      const bool input2_broadcast_dim6 = bm2 & (static_cast<uint32_t>(1) << 5);
-      const int input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
-      const int input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
-      const int input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
-      const int input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
-      const int input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
-      const int input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
-      const int input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
-      const int input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
-      const int input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
-      const int input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
-      const int input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
-      const int input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
-      std::vector<int> input1_full_shape{input1_dim1, input1_dim2, input1_dim3,
-                                         input1_dim4, input1_dim5, input1_dim6};
-      std::vector<int> input2_full_shape{input2_dim1, input2_dim2, input2_dim3,
-                                         input2_dim4, input2_dim5, input2_dim6};
-      for (int input1_dims = 1; input1_dims <= kMaxMulBroadcastDim;
-           ++input1_dims) {
-        for (int input2_dims = 1; input2_dims <= kMaxMulBroadcastDim;
-             ++input2_dims) {
-          std::vector<int> input1_shape(input1_dims), input2_shape(input2_dims);
-          std::copy(input1_full_shape.end() - input1_dims,
-                    input1_full_shape.end(), input1_shape.data());
-          std::copy(input2_full_shape.end() - input2_dims,
-                    input2_full_shape.end(), input2_shape.data());
-          TestQuantizedBroadcast<uint8_t>(input1_shape, input2_shape);
-        }
-      }
-    }
-  }
+TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcastSubshard0) {
+  TestQuantizedMultiDimBroadcast<int8_t>(0, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcastSubshard1) {
+  TestQuantizedMultiDimBroadcast<int8_t>(1, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcastSubshard2) {
+  TestQuantizedMultiDimBroadcast<int8_t>(2, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcastSubshard3) {
+  TestQuantizedMultiDimBroadcast<int8_t>(3, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcastSubshard4) {
+  TestQuantizedMultiDimBroadcast<int8_t>(4, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcastSubshard5) {
+  TestQuantizedMultiDimBroadcast<int8_t>(5, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcastSubshard6) {
+  TestQuantizedMultiDimBroadcast<int8_t>(6, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcastSubshard7) {
+  TestQuantizedMultiDimBroadcast<int8_t>(7, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcastSubshard8) {
+  TestQuantizedMultiDimBroadcast<int8_t>(8, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Int8QuantizedMultiDimBroadcastSubshard9) {
+  TestQuantizedMultiDimBroadcast<int8_t>(9, kMultiDimBroadcastSubshardCount);
+}
+
+TEST(QuantizedMulOpModel, Uint8QuantizedMultiDimBroadcastSubshard0) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(0, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Uint8QuantizedMultiDimBroadcastSubshard1) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(1, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Uint8QuantizedMultiDimBroadcastSubshard2) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(2, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Uint8QuantizedMultiDimBroadcastSubshard3) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(3, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Uint8QuantizedMultiDimBroadcastSubshard4) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(4, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Uint8QuantizedMultiDimBroadcastSubshard5) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(5, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Uint8QuantizedMultiDimBroadcastSubshard6) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(6, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Uint8QuantizedMultiDimBroadcastSubshard7) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(7, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Uint8QuantizedMultiDimBroadcastSubshard8) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(8, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedMulOpModel, Uint8QuantizedMultiDimBroadcastSubshard9) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(9, kMultiDimBroadcastSubshardCount);
 }
 
 }  // namespace

From 9668ea0809b56288400d9dadaae57678043046b3 Mon Sep 17 00:00:00 2001
From: Yishuang Pang <ypang@google.com>
Date: Mon, 25 Sep 2023 16:17:09 -0700
Subject: [PATCH 242/567] Legalize some MHLO broadcasted select op to TF
 SelectV2 op directly. tf.BroadcastTo op folder folds constants by default,
 this would increase the size of models converted from StableHLO because
 StableHLO requires explicit broadcasting. This change helps reduce model size
 by removing unnecessary broadcasts at legalization stage.

PiperOrigin-RevId: 568352559
---
 .../lite/stablehlo/tests/legalize_hlo.mlir    | 26 +++++++++++++++++++
 .../transforms/legalize_hlo_patterns.td       | 17 ++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index e02ce38908d9ff..2df637ef62f263 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -986,6 +986,32 @@ func.func @selectv2_pred_scalar(%arg0: tensor<i1>, %arg1: tensor<2xi32>, %arg2:
   func.return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @selectv2_broadcasted_operand(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<i1>,
+// CHECK-SAME:                               %[[VAL_1:.*]]: tensor<1x1xi32>,
+// CHECK-SAME:                               %[[VAL_2:.*]]: tensor<1x100xi32>) -> tensor<1x100xi32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.SelectV2"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<i1>, tensor<1x1xi32>, tensor<1x100xi32>) -> tensor<1x100xi32>
+// CHECK:           return %[[VAL_3]] : tensor<1x100xi32>
+// CHECK:         }
+func.func @selectv2_broadcasted_operand(%arg0: tensor<i1>, %arg1: tensor<1x1xi32>, %arg2: tensor<1x100xi32>) -> tensor<1x100xi32> {
+  %0 = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x100xi32>
+  %1 = "mhlo.select"(%arg0, %0, %arg2) : (tensor<i1>, tensor<1x100xi32>, tensor<1x100xi32>) -> tensor<1x100xi32>
+  func.return %1 : tensor<1x100xi32>
+}
+
+// CHECK-LABEL:   func @selectv2_broadcasted_condition(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1x1xi1>,
+// CHECK-SAME:                               %[[VAL_1:.*]]: tensor<1x100xi32>,
+// CHECK-SAME:                               %[[VAL_2:.*]]: tensor<1x100xi32>) -> tensor<1x100xi32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.SelectV2"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<1x1xi1>, tensor<1x100xi32>, tensor<1x100xi32>) -> tensor<1x100xi32>
+// CHECK:           return %[[VAL_3]] : tensor<1x100xi32>
+// CHECK:         }
+func.func @selectv2_broadcasted_condition(%arg0: tensor<1x1xi1>, %arg1: tensor<1x100xi32>, %arg2: tensor<1x100xi32>) -> tensor<1x100xi32> {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi1>) -> tensor<1x100xi1>
+  %1 = "mhlo.select"(%0, %arg1, %arg2) : (tensor<1x100xi1>, tensor<1x100xi32>, tensor<1x100xi32>) -> tensor<1x100xi32>
+  func.return %1 : tensor<1x100xi32>
+}
+
 // CHECK-LABEL:   func @transpose_2d(
 // CHECK-SAME:                       %[[VAL_0:.*]]: tensor<2x3xf32>) -> tensor<3x2xf32> {
 // CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
index 9cf9aa518849f5..307f3515c68aff 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
@@ -225,6 +225,23 @@ def : Pat<(MHLO_RoundNearestEvenOp $input), (TF_RoundOp $input)>;
 def : Pat<(MHLO_ClampOp $min, $arg, $max),
           (TF_MaximumOp (TF_MinimumOp $arg, $max), $min)>;
 def : Pat<(MHLO_SelectOp $cond, $t, $e), (TF_SelectOp $cond, $t, $e)>;
+def : Pat<(MHLO_SelectOp (MHLO_BroadcastInDimOp:$output
+                                  $bcast_cond,
+                                  $broadcast_dimensions), $t, $e),
+          (TF_SelectV2Op $bcast_cond, $t, $e),
+          [(IsTFStyleBroadcast $broadcast_dimensions, $output)]>;
+def : Pat<(MHLO_SelectOp $cond, $t,
+                                (MHLO_BroadcastInDimOp:$output
+                                  $bcast_operand,
+                                  $broadcast_dimensions)),
+          (TF_SelectV2Op $cond, $t, $bcast_operand),
+          [(IsTFStyleBroadcast $broadcast_dimensions, $output)]>;
+def : Pat<(MHLO_SelectOp $cond, (MHLO_BroadcastInDimOp:$output
+                                  $bcast_operand,
+                                  $broadcast_dimensions),
+                                $e),
+          (TF_SelectV2Op $cond, $bcast_operand, $e),
+          [(IsTFStyleBroadcast $broadcast_dimensions, $output)]>;
 
 //===----------------------------------------------------------------------===//
 // Variadic op patterns.

From 55b7276ba525f322d7bd7773f61f0938a13292d1 Mon Sep 17 00:00:00 2001
From: Matt Callanan <mpcallanan@google.com>
Date: Mon, 25 Sep 2023 16:21:43 -0700
Subject: [PATCH 243/567] #tf-data-service Scale up `"no_compression"`
 experiment to 50%.

PiperOrigin-RevId: 568353683
---
 tensorflow/core/data/dataset_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 7f0eb78fea8b4c..b03527344e3d18 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -988,7 +988,7 @@ REGISTER_DATASET_EXPERIMENT("file_locality", RandomJobSamplePercentage<0>,
                             IndependentHostTasks);
 REGISTER_DATASET_EXPERIMENT("file_locality_v2", RandomJobSamplePercentage<0>,
                             AllTasks);
-REGISTER_DATASET_EXPERIMENT("no_compression", RandomJobSamplePercentage<1>,
+REGISTER_DATASET_EXPERIMENT("no_compression", RandomJobSamplePercentage<50>,
                             AllTasks);
 REGISTER_DATASET_EXPERIMENT("inject_io_prefetch", RandomJobSamplePercentage<50>,
                             AllTasks);

From 882e787f3d989df283811fd8309177601eccd6e9 Mon Sep 17 00:00:00 2001
From: Jieying Luo <jieying@google.com>
Date: Mon, 25 Sep 2023 17:24:28 -0700
Subject: [PATCH 244/567] [PJRT C API] Split callback and py_client_gpu out
 from py_client.

This is to prepare separating GPU into a plugin. The plugin will use py_client_gpu to register custom callback for `xla_python_gpu_callback`.

PiperOrigin-RevId: 568368076
---
 third_party/xla/xla/python/BUILD            | 76 ++++++++++++++++++---
 third_party/xla/xla/python/callback.cc      | 29 ++++----
 third_party/xla/xla/python/callback.h       |  5 +-
 third_party/xla/xla/python/py_client.cc     |  1 +
 third_party/xla/xla/python/py_client_gpu.cc |  1 +
 5 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 0b4e59bd199c40..7d9b296bd27a9e 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -187,7 +187,6 @@ py_strict_test(
 cc_library(
     name = "exceptions",
     hdrs = ["exceptions.h"],
-    compatible_with = [],
     copts = [
         "-fexceptions",
         "-fno-strict-aliasing",
@@ -314,7 +313,6 @@ cc_library(
 cc_library(
     name = "py_client",
     srcs = [
-        "callback.cc",
         "py_array.cc",
         "py_buffer.cc",
         "py_client.cc",
@@ -324,11 +322,8 @@ cc_library(
         "py_host_callback.cc",
         "py_values.cc",
         "sharding.cc",
-    ] + if_cuda_or_rocm([
-        "py_client_gpu.cc",
-    ]),
+    ],
     hdrs = [
-        "callback.h",
         "py_array.h",
         "py_buffer.h",
         "py_client.h",
@@ -339,9 +334,7 @@ cc_library(
         "py_values.h",
         "sharded_device_array.h",
         "sharding.h",
-    ] + if_cuda_or_rocm([
-        "py_client_gpu.h",
-    ]),
+    ],
     compatible_with = [],
     copts = [
         "-fexceptions",
@@ -353,8 +346,10 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
+        ":callback",
         ":exceptions",
         ":pprof_profile_builder",
+        ":py_client_gpu",
         ":py_host_callback_proto_cc",
         ":python_ref_manager",
         ":python_utils",
@@ -368,7 +363,9 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
@@ -411,6 +408,67 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "callback",
+    srcs = [
+        "callback.cc",
+    ],
+    hdrs = [
+        "callback.h",
+    ],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":python_ref_manager",
+        "//xla:comparison_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/pjrt:transpose",
+        "//xla/service:custom_call_status",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:statusor",
+        "@pybind11",
+    ],
+)
+
+cc_library(
+    name = "py_client_gpu",
+    srcs = if_cuda_or_rocm([
+        "py_client_gpu.cc",
+    ]),
+    hdrs = if_cuda_or_rocm([
+        "py_client_gpu.h",
+    ]),
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    defines = if_cuda(["GOOGLE_CUDA=1"]) + if_rocm([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    features = ["-use_header_modules"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":callback",
+        ":exceptions",
+        "//xla:comparison_util",
+        "//xla/service:custom_call_status",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@pybind11",
+    ] + if_cuda([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
+
 cc_library(
     name = "dlpack",
     srcs = ["dlpack.cc"],
diff --git a/third_party/xla/xla/python/callback.cc b/third_party/xla/xla/python/callback.cc
index 99ad6962549859..fbba7e68412997 100644
--- a/third_party/xla/xla/python/callback.cc
+++ b/third_party/xla/xla/python/callback.cc
@@ -23,11 +23,14 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "absl/types/span.h"
+#include "pybind11/numpy.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
 #include "xla/primitive_util.h"
-#include "xla/python/exceptions.h"
 #include "xla/service/custom_call_status.h"
-#include "tsl/profiler/lib/traceme.h"
+#include "tsl/platform/statusor.h"
 
 namespace py = pybind11;
 
@@ -103,16 +106,18 @@ StatusOr<py::tuple> CpuCallback::CallInternal(py::tuple args) {
   } catch (py::error_already_set& e) {
     PyErr_Clear();
     std::string error_message = e.what();
-    return InternalError("CpuCallback error: %s", error_message);
+    return absl::InternalError(
+        absl::StrFormat("CpuCallback error: %s", error_message));
   }
   if (!PyTuple_Check(result_object.ptr())) {
-    return InternalError("CPU callback expected a tuple result, got %s",
-                         static_cast<std::string>(py::repr(result_object)));
+    return absl::InternalError(
+        absl::StrFormat("CPU callback expected a tuple result, got %s",
+                        static_cast<std::string>(py::repr(result_object))));
   }
   if (PyTuple_Size(result_object.ptr()) != results_.size()) {
-    return InternalError(
-        "CPU callback expected a tuple with %d results, got %d",
-        results_.size(), PyTuple_Size(result_object.ptr()));
+    return absl::InternalError(
+        absl::StrFormat("CPU callback expected a tuple with %d results, got %d",
+                        results_.size(), PyTuple_Size(result_object.ptr())));
   }
   py::tuple result_tuple = py::cast<py::tuple>(result_object);
   for (size_t i = 0; i < results_.size(); ++i) {
@@ -120,9 +125,9 @@ StatusOr<py::tuple> CpuCallback::CallInternal(py::tuple args) {
         PyTuple_GetItem(result_tuple.ptr(), i));
     if (results_[i].type == xla::TOKEN) {
       if (!output.is_none()) {
-        return InternalError(
+        return absl::InternalError(absl::StrFormat(
             "Token output from Python callback should be None, got %s",
-            static_cast<std::string>(py::repr(output)));
+            static_cast<std::string>(py::repr(output))));
       }
       continue;
     }
@@ -132,11 +137,11 @@ StatusOr<py::tuple> CpuCallback::CallInternal(py::tuple args) {
     absl::Span<int64_t const> dims(
         reinterpret_cast<const int64_t*>(array.shape()), array.ndim());
     if (dims != results_[i].expected_dims) {
-      return InternalError(
+      return absl::InternalError(absl::StrFormat(
           "Mismatched result shape for %d-th return value from CPU callback; "
           "expected array with dimensions %s, got %s",
           i, absl::StrJoin(results_[i].expected_dims, ","),
-          absl::StrJoin(dims, ","));
+          absl::StrJoin(dims, ",")));
     }
   }
   return result_tuple;
diff --git a/third_party/xla/xla/python/callback.h b/third_party/xla/xla/python/callback.h
index 97e2a0f6633ee9..401554825f0718 100644
--- a/third_party/xla/xla/python/callback.h
+++ b/third_party/xla/xla/python/callback.h
@@ -20,12 +20,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/numpy.h"  // from @pybind11
 #include "xla/pjrt/transpose.h"
-#include "xla/python/py_values.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/service/custom_call_status.h"
-#include "xla/types.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/python/py_client.cc b/third_party/xla/xla/python/py_client.cc
index 8d8f723d6d3cd9..ce6e4dcba1c2e7 100644
--- a/third_party/xla/xla/python/py_client.cc
+++ b/third_party/xla/xla/python/py_client.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/python/py_buffer.h"
 #include "xla/python/py_executable.h"
 #include "xla/python/py_host_callback.h"
+#include "xla/python/py_values.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/traceback.h"
 #include "xla/python/transfer_guard_lib.h"
diff --git a/third_party/xla/xla/python/py_client_gpu.cc b/third_party/xla/xla/python/py_client_gpu.cc
index d695098f141ba1..6a5caf680243d0 100644
--- a/third_party/xla/xla/python/py_client_gpu.cc
+++ b/third_party/xla/xla/python/py_client_gpu.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #endif
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "xla/primitive_util.h"
 #include "xla/python/callback.h"
 #include "xla/python/exceptions.h"
 

From c475f59d1a3c735a94ed8078983fc0119259bef1 Mon Sep 17 00:00:00 2001
From: Shixin Li <shixinli@google.com>
Date: Mon, 25 Sep 2023 18:18:06 -0700
Subject: [PATCH 245/567] Create AotCompileSavedModel function to produce
 AotResult w/o writing to disk.

PiperOrigin-RevId: 568379208
---
 .../mlir/tfrt/tests/saved_model/BUILD         |  1 +
 .../tests/saved_model/saved_model_test.cc     | 35 +++++++
 .../mlir/tfrt/translate/import_model.cc       | 19 ++--
 .../mlir/tfrt/translate/import_model.h        | 22 +++--
 .../core/tfrt/fallback/fallback_state.h       |  4 +
 tensorflow/core/tfrt/saved_model/BUILD        |  3 +
 .../python/saved_model_aot_compile_wrapper.cc |  3 +-
 .../saved_model/saved_model_aot_compile.cc    | 91 ++++++++++++-------
 .../saved_model/saved_model_aot_compile.h     | 24 ++++-
 9 files changed, 151 insertions(+), 51 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/tests/saved_model/BUILD b/tensorflow/compiler/mlir/tfrt/tests/saved_model/BUILD
index 45561d45b3f46d..ae1bfb32a6a09d 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/saved_model/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/saved_model/BUILD
@@ -22,6 +22,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:resource_loader",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
     ],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc b/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc
index 4cd8ce4b833ce3..f19e09d7c96d78 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/strings/match.h"
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
@@ -190,6 +192,39 @@ TEST(SavedModelTest, ConvertTfMlirToBefExportingXlaReduceWindow) {
             2);
 }
 
+TEST(SavedModelTest, AddXlaFunctionsOutputFunctionNames) {
+  std::string saved_model_mlir_path = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/"
+      "xla_launch_xla_reduce_window.mlir");
+
+  mlir::DialectRegistry registry;
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::MLIRContext context(registry);
+  auto module =
+      mlir::parseSourceFile<mlir::ModuleOp>(saved_model_mlir_path, &context);
+  ASSERT_TRUE(module);
+
+  tfrt::BefBuffer bef_buffer;
+  auto runtime =
+      tensorflow::tfrt_stub::Runtime::Create(/*num_inter_op_threads=*/1);
+  tfrt_stub::GraphExecutionOptions options(runtime.get());
+  options.compile_options.device_target = TfrtDeviceInfraTarget::kGpu;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<tfrt_stub::FallbackState> fallback_state,
+      tfrt_stub::FallbackState::Create(SessionOptions(), FunctionDefLibrary()));
+
+  tfrt::ResourceContext resource_context;
+  tfrt_stub::ModelRuntimeContext model_context(
+      &options, options.compile_options.saved_model_dir, &resource_context);
+
+  std::vector<std::string> function_names;
+  TF_ASSERT_OK(ConvertTfMlirToBef(options.compile_options, module.get(),
+                                  &bef_buffer, model_context,
+                                  fallback_state.get(), &function_names));
+  EXPECT_THAT(function_names, ::testing::SizeIs(2));
+}
+
 // TODO(b/162442824): Add a SavedModel test that covers the error pass.
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index 1d1977ffb74dbc..05837853d2fc4a 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -155,7 +155,8 @@ Status ConvertTfMlirToRuntimeExecutable(
                              const tensorflow::TfrtPipelineOptions& options)>
         emit_executable,
     tfrt_stub::ModelRuntimeContext& model_context,
-    tfrt_stub::FallbackState* fallback_state) {
+    tfrt_stub::FallbackState* fallback_state,
+    std::vector<std::string>* added_xla_function_names) {
   mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
 
   {
@@ -224,7 +225,8 @@ Status ConvertTfMlirToRuntimeExecutable(
     // GPU XLA clusters are wrapped in functions, which could be transformed by
     // bridge. Hence, the MLIR functions for XLA clusters are exported and added
     // to the function library.
-    TF_RETURN_IF_ERROR(AddXlaFunctions(fallback_state, module));
+    TF_RETURN_IF_ERROR(
+        AddXlaFunctions(fallback_state, module, added_xla_function_names));
   }
 
   if (VLOG_IS_ON(1)) {
@@ -256,7 +258,8 @@ Status ConvertTfMlirToRuntimeExecutable(
 Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
                           mlir::ModuleOp module, tfrt::BefBuffer* bef_buffer,
                           tfrt_stub::ModelRuntimeContext& model_context,
-                          tfrt_stub::FallbackState* fallback_state) {
+                          tfrt_stub::FallbackState* fallback_state,
+                          std::vector<std::string>* added_xla_function_names) {
   return ConvertTfMlirToRuntimeExecutable(
       options, module,
       [bef_buffer](mlir::PassManager& pm, mlir::ModuleOp module,
@@ -283,7 +286,7 @@ Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
         bef_buffer->shrink_to_fit();
         return OkStatus();
       },
-      model_context, fallback_state);
+      model_context, fallback_state, added_xla_function_names);
 }
 
 std::unique_ptr<tensorflow::TfrtPipelineOptions> GetTfrtPipelineOptions(
@@ -328,13 +331,17 @@ std::unique_ptr<tensorflow::TfrtPipelineOptions> GetTfrtPipelineOptions(
   return pipeline_options;
 }
 
-tensorflow::Status AddXlaFunctions(tfrt_stub::FallbackState* fallback_state,
-                                   mlir::ModuleOp mlir_module) {
+tensorflow::Status AddXlaFunctions(
+    tfrt_stub::FallbackState* fallback_state, mlir::ModuleOp mlir_module,
+    std::vector<std::string>* added_xla_function_names) {
   if (fallback_state != nullptr) {
     TF_ASSIGN_OR_RETURN(const std::vector<FunctionDef> xla_func_defs,
                         ExportXlaFunctions(mlir_module));
     for (const auto& func_def : xla_func_defs) {
       TF_RETURN_IF_ERROR(fallback_state->AddFunctionDef(func_def));
+      if (added_xla_function_names != nullptr) {
+        added_xla_function_names->push_back(func_def.signature().name());
+      }
     }
   }
 
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.h b/tensorflow/compiler/mlir/tfrt/translate/import_model.h
index 467f7ff3a42ae1..09917ea61095fe 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_IMPORT_MODEL_H_
 
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -50,11 +51,14 @@ Status ConvertFunctionToBef(
 // Converts an MLIR `module` in TF dialect to TFRT's Binary Executable Format.
 // If `fallback_state` is not null, the MLIR functions for XLA clusters in
 // the form of XlaLaunch will be exported and added to the function library when
-// needed. The nested functions will also be exported.
-Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
-                          mlir::ModuleOp module, tfrt::BefBuffer* bef_buffer,
-                          tfrt_stub::ModelRuntimeContext& model_context,
-                          tfrt_stub::FallbackState* fallback_state = nullptr);
+// needed. The nested functions will also be exported. If
+// `added_xla_function_names` is not null, it will be populated with the names
+// of the added XLA functions.
+Status ConvertTfMlirToBef(
+    const TfrtCompileOptions& options, mlir::ModuleOp module,
+    tfrt::BefBuffer* bef_buffer, tfrt_stub::ModelRuntimeContext& model_context,
+    tfrt_stub::FallbackState* fallback_state = nullptr,
+    std::vector<std::string>* added_xla_function_names = nullptr);
 
 Status ConvertTfMlirToRuntimeExecutable(
     const TfrtCompileOptions& options, mlir::ModuleOp module,
@@ -62,14 +66,16 @@ Status ConvertTfMlirToRuntimeExecutable(
                              const tensorflow::TfrtPipelineOptions& options)>
         emit_executable,
     tfrt_stub::ModelRuntimeContext& model_context,
-    tfrt_stub::FallbackState* fallback_state = nullptr);
+    tfrt_stub::FallbackState* fallback_state = nullptr,
+    std::vector<std::string>* added_xla_function_names = nullptr);
 
 std::unique_ptr<tensorflow::TfrtPipelineOptions> GetTfrtPipelineOptions(
     const TfrtCompileOptions& options);
 
 // Adds MLIR functions for XLA clusters to the function library.
-tensorflow::Status AddXlaFunctions(tfrt_stub::FallbackState* fallback_state,
-                                   mlir::ModuleOp mlir_module);
+tensorflow::Status AddXlaFunctions(
+    tfrt_stub::FallbackState* fallback_state, mlir::ModuleOp mlir_module,
+    std::vector<std::string>* added_xla_function_names = nullptr);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.h b/tensorflow/core/tfrt/fallback/fallback_state.h
index 904e04e5245d55..efd96af150fa33 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.h
+++ b/tensorflow/core/tfrt/fallback/fallback_state.h
@@ -70,6 +70,10 @@ class FallbackState {
     return pflr_;
   }
 
+  const FunctionLibraryDefinition &func_lib_def() const {
+    return func_lib_def_;
+  }
+
  private:
   SessionOptions session_options_;
   StaticDeviceMgr device_manager_;
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index 5f1d8cc73c1aff..f001b29660fc4d 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -41,7 +41,9 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tfrt:import_model",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:function_proto_cc",
         "//tensorflow/core/ops",
         "//tensorflow/core/platform:enable_tf2_utils",
         "//tensorflow/core/platform:path",
@@ -56,6 +58,7 @@ cc_library(
         "//tensorflow/core/tfrt/utils",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:status",
         "@local_xla//xla/service:compiler",
diff --git a/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile_wrapper.cc b/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile_wrapper.cc
index 7c1e31fab55e7e..1dd62575fbfd98 100644
--- a/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile_wrapper.cc
+++ b/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile_wrapper.cc
@@ -29,7 +29,8 @@ PYBIND11_MODULE(_pywrap_saved_model_aot_compile, m) {
       .def(py::init<>());
   m.doc() = "pybind11 AotOptions Python - C++ Wrapper";
 
-  m.def("AotCompileSavedModel", &tensorflow::tfrt_stub::AotCompileSavedModel,
+  m.def("AotCompileSavedModel",
+        &tensorflow::tfrt_stub::AotCompileSavedModelAndSaveResult,
         py::arg("input_model_dir") = absl::string_view(),
         py::arg("aot_options") = tensorflow::tfrt_stub::AotOptions(),
         py::arg("output_model_dir") = absl::string_view());
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
index fb3c26e8e16cc0..2bde4e977e5c8d 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
@@ -19,14 +19,20 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <unordered_set>
+#include <utility>
+#include <vector>
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
 #include "xla/service/compiler.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/path.h"
@@ -43,6 +49,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/file_system_helper.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
 #include "tfrt/host_context/resource_context.h"  // from @tf_runtime
@@ -64,9 +71,9 @@ void UpdateCompileOptions(AotOptions& options) {
 
 AotOptions::AotOptions() : graph_execution_options(nullptr) {}
 
-Status AotCompileSavedModel(absl::string_view input_model_dir,
-                            AotOptions aot_options,
-                            absl::string_view output_model_dir) {
+Status AotCompileSavedModelAndSaveResult(absl::string_view input_model_dir,
+                                         AotOptions aot_options,
+                                         absl::string_view output_model_dir) {
   // Create aot_packages directory.
   Env* env = Env::Default();
   const bool new_directory = !output_model_dir.empty();
@@ -109,6 +116,42 @@ Status AotCompileSavedModel(absl::string_view input_model_dir,
     aot_options.tags = {"serve", "gpu"};
   }
 
+  TF_ASSIGN_OR_RETURN(AotResult result,
+                      AotCompileSavedModel(input_model_dir, aot_options));
+
+  const std::string warmup_requests_path = io::JoinPath(
+      input_model_dir, "assets.extra", "tf_serving_warmup_requests");
+  TF_RETURN_IF_ERROR(env->FileExists(warmup_requests_path));
+
+  const std::string saved_model_pb_path =
+      io::JoinPath(input_model_dir, kSavedModelFilenamePb);
+  const std::string saved_model_pbtxt_path =
+      io::JoinPath(input_model_dir, kSavedModelFilenamePbTxt);
+  bool pb_found = env->FileExists(saved_model_pb_path).ok();
+  bool pbtxt_found = env->FileExists(saved_model_pbtxt_path).ok();
+  if (!pb_found && !pbtxt_found) {
+    return absl::NotFoundError(absl::StrCat(
+        "saved_model not found in input directory: ", input_model_dir));
+  }
+
+  // Serialize BEF buffer to a file under aot_packages
+  const std::string serialized_bef_path =
+      io::JoinPath(aot_directory, kBefBufferFilenameMLIRBEF);
+  TF_RETURN_IF_ERROR(SerializeBEF(result.bef, serialized_bef_path));
+
+  if (pb_found) {
+    const std::string output_file_directory =
+        io::JoinPath(std::string(output_model_dir), kSavedModelFilenamePb);
+    return env->CopyFile(saved_model_pb_path, output_file_directory);
+  } else {
+    const std::string output_file_directory =
+        io::JoinPath(std::string(output_model_dir), kSavedModelFilenamePbTxt);
+    return env->CopyFile(saved_model_pbtxt_path, output_file_directory);
+  }
+}
+
+StatusOr<AotResult> AotCompileSavedModel(absl::string_view input_model_dir,
+                                         AotOptions aot_options) {
   TF_ASSIGN_OR_RETURN(tensorflow::MetaGraphDef meta_graph_def,
                       ReadSavedModel(input_model_dir, aot_options.tags));
 
@@ -160,43 +203,27 @@ Status AotCompileSavedModel(absl::string_view input_model_dir,
   }
 
   tfrt::BefBuffer bef;
+  std::vector<std::string> xla_function_names;
   RETURN_IF_ERROR_IN_COMPILE(tensorflow::ConvertTfMlirToBef(
       aot_options.graph_execution_options->compile_options, mlir_module.get(),
-      &bef, model_context, fallback_state.get()));
+      &bef, model_context, fallback_state.get(), &xla_function_names));
   if (bef.empty()) {
-    LOG(DFATAL) << "BefBuffer is empty.";
     return absl::InternalError("BefBuffer is empty.");
   }
 
-  const std::string warmup_requests_path = io::JoinPath(
-      input_model_dir, "assets.extra", "tf_serving_warmup_requests");
-  TF_RETURN_IF_ERROR(env->FileExists(warmup_requests_path));
-
-  const std::string saved_model_pb_path =
-      io::JoinPath(input_model_dir, kSavedModelFilenamePb);
-  const std::string saved_model_pbtxt_path =
-      io::JoinPath(input_model_dir, kSavedModelFilenamePbTxt);
-  bool pb_found = env->FileExists(saved_model_pb_path).ok();
-  bool pbtxt_found = env->FileExists(saved_model_pbtxt_path).ok();
-  if (!pb_found && !pbtxt_found) {
-    return absl::NotFoundError(absl::StrCat(
-        "saved_model not found in input directory: ", input_model_dir));
+  const FunctionLibraryDefinition& flib_def = fallback_state->func_lib_def();
+  std::vector<FunctionDef> xla_functions;
+  xla_functions.reserve(xla_function_names.size());
+  for (const std::string& name : xla_function_names) {
+    const FunctionDef* xla_func_def = flib_def.Find(name);
+    if (xla_func_def == nullptr) {
+      return absl::NotFoundError(
+          absl::StrCat("XLA function ", name, " not found in library."));
+    }
+    xla_functions.push_back(*xla_func_def);
   }
 
-  // Serialize BEF buffer to a file under aot_packages
-  const std::string serialized_bef_path =
-      io::JoinPath(aot_directory, kBefBufferFilenameMLIRBEF);
-  TF_RETURN_IF_ERROR(SerializeBEF(bef, serialized_bef_path));
-
-  if (pb_found) {
-    const std::string output_file_directory =
-        io::JoinPath(std::string(output_model_dir), kSavedModelFilenamePb);
-    return env->CopyFile(saved_model_pb_path, output_file_directory);
-  } else {
-    const std::string output_file_directory =
-        io::JoinPath(std::string(output_model_dir), kSavedModelFilenamePbTxt);
-    return env->CopyFile(saved_model_pbtxt_path, output_file_directory);
-  }
+  return AotResult{std::move(bef), std::move(xla_functions)};
 }
 
 // TODO(b/294095043): Create a function (ex Status
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
index da5e85f2c06f65..d51cda14f302c1 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
@@ -19,9 +19,12 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_set>
+#include <vector>
 
 #include "xla/service/compiler.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 
 namespace tensorflow::tfrt_stub {
 struct AotOptions {
@@ -31,13 +34,26 @@ struct AotOptions {
   std::shared_ptr<GraphExecutionOptions> graph_execution_options;
 };
 
-// AOT Compiles saved_model in input_model_dir, writing output
+struct AotResult {
+  tfrt::BefBuffer bef;
+  // TODO(b/296466237): Investigate whether the whole FunctionDefLibrary should
+  // be put here.
+  // XLA cluster functions generated during bridge and their nested functions.
+  std::vector<FunctionDef> xla_functions;
+};
+
+// AOT compiles saved_model in input_model_dir and returns AotResult, otherwise
+// returns error.
+StatusOr<AotResult> AotCompileSavedModel(absl::string_view input_model_dir,
+                                         AotOptions aot_options = {});
+
+// AOT compiles saved_model in input_model_dir, writing output
 // saved_model and aot packages to output_model_dir, or
 // "{input_model_dir}/aot_packages" if output dir provided. Warmup requests
 // should be present in input_model_dir
-Status AotCompileSavedModel(absl::string_view input_model_dir,
-                            AotOptions aot_options = {},
-                            absl::string_view output_model_dir = "");
+Status AotCompileSavedModelAndSaveResult(
+    absl::string_view input_model_dir, AotOptions aot_options = {},
+    absl::string_view output_model_dir = "");
 
 }  // namespace tensorflow::tfrt_stub
 

From b53353deeecec5c99689c8d63a9e719137612f6a Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Mon, 25 Sep 2023 18:33:55 -0700
Subject: [PATCH 246/567] Update the containers which contains a rebuilt image
 which removes bazel from jax's docker image and use the ones preinstalled in
 the sigbuild image

PiperOrigin-RevId: 568381631
---
 tensorflow/tools/toolchains/remote_config/containers.bzl      | 4 ++--
 .../tsl/tools/toolchains/remote_config/containers.bzl         | 4 ++--
 third_party/xla/tools/toolchains/remote_config/containers.bzl | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/tools/toolchains/remote_config/containers.bzl b/tensorflow/tools/toolchains/remote_config/containers.bzl
index 2819512e4b902c..d2e825200e042d 100644
--- a/tensorflow/tools/toolchains/remote_config/containers.bzl
+++ b/tensorflow/tools/toolchains/remote_config/containers.bzl
@@ -7,9 +7,9 @@ container_digests = {
     # JAX manylinux2014 configs.
     "cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython": "sha256:011034978c5f1e5dcecc816b3b964faafc42b243001d9cd09ff7cfe4a6a0f4b9",
     "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:d17894a1349a12baea1732cb133f65f08754ed97d0a6647efe23c916a9ab8f1c",
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:c973a5dd1b335b83f5cc65ab2d1f12e12c0cc5d310a2d9bf676fcdb52cf08285",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:4d5b528d900d5366800c984a2bf737770b2e4b1da8099de64f0e8c44caa08e0f",
     "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:2551b1587bdd0b63a4dd329eba6416cd07acb25496dde411c376609ce4f076f0",
-    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:55cf36aa54debd7ec7b3aac5a84af1fe3691a186aceae8d1d8eafe886d6a6950",
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8b55e288ed59e960abd3a3fdcbb19fd7c36183f71766d4733aeb38ab73d1e8d7",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
index 2819512e4b902c..d2e825200e042d 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
@@ -7,9 +7,9 @@ container_digests = {
     # JAX manylinux2014 configs.
     "cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython": "sha256:011034978c5f1e5dcecc816b3b964faafc42b243001d9cd09ff7cfe4a6a0f4b9",
     "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:d17894a1349a12baea1732cb133f65f08754ed97d0a6647efe23c916a9ab8f1c",
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:c973a5dd1b335b83f5cc65ab2d1f12e12c0cc5d310a2d9bf676fcdb52cf08285",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:4d5b528d900d5366800c984a2bf737770b2e4b1da8099de64f0e8c44caa08e0f",
     "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:2551b1587bdd0b63a4dd329eba6416cd07acb25496dde411c376609ce4f076f0",
-    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:55cf36aa54debd7ec7b3aac5a84af1fe3691a186aceae8d1d8eafe886d6a6950",
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8b55e288ed59e960abd3a3fdcbb19fd7c36183f71766d4733aeb38ab73d1e8d7",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
diff --git a/third_party/xla/tools/toolchains/remote_config/containers.bzl b/third_party/xla/tools/toolchains/remote_config/containers.bzl
index 2819512e4b902c..d2e825200e042d 100644
--- a/third_party/xla/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/containers.bzl
@@ -7,9 +7,9 @@ container_digests = {
     # JAX manylinux2014 configs.
     "cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython": "sha256:011034978c5f1e5dcecc816b3b964faafc42b243001d9cd09ff7cfe4a6a0f4b9",
     "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:d17894a1349a12baea1732cb133f65f08754ed97d0a6647efe23c916a9ab8f1c",
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:c973a5dd1b335b83f5cc65ab2d1f12e12c0cc5d310a2d9bf676fcdb52cf08285",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:4d5b528d900d5366800c984a2bf737770b2e4b1da8099de64f0e8c44caa08e0f",
     "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:2551b1587bdd0b63a4dd329eba6416cd07acb25496dde411c376609ce4f076f0",
-    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:55cf36aa54debd7ec7b3aac5a84af1fe3691a186aceae8d1d8eafe886d6a6950",
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8b55e288ed59e960abd3a3fdcbb19fd7c36183f71766d4733aeb38ab73d1e8d7",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",

From cc390d943023314588f939f4a7bf2758ef8e837a Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 25 Sep 2023 18:37:05 -0700
Subject: [PATCH 247/567] [stream_executor] CommandBuffer: add APIs for
 recording kernel launch and memcpy operations

This implementation is based on gpu_graph.{h.cc} and for CUDA backend adds CUDA graph nodes to the graph under construction.

https://github.com/openxla/xla/issues/5857

PiperOrigin-RevId: 568382083
---
 .../xla/xla/stream_executor/command_buffer.h  |  4 ++
 .../stream_executor/cuda/cuda_gpu_executor.cc |  2 +-
 third_party/xla/xla/stream_executor/gpu/BUILD |  2 +
 .../stream_executor/gpu/gpu_command_buffer.cc | 43 ++++++++++++++++++-
 .../stream_executor/gpu/gpu_command_buffer.h  | 15 ++++++-
 .../stream_executor_internal.h                | 41 +++++++++++++++++-
 6 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index fba2e1ec99e4da..a22ddb501732fd 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -42,6 +42,10 @@ class CommandBuffer {
 
   static tsl::StatusOr<CommandBuffer> Create(StreamExecutor* executor);
 
+  internal::CommandBufferInterface* operator->() {
+    return implementation_.get();
+  }
+
  private:
   std::unique_ptr<internal::CommandBufferInterface> implementation_;
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc
index 483f9a973083d1..212cc6643a006f 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc
@@ -849,7 +849,7 @@ GpuExecutor::GetCommandBufferImplementation() {
   VLOG(2) << "Create CUDA command buffer (CUDA graph)";
   GpuGraphHandle graph = nullptr;
   TF_RETURN_IF_ERROR(GpuDriver::CreateGraph(&graph));
-  return std::make_unique<GpuCommandBuffer>(graph);
+  return std::make_unique<GpuCommandBuffer>(this, graph);
 }
 
 void* GpuExecutor::GpuContextHack() { return context_; }
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 87a41605b5046d..80bda68ef3f6b7 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -90,11 +90,13 @@ cc_library(
     deps = [
         ":gpu_driver_header",
         ":gpu_executor_header",
+        ":gpu_kernel_header",
         ":gpu_stream",
         ":gpu_types_header",
         "//xla/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index 9e18de6611234e..03c4d170bd11ed 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -21,7 +21,13 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/gpu/gpu_kernel.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
 
 namespace stream_executor::gpu {
 
@@ -54,8 +60,8 @@ static int64_t NotifyExecDestroyed() {
 // GpuCommandBuffer implementation
 //===----------------------------------------------------------------------===//
 
-GpuCommandBuffer::GpuCommandBuffer(GpuGraphHandle graph)
-    : graph_(graph), exec_(nullptr) {}
+GpuCommandBuffer::GpuCommandBuffer(GpuExecutor* parent, GpuGraphHandle graph)
+    : parent_(parent), graph_(graph), exec_(nullptr) {}
 
 GpuCommandBuffer::~GpuCommandBuffer() {
   if (exec_ != nullptr) {
@@ -70,4 +76,37 @@ GpuCommandBuffer::~GpuCommandBuffer() {
   }
 }
 
+static GpuDevicePtr AsDevicePtr(const DeviceMemoryBase& mem) {
+  return reinterpret_cast<GpuDevicePtr>(const_cast<void*>(mem.opaque()));
+}
+
+tsl::Status GpuCommandBuffer::Launch(const ThreadDim& threads,
+                                     const BlockDim& blocks,
+                                     const KernelBase& kernel,
+                                     const KernelArgsArrayBase& args) {
+  const GpuKernel* gpu_kernel = AsGpuKernel(&kernel);
+  GpuFunctionHandle gpu_func = gpu_kernel->AsGpuFunctionHandle();
+
+  void** kernel_params = const_cast<void**>(args.argument_addresses().data());
+
+  GpuGraphNodeHandle node;
+  TF_RETURN_IF_ERROR(GpuDriver::GraphAddKernelNode(
+      &node, graph_, {}, kernel.name(), gpu_func, blocks.x, blocks.y, blocks.z,
+      threads.x, threads.y, threads.z, args.number_of_shared_bytes(),
+      kernel_params, /*extra=*/nullptr));
+
+  return tsl::OkStatus();
+}
+
+tsl::Status GpuCommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
+                                                   const DeviceMemoryBase& src,
+                                                   uint64_t size) {
+  GpuGraphNodeHandle node;
+  TF_RETURN_IF_ERROR(GpuDriver::GraphAddMemcpyD2DNode(
+      parent_->gpu_context(), &node, graph_, {}, AsDevicePtr(*dst),
+      AsDevicePtr(src), size));
+
+  return tsl::OkStatus();
+}
+
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index 7ce16607b7a60d..b744a02e90be1f 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -19,8 +19,12 @@ limitations under the License.
 #include <cstdint>
 #include <type_traits>
 
+#include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor_internal.h"
+#include "tsl/platform/status.h"
 
 namespace stream_executor::gpu {
 
@@ -28,9 +32,17 @@ namespace stream_executor::gpu {
 // implementation (it's backed by CUDA or HIP graphs on NVIDIA and AMD devices).
 class GpuCommandBuffer : public internal::CommandBufferInterface {
  public:
-  explicit GpuCommandBuffer(GpuGraphHandle graph);
+  GpuCommandBuffer(GpuExecutor* parent, GpuGraphHandle graph);
   ~GpuCommandBuffer() override;
 
+  tsl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
+                     const KernelBase& kernel,
+                     const KernelArgsArrayBase& args) override;
+
+  tsl::Status MemcpyDeviceToDevice(DeviceMemoryBase* dst,
+                                   const DeviceMemoryBase& src,
+                                   uint64_t size) override;
+
   // We track the total number of allocated and alive executable graphs in the
   // process to track the command buffers resource usage. Executable graph
   // allocates resources on a GPU devices (rule of thumb is ~8kb per node), so
@@ -50,6 +62,7 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   static_assert(std::is_pointer_v<GpuGraphExecHandle>,
                 "GpuGraphExecHandle must be a pointer");
 
+  GpuExecutor* parent_;                // not owned, must outlive *this
   GpuGraphHandle graph_ = nullptr;     // owned handle
   GpuGraphExecHandle exec_ = nullptr;  // owned handle
 };
diff --git a/third_party/xla/xla/stream_executor/stream_executor_internal.h b/third_party/xla/xla/stream_executor/stream_executor_internal.h
index 9acd28dc84deb5..b57d683a82301d 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_internal.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_internal.h
@@ -52,6 +52,10 @@ namespace stream_executor {
 
 class Stream;
 
+//===----------------------------------------------------------------------===//
+// ModuleHandle
+//===----------------------------------------------------------------------===//
+
 // An opaque handle to a loaded module.
 //
 // An instance of this is returned from StreamExecutor::GetModule.
@@ -71,6 +75,10 @@ class ModuleHandle {
 
 namespace internal {
 
+//===----------------------------------------------------------------------===//
+// EventInterface
+//===----------------------------------------------------------------------===//
+
 // Platform-dependent interface class for the generic Events interface, in
 // the PIMPL style.
 class EventInterface {
@@ -82,6 +90,10 @@ class EventInterface {
   SE_DISALLOW_COPY_AND_ASSIGN(EventInterface);
 };
 
+//===----------------------------------------------------------------------===//
+// KernelInterface
+//===----------------------------------------------------------------------===//
+
 // Pointer-to-implementation object type (i.e. the KernelBase class delegates to
 // this interface) with virtual destruction. This class exists for the
 // platform-dependent code to hang any kernel data/resource info/functionality
@@ -107,16 +119,39 @@ class KernelInterface {
   SE_DISALLOW_COPY_AND_ASSIGN(KernelInterface);
 };
 
-// Platform-dependent interface class implementing generic CommandBuffer.
+//===----------------------------------------------------------------------===//
+// CommandBufferInterface
+//===----------------------------------------------------------------------===//
+
+// Platform-dependent interface class for implementing generic CommandBuffer.
+//
+// TODO(ezhulenev): Currently we assume that all operations between barriers
+// can execute concurrently, and it's up to the caller to insert barriers to
+// guarantee correctness. Consider adding finer grained synchronization
+// mechanism between different commands.
 class CommandBufferInterface {
  public:
   CommandBufferInterface() = default;
   virtual ~CommandBufferInterface() = default;
 
+  // Adds a kernel launch command to the command buffer.
+  virtual tsl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
+                             const KernelBase& kernel,
+                             const KernelArgsArrayBase& args) = 0;
+
+  // Adds a device-to-device memory copy to the command buffer.
+  virtual tsl::Status MemcpyDeviceToDevice(DeviceMemoryBase* dst,
+                                           const DeviceMemoryBase& src,
+                                           uint64_t size) = 0;
+
  private:
   SE_DISALLOW_COPY_AND_ASSIGN(CommandBufferInterface);
 };
 
+//===----------------------------------------------------------------------===//
+// StreamInterface
+//===----------------------------------------------------------------------===//
+
 // Pointer-to-implementation object type (i.e. the Stream class delegates to
 // this interface) with virtual destruction. This class exists for the
 // platform-dependent code to hang any kernel data/resource info/functionality
@@ -155,6 +190,10 @@ class StreamInterface {
   SE_DISALLOW_COPY_AND_ASSIGN(StreamInterface);
 };
 
+//===----------------------------------------------------------------------===//
+// StreamExecutorInterface
+//===----------------------------------------------------------------------===//
+
 // Interface for the different StreamExecutor platforms (i.e. CUDA, OpenCL).
 //
 // Various platforms will provide an implementation that satisfy this interface.

From d8b5490ded4b24da09d7fe08d8a6b116b3e9d452 Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Mon, 25 Sep 2023 18:49:21 -0700
Subject: [PATCH 248/567] Remove bazel from jax's docker image and use the ones
 preinstalled in the sigbuild image

PiperOrigin-RevId: 568383806
---
 ...rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython | 3 ---
 ...rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
index 83223fde6e478c..460e54add5d558 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
@@ -31,9 +31,6 @@ RUN apt-get update && apt-get install -y \
       && \
     rm -rf /var/lib/apt/lists/*
 
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
 COPY install/build_and_install_python.sh /install/
 RUN /install/build_and_install_python.sh "3.9.4"
 RUN /install/build_and_install_python.sh "3.10.0"
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython
index 12e9356664f896..308b709bf51471 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython
@@ -30,9 +30,6 @@ RUN apt-get update && apt-get install -y \
       && \
     rm -rf /var/lib/apt/lists/*
 
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
 COPY install/build_and_install_python.sh /install/
 RUN /install/build_and_install_python.sh "3.9.4"
 RUN /install/build_and_install_python.sh "3.10.0"

From cf5e85663fef097f83f5004f112ae42758e9b035 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 25 Sep 2023 19:43:50 -0700
Subject: [PATCH 249/567] [stream_executor] NFC: Restrict visibility of
 stream_executor_pimpl target

`stream_executor_pimpl` is an internal implementation detail planned to be removed. Remove all external users of this target and clean up some of the build files to remove dependencies on other internal targets planned to be removed.

https://github.com/openxla/xla/issues/5761

PiperOrigin-RevId: 568390988
---
 tensorflow/c/experimental/stream_executor/BUILD       |  3 ---
 tensorflow/python/BUILD                               |  2 +-
 third_party/xla/xla/backends/interpreter/BUILD        |  2 --
 third_party/xla/xla/client/BUILD                      |  1 -
 third_party/xla/xla/mlir/backends/gpu/BUILD           |  2 +-
 third_party/xla/xla/service/BUILD                     | 11 +++--------
 third_party/xla/xla/service/cpu/BUILD                 |  2 --
 third_party/xla/xla/service/gpu/tests/BUILD           |  4 ++--
 third_party/xla/xla/stream_executor/BUILD             |  6 ++++--
 third_party/xla/xla/stream_executor/cuda/BUILD        |  8 ++++----
 third_party/xla/xla/stream_executor/gpu/BUILD         |  4 ++--
 third_party/xla/xla/stream_executor/host/BUILD        |  1 -
 third_party/xla/xla/stream_executor/rocm/BUILD        |  6 +++---
 .../xla/xla/translate/mhlo_to_lhlo_with_xla/BUILD     |  2 +-
 third_party/xla/xla/xla.bzl                           |  2 ++
 15 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
index 5cf55e03aa778e..4c02379a999052 100644
--- a/tensorflow/c/experimental/stream_executor/BUILD
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -46,7 +46,6 @@ cc_library(
         "//tensorflow/core/platform:strcat",
         "@com_google_absl//absl/functional:any_invocable",
         "@local_xla//xla/stream_executor",
-        "@local_xla//xla/stream_executor:multi_platform_manager",
         "@local_xla//xla/stream_executor:platform",
     ],
 )
@@ -80,8 +79,6 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
         "@local_xla//xla/stream_executor",
-        "@local_xla//xla/stream_executor:multi_platform_manager",
-        "@local_xla//xla/stream_executor:stream_executor_pimpl",
     ],
 )
 
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 4d79a70ca67e2c..7ad991d1d5b87b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -940,7 +940,7 @@ filegroup(
         "@local_tsl//tsl/profiler/rpc/client:profiler_client_impl",
         "@local_tsl//tsl/python/lib/core:ml_dtypes_lib",  # bfloat16, float8_e4m3fn, float8_e5m2
         "@local_tsl//tsl/python/lib/core:numpy",  # checkpoint_reader
-        "@local_xla//xla/stream_executor:stream_executor_pimpl",  # stat_summarizer
+        "@local_xla//xla/stream_executor",  # stat_summarizer
     ] + if_xla_available([
         "//tensorflow/compiler/aot:tfcompile_lib",  # tfcompile
         "@local_xla//xla:status_macros",  # tfcompile
diff --git a/third_party/xla/xla/backends/interpreter/BUILD b/third_party/xla/xla/backends/interpreter/BUILD
index 938e4ae4af27d4..bba10b2e1fd7ee 100644
--- a/third_party/xla/xla/backends/interpreter/BUILD
+++ b/third_party/xla/xla/backends/interpreter/BUILD
@@ -144,8 +144,6 @@ cc_library(
         ":executor",
         ":platform_id",
         "//xla/stream_executor",
-        "//xla/stream_executor:executor_cache",
-        "//xla/stream_executor:stream_executor_pimpl",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:status",
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index 4c47f78ef180ad..047f2590bd3c6e 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -142,7 +142,6 @@ cc_library(
         "//xla/service:stream_pool",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:stream_executor_pimpl",  # fixdeps: keep
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/third_party/xla/xla/mlir/backends/gpu/BUILD b/third_party/xla/xla/mlir/backends/gpu/BUILD
index 0cbdc1e915e4c6..e97e14560506cd 100644
--- a/third_party/xla/xla/mlir/backends/gpu/BUILD
+++ b/third_party/xla/xla/mlir/backends/gpu/BUILD
@@ -20,7 +20,7 @@ xla_cc_binary(
         "//xla/mlir/backends/gpu/transforms:passes",
         "//xla/mlir_hlo:lhlo",
         "//xla/mlir_hlo:lhlo_gpu",
-        "//xla/stream_executor:stream_executor_impl",
+        "//xla/stream_executor",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:GPUDialect",
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 2a4e6d77e45cba..b29f6d1a832b07 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1065,7 +1065,6 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_pimpl",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/host:host_platform_id",
         "//xla/stream_executor/rocm:rocm_platform_id",
@@ -1090,7 +1089,6 @@ cc_library(
         "//xla:util",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:stream_executor_pimpl",
         "//xla/stream_executor/host:host_platform_id",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -1145,7 +1143,6 @@ cc_library(
         "//xla/hlo/ir:hlo_module_group",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:stream_executor_pimpl",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -1335,11 +1332,12 @@ cc_library(
     name = "gpu_plugin_impl",
     compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
+    deps = [
+        "//xla/stream_executor:stream_executor_impl",
+    ] + if_gpu_is_configured([
         ":service",
         "//xla/service/gpu:gpu_compiler",
         "//xla/service/gpu:gpu_transfer_manager",
-        "//xla/stream_executor",
     ]) + if_cuda_is_configured([
         "//xla/service/gpu:nvptx_compiler",
         "//xla/stream_executor/cuda:stream_executor_cuda",
@@ -1562,7 +1560,6 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_pimpl",
         "@local_tsl//tsl/platform:logging",
     ],
 )
@@ -3901,7 +3898,6 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_pimpl",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -5396,7 +5392,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_pimpl",
     ],
 )
 
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 6bbaf8de88d43f..f4c9fde056b1c2 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -344,7 +344,6 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/spmd:stateful_rng_spmd_partitioner",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_pimpl",  # fixdeps: keep
         "//xla/stream_executor/host:host_platform_id",
         "//xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
@@ -439,7 +438,6 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/service:llvm_compiler",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_pimpl",  # fixdeps: keep
         "//xla/stream_executor/host:host_platform_id",
         "@llvm-project//llvm:Target",
     ],
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 854766e620a00f..06d9e3ed3bc01a 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -727,10 +727,10 @@ xla_cc_binary(
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:target_constants",
         "//xla/service/gpu/llvm_gpu_backend",
+        "//xla/stream_executor",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc_impl",
         "//xla/stream_executor:dnn",
-        "//xla/stream_executor:stream_executor_impl",
         "//xla/stream_executor/host:host_platform",
         "//xla/tests:test_utils",
         "//xla/tools:hlo_module_loader",
@@ -891,9 +891,9 @@ xla_cc_test(
         "//xla/client:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:cublas_cudnn",
+        "//xla/stream_executor",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
-        "//xla/stream_executor:stream_executor_pimpl",
         "//xla/tests:client_library_test_base",
         "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index b0f10c4d83d930..ab8e0b77ef79f1 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -595,8 +595,8 @@ cc_library(
     hdrs = ["event.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":stream_executor_headers",
         ":stream_executor_internal",
-        ":stream_executor_pimpl_header",
         "//xla/stream_executor/platform",
         "@local_tsl//tsl/platform:status",
     ],
@@ -659,7 +659,6 @@ cc_library(
     deps = [
         ":device_memory",
         ":stream_executor_headers",
-        ":stream_executor_pimpl_header",
         ":temporary_device_memory",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
@@ -695,6 +694,7 @@ cc_library(
 cc_library(
     name = "stream_executor",
     textual_hdrs = [
+        "allocator_stats.h",
         "blas.h",
         "data_type.h",
         "device_description.h",
@@ -741,6 +741,8 @@ cc_library(
     name = "stream_executor_impl",
     visibility = ["//visibility:public"],
     deps = [
+        ":allocator_stats",
+        ":device_description",
         ":device_memory",
         ":dnn",
         ":dnn_proto_cc",
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 22d30bf5fd23f9..3b5e64c27d93ea 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -61,7 +61,7 @@ cc_library(
             "//xla/stream_executor",  # buildcleaner: keep
             "//xla/stream_executor:executor_cache",
             "//xla/stream_executor:multi_platform_manager",
-            "//xla/stream_executor:stream_executor_pimpl_header",
+            "//xla/stream_executor:stream_executor_headers",
             "//xla/stream_executor/platform",
         ],
     ) + tf_additional_cuda_platform_deps() + [
@@ -360,7 +360,7 @@ cc_library(
         "//xla/stream_executor:event",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
-        "//xla/stream_executor:stream_executor_pimpl_header",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:temporary_device_memory",
         "//xla/stream_executor/platform",
         "@local_tsl//tsl/platform:errors",
@@ -381,7 +381,7 @@ cc_library(
         ":cuda_driver",
         "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor:event",
-        "//xla/stream_executor:stream_executor_pimpl_header",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_kernel_header",
         "//xla/stream_executor/platform",
     ]),
@@ -460,7 +460,7 @@ cc_library(
         "//xla/stream_executor:event",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:stream_executor_internal",
-        "//xla/stream_executor:stream_executor_pimpl_header",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:asm_compiler",
         "//xla/stream_executor/gpu:gpu_command_buffer",
         "//xla/stream_executor/gpu:gpu_executor_header",
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 80bda68ef3f6b7..5d35ea3380f44b 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -137,8 +137,8 @@ cc_library(
         ":gpu_kernel_header",
         "//xla/stream_executor:event",
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:stream_executor_internal",
-        "//xla/stream_executor:stream_executor_pimpl_header",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -201,8 +201,8 @@ cc_library(
     deps = [
         ":gpu_driver_header",
         "//xla/stream_executor:event",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:stream_executor_internal",
-        "//xla/stream_executor:stream_executor_pimpl_header",
         "//xla/stream_executor/platform",
         "@local_tsl//tsl/platform:logging",
     ],
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 1cea4b7e5485eb..d6b2d697af45d5 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -88,7 +88,6 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:kernel",
         "//xla/stream_executor:stream_executor_internal",
-        "//xla/stream_executor:stream_executor_pimpl",  # fixdeps: keep
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index a4c1bf64976212..57826e043149ca 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -111,7 +111,7 @@ cc_library(
         "//xla/stream_executor:event",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:stream_executor_internal",
-        "//xla/stream_executor:stream_executor_pimpl_header",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_event",
         "//xla/stream_executor/gpu:gpu_kernel_header",
@@ -161,7 +161,7 @@ cc_library(
         "//xla/stream_executor",  # buildcleaner: keep
         "//xla/stream_executor:executor_cache",
         "//xla/stream_executor:multi_platform_manager",
-        "//xla/stream_executor:stream_executor_pimpl_header",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/platform",
     ]),
     alwayslink = True,  # Registers itself with the MultiPlatformManager.
@@ -318,7 +318,7 @@ cc_library(
         "//xla/stream_executor:event",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
-        "//xla/stream_executor:stream_executor_pimpl_header",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:temporary_device_memory",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_stream_header",
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/BUILD b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/BUILD
index f19765af13bb55..165c60944986e9 100644
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/BUILD
@@ -116,7 +116,7 @@ xla_cc_binary(
         "//xla/mlir_hlo:all_passes",
         "//xla/mlir_hlo:hlo_dialect_registration",
         "//xla/service:gpu_plugin",
-        "//xla/stream_executor:stream_executor_impl",
+        "//xla/stream_executor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index 3fe599358e2baf..ee021b9b78c951 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -63,6 +63,7 @@ def xla_cc_binary(deps = None, copts = tsl_copts(), **kwargs):
         "//xla/service/gpu:backend_configs_cc_impl",
         "//xla/service/gpu:hlo_op_profile_proto_cc_impl",
         "//xla/stream_executor:dnn_proto_cc_impl",
+        "//xla/stream_executor:stream_executor_impl",
         "@local_tsl//tsl/platform:env_impl",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
         "@local_tsl//tsl/profiler/utils:time_utils_impl",
@@ -95,6 +96,7 @@ def xla_cc_test(
                        clean_dep("//xla/service/gpu:backend_configs_cc_impl"),
                        clean_dep("//xla/service/gpu:hlo_op_profile_proto_cc_impl"),
                        clean_dep("//xla/stream_executor:dnn_proto_cc_impl"),
+                       clean_dep("//xla/stream_executor:device_id_utils"),
                        clean_dep("//xla/stream_executor:stream_executor_impl"),
                        clean_dep("//xla/stream_executor/gpu:gpu_cudamallocasync_allocator"),
                        clean_dep("//xla/stream_executor/gpu:gpu_init_impl"),

From f947c9c19700e84c255a344ee0d03cc45cf9fd04 Mon Sep 17 00:00:00 2001
From: Luke Boyer <lukeboyer@google.com>
Date: Mon, 25 Sep 2023 20:09:56 -0700
Subject: [PATCH 250/567] TensorListElementShape kernel implementation.

PiperOrigin-RevId: 568396257
---
 tensorflow/lite/kernels/variants/BUILD        |  15 +++
 .../list_kernels/list_element_shape.cc        | 111 ++++++++++++++++++
 .../list_kernels/list_element_shape_test.cc   |  98 ++++++++++++++++
 .../lite/kernels/variants/list_ops_lib.h      |   2 +
 .../kernels/variants/register_list_ops.cc     |   1 +
 5 files changed, 227 insertions(+)
 create mode 100644 tensorflow/lite/kernels/variants/list_kernels/list_element_shape.cc
 create mode 100644 tensorflow/lite/kernels/variants/list_kernels/list_element_shape_test.cc

diff --git a/tensorflow/lite/kernels/variants/BUILD b/tensorflow/lite/kernels/variants/BUILD
index 55fd1a30ac1c5a..207f570302bffd 100644
--- a/tensorflow/lite/kernels/variants/BUILD
+++ b/tensorflow/lite/kernels/variants/BUILD
@@ -142,6 +142,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "list_element_shape_test",
+    srcs = ["list_kernels/list_element_shape_test.cc"],
+    deps = [
+        ":list_ops_lib",
+        ":tensor_array",
+        ":test_util",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "list_ops_util",
     srcs = ["list_ops_util.cc"],
diff --git a/tensorflow/lite/kernels/variants/list_kernels/list_element_shape.cc b/tensorflow/lite/kernels/variants/list_kernels/list_element_shape.cc
new file mode 100644
index 00000000000000..b07a72257f4638
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/list_kernels/list_element_shape.cc
@@ -0,0 +1,111 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <cstring>
+
+#include "tensorflow/lite/array.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/variants/list_ops_lib.h"
+#include "tensorflow/lite/kernels/variants/tensor_array.h"
+
+namespace tflite {
+namespace variants {
+namespace ops {
+namespace list_element_shape {
+namespace {
+
+// This kernel returns a `TfLiteTensor` which represents the shape signature
+// of the tensorlist elements stored in the `TensorArray::ElementShape` field as
+// a `TfLiteIntArray`. Encoding shape signatures in tensors works as follows:
+//
+// An unranked shape signature:
+//    `TfLiteIntArray` : []
+//    `TfLiteTensor` : shape = [], data = [-1]
+//
+// A scalar shape signature:
+//    `TfLiteIntArray` : [0]
+//    `TfLiteTensor` : shape = [0], data = []
+//
+// A ranked tensor shape signature (with possibly dynamic dimensions):
+//    `TfLiteIntArray` : [Dim1, Dim2 ... DimRank] (DimI >= -1)
+//    `TfLiteTensor` : shape = [Rank], data = [Dim1, Dim2 ... DimRank]
+
+using ::tflite::variants::TensorArray;
+
+constexpr int kListInput = 0;
+constexpr int kShapeOut = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+
+  const TfLiteTensor* list_input;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kListInput, &list_input));
+  TF_LITE_ENSURE(context, list_input->type == kTfLiteVariant);
+
+  TfLiteTensor* shape_out;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kShapeOut, &shape_out));
+  TF_LITE_ENSURE_TYPES_EQ(context, shape_out->type, kTfLiteInt32);
+
+  SetTensorToDynamic(shape_out);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* list_input;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kListInput, &list_input));
+
+  const TensorArray* const list =
+      reinterpret_cast<const TensorArray*>(list_input->data.data);
+  const TfLiteIntArray& element_shape = *list->ElementShape();
+
+  TfLiteTensor* shape_out;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kShapeOut, &shape_out));
+
+  if (element_shape.size == 0) {
+    // Unranked
+    context->ResizeTensor(context, shape_out, BuildTfLiteArray(0).release());
+    GetTensorData<int32_t>(shape_out)[0] = -1;
+  } else if (element_shape.data[0] == 0) {
+    // Scalar
+    context->ResizeTensor(context, shape_out, BuildTfLiteArray({0}).release());
+  } else {
+    // Ranked
+    context->ResizeTensor(context, shape_out,
+                          BuildTfLiteArray({element_shape.size}).release());
+    memcpy(GetTensorData<int32_t>(shape_out), element_shape.data,
+           element_shape.size * sizeof(int32_t));
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+}  // namespace list_element_shape
+
+TfLiteRegistration* Register_LIST_ELEMENT_SHAPE() {
+  static TfLiteRegistration r = {nullptr, nullptr, list_element_shape::Prepare,
+                                 list_element_shape::Eval};
+  return &r;
+}
+
+}  // namespace ops
+}  // namespace variants
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/list_kernels/list_element_shape_test.cc b/tensorflow/lite/kernels/variants/list_kernels/list_element_shape_test.cc
new file mode 100644
index 00000000000000..0078e047eedd69
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/list_kernels/list_element_shape_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/kernels/variants/list_kernels/test_util.h"
+#include "tensorflow/lite/kernels/variants/list_ops_lib.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace variants {
+namespace ops {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class ListElementShapeModel : public ListOpModel {
+ public:
+  ListElementShapeModel() {
+    list_input_ = AddInput({TensorType_VARIANT, {}});
+    shape_output_ = AddOutput({TensorType_INT32, {}});
+    SetCustomOp("ListElementShape", {}, Register_LIST_ELEMENT_SHAPE);
+    BuildInterpreter({{}});
+  }
+  const TfLiteTensor* GetOutputTensor(int index) {
+    return interpreter_->tensor(index);
+  }
+  int list_input_;
+  int shape_output_;
+};
+
+TEST(ListElementShapeTest, MultiDimStaticShape) {
+  ListElementShapeModel m;
+  m.PopulateListTensor(0, {2, 2}, 10, kTfLiteInt32);
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  const TfLiteTensor* const out = m.GetOutputTensor(m.shape_output_);
+  ASSERT_THAT(out, DimsAre({2}));
+  ASSERT_THAT(std::vector<int>(out->data.i32, out->data.i32 + 2),
+              ElementsAreArray({2, 2}));
+}
+
+TEST(ListElementShapeTest, MultiDimWithDynamicDims) {
+  ListElementShapeModel m;
+  m.PopulateListTensor(0, {2, -1, 3}, 10, kTfLiteInt32);
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  const TfLiteTensor* const out = m.GetOutputTensor(m.shape_output_);
+  ASSERT_THAT(out, DimsAre({3}));
+  ASSERT_THAT(std::vector<int>(out->data.i32, out->data.i32 + 3),
+              ElementsAreArray({2, -1, 3}));
+}
+
+TEST(ListElementShapeTest, ScalarShape) {
+  ListElementShapeModel m;
+  m.PopulateListTensor(0, {0}, 10, kTfLiteInt32);
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  const TfLiteTensor* const out = m.GetOutputTensor(m.shape_output_);
+  ASSERT_THAT(out, DimsAre({0}));
+  ASSERT_EQ(out->bytes, 0);
+}
+
+TEST(ListElementShapeTest, UnrankedShape) {
+  ListElementShapeModel m;
+  m.PopulateListTensor(0, {}, 10, kTfLiteInt32);
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  const TfLiteTensor* const out = m.GetOutputTensor(m.shape_output_);
+  ASSERT_THAT(out, DimsAre({}));
+  ASSERT_EQ(out->bytes, sizeof(int));
+  ASSERT_EQ(out->data.i32[0], -1);
+}
+
+}  // namespace
+}  // namespace ops
+}  // namespace variants
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/list_ops_lib.h b/tensorflow/lite/kernels/variants/list_ops_lib.h
index 4bf1b6038661ed..f0e8eff703b3a9 100644
--- a/tensorflow/lite/kernels/variants/list_ops_lib.h
+++ b/tensorflow/lite/kernels/variants/list_ops_lib.h
@@ -41,6 +41,8 @@ TfLiteRegistration* Register_LIST_GET_ITEM();
 
 TfLiteRegistration* Register_LIST_LENGTH();
 
+TfLiteRegistration* Register_LIST_ELEMENT_SHAPE();
+
 }  // namespace ops
 }  // namespace variants
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/register_list_ops.cc b/tensorflow/lite/kernels/variants/register_list_ops.cc
index 6c9f205377b0c5..cdcc47986158f2 100644
--- a/tensorflow/lite/kernels/variants/register_list_ops.cc
+++ b/tensorflow/lite/kernels/variants/register_list_ops.cc
@@ -28,6 +28,7 @@ void RegisterListOps(MutableOpResolver* resolver) {
   resolver->AddCustom("TensorListFromTensor", Register_LIST_FROM_TENSOR());
   resolver->AddCustom("TensorListGetItem", Register_LIST_GET_ITEM());
   resolver->AddCustom("TensorListLength", Register_LIST_LENGTH());
+  resolver->AddCustom("TensorListElementShape", Register_LIST_ELEMENT_SHAPE());
 }
 
 }  // namespace ops

From c7b5860dbcc8ad8d8236d0a52603643d5886097a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 22:02:32 -0700
Subject: [PATCH 251/567] Adding helper method to allow setting profile keys
 map.

PiperOrigin-RevId: 568413206
---
 third_party/xla/xla/hlo/ir/hlo_module.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index 00356d955e9a61..7ebc303b1b83e8 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -563,6 +563,14 @@ class HloModule {
     autofdo_profile_keys_[profile_type] = std::string(profile_key);
   }
 
+  void set_autofdo_profile_keys(
+      const absl::flat_hash_map<HloModuleProto::ProfileType, std::string>&
+          profile_keys) {
+    for (const auto& [profile_type, profile_key] : profile_keys) {
+      autofdo_profile_keys_[profile_type] = profile_key;
+    }
+  }
+
   const absl::flat_hash_map<HloModuleProto::ProfileType, std::string>&
   autofdo_profile_keys() const {
     return autofdo_profile_keys_;

From f0f6eb81e7ed9e835f0ae59194ea389abd4cfe2a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Sep 2023 23:22:57 -0700
Subject: [PATCH 252/567] Fixes build error with clang compiler.

PiperOrigin-RevId: 568431979
---
 tensorflow/core/example/feature_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h
index 777040a692c4dc..5f55138a94f4b0 100644
--- a/tensorflow/core/example/feature_util.h
+++ b/tensorflow/core/example/feature_util.h
@@ -223,7 +223,7 @@ struct NoneSuch {};
 // True if the Feature map in a tf.Example supports heterogenous lookup.
 // See https://abseil.io/tips/144.
 inline constexpr bool kFeatureMapHasHeterogeneousLookup =
-    Requires<const decltype(Features::default_instance().feature())>(
+    Requires<decltype(Features::default_instance().feature())>(
         [](auto&& c) -> decltype(c.find(NoneSuch{})) {});
 
 // Converts an `absl::string_view` into a string-type compatible for use in the

From 42af713661bf7324c0a8e9b2ee21750ec417cd5e Mon Sep 17 00:00:00 2001
From: Shanbin Ke <shanbinke@gmail.com>
Date: Mon, 25 Sep 2023 23:24:38 -0700
Subject: [PATCH 253/567] PR #5881: [XLA:GPU] remove unused buffers to fix
 cudnn graph

Imported from GitHub PR https://github.com/openxla/xla/pull/5881

This PR is to fix the bug introduced in commit https://github.com/openxla/xla/pull/5184/commits/25aa1baa87abb259e3e0350737c4d39a715183a0 in runner clean up PR: https://github.com/openxla/xla/pull/5184.

Unused buffers in fused attention need to be removed from data_vec so cudnn graph finalization doesn't error out.
Copybara import of the project:

--
0e3302504012794982a2a7acbc157767ccf07fcb by cjkkkk <ske@nvidia.com>:

rm unused buffers to fix cudnn graph

--
b16f756f7f71dd6f1b5959dfe29ff8ec619bae6f by cjkkkk <ske@nvidia.com>:

fix format

Merging this change closes #5881

PiperOrigin-RevId: 568432367
---
 third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index c6496ca684ab85..4ebad8290a80ee 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -6734,6 +6734,16 @@ class CudnnExecutionPlanRunner<void(Args...)>
       data_ptrs_vec.pop_back();
     }
 
+    if (sizeof...(Args) == 7 || sizeof...(Args) == 11) {
+      // is fused attention fwd and bwd
+      // remove empty buffers from the list
+      data_ptrs_vec.erase(
+          std::remove(data_ptrs_vec.begin(), data_ptrs_vec.end(), nullptr),
+          data_ptrs_vec.end());
+      // ensure the size is equal after removing useless pointers
+      CHECK(data_ptrs_vec.size() == data_uids_vec.size());
+    }
+
     if (should_add_scalars) {
       data_uids_vec.insert(data_uids_vec.end(), scalar_input_uids_.begin(),
                            scalar_input_uids_.end());

From 8eba8a15c600fea12eb5f403eeed56ce9c315a03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Tue, 26 Sep 2023 00:05:36 -0700
Subject: [PATCH 254/567] [XLA:GPU] Fall back to cuBLAS in TritonAutotuner if
 that's faster

PiperOrigin-RevId: 568440756
---
 third_party/xla/xla/debug_options_flags.cc    |  7 +++
 third_party/xla/xla/service/gpu/BUILD         |  3 ++
 .../xla/xla/service/gpu/float_support_test.cc |  1 +
 .../xla/xla/service/gpu/gpu_compiler.cc       | 12 ++++-
 .../gpu/ir_emitter_triton_large_test.cc       | 10 +++-
 .../xla/service/gpu/ir_emitter_triton_test.cc | 27 ++++++----
 .../xla/service/gpu/nvptx_compiler_test.cc    | 12 ++++-
 .../xla/xla/service/gpu/triton_autotuner.cc   | 54 +++++++++++++++----
 .../xla/service/gpu/triton_autotuner_test.cc  |  8 +--
 third_party/xla/xla/xla.proto                 |  6 ++-
 10 files changed, 111 insertions(+), 29 deletions(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index f399ccd33c37a5..549b9e8846142e 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -193,6 +193,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_single_wave_autotuning(true);
   opts.set_xla_gpu_enable_reduction_epilogue_fusion(true);
   opts.set_xla_gpu_enable_nccl_clique_optimization(false);
+  opts.set_xla_gpu_cublas_fallback(true);
   return opts;
 }
 
@@ -1266,6 +1267,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                     &DebugOptions::set_xla_gpu_enable_nccl_clique_optimization),
                 debug_options->xla_gpu_enable_nccl_clique_optimization(),
                 "Allow early return when acquiring NCCL cliques"));
+  flag_list->push_back(
+      tsl::Flag("xla_gpu_cublas_fallback",
+                bool_setter_for(&DebugOptions::set_xla_gpu_cublas_fallback),
+                debug_options->xla_gpu_cublas_fallback(),
+                "Allow Triton GEMM autotuning to fall back to cuBLAS when that "
+                "is faster."));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 3f0ca3b23d173c..eb65a040ba5708 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -605,6 +605,7 @@ xla_test(
         "//xla:error_spec",
         "//xla:xla_proto_cc",
         "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_googletest//:gtest",
     ],
@@ -669,6 +670,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+        "@local_config_cuda//cuda:cuda_headers",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -2918,6 +2920,7 @@ xla_cc_test(
         ":nvptx_compiler_impl",
         "//xla:statusor",
         "//xla:util",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:backend",
         "//xla/service:buffer_assignment",
diff --git a/third_party/xla/xla/service/gpu/float_support_test.cc b/third_party/xla/xla/service/gpu/float_support_test.cc
index 3f0bf03a2b1487..b7e2ffd6990b60 100644
--- a/third_party/xla/xla/service/gpu/float_support_test.cc
+++ b/third_party/xla/xla/service/gpu/float_support_test.cc
@@ -38,6 +38,7 @@ class FloatSupportTestWithTriton : public HloTestBase {
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_triton_gemm_any(true);
+    debug_options.set_xla_gpu_cublas_fallback(false);
     return debug_options;
   }
 };
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index f025ef98cc0af9..cdf2a46ba69ed8 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1000,10 +1000,18 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // f32).
   add_float_normalization(pipeline);
 
-  TF_RETURN_IF_ERROR(AddConvAndGemmAutotuningPasses(
-      &pipeline, hlo_module, autotune_config, thread_pool));
   TF_RETURN_IF_ERROR(AddTritonGemmAutotuningPasses(
       &pipeline, hlo_module, autotune_config, thread_pool));
+  // Inline back the calls which have better performance with cuBLAS.
+  pipeline.AddPass<CallInliner>();
+  // TODO(tdanyluk): Apply CublasPadForGemms to the cuBLAS GEMMs generated
+  // here for possibly better cuBLAS performance.
+  pipeline.AddPass<GemmRewriter>(gpu_version);
+  // Rewrite GEMMs with broadcasted inputs as strided GEMMs.
+  pipeline.AddPass<GemmBroadcastFoldingRewriter>();
+
+  TF_RETURN_IF_ERROR(AddConvAndGemmAutotuningPasses(
+      &pipeline, hlo_module, autotune_config, thread_pool));
 
   // The Triton autotuner can insert new bf16 reductions that need to be
   // normalized again.
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc
index 0ca4051230e49b..2fc8733027ac26 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc
@@ -18,13 +18,21 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/tests/hlo_test_base.h"
 #include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using TritonGemmTest = GpuCodegenTest;
+class TritonGemmTest : public GpuCodegenTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_cublas_fallback(false);
+    return debug_options;
+  }
+};
 
 TEST_F(TritonGemmTest, IndexUsing64Bits) {
   const char* kHloTextRef = R"(
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 65f6906475d8e2..014b006b6ab2a7 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -57,7 +57,22 @@ namespace {
 
 namespace m = ::xla::match;
 
-class TritonGemmNoTF32Test : public GpuCodegenTest {
+class TritonGemmTest : public GpuCodegenTest {
+ public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_cublas_fallback(false);
+    return debug_options;
+  }
+};
+
+class TritonGemmNoTF32Test : public TritonGemmTest {
  public:
   void SetUp() override {
     tf32_state_ = tsl::tensor_float_32_execution_enabled();
@@ -100,16 +115,6 @@ CHECK-NOT: mma
 )");
 }
 
-class TritonGemmTest : public GpuCodegenTest {
- public:
-  se::CudaComputeCapability GetCudaComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .cuda_compute_capability();
-  }
-};
-
 TEST_F(TritonGemmTest, DebugOptionsArePropagated) {
   const std::string kHloText = R"(
 ENTRY e {
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
index 9a76f025c9341b..9e61390635a471 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
+#include "xla/xla.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -40,6 +41,15 @@ class NVPTXCompilerTest : public HloTestBase {
   }
 };
 
+class NVPTXCompilerTestTriton : public NVPTXCompilerTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_cublas_fallback(false);
+    return debug_options;
+  }
+};
+
 TEST_F(NVPTXCompilerTest, AllReducePerformedInplace) {
   const absl::string_view hlo_string = R"(
 HloModule Module, input_output_alias={ {}: (0, {}, may-alias) }
@@ -96,7 +106,7 @@ ENTRY entry {
       all_reduce, {1}, all_reduce->operand(1), {}));
 }
 
-TEST_F(NVPTXCompilerTest,
+TEST_F(NVPTXCompilerTestTriton,
        DotDimensionAreSortedBeforePaddingForCublasEnablingTritonFusion) {
   MatchOptimizedHlo(R"(
 ENTRY e {
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner.cc b/third_party/xla/xla/service/gpu/triton_autotuner.cc
index fd5d77258cb907..d0dc5fc2fcbe64 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -133,14 +134,28 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
               }));
       VLOG(2) << "Result: " << autotune_result.ShortDebugString();
 
-      TF_RET_CHECK(autotune_result.has_triton());
-      *backend_config.mutable_triton_gemm_config() = autotune_result.triton();
-      TF_RETURN_IF_ERROR(hlo->set_backend_config(backend_config));
+      if (autotune_result.has_triton()) {
+        *backend_config.mutable_triton_gemm_config() = autotune_result.triton();
+        TF_RETURN_IF_ERROR(hlo->set_backend_config(backend_config));
+      } else {
+        // Falling back to cuBLAS: Converting the fusion to a Call, so that it
+        // can be inlined back again.
+        HloComputation* const computation = hlo->parent();
+        HloInstruction* const call = computation->AddInstruction(
+            HloInstruction::CreateCall(hlo->shape(), hlo->operands(),
+                                       hlo->fused_instructions_computation()));
+        TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, call));
+        hlo = call;
+      }
     }
-    const AutotuneResult::TritonGemmKey& tiling =
-        backend_config.triton_gemm_config();
-    if (tiling.split_k() > 1) {
-      TF_RETURN_IF_ERROR(MakeDotSplitKBatch(hlo, tiling));
+
+    // This cannot be the "else" branch of the previous "if".
+    if (backend_config.has_triton_gemm_config()) {
+      const AutotuneResult::TritonGemmKey& tiling =
+          backend_config.triton_gemm_config();
+      if (tiling.split_k() > 1) {
+        TF_RETURN_IF_ERROR(MakeDotSplitKBatch(hlo, tiling));
+      }
     }
 
     MarkAsChanged();
@@ -372,6 +387,11 @@ StatusOr<std::unique_ptr<HloModule>> CublasGemmAutotuneExtractor(
                                    GetGpuDeviceInfo(config.GetExecutor()));
   TF_RETURN_IF_ERROR(rewriter.Run(new_module.get()).status());
   TF_RETURN_IF_ERROR(fusion_pass.Run(new_module.get()).status());
+  // TODO(tdanyluk): Consider running GemmAlgorithmPicker here for better cuBLAS
+  // performance. It is probably not needed on Ampere and later because cuBLAS
+  // ignores the algorithm parameter for those targets. If we run
+  // GemmAlgorithmPicker, we probably should not run this in parallel with other
+  // compilations.
   return new_module;
 }
 
@@ -668,10 +688,26 @@ StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
   VLOG(2) << "Done running.";
 
   TF_ASSIGN_OR_RETURN(
-      AutotuneResult best,
+      AutotuneResult best_triton,
       PickBestResult(results, root.ToString(), root.GetModule()->config()));
 
-  return best;
+  if (debug_opts.xla_gpu_cublas_fallback()) {
+    const absl::Duration best_triton_duration =
+        tsl::proto_utils::FromDurationProto(best_triton.run_time());
+    if (cublas_duration < best_triton_duration) {
+      VLOG(1) << "Falling back to cuBLAS for " << fusion->name();
+
+      AutotuneResult cublas;
+      *cublas.mutable_run_time() =
+          tsl::proto_utils::ToDurationProto(cublas_duration);
+      // We will ignore this value anyway.
+      cublas.mutable_gemm()->set_algorithm(CUBLAS_GEMM_DEFAULT);
+
+      return cublas;
+    }
+  }
+
+  return best_triton;
 }
 
 Status DumpAutotunedFusions(const AutotuneConfig& config,
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
index 217f030519719d..c61522ff9fb2ac 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "xla/service/gpu/triton_autotuner.h"
 
 #include <algorithm>
@@ -149,6 +148,7 @@ class TritonAutotunerTest : public HloTestBase {
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_enable_triton_gemm(true);
+    debug_options.set_xla_gpu_cublas_fallback(false);
     return debug_options;
   }
 
@@ -459,6 +459,7 @@ class TritonAutotunerLevelTest : public HloTestBase,
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_autotune_level(GetParam());
+    debug_options.set_xla_gpu_cublas_fallback(false);
     return debug_options;
   }
 };
@@ -498,8 +499,7 @@ INSTANTIATE_TEST_SUITE_P(TritonAutotunerLevelSweep, TritonAutotunerLevelTest,
 class TritonAutotunerExhaustiveTest : public TritonAutotunerTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_triton_gemm(true);
+    DebugOptions debug_options = TritonAutotunerTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_exhaustive_tiling_search(true);
     return debug_options;
   }
@@ -543,7 +543,7 @@ ENTRY e {
 class TritonAutotunerDisableSplitK : public TritonAutotunerTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options = TritonAutotunerTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_enable_split_k_autotuning(false);
     return debug_options;
   }
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index c0c3d9aaeeeb2b..b4c617d5db56e6 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -616,7 +616,11 @@ message DebugOptions {
   // Replace custom calls with noop operations.
   bool xla_gpu_mock_custom_calls = 245;
 
-  // Next id: 247
+  // Allow Triton GEMM autotuning to fall back to cuBLAS when that is
+  // faster.
+  bool xla_gpu_cublas_fallback = 247;
+
+  // Next id: 248
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.

From 8b579914b4b4ebeebdb93728fc06f256fa64bd64 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 26 Sep 2023 00:31:22 -0700
Subject: [PATCH 255/567] [stream_executor] NFC: Make
 stream_executor:executor_cache implementation private

PiperOrigin-RevId: 568446172
---
 third_party/xla/xla/service/gpu/runtime/BUILD |  3 +-
 third_party/xla/xla/stream_executor/BUILD     | 45 +++++++++----------
 .../xla/xla/stream_executor/cuda/BUILD        |  1 -
 .../xla/xla/stream_executor/executor_cache.cc |  2 +-
 .../xla/xla/stream_executor/host/BUILD        |  7 ++-
 .../xla/xla/stream_executor/rocm/BUILD        |  1 -
 6 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 168a6e04b071d4..fc2a57803bafe0 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -56,8 +56,7 @@ cc_library(
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:thunk",
-        "//xla/stream_executor:event",
-        "//xla/stream_executor:executor_cache",
+        "//xla/stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index ab8e0b77ef79f1..3d8b798c4b3389 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -247,6 +247,26 @@ filegroup(
 # implementation) or `stream_executor_headers` (only headers, if there is a reason not to link
 # implementation) if they want to use StreamExecutor.
 
+cc_library(
+    name = "executor_cache",
+    srcs = ["executor_cache.cc"],
+    hdrs = ["executor_cache.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":platform",
+        ":stream_executor_headers",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "kernel_spec",
     srcs = ["kernel_spec.cc"],
@@ -379,30 +399,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "executor_cache",
-    srcs = ["executor_cache.cc"],
-    hdrs = ["executor_cache.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":platform",
-        ":stream_executor_headers",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "multi_platform_manager",
     srcs = ["multi_platform_manager.cc"],
@@ -747,6 +743,7 @@ cc_library(
         ":dnn",
         ":dnn_proto_cc",
         ":event",
+        ":executor_cache",
         ":kernel",
         ":kernel_spec",
         ":multi_platform_manager",
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 3b5e64c27d93ea..01b9bb3c68a497 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -59,7 +59,6 @@ cc_library(
             ":cuda_platform_id",
             ":cuda_activation",
             "//xla/stream_executor",  # buildcleaner: keep
-            "//xla/stream_executor:executor_cache",
             "//xla/stream_executor:multi_platform_manager",
             "//xla/stream_executor:stream_executor_headers",
             "//xla/stream_executor/platform",
diff --git a/third_party/xla/xla/stream_executor/executor_cache.cc b/third_party/xla/xla/stream_executor/executor_cache.cc
index 10b3627f7aee3a..994a3e44179355 100644
--- a/third_party/xla/xla/stream_executor/executor_cache.cc
+++ b/third_party/xla/xla/stream_executor/executor_cache.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/statusor.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index d6b2d697af45d5..30bfc2bacdb8ee 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -42,9 +42,8 @@ cc_library(
     deps = [
         ":host_gpu_executor",
         ":host_platform_id",
-        "//xla/stream_executor:executor_cache",
+        "//xla/stream_executor",
         "//xla/stream_executor:multi_platform_manager",
-        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
@@ -63,7 +62,8 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        "//xla/stream_executor:kernel",
+        "//xla/stream_executor",
+        "//xla/stream_executor:stream_executor_internal",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:denormal",
@@ -86,7 +86,6 @@ cc_library(
         ":host_platform_id",
         ":host_stream",
         "//xla/stream_executor",
-        "//xla/stream_executor:kernel",
         "//xla/stream_executor:stream_executor_internal",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 57826e043149ca..74f710420f6e40 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -159,7 +159,6 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "//xla/stream_executor",  # buildcleaner: keep
-        "//xla/stream_executor:executor_cache",
         "//xla/stream_executor:multi_platform_manager",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/platform",

From 2f94ec0c2ed3d16d846f0244c6f8ddcb1ae56e86 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 02:03:33 -0700
Subject: [PATCH 256/567] compat: Update forward compatibility horizon to
 2023-09-26

PiperOrigin-RevId: 568466907
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index dbb8b5b32bce92..bc3240a9edd402 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 25)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 26)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From a6815d063b081eab127143de6316681847fecfb0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 02:03:33 -0700
Subject: [PATCH 257/567] Update GraphDef version to 1631.

PiperOrigin-RevId: 568466908
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 3f5ae3ed744546..06e48524defcb2 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1630  // Updated: 2023/9/25
+#define TF_GRAPH_DEF_VERSION 1631  // Updated: 2023/9/26
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From c2ae731c249b9ad452304a505929a7de8faf12a8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 02:06:31 -0700
Subject: [PATCH 258/567] Make SimplifyFPConversions check creation pass ids

This is adding an additional model to `SimplifyFPConversions` which will only
simplify chains of converts if they were generated by a previous optimization pass.

The pass is checking whether these convert ops were created by an optimization pass
or part of the input HLO by checking the `logical_creation_pass_id` metadata field.
A pass id of -1 means they were part of the input.

Note that this changing is only preparing the pass and adding changes, but it is
not yet changing any behaviour in XLA.

PiperOrigin-RevId: 568467656
---
 third_party/xla/xla/service/BUILD             |  6 +-
 .../xla/xla/service/gpu/gpu_compiler.cc       |  3 +-
 .../xla/service/simplify_fp_conversions.cc    | 37 +++++++-
 .../xla/xla/service/simplify_fp_conversions.h | 21 ++++-
 .../service/simplify_fp_conversions_test.cc   | 88 ++++++++++++++++++-
 5 files changed, 144 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index b29f6d1a832b07..f20a5fcf8e77b8 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -2525,8 +2525,12 @@ cc_library(
     deps = [
         ":hlo_pass",
         "//xla:comparison_util",
+        "//xla:statusor",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
-        "@local_tsl//tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index cdf2a46ba69ed8..db3c2cec28c571 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -921,7 +921,8 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     sub_pipeline.AddPass<FloatNormalization>(&f8e4m3fnuz_support);
     // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
     if (debug_options.xla_gpu_simplify_all_fp_conversions()) {
-      sub_pipeline.AddPass<SimplifyFPConversions>();
+      sub_pipeline.AddPass<SimplifyFPConversions>(
+          SimplifyFPConversions::Scope::kSimplifyAllConversions);
     }
   };
 
diff --git a/third_party/xla/xla/service/simplify_fp_conversions.cc b/third_party/xla/xla/service/simplify_fp_conversions.cc
index 8651a5550c5ba3..22fbf3961506c9 100644
--- a/third_party/xla/xla/service/simplify_fp_conversions.cc
+++ b/third_party/xla/xla/service/simplify_fp_conversions.cc
@@ -15,23 +15,35 @@ limitations under the License.
 
 #include "xla/service/simplify_fp_conversions.h"
 
+#include <cstddef>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_format.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
-#include "tsl/platform/statusor.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace {
 
 // Simplifies floating-point conversions `A -> B -> C -> D` as `A -> D`.
-StatusOr<bool> RunOnComputation(HloComputation& computation) {
+StatusOr<bool> RunOnComputation(HloComputation& computation,
+                                SimplifyFPConversions::Scope scope) {
+  const int minimum_logical_creation_pass_id =
+      (scope == SimplifyFPConversions::Scope::kSimplifyAllConversions) ? -1 : 0;
   bool changed = false;
   for (HloInstruction* instruction : computation.MakeInstructionPostOrder()) {
     HloInstruction* input = instruction;
     size_t convert_chain_length = 0;
 
     while ((input->opcode() == HloOpcode::kConvert) &&
+           (input->metadata().logical_creation_pass_id() >=
+            minimum_logical_creation_pass_id) &&
            primitive_util::IsFloatingPointType(input->shape().element_type())) {
       input = input->mutable_operand(0);
       ++convert_chain_length;
@@ -52,17 +64,36 @@ StatusOr<bool> RunOnComputation(HloComputation& computation) {
   return changed;
 }
 
+std::string ToString(SimplifyFPConversions::Scope scope) {
+  using Scope = SimplifyFPConversions::Scope;
+  switch (scope) {
+    case Scope::kSimplifyAllConversions:
+      return "SimplifyAllConversions";
+    case Scope::kOnlySimplifyCompilerGeneratedConversions:
+      return "OnlySimplifyCompilerGeneratedConversions";
+  }
+}
+
 }  // namespace
 
 StatusOr<bool> SimplifyFPConversions::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  XLA_VLOG_LINES(
+      2,
+      absl::StrFormat("SimplifyFPConversions::Run() with scope=%s, before:\n%s",
+                      ToString(scope_), module->ToString()));
   bool changed = false;
   for (HloComputation* computation :
        module->MakeComputationPostOrder(execution_threads)) {
-    TF_ASSIGN_OR_RETURN(bool comp_changed, RunOnComputation(*computation));
+    TF_ASSIGN_OR_RETURN(bool comp_changed,
+                        RunOnComputation(*computation, scope_));
     changed |= comp_changed;
   }
+  XLA_VLOG_LINES(
+      2,
+      absl::StrFormat("SimplifyFPConversions::Run() with scope=%s, after:\n%s",
+                      ToString(scope_), module->ToString()));
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/simplify_fp_conversions.h b/third_party/xla/xla/service/simplify_fp_conversions.h
index 5951148a54272e..3904fa37505a52 100644
--- a/third_party/xla/xla/service/simplify_fp_conversions.h
+++ b/third_party/xla/xla/service/simplify_fp_conversions.h
@@ -16,9 +16,10 @@ limitations under the License.
 #ifndef XLA_SERVICE_SIMPLIFY_FP_CONVERSIONS_H_
 #define XLA_SERVICE_SIMPLIFY_FP_CONVERSIONS_H_
 
+#include "absl/container/flat_hash_set.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
-#include "tsl/platform/statusor.h"
+#include "xla/statusor.h"
 
 namespace xla {
 
@@ -26,16 +27,30 @@ namespace xla {
 //
 // The algebraic simplifier will remove convert pairs of the form `X -> Y -> X`,
 // only when they are a no-op (e.g. `bf16 -> f32 -> bf16`). This passes does
-// similar, but will simplify any chain of float conversions, possibly improving
-// accuracy (e.g. `f32 -> bf16 -> f32` is removed).
+// similar, but has two scopes:
+// - kSimplifyAllConversions: Simplify any chain of float conversions, possibly
+//   improving  accuracy (e.g. `f32 -> bf16 -> f32` is removed).
+// - kOnlySimplifyCompilerGeneratedConversions: Only simplify chains of float
+//   conversions generated by the compiler in one of the previous optimization
+//   passes.
 class SimplifyFPConversions : public HloModulePass {
  public:
+  enum class Scope {
+    kOnlySimplifyCompilerGeneratedConversions,
+    kSimplifyAllConversions
+  };
+
+  explicit SimplifyFPConversions(Scope scope) : scope_(scope) {}
+
   absl::string_view name() const override { return "simplify-fp-conversions"; }
 
   using HloPassInterface::Run;
   StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  Scope scope_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/simplify_fp_conversions_test.cc b/third_party/xla/xla/service/simplify_fp_conversions_test.cc
index 9d60d68e395160..fae975963adb46 100644
--- a/third_party/xla/xla/service/simplify_fp_conversions_test.cc
+++ b/third_party/xla/xla/service/simplify_fp_conversions_test.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include "xla/service/simplify_fp_conversions.h"
 
+#include <memory>
+
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/status_matchers.h"
@@ -31,6 +36,32 @@ using ::tsl::testing::IsOkAndHolds;
 
 using SimplifyFPConversionsTest = HloTestBase;
 
+// This marks all ops in `module` as user-provided, meaning the
+// simplifier won't remove any of the converts
+static void InitializeCreationPassIds(HloModule* module) {
+  constexpr int kUserSuppliedOpCreationPassId = -1;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      instruction->set_creation_pass_id(kUserSuppliedOpCreationPassId);
+      instruction->set_logical_creation_pass_id(kUserSuppliedOpCreationPassId);
+    }
+  }
+}
+
+// This marks all converts ops in `module` as being created by the
+// optimization pass `creation_pass_id`.
+static void SetCreationPassIdInAllConvertOps(HloModule* module,
+                                             int creation_pass_id) {
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kConvert) {
+        instruction->set_creation_pass_id(creation_pass_id);
+        instruction->set_logical_creation_pass_id(creation_pass_id);
+      }
+    }
+  }
+}
+
 TEST_F(SimplifyFPConversionsTest, DoesNotChangeSingleConvert) {
   const absl::string_view kModuleStr = R"(
     HloModule test
@@ -43,8 +74,10 @@ TEST_F(SimplifyFPConversionsTest, DoesNotChangeSingleConvert) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
+  InitializeCreationPassIds(module.get());
 
-  SimplifyFPConversions simplifier;
+  SimplifyFPConversions simplifier{
+      SimplifyFPConversions::Scope::kSimplifyAllConversions};
   EXPECT_THAT(simplifier.Run(module.get()), IsOkAndHolds(false));
 }
 
@@ -61,13 +94,60 @@ TEST_F(SimplifyFPConversionsTest, SimplifiesF32ToBF16ToF32) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
+  InitializeCreationPassIds(module.get());
+
+  SimplifyFPConversions simplifier{
+      SimplifyFPConversions::Scope::kSimplifyAllConversions};
+  EXPECT_THAT(simplifier.Run(module.get()), IsOkAndHolds(true));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Parameter(0)));
+}
+
+TEST_F(SimplifyFPConversionsTest, SimplifiesCompilerGeneratedF32ToBF16ToF32) {
+  const absl::string_view kModuleStr = R"(
+    HloModule test
+
+    ENTRY entry {
+      p0 = f32[2,3] parameter(0)
+      c0 = bf16[2,3] convert(p0)
+      c1 = f32[2,3] convert(c0)
+      ROOT ret = (f32[2,3]) tuple(c1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  InitializeCreationPassIds(module.get());
+
+  constexpr int kRandomCreationPassId = 42;
+  SetCreationPassIdInAllConvertOps(module.get(), kRandomCreationPassId);
 
-  SimplifyFPConversions simplifier;
+  SimplifyFPConversions simplifier{
+      SimplifyFPConversions::Scope::kOnlySimplifyCompilerGeneratedConversions};
   EXPECT_THAT(simplifier.Run(module.get()), IsOkAndHolds(true));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Tuple(op::Parameter(0)));
 }
 
+TEST_F(SimplifyFPConversionsTest, DoesNotChangeUserInsertedConverts) {
+  const absl::string_view kModuleStr = R"(
+    HloModule test
+
+    ENTRY entry {
+      p0 = f32[2,3] parameter(0)
+      c0 = bf16[2,3] convert(p0)
+      c1 = f32[2,3] convert(c0)
+      ROOT ret = (f32[2,3]) tuple(c1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  InitializeCreationPassIds(module.get());
+
+  SimplifyFPConversions simplifier{
+      SimplifyFPConversions::Scope::kOnlySimplifyCompilerGeneratedConversions};
+  EXPECT_THAT(simplifier.Run(module.get()), IsOkAndHolds(false));
+}
+
 TEST_F(SimplifyFPConversionsTest, SimplifiesF64ToF16ToF32ToBF16) {
   const absl::string_view kModuleStr = R"(
     HloModule test
@@ -82,8 +162,10 @@ TEST_F(SimplifyFPConversionsTest, SimplifiesF64ToF16ToF32ToBF16) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
+  InitializeCreationPassIds(module.get());
 
-  SimplifyFPConversions simplifier;
+  SimplifyFPConversions simplifier{
+      SimplifyFPConversions::Scope::kSimplifyAllConversions};
   EXPECT_THAT(simplifier.Run(module.get()), IsOkAndHolds(true));
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),

From a1fa36df2097fed9b3f912ca02b737dde5ba81e2 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Tue, 26 Sep 2023 02:20:28 -0700
Subject: [PATCH 259/567] [XLA:GPU] Fix detection of unsupported transposes in
 Triton GEMM fusions.

PiperOrigin-RevId: 568470689
---
 .../xla/service/gpu/gemm_rewriter_triton.cc   | 37 ++++++++-----------
 .../service/gpu/gemm_rewriter_triton_test.cc  | 14 +++++++
 2 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
index 8f1e0399ddb13a..59cecc4dba6e77 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
@@ -554,54 +554,48 @@ FusionDecision FusionContext::RequireSupportedDimOrder(
     const DimensionOrder& order, int64_t& split_dim_major_part) const {
   VLOG(8) << order.ToString();
   const Fragments& tensor_dim_fragments = order.TensorFragmentsOrder();
-  auto non_major_fragment_is_sliced = [&](int fragment, int distance_to_end) {
-    return distance_to_end > 1 && tensor_dim_fragments[fragment].is_sliced();
-  };
   for (const auto& [dim_index, dim_fragments] : order.DimFragmentsOrders()) {
-    int split_counter = -1;
+    CHECK(!dim_fragments.empty());
+    for (int i = 0; i < dim_fragments.size() - 1; ++i) {
+      if (tensor_dim_fragments[dim_fragments[i]].is_sliced()) {
+        return "Sliced non-major-most fragment.";
+      }
+    }
+    int group_counter = 0;
+    int last_seen_group_last_fragment_index = -1;
     auto fragment_it = dim_fragments.cbegin();
-    // TODO(b/300892934): simplify the logic.
     while (true) {
       if (fragment_it == dim_fragments.cend()) {
         break;
       }
-      if (non_major_fragment_is_sliced(*fragment_it,
-                                       dim_fragments.cend() - fragment_it)) {
-        return "Sliced non-major-most fragment.";
-      }
       int64_t grouped_size = tensor_dim_fragments[*fragment_it].full_size();
-      // Gather contiguous fragments.
+      // Gather contiguous fragments: they have consecutive indices.
       while ((fragment_it + 1) != dim_fragments.cend() &&
              *(fragment_it + 1) == *fragment_it + 1) {
         ++fragment_it;
-        if (non_major_fragment_is_sliced(*fragment_it,
-                                         dim_fragments.cend() - fragment_it)) {
-          return "Sliced non-major-most fragment.";
-        }
         grouped_size *= tensor_dim_fragments[*fragment_it].full_size();
       }
-
+      // Ignore 1-sized groups of fragments.
       if (grouped_size == 1) {
         ++fragment_it;
         continue;
       }
 
-      if (fragment_it != dim_fragments.cbegin() &&
-          *fragment_it < *(fragment_it - 1)) {
+      if (last_seen_group_last_fragment_index > *fragment_it) {
         return "Transpose within a dimension.";
       }
 
-      ++split_counter;
-      if (split_counter > 0) {
+      ++group_counter;
+      if (group_counter > 1) {
         if (dim_index == SplittableDimensionIndex() &&
             IsSupportedSplittableDimensionMajorPartSize(grouped_size)) {
-          if (split_counter == 1) {
+          if (group_counter == 2) {
             if (split_dim_major_part != 0 &&
                 split_dim_major_part != grouped_size) {
               return "Conflicting splits of splittable dimension";
             }
             split_dim_major_part = grouped_size;
-          } else if (split_counter > 1) {
+          } else if (group_counter > 2) {
             return "2nd split of a splittable dimension.";
           }
         } else {
@@ -609,6 +603,7 @@ FusionDecision FusionContext::RequireSupportedDimOrder(
         }
       }
 
+      last_seen_group_last_fragment_index = *fragment_it;
       ++fragment_it;
     }
   }
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
index b91abc7f617ffe..c72884cf3e912e 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
@@ -100,6 +100,20 @@ ENTRY e {
               GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
 }
 
+TEST_F(GemmRewriterTritonTest, UnsupportedTransposeIsNotFused) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = f16[1,512,8,1024]{3,1,0,2} parameter(0)
+  c = f16[1,512,8,1024]{3,2,1,0} copy(p0)
+  b = f16[4096,1024]{1,0} bitcast(c)
+  p1 = f16[128,1024]{1,0} parameter(1)
+  ROOT d = f16[4096,128]{1,0} dot(b, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})")
+                    .value();
+  EXPECT_FALSE(GemmRewriterTriton(gpu_version_).Run(module.get()).value());
+}
+
 TEST_F(GemmRewriterTritonTest, BitcastChain) {
   // This HLO is artificial because unnecessary reshapes get optimized
   // out during compilation. It tests the ability of GemmRewriterTriton

From 6c4d42bc1f1de15486ef50a54b99543a6ae9c679 Mon Sep 17 00:00:00 2001
From: TJ Xu <tjx@nvidia.com>
Date: Tue, 26 Sep 2023 03:08:47 -0700
Subject: [PATCH 260/567] PR #5473: [NVIDIA XLA GPU] Added an pass to unroll
 loops by a factor of 2 to achieve double buffering

Imported from GitHub PR https://github.com/openxla/xla/pull/5473

This only deals with loops with known trip count.
For loops with even trip count, we unroll directly.
For loops with odd trip count, we take out 1 iteration into the main computation outside of the main loop and unroll the rest by a factor of 2.
Copybara import of the project:

--
ff05af9fb793a1b14ffc8de6ba327c20ef339d38 by TJ <tjx@nvidia.com>:

Add a new experimental pass called test_double_buffer
It's placed before async creator for now to unroll while loops to
achieve double buffering indirectly. However, control depdencies
still need to injected between two loop iterations to ensure execution
order

--
293481cd42102f2fce8bddf3febffa9a0b6aec50 by TJ <tjx@nvidia.com>:

add support for loops with odd trip count

--
b304ec3f6f9a8ef314da0f302c7e7d527e147e4c by TJ <tjx@nvidia.com>:

added a separate file to host loop double buffer transformer tests
refactored code in loop double buffer transformer
added a new test in copy insertion test to verify double buffer

--
49af20bee3d43208a3c517acae0f488d58fa6b4c by TJ <tjx@nvidia.com>:

simplify copy loop for unrolling and peeling even further

--
0858b06a3128e64129ad91c22f33baf698d743d4 by TJ <tjx@nvidia.com>:

added more tests to cover channel id and control deps

Merging this change closes #5473

PiperOrigin-RevId: 568481154
---
 third_party/xla/xla/debug_options_flags.cc    |   8 +
 third_party/xla/xla/service/gpu/BUILD         |  49 +++
 .../xla/xla/service/gpu/gpu_compiler.cc       |   7 +
 .../service/gpu/gpu_copy_insertion_test.cc    |  65 ++++
 .../gpu/loop_double_buffer_transformer.cc     | 267 +++++++++++++
 .../gpu/loop_double_buffer_transformer.h      |  55 +++
 .../loop_double_buffer_transformer_test.cc    | 361 ++++++++++++++++++
 third_party/xla/xla/xla.proto                 |   5 +-
 8 files changed, 816 insertions(+), 1 deletion(-)
 create mode 100644 third_party/xla/xla/service/gpu/loop_double_buffer_transformer.cc
 create mode 100644 third_party/xla/xla/service/gpu/loop_double_buffer_transformer.h
 create mode 100644 third_party/xla/xla/service/gpu/loop_double_buffer_transformer_test.cc

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 549b9e8846142e..97d2c5dfab2c74 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -194,6 +194,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_enable_reduction_epilogue_fusion(true);
   opts.set_xla_gpu_enable_nccl_clique_optimization(false);
   opts.set_xla_gpu_cublas_fallback(true);
+  opts.set_xla_gpu_enable_while_loop_double_buffering(false);
+
   return opts;
 }
 
@@ -1273,6 +1275,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 debug_options->xla_gpu_cublas_fallback(),
                 "Allow Triton GEMM autotuning to fall back to cuBLAS when that "
                 "is faster."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_while_loop_double_buffering",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_while_loop_double_buffering),
+      debug_options->xla_gpu_enable_while_loop_double_buffering(),
+      "Enable double buffering for while loop"));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index eb65a040ba5708..1c76f62f78b7b1 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2622,6 +2622,7 @@ cc_library(
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
+        ":loop_double_buffer_transformer",
         ":matmul_utils",
         ":metrics",
         ":move_copy_to_users",
@@ -4605,3 +4606,51 @@ xla_cc_test(
         "@local_tsl//tsl/lib/core:status_test_util",
     ],
 )
+
+cc_library(
+    name = "loop_double_buffer_transformer",
+    srcs = ["loop_double_buffer_transformer.cc"],
+    hdrs = ["loop_double_buffer_transformer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":gpu_types",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "loop_double_buffer_transformer_test",
+    srcs = if_gpu_is_configured(["loop_double_buffer_transformer_test.cc"]),
+    tags = tf_cuda_tests_tags(),
+    deps = if_gpu_is_configured([
+        ":gpu_compiler",
+        ":loop_double_buffer_transformer",
+        "//xla:test",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:tuple_simplifier",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+    ]) + [
+        "//xla:test_helpers",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/service:hlo_dce",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index db3c2cec28c571..c6b64525105223 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -116,6 +116,7 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_stats.h"
 #include "xla/service/gpu/horizontal_loop_fusion.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/loop_double_buffer_transformer.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/metrics.h"
 #include "xla/service/gpu/move_copy_to_users.h"
@@ -783,6 +784,12 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
       pipeline.AddPass<AllReduceBlueConnect>(blueconnect_num_devices_per_host);
     }
 
+    if (debug_options.xla_gpu_enable_while_loop_double_buffering()) {
+      pipeline.AddPass<LoopDoubleBufferTransformer>();
+      pipeline.AddPass<TupleSimplifier>();
+      pipeline.AddPass<HloDCE>();
+    }
+
     {
       // Convert all collectives to their async form, and then annotate the ones
       // that actually need to run asynchronously with a GPU specific backend
diff --git a/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc b/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
index 8fe7547b870d29..3f3bf3c3cf1d74 100644
--- a/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
@@ -504,6 +504,71 @@ ENTRY main {
   ExpectOptionalFalse(FusionCanShareBufferHint(fusion, fusion->operand(0), {}));
 }
 
+// For loops unrolled with double buffering,
+// copyInsertion should not insert any copy.
+TEST_F(GpuCopyInsertionTest, UnrolledLoopShouldNotHaveCopy) {
+  const char* const kModuleString = R"(
+HloModule all_gather_overlapping, entry_computation_layout={(f32[1,128]{1,0}, f32[2,128]{1,0})->(f32[1,128]{1,0}, f32[1,128]{1,0}, f32[2,128]{1,0}, s32[])}
+
+body {
+  input_tuple_while = (f32[1,128]{1,0}, f32[1,128]{1,0}, f32[2,128]{1,0}, s32[]) parameter(0)
+  param_1 = f32[2,128]{1,0} get-tuple-element(input_tuple_while), index=2
+  c1_s32 = s32[] constant(1)
+  c0_s32 = s32[] constant(0)
+  dynamic-slice = f32[1,128]{1,0} dynamic-slice(param_1, c1_s32, c0_s32), dynamic_slice_sizes={1,128}
+  param_0 = f32[1,128]{1,0} get-tuple-element(input_tuple_while), index=0
+  cond.1 = s32[] get-tuple-element(input_tuple_while), index=3
+  cond_plus_1 = s32[] add(cond.1, c1_s32)
+  c0 = f32[] constant(0)
+  splat_c0 = f32[1,128]{1,0} broadcast(c0), dimensions={}
+  add = f32[1,128]{1,0} add(splat_c0, param_0)
+  all-gather-start = (f32[1,128]{1,0}, f32[2,128]{1,0}) all-gather-start(add), channel_id=1337, replica_groups={{0,1}}, dimensions={0}, use_global_device_ids=true
+  all-gather-done = f32[2,128]{1,0} all-gather-done(all-gather-start)
+  dynamic-slice.double_buffer_clone = f32[1,128]{1,0} dynamic-slice(all-gather-done, c1_s32, c0_s32), dynamic_slice_sizes={1,128}
+  splat_c0_unrolled = f32[1,128]{1,0} broadcast(c0), dimensions={}
+  add.double_buffer_clone = f32[1,128]{1,0} add(splat_c0_unrolled, param_0)
+  all-gather-start-unrolled = (f32[1,128]{1,0}, f32[2,128]{1,0}) all-gather-start(add.double_buffer_clone), channel_id=1339, replica_groups={{0,1}}, dimensions={0}, use_global_device_ids=true
+  all-gather-done-unrolled = f32[2,128]{1,0} all-gather-done(all-gather-start-unrolled)
+  one.2 = s32[] constant(1)
+  cond_plus_1.double_buffer_clone = s32[] add(cond_plus_1, one.2)
+  ROOT output_tuple = (f32[1,128]{1,0}, f32[1,128]{1,0}, f32[2,128]{1,0}, s32[]) tuple(param_0, dynamic-slice.double_buffer_clone, all-gather-done-unrolled, cond_plus_1.double_buffer_clone)
+}
+
+condition {
+  input_tuple = (f32[1,128]{1,0}, f32[1,128]{1,0}, f32[2,128]{1,0}, s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=3
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+ENTRY main {
+  input_param_0 = f32[1,128]{1,0} parameter(0)
+  input_param_1 = f32[2,128]{1,0} parameter(1)
+  constant_1 = s32[] constant(1)
+  constant_0 = s32[] constant(0)
+  dynamic-slice-main = f32[1,128]{1,0} dynamic-slice(input_param_1, constant_1, constant_0), dynamic_slice_sizes={1,128}
+  float0 = f32[] constant(0)
+  splat_float0 = f32[1,128]{1,0} broadcast(float0), dimensions={}
+  add.peeled_double_buffer = f32[1,128]{1,0} add(splat_float0, input_param_0)
+  all-gather-start-main = (f32[1,128]{1,0}, f32[2,128]{1,0}) all-gather-start(add.peeled_double_buffer), channel_id=1338, replica_groups={{0,1}}, dimensions={0}, use_global_device_ids=true
+  all-gather-done-main = f32[2,128]{1,0} all-gather-done(all-gather-start-main)
+  param_2 = s32[] constant(0)
+  cond_plus_1.peeled_double_buffer = s32[] add(param_2, constant_1)
+  tuple = (f32[1,128]{1,0}, f32[1,128]{1,0}, f32[2,128]{1,0}, s32[]) tuple(input_param_0, dynamic-slice-main, all-gather-done-main, cond_plus_1.peeled_double_buffer)
+  ROOT while = (f32[1,128]{1,0}, f32[1,128]{1,0}, f32[2,128]{1,0}, s32[]) while(tuple), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+
+  CopyInsertion copy_insertion(FusionCanShareBufferHint,
+                               /*use_region_based_live_range_analysis=*/0);
+  ASSERT_IS_OK(copy_insertion.Run(module.get(), {"foobar"}).status());
+  VLOG(2) << module->ToString();
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.cc b/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.cc
new file mode 100644
index 00000000000000..113970552597a4
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.cc
@@ -0,0 +1,267 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/loop_double_buffer_transformer.h"
+
+#include <cstdint>
+#include <iterator>
+#include <optional>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+void SetChannelIdForNewCollective(HloInstruction* new_instr,
+                                  const HloModule* module) {
+  // This is to track mappings of old->new channel id for async collectives
+  // wrapped in the form of HloAsyncInstruction, the start and done need to
+  // have the same unique channel id.
+  absl::flat_hash_map<int64_t, int64_t> old_to_new_channel_id_map;
+  absl::flat_hash_map<int64_t, HloComputation*> channel_id_comp_map;
+  if (HloAsyncInstruction::ClassOf(new_instr) &&
+      hlo_query::IsCollectiveCommunicationOp(
+          DynCast<HloAsyncInstruction>(new_instr)
+              ->async_wrapped_instruction()
+              ->opcode())) {
+    HloInstruction* wrapped_instr =
+        DynCast<HloAsyncInstruction>(new_instr)->async_wrapped_instruction();
+    int64_t old_channel_id = *wrapped_instr->channel_id();
+    int64_t new_channel_id = old_to_new_channel_id_map[old_channel_id];
+    if (old_to_new_channel_id_map.find(old_channel_id) ==
+        old_to_new_channel_id_map.end()) {
+      new_channel_id = hlo_query::NextChannelId(*module);
+      VLOG(2) << "Generated new channel id " << new_channel_id;
+      old_to_new_channel_id_map[old_channel_id] = new_channel_id;
+    }
+
+    VLOG(2) << "Setting channel id to " << new_channel_id;
+
+    wrapped_instr->set_channel_id(new_channel_id);
+    if (channel_id_comp_map.find(new_channel_id) == channel_id_comp_map.end()) {
+      channel_id_comp_map[new_channel_id] = new_instr->called_computations()[0];
+    } else {
+      channel_id_comp_map[new_channel_id]->AddAsyncInstruction(*new_instr);
+    }
+  } else if (hlo_query::IsCollectiveCommunicationOp(new_instr->opcode()) ||
+             hlo_query::IsAsyncCollectiveStartOp(new_instr->opcode())) {
+    new_instr->set_channel_id(hlo_query::NextChannelId(*module));
+  }
+}
+
+Status PeelInstructionsForOddTripCount(HloModule* module,
+                                       HloInstruction* while_instr) {
+  HloCloneContext context(module, "peeled_double_buffer");
+
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> old_to_new_map;
+  HloComputation* while_body = while_instr->while_body();
+  HloInstruction* input_parameter = while_body->parameter_instruction(0);
+  HloInstruction* input_tuple = while_instr->mutable_operand(0);
+  CHECK(input_tuple->opcode() == HloOpcode::kTuple);
+
+  auto old_loop_roots = while_body->root_instruction()->mutable_operands();
+  HloComputation* parent_comp = while_instr->parent();
+  old_to_new_map[input_parameter] = input_tuple;
+
+  for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
+    if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
+      continue;
+    }
+    VLOG(2) << "Peeling instruction " << old_instr->ToString();
+    std::vector<HloInstruction*> new_operands(old_instr->operand_count());
+    for (int64_t i = 0; i < old_instr->operand_count(); i++) {
+      new_operands[i] = old_to_new_map[old_instr->mutable_operand(i)];
+    }
+    HloInstruction* new_instr =
+        parent_comp->AddInstruction(old_instr->CloneWithNewOperands(
+            old_instr->shape(), new_operands, &context));
+
+    SetChannelIdForNewCollective(new_instr, module);
+    old_to_new_map[old_instr] = new_instr;
+    VLOG(2) << "Added instruction " << new_instr->ToString()
+            << " to parent computation.";
+  }
+
+  std::vector<HloInstruction*> new_roots;
+  for (HloInstruction* instr : old_loop_roots) {
+    new_roots.push_back(old_to_new_map[instr]);
+  }
+  TF_RETURN_IF_ERROR(while_instr->ReplaceOperandWith(
+      0, old_to_new_map[while_body->root_instruction()]));
+  VLOG(2) << "Replaced with new input tuple "
+          << while_instr->operand(0)->ToString();
+
+  // Handle existing control dependencies.
+  for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
+    if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
+      HloInstruction* new_instr = old_to_new_map[old_instr];
+      VLOG(2) << "Processing control predecessors for peeled instruction "
+              << new_instr->ToString();
+      std::vector<HloInstruction*> new_control_pred(
+          old_instr->control_predecessors().size());
+      for (HloInstruction* pred : old_instr->control_predecessors()) {
+        new_control_pred.push_back(old_to_new_map[pred]);
+      }
+
+      TF_RETURN_IF_ERROR(new_instr->DropAllControlDeps());
+      for (HloInstruction* new_pred : new_control_pred) {
+        TF_RETURN_IF_ERROR(new_pred->AddControlDependencyTo(new_instr));
+        VLOG(2) << "Adding " << new_pred->ToString()
+                << " to control dependency of peeled instruction: "
+                << new_instr->ToString();
+      }
+    }
+  }
+  return OkStatus();
+}
+}  // namespace
+
+StatusOr<bool> LoopDoubleBufferTransformer::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  std::vector<HloInstruction*> while_instrs;
+  absl::c_copy_if(module->entry_computation()->instructions(),
+                  std::back_inserter(while_instrs),
+                  HloPredicateIsOp<HloOpcode::kWhile>);
+  VLOG(2) << "Processing " << while_instrs.size() << " while loops.";
+
+  for (HloInstruction* while_instr : while_instrs) {
+    TF_ASSIGN_OR_RETURN(WhileLoopBackendConfig config,
+                        while_instr->backend_config<WhileLoopBackendConfig>());
+    if (!config.has_known_trip_count()) {
+      VLOG(2) << while_instr->ToString()
+              << " doesn't have exact trip count, skipping double buffering "
+                 "for now";
+      continue;
+    }
+    int64_t exact_trip_count = config.known_trip_count().n();
+    VLOG(2) << "Processing while loop " << while_instr->ToString()
+            << " with trip count: " << exact_trip_count;
+
+    HloComputation* while_body = while_instr->while_body();
+
+    CHECK(while_body->root_instruction()->opcode() == HloOpcode::kTuple);
+    VLOG(2) << "Processing root " << while_body->root_instruction()->ToString();
+
+    auto old_loop_roots = while_body->root_instruction()->mutable_operands();
+    HloInstruction* input_parameter = while_body->parameter_instruction(0);
+    VLOG(2) << "Processing input parameter " << input_parameter->ToString();
+    absl::flat_hash_map<HloInstruction*, HloInstruction*> old_to_new_map;
+    absl::flat_hash_set<HloInstruction*> skip_control_dep_injection;
+
+    if (exact_trip_count % 2) {
+      VLOG(2) << "Found loops with odd trip count, 1 iteration will be peeled "
+                 "outside of the main body.";
+      TF_RETURN_IF_ERROR(PeelInstructionsForOddTripCount(module, while_instr));
+      exact_trip_count -= 1;
+    }
+    HloCloneContext context(module, "double_buffer_clone");
+    old_to_new_map[input_parameter] = while_body->root_instruction();
+    for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
+      if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
+        continue;
+      }
+      VLOG(2) << "Cloning instruction " << old_instr->ToString();
+      std::vector<HloInstruction*> new_operands;
+      for (HloInstruction* old_operand : old_instr->mutable_operands()) {
+        new_operands.push_back(old_to_new_map[old_operand]);
+      }
+      HloInstruction* new_instr =
+          while_body->AddInstruction(old_instr->CloneWithNewOperands(
+              old_instr->shape(), new_operands, &context));
+
+      // If an elementwise instruction with constant operand is present, we
+      // won't inject control dependency at the end to allow more constant
+      // folding opportunities.
+      if (old_instr->IsElementwiseBinary() && old_instr->HasConstantOperand()) {
+        skip_control_dep_injection.insert(old_instr);
+      }
+      SetChannelIdForNewCollective(new_instr, module);
+      old_to_new_map[old_instr] = new_instr;
+      VLOG(2) << "Added instruction " << new_instr->ToString();
+    }
+
+    while_body->set_root_instruction(
+        old_to_new_map[while_body->root_instruction()]);
+    VLOG(2) << "Replaced with new root "
+            << while_body->root_instruction()->ToString();
+
+    // Handle existing control dependencies.
+    for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
+      if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
+        HloInstruction* new_instr = old_to_new_map[old_instr];
+        VLOG(2) << "Processing control predecessors for "
+                << new_instr->ToString();
+        std::vector<HloInstruction*> new_control_pred(
+            old_instr->control_predecessors().size());
+        for (HloInstruction* pred : old_instr->control_predecessors()) {
+          new_control_pred.push_back(old_to_new_map[pred]);
+        }
+
+        TF_RETURN_IF_ERROR(new_instr->DropAllControlDeps());
+        for (HloInstruction* new_pred : new_control_pred) {
+          TF_RETURN_IF_ERROR(new_pred->AddControlDependencyTo(new_instr));
+          VLOG(2) << "Adding " << new_pred->ToString()
+                  << " to control dependency of " << new_instr->ToString();
+        }
+      }
+    }
+    for (HloInstruction* input_consumer : input_parameter->users()) {
+      for (HloInstruction* old_input : input_consumer->users()) {
+        HloInstruction* new_input = old_to_new_map[old_input];
+        if (skip_control_dep_injection.find(old_input) ==
+                skip_control_dep_injection.end() &&
+            !IsCollective(old_input)) {
+          for (HloInstruction* old_root : old_loop_roots) {
+            TF_RETURN_IF_ERROR(old_root->AddControlDependencyTo(new_input));
+          }
+        }
+      }
+    }
+    WhileLoopBackendConfig new_config;
+    new_config.mutable_known_trip_count()->set_n((exact_trip_count / 2));
+    TF_RETURN_IF_ERROR(while_instr->set_backend_config(new_config));
+    changed = true;
+  }
+
+  VLOG(2) << "LoopDoubleBufferTransformer output: " << module->ToString();
+
+  return changed;
+}
+
+}  // end namespace gpu
+}  // end namespace xla
diff --git a/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.h b/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.h
new file mode 100644
index 00000000000000..3e10b445afbc49
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_LOOP_DOUBLE_BUFFER_TRANSFORMER_H_
+#define XLA_SERVICE_GPU_LOOP_DOUBLE_BUFFER_TRANSFORMER_H_
+
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/gpu_types.h"
+#include "xla/service/hlo_pass_interface.h"
+#include "xla/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+// This pass performs the unrolling-by-2 loop transformation
+// to effectively achieve double buffering between inputs and outputs
+// of previously rolled iterations.
+// This pass only runs on loops with known trip counts.
+// For even number of iterations, unrolling-by-2 will be done directly.
+// For odd number of iterations, the first iteration of the loop will be
+// peeled outside of the while loop to make the trip count an even number,
+// then proceed to unroll by 2.
+// It also updates the trip count property of the loop to the correct one (n/2).
+class LoopDoubleBufferTransformer : public HloModulePass {
+ public:
+  LoopDoubleBufferTransformer() = default;
+  ~LoopDoubleBufferTransformer() override = default;
+  absl::string_view name() const override {
+    return "loop-double-buffer-transformer";
+  }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // end namespace gpu
+}  // end namespace xla
+
+#endif  // XLA_SERVICE_GPU_LOOP_DOUBLE_BUFFER_TRANSFORMER_H_
diff --git a/third_party/xla/xla/service/gpu/loop_double_buffer_transformer_test.cc b/third_party/xla/xla/service/gpu/loop_double_buffer_transformer_test.cc
new file mode 100644
index 00000000000000..46ec880c60e077
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/loop_double_buffer_transformer_test.cc
@@ -0,0 +1,361 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/loop_double_buffer_transformer.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/hlo_dce.h"
+#include "xla/service/tuple_simplifier.h"
+#include "xla/test.h"
+#include "xla/test_helpers.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+int64_t CountInstructions(const HloComputation& computation, HloOpcode opcode) {
+  int64_t count = 0;
+  for (const auto& instruction : computation.instructions()) {
+    if (instruction->opcode() == opcode) {
+      count++;
+    }
+  }
+  return count;
+}
+
+int64_t CountInstructions(const HloModule& module, HloOpcode opcode) {
+  int64_t count = 0;
+  for (const auto& computation : module.computations()) {
+    count += CountInstructions((*computation), opcode);
+  }
+  return count;
+}
+
+class GpuLoopDoubleBufferTransformerTest : public HloTestBase {
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_while_loop_double_buffering(true);
+    return debug_options;
+  }
+};
+
+TEST_F(GpuLoopDoubleBufferTransformerTest, UnrolledLoopEvenTripCount) {
+  const char* const kModuleString = R"(
+HloModule all_gather_overlapping
+condition {
+  input_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=3
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+body {
+ input_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) parameter(0)
+ param_0 = f32[1,128] get-tuple-element(input_tuple), index=0
+ param_1 = f32[2,128] get-tuple-element(input_tuple), index=2
+ cond = s32[] get-tuple-element(input_tuple), index=3
+ c0 = f32[] constant(0)
+ splat_c0 = f32[1,128] broadcast(c0), dimensions={}
+ add = f32[1,128] add(splat_c0, param_0)
+ // Start all-gather communication
+ all-gather-start = (f32[1,128], f32[2,128]) all-gather-start(add), channel_id=1337, replica_groups={{0,1}}, dimensions={0}, use_global_device_ids=true
+ // Intertwined with the all-gather communication, an operation happens which
+ // depends on param_1, but crucially has a different output shape (which
+ // excludes reusing param_1's buffer for its output).
+ c1_s32 = s32[] constant(1)
+ c0_s32 = s32[] constant(0)
+ one = s32[] constant(1)
+ cond_plus_1 = s32[] add(cond, one)
+ dynamic-slice = f32[1,128] dynamic-slice(param_1, c1_s32, c0_s32), dynamic_slice_sizes={1,128}
+ // The all-gather communication finishes
+ all-gather-done = f32[2,128] all-gather-done(all-gather-start)
+ ROOT output_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) tuple(param_0, dynamic-slice, all-gather-done, cond_plus_1)
+}
+
+ENTRY main {
+ param_0 = f32[1,128] parameter(0)
+ param_1 = f32[2,128] parameter(1)
+ param_2 = s32[] constant(0)
+ tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) tuple(param_0, param_0, param_1, param_2)
+ ROOT while = (f32[1,128], f32[1,128], f32[2,128], s32[]) while(tuple), condition=condition, body=body, backend_config={"known_trip_count":{"n":"10"}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  LoopDoubleBufferTransformer double_buffer;
+  HloDCE dce;
+  TupleSimplifier tuple_simp;
+  ASSERT_IS_OK(double_buffer.Run(module.get()).status());
+  ASSERT_IS_OK(tuple_simp.Run(module.get()).status());
+  ASSERT_IS_OK(dce.Run(module.get()).status());
+
+  HloInstruction* while_instruction;
+  for (auto instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      while_instruction = instr;
+    }
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_instruction->backend_config<WhileLoopBackendConfig>());
+  int64_t exact_trip_count = config.known_trip_count().n();
+  // We expect that after unrolling, the total trip count is half of original
+  // count.
+  EXPECT_EQ(exact_trip_count, 5);
+  // We expect that after unrolling, there should be 2 allgather starts,
+  // both in while body.
+  EXPECT_EQ(CountInstructions((*while_instruction->while_body()),
+                              HloOpcode::kAllGatherStart),
+            2);
+  EXPECT_EQ(CountInstructions((*module), HloOpcode::kAllGatherStart), 2);
+}
+
+TEST_F(GpuLoopDoubleBufferTransformerTest, UnrolledLoopOddTripCount) {
+  const char* const kModuleString = R"(
+HloModule all_gather_overlapping
+condition {
+  input_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=3
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+body {
+ input_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) parameter(0)
+ param_0 = f32[1,128] get-tuple-element(input_tuple), index=0
+ param_1 = f32[2,128] get-tuple-element(input_tuple), index=2
+ cond = s32[] get-tuple-element(input_tuple), index=3
+ c0 = f32[] constant(0)
+ splat_c0 = f32[1,128] broadcast(c0), dimensions={}
+ add = f32[1,128] add(splat_c0, param_0)
+ // Start all-gather communication
+ all-gather-start = (f32[1,128], f32[2,128]) all-gather-start(add), channel_id=1337, replica_groups={{0,1}}, dimensions={0}, use_global_device_ids=true
+ // Intertwined with the all-gather communication, an operation happens which
+ // depends on param_1, but crucially has a different output shape (which
+ // excludes reusing param_1's buffer for its output).
+ c1_s32 = s32[] constant(1)
+ c0_s32 = s32[] constant(0)
+ one = s32[] constant(1)
+ cond_plus_1 = s32[] add(cond, one)
+ dynamic-slice = f32[1,128] dynamic-slice(param_1, c1_s32, c0_s32), dynamic_slice_sizes={1,128}
+ // The all-gather communication finishes
+ all-gather-done = f32[2,128] all-gather-done(all-gather-start)
+ ROOT output_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) tuple(param_0, dynamic-slice, all-gather-done, cond_plus_1)
+}
+
+ENTRY main {
+ param_0 = f32[1,128] parameter(0)
+ param_1 = f32[2,128] parameter(1)
+ param_2 = s32[] constant(0)
+ tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) tuple(param_0, param_0, param_1, param_2)
+ ROOT while = (f32[1,128], f32[1,128], f32[2,128], s32[]) while(tuple), condition=condition, body=body, backend_config={"known_trip_count":{"n":"11"}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  LoopDoubleBufferTransformer double_buffer;
+  HloDCE dce;
+  TupleSimplifier tuple_simp;
+  ASSERT_IS_OK(double_buffer.Run(module.get()).status());
+  ASSERT_IS_OK(tuple_simp.Run(module.get()).status());
+  ASSERT_IS_OK(dce.Run(module.get()).status());
+
+  // We expect that for the while loop, no further copy needs to be added to the
+  // module.
+  HloInstruction* while_instruction;
+  for (auto instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      while_instruction = instr;
+    }
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_instruction->backend_config<WhileLoopBackendConfig>());
+  int64_t exact_trip_count = config.known_trip_count().n();
+  // We expect that after unrolling, the total trip count is half of original
+  // count.
+  EXPECT_EQ(exact_trip_count, 5);
+
+  // We expect that after unrolling, there should be 3 allgather starts,
+  // 1 in parent computation, 2 in while body.
+  EXPECT_EQ(CountInstructions((*while_instruction->while_body()),
+                              HloOpcode::kAllGatherStart),
+            2);
+  EXPECT_EQ(CountInstructions((*module), HloOpcode::kAllGatherStart), 3);
+
+  // We expect that after unrolling, the third operand of the input tuple should
+  // be the peeled allgather done.
+  EXPECT_EQ(while_instruction->operand(0)->operand(2)->opcode(),
+            HloOpcode::kAllGatherDone);
+}
+
+TEST_F(GpuLoopDoubleBufferTransformerTest,
+       UnrolledLoopNoControlDepsForConstantAdd) {
+  const char* const kModuleString = R"(
+HloModule loop_unrolling_no_deps
+condition {
+  input_tuple = (f32[], s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=1
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+body {
+ input_tuple = (f32[], s32[]) parameter(0)
+ param_0 = f32[] get-tuple-element(input_tuple), index=0
+ cond = s32[] get-tuple-element(input_tuple), index=1
+ c2 = f32[] constant(2)
+ add = f32[] add(c2, param_0)
+ one = s32[] constant(1)
+ cond_plus_1 = s32[] add(cond, one)
+ ROOT output_tuple = (f32[], s32[]) tuple(add, cond_plus_1)
+}
+
+ENTRY main {
+ param_0 = f32[] parameter(0)
+ param_2 = s32[] constant(0)
+ tuple = (f32[], s32[]) tuple(param_0, param_2)
+ ROOT while = (f32[], s32[]) while(tuple), condition=condition, body=body, backend_config={"known_trip_count":{"n":"11"}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  LoopDoubleBufferTransformer double_buffer;
+  HloDCE dce;
+  TupleSimplifier tuple_simp;
+  ASSERT_IS_OK(double_buffer.Run(module.get()).status());
+  ASSERT_IS_OK(tuple_simp.Run(module.get()).status());
+  ASSERT_IS_OK(dce.Run(module.get()).status());
+
+  HloInstruction* while_instruction;
+  for (auto instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      while_instruction = instr;
+    }
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_instruction->backend_config<WhileLoopBackendConfig>());
+  int64_t exact_trip_count = config.known_trip_count().n();
+  // We expect that after unrolling, the total trip count is half of original
+  // count.
+  EXPECT_EQ(exact_trip_count, 5);
+
+  // We expect that after unrolling, there should be 4 adds
+  EXPECT_EQ(
+      CountInstructions((*while_instruction->while_body()), HloOpcode::kAdd),
+      4);
+
+  // We expect that after unrolling, the first operand of the output tuple
+  // should not have any control dependency since it's a elementwise add with a
+  // constant operand.
+  EXPECT_EQ(while_instruction->while_body()
+                ->root_instruction()
+                ->operand(0)
+                ->control_predecessors()
+                .size(),
+            0);
+}
+
+TEST_F(GpuLoopDoubleBufferTransformerTest,
+       UnrolledLoopNoControlDepsForCollective) {
+  const char* const kModuleString = R"(
+HloModule loop_unrolling_no_deps
+condition {
+  input_tuple = (f32[], s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=1
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+ar_add {
+  Arg_1 = f32[] parameter(1)
+  Arg_0 = f32[] parameter(0)
+  ROOT add_ar = f32[] add(Arg_1, Arg_0)
+}
+
+body {
+ input_tuple = (f32[], s32[]) parameter(0)
+ param_0 = f32[] get-tuple-element(input_tuple), index=0
+ cond = s32[] get-tuple-element(input_tuple), index=1
+ all-reduce-start = f32[] all-reduce-start(param_0), channel_id=8, replica_groups={{0}}, to_apply=ar_add, backend_config="{\"is_sync\":false}"
+ one = s32[] constant(1)
+ all-reduce-done = f32[] all-reduce-done(all-reduce-start)
+ cond_plus_1 = s32[] add(cond, one)
+ ROOT output_tuple = (f32[], s32[]) tuple(all-reduce-done, cond_plus_1)
+}
+
+ENTRY main {
+ param_0 = f32[] parameter(0)
+ param_2 = s32[] constant(0)
+ tuple = (f32[], s32[]) tuple(param_0, param_2)
+ ROOT while = (f32[], s32[]) while(tuple), condition=condition, body=body, backend_config={"known_trip_count":{"n":"10"}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  LoopDoubleBufferTransformer double_buffer;
+  HloDCE dce;
+  TupleSimplifier tuple_simp;
+  ASSERT_IS_OK(double_buffer.Run(module.get()).status());
+  ASSERT_IS_OK(tuple_simp.Run(module.get()).status());
+  ASSERT_IS_OK(dce.Run(module.get()).status());
+
+  HloInstruction* while_instruction;
+  for (auto instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      while_instruction = instr;
+    }
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_instruction->backend_config<WhileLoopBackendConfig>());
+  int64_t exact_trip_count = config.known_trip_count().n();
+  // We expect that after unrolling, the total trip count is half of original
+  // count.
+  EXPECT_EQ(exact_trip_count, 5);
+
+  // We expect that after unrolling, there should be 2 all-reduce-starts
+  EXPECT_EQ(CountInstructions((*while_instruction->while_body()),
+                              HloOpcode::kAllReduceStart),
+            2);
+  absl::flat_hash_set<int64_t> channel_ids;
+  for (HloInstruction* ar : while_instruction->while_body()->instructions()) {
+    if (ar->opcode() == HloOpcode::kAllReduceStart) {
+      // We expect that after unrolling, allreduces should not have any control
+      // deps.
+      EXPECT_EQ(ar->control_predecessors().size(), 0);
+      channel_ids.insert(*(ar->channel_id()));
+    }
+  }
+  // we expect that all 2 allreduces will have different channel ids.
+  EXPECT_EQ(channel_ids.size(), 2);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index b4c617d5db56e6..19a9fd73b5814c 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -620,7 +620,10 @@ message DebugOptions {
   // faster.
   bool xla_gpu_cublas_fallback = 247;
 
-  // Next id: 248
+  // Enable double buffering for loops.
+  bool xla_gpu_enable_while_loop_double_buffering = 248;
+
+  // Next id: 249
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.

From 87ce74262ac34a46d29113850c643d722c0655a3 Mon Sep 17 00:00:00 2001
From: Chao <cchen104@amd.com>
Date: Tue, 26 Sep 2023 03:30:52 -0700
Subject: [PATCH 261/567] PR #5867: [ROCm] fixed rocm kernel link

Imported from GitHub PR https://github.com/openxla/xla/pull/5867

Fixed a recent build error due to the missing link at rocm gpu executor and some updates on rocm_driver

@akuegel @ddunl  Thanks in advance!
Copybara import of the project:

--
63801cf1b46aee77ccc272e0eb65624367553fe2 by Chao Chen <cchen104@amd.com>:

fixed rocm kernel link

Merging this change closes #5867

PiperOrigin-RevId: 568485642
---
 .../xla/xla/stream_executor/gpu/gpu_types.h   |  2 +-
 .../xla/xla/stream_executor/rocm/BUILD        |  7 +++---
 .../xla/stream_executor/rocm/rocm_driver.cc   | 10 ++++-----
 .../rocm/rocm_driver_wrapper.h                |  2 +-
 .../stream_executor/rocm/rocm_gpu_executor.cc | 22 ++++++++++++-------
 5 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_types.h b/third_party/xla/xla/stream_executor/gpu/gpu_types.h
index 4a0d643a4f9819..dea81d66a1d59d 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_types.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_types.h
@@ -42,7 +42,7 @@ using GpuContextHandle = hipCtx_t;
 using GpuStreamHandle = hipStream_t;
 using GpuEventHandle = hipEvent_t;
 using GpuFunctionHandle = hipFunction_t;
-using GpuFunctionAttribute = hipFuncAttribute;
+using GpuFunctionAttribute = hipFunction_attribute;
 using GpuDeviceHandle = hipDevice_t;
 using GpuDevicePtr = hipDeviceptr_t;
 using GpuDeviceAttribute = hipDeviceAttribute_t;
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 74f710420f6e40..b9e5f20ef22af0 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -108,10 +108,11 @@ cc_library(
         ":rocm_platform_id",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
+        "//xla/stream_executor:kernel",
         "//xla/stream_executor:event",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:stream_executor_internal",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor:stream_executor_pimpl_header",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_event",
         "//xla/stream_executor/gpu:gpu_kernel_header",
@@ -160,7 +161,7 @@ cc_library(
         "@com_google_absl//absl/memory",
         "//xla/stream_executor",  # buildcleaner: keep
         "//xla/stream_executor:multi_platform_manager",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor:stream_executor_pimpl_header",
         "//xla/stream_executor/platform",
     ]),
     alwayslink = True,  # Registers itself with the MultiPlatformManager.
@@ -317,7 +318,7 @@ cc_library(
         "//xla/stream_executor:event",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor:stream_executor_pimpl",
         "//xla/stream_executor:temporary_device_memory",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_stream_header",
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
index bde91ca30582c7..d48ed7e3818d92 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
@@ -425,11 +425,11 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
   return context->context();
 }
 
-/* static */ tsl::Status GpuDriver::FuncGetAttribute(hipFuncAttribute attribute,
-                                                     hipFunction_t func,
-                                                     int* attribute_value) {
-  RETURN_IF_ROCM_ERROR(hipFuncSetAttribute(func, attribute, *attribute_value),
-                       "Failed to query kernel attribute: ", attribute);
+/* static */ tsl::Status GpuDriver::FuncGetAttribute(
+    hipFunction_attribute attribute, hipFunction_t func, int* attribute_value) {
+  RETURN_IF_ROCM_ERROR(
+      wrap::hipFuncGetAttribute(attribute_value, attribute, func),
+      "Failed to query kernel attribute: ", attribute);
   return tsl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h b/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
index 3aa2cf191b4d6f..5808f4c266ce85 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
@@ -93,7 +93,7 @@ namespace wrap {
   __macro(hipEventSynchronize)                      \
   __macro(hipFree)                                  \
   __macro(hipFuncSetCacheConfig)                    \
-  __macro(hipFuncSetAttribute)                      \
+  __macro(hipFuncGetAttribute)                      \
   __macro(hipGetDevice)                             \
   __macro(hipGetDeviceCount)                        \
   __macro(hipGetDeviceProperties)                   \
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
index 618514de604493..1443714e623719 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "xla/stream_executor/rocm/rocm_gpu_executor.h"
 
 #include <unistd.h>
 
@@ -24,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "xla/stream_executor/gpu/gpu_command_buffer.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_event.h"
@@ -203,10 +205,10 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   const string* kernel_name;
 
   const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
-  bool has_cubin = spec.has_cuda_cubin_on_disk();
-  if (has_cubin) {
-    on_disk_spec = &spec.cuda_cubin_on_disk();
-  }
+
+  VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
+
+  if (spec.has_cuda_cubin_on_disk()) on_disk_spec = &spec.cuda_cubin_on_disk();
 
   if (on_disk_spec != nullptr) {
     return tsl::errors::Internal(
@@ -244,19 +246,23 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
 tsl::Status GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
                                            KernelMetadata* kernel_metadata) {
   int value = 0;
-  // TODO(ROCm) implement this feature in HIP
+  TF_RETURN_IF_ERROR(GpuDriver::FuncGetAttribute(
+      HIP_FUNC_ATTRIBUTE_NUM_REGS, *rocm_kernel->gpu_function_ptr(), &value));
   kernel_metadata->set_registers_per_thread(value);
 
-  // TODO(ROCm) implement this feature in HIP
+  TF_RETURN_IF_ERROR(
+      GpuDriver::FuncGetAttribute(HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                                  *rocm_kernel->gpu_function_ptr(), &value));
   kernel_metadata->set_shared_memory_bytes(value);
-  return tsl::OkStatus();
+  return ::tsl::OkStatus();
 }
 
 tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
                                 const BlockDim& block_dims,
                                 const KernelBase& kernel,
                                 const KernelArgsArrayBase& args) {
-  CHECK_EQ(kernel.Arity(), args.number_of_arguments());
+  CHECK_EQ(kernel.Arity() + (args.number_of_shared_bytes() > 0),
+           args.number_of_arguments());
   GpuStreamHandle hipstream = AsGpuStreamValue(stream);
   const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
   hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();

From ae01629d90f6f8ef3a04de41bb459b1f5e475df6 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Tue, 26 Sep 2023 03:53:52 -0700
Subject: [PATCH 262/567] [XLA:GPU] Trigger Triton GEMM fusions also on kCopy
 input operations.

PiperOrigin-RevId: 568489918
---
 .../xla/service/gpu/gemm_rewriter_triton.cc   | 30 ++++++++++++-------
 .../xla/service/gpu/ir_emitter_triton_test.cc | 22 ++++++++++++++
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
index 59cecc4dba6e77..88be780bf25308 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
@@ -73,6 +73,16 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+int GetFusionLevel(const HloInstruction& hlo, const GpuVersion gpu_version) {
+  int level =
+      hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
+  if (!std::get<se::CudaComputeCapability>(gpu_version)
+           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+    level = std::min(level, 1);
+  }
+  return level;
+}
+
 bool HasDivisibleSuffixAllowingSplit(const absl::Span<int64_t const> span,
                                      const int64_t divisor) {
   CHECK_GE(divisor, 1);
@@ -1077,12 +1087,6 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
     absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
         old_to_new_mapping,
     const GpuVersion gpu_version) const {
-  int fusion_level =
-      hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
-  if (!std::get<se::CudaComputeCapability>(gpu_version)
-           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    fusion_level = std::min(fusion_level, 1);
-  }
   if (hlo.opcode() == HloOpcode::kTuple ||
       hlo.opcode() == HloOpcode::kGetTupleElement) {
     return "Unsupported instruction.";
@@ -1103,7 +1107,7 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
     return "Unsupported output data type.";
   }
   if (as_input) {
-    if (fusion_level < 2) {
+    if (GetFusionLevel(hlo, gpu_version) < 2) {
       if (hlo.opcode() == HloOpcode::kConvert) {
         if (FusionDecision decision =
                 RequireTritonFusibleConvert(&hlo, gpu_version);
@@ -1119,7 +1123,7 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
       }
     }
   } else {
-    if (fusion_level < 2) {
+    if (GetFusionLevel(hlo, gpu_version) < 2) {
       return "Skipping fusing outputs at low fusion levels.";
     }
     for (const HloInstruction* operand : hlo.operands()) {
@@ -1369,10 +1373,14 @@ StatusOr<FusionDecision> FuseDot(HloInstruction& dot,
   if (dot.GetModule()->config().debug_options().xla_gpu_triton_gemm_any()) {
     return FusionDecision{};
   }
+
+  absl::flat_hash_set<HloOpcode> triggers{
+      HloOpcode::kConvert, HloOpcode::kSlice, HloOpcode::kTranspose};
+  if (GetFusionLevel(dot, gpu_version) >= 2) {
+    triggers.insert(HloOpcode::kCopy);
+  }
   for (const auto& iter : old_to_new_mapping) {
-    if (iter.second->opcode() == HloOpcode::kConvert ||
-        iter.second->opcode() == HloOpcode::kSlice ||
-        iter.second->opcode() == HloOpcode::kTranspose) {
+    if (triggers.contains(iter.second->opcode())) {
       return FusionDecision{};
     }
   }
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 014b006b6ab2a7..0e20fc806662dc 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -1091,6 +1091,28 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-3, /*arel=*/2e-3}));
 }
 
+TEST_F(TritonGemmLevel2Test, FuseTransposeWithoutMixedTypes) {
+  const std::string kHloText = R"(
+ENTRY e {
+  p1 = f16[150,32,60]{2,1,0} parameter(1)
+  p0 = f16[75,2,26,60]{3,2,1,0} parameter(0)
+  t = f16[75,2,60,26]{3,2,1,0} transpose(p0), dimensions={0,1,3,2}
+  r = f16[150,60,26]{2,1,0} reshape(t)
+  ROOT tmp_4 = f16[150,32,26]{2,1,0} dot(p1, r),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 TEST_F(TritonGemmTest, SineOutputIsNotFused) {
   const std::string kHloText = R"(
 HloModule m

From 079ab56a5b0c5aefcdffa4f20f79c7bb137c6bd0 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Tue, 26 Sep 2023 04:28:32 -0700
Subject: [PATCH 263/567] Retire MLIR interpreter and related tooling.

This interpreter no longer makes sense inside XLA/isn't actively used. Retire here. Ongoing discussion with upstream to potentially move there post some refactoring, but can be recovered from commit history if and when needed rather than maintain until then.

PiperOrigin-RevId: 568496965
---
 .../xla/xla/mlir/tools/mlir_bisect/BUILD      |  69 --
 .../xla/xla/mlir/tools/mlir_bisect/README.md  |  85 --
 .../xla/mlir/tools/mlir_bisect/bisect_lib.cc  |  81 --
 .../xla/mlir/tools/mlir_bisect/bisect_lib.h   |  96 --
 .../xla/mlir/tools/mlir_bisect/mlir_bisect.cc | 363 -------
 .../xla/mlir/tools/mlir_bisect/rewrites/BUILD |  26 -
 .../mlir/tools/mlir_bisect/rewrites/func.cc   |  80 --
 .../tools/mlir_bisect/rewrites/general.cc     | 186 ----
 .../mlir/tools/mlir_bisect/rewrites/scf.cc    | 139 ---
 .../tools/mlir_bisect/rewrites/tests/BUILD    |  24 -
 .../tests/erase-op-without-results.mlir       |  12 -
 .../rewrites/tests/inline-scf-while.mlir      |  40 -
 .../tests/reduce-scf-forall-bounds.mlir       |  16 -
 .../tests/replace-op-with-constant.mlir       |  26 -
 .../rewrites/tests/replace-op-with-value.mlir |  16 -
 .../tests/replace-operand-with-constant.mlir  |  28 -
 ...eturn-operands-of-terminator-operands.mlir |  15 -
 .../rewrites/tests/truncate-function.mlir     |  31 -
 .../xla/mlir/tools/mlir_bisect/test_passes.cc |  48 -
 .../xla/mlir/tools/mlir_bisect/test_passes.h  |  29 -
 .../xla/mlir/tools/mlir_bisect/tests/BUILD    |  26 -
 .../mlir/tools/mlir_bisect/tests/bisect.mlir  |  46 -
 .../mlir/tools/mlir_bisect/tests/no-bug.mlir  |  10 -
 .../tools/mlir_bisect/tests/snapshot.mlir     |  12 -
 .../tools/mlir_bisect/tests/snapshot.mlir.pb  | Bin 68 -> 0 bytes
 .../xla/xla/mlir/tools/mlir_replay/BUILD      |  71 --
 .../xla/xla/mlir/tools/mlir_replay/README.md  |  48 -
 .../xla/mlir/tools/mlir_replay/mlir_replay.cc | 248 -----
 .../mlir/tools/mlir_replay/mlir_replay_lib.cc | 263 ------
 .../mlir/tools/mlir_replay/mlir_replay_lib.h  |  40 -
 .../xla/mlir/tools/mlir_replay/public/BUILD   |  42 -
 .../mlir/tools/mlir_replay/public/README.md   |  11 +-
 .../mlir_replay/public/execution_trace.proto  |  72 --
 .../public/execution_trace_utils.cc           | 447 ---------
 .../public/execution_trace_utils.h            |  76 --
 .../public/execution_trace_utils_test.cc      | 138 ---
 third_party/xla/xla/mlir_hlo/BUILD            | 117 ---
 .../tools/mlir_interpreter/dialects/affine.cc |  51 -
 .../tools/mlir_interpreter/dialects/arith.cc  | 305 ------
 .../dialects/bufferization.cc                 |  69 --
 .../mlir_interpreter/dialects/builtin.cc      |  54 --
 .../mlir_interpreter/dialects/comparators.h   | 104 --
 .../mlir_interpreter/dialects/complex.cc      |  63 --
 .../mlir_interpreter/dialects/cwise_math.h    | 239 -----
 .../mlir_interpreter/dialects/deallocation.cc |  88 --
 .../tools/mlir_interpreter/dialects/func.cc   | 116 ---
 .../tools/mlir_interpreter/dialects/gml_st.cc |  45 -
 .../tools/mlir_interpreter/dialects/linalg.cc | 310 ------
 .../tools/mlir_interpreter/dialects/math.cc   |  64 --
 .../tools/mlir_interpreter/dialects/memref.cc | 247 -----
 .../tools/mlir_interpreter/dialects/mhlo.cc   | 886 ------------------
 .../dialects/mhlo_binary_cwise.cc             |  45 -
 .../dialects/mhlo_unary_cwise.cc              |  84 --
 .../tools/mlir_interpreter/dialects/scf.cc    | 216 -----
 .../tools/mlir_interpreter/dialects/tensor.cc | 256 -----
 .../mlir_interpreter/dialects/tests/BUILD     |  24 -
 .../dialects/tests/affine/apply.mlir          |  63 --
 .../dialects/tests/affine/minmax.mlir         |  36 -
 .../dialects/tests/arith/bitcast.mlir         |  21 -
 .../dialects/tests/arith/cmpf.mlir            | 129 ---
 .../dialects/tests/arith/cmpi.mlir            | 147 ---
 .../dialects/tests/arith/constant.mlir        |  37 -
 .../dialects/tests/arith/extf.mlir            |  11 -
 .../dialects/tests/arith/fptosi.mlir          |  21 -
 .../dialects/tests/arith/index_cast.mlir      |  28 -
 .../dialects/tests/arith/int_math.mlir        | 111 ---
 .../dialects/tests/arith/minmax.mlir          |  25 -
 .../dialects/tests/arith/negf.mlir            |  21 -
 .../dialects/tests/arith/remf.mlir            |  12 -
 .../dialects/tests/arith/select.mlir          |  52 -
 .../dialects/tests/arith/sitofp.mlir          |  31 -
 .../dialects/tests/arith/uitofp.mlir          |  31 -
 .../dialects/tests/arith/vector_math.mlir     |  12 -
 .../tests/bufferization/alloc_tensor.mlir     |  30 -
 .../dialects/tests/bufferization/clone.mlir   |  14 -
 .../tests/bufferization/to_memref.mlir        |  10 -
 .../tests/bufferization/to_tensor.mlir        |  11 -
 .../builtin/unrealized_conversion_cast.mlir   |  21 -
 .../dialects/tests/complex/complex.mlir       | 186 ----
 .../tests/deallocation/deallocation.mlir      |  51 -
 .../dialects/tests/func/call.mlir             |  48 -
 .../dialects/tests/gml_st/fusion.mlir         |  35 -
 .../dialects/tests/linalg/broadcast.mlir      |  30 -
 .../dialects/tests/linalg/dot.mlir            |  14 -
 .../dialects/tests/linalg/fill.mlir           |  24 -
 .../dialects/tests/linalg/generic.mlir        | 113 ---
 .../dialects/tests/linalg/map.mlir            |  74 --
 .../dialects/tests/linalg/matmul.mlir         |  41 -
 .../dialects/tests/linalg/reduce.mlir         |  57 --
 .../dialects/tests/linalg/transpose.mlir      |  27 -
 .../dialects/tests/linalg/vecmat.mlir         |  14 -
 .../dialects/tests/math/math.mlir             | 252 -----
 .../dialects/tests/memref/alloc.mlir          |  57 --
 .../dialects/tests/memref/collapse_shape.mlir |  33 -
 .../dialects/tests/memref/copy.mlir           |  39 -
 .../dialects/tests/memref/dim.mlir            |  12 -
 .../dialects/tests/memref/expand_shape.mlir   |  52 -
 .../dialects/tests/memref/get_global.mlir     |  12 -
 .../dialects/tests/memref/invalid.mlir        |  77 --
 .../dialects/tests/memref/load.mlir           |  12 -
 .../dialects/tests/memref/subview.mlir        | 131 ---
 .../dialects/tests/mhlo/bitcast_convert.mlir  |  11 -
 .../dialects/tests/mhlo/broadcast_in_dim.mlir |  20 -
 .../dialects/tests/mhlo/case.mlir             |  17 -
 .../dialects/tests/mhlo/clamp.mlir            |  27 -
 .../dialects/tests/mhlo/compare.mlir          | 143 ---
 .../dialects/tests/mhlo/complex_math.mlir     | 100 --
 .../tests/mhlo/compute_reshape_shape.mlir     |  26 -
 .../dialects/tests/mhlo/concatenate.mlir      |  37 -
 .../dialects/tests/mhlo/constant.mlir         |  25 -
 .../dialects/tests/mhlo/convert.mlir          |  21 -
 .../dialects/tests/mhlo/dot.mlir              |  37 -
 .../dialects/tests/mhlo/dot_general.mlir      |  73 --
 .../dialects/tests/mhlo/dynamic_slice.mlir    |  32 -
 .../tests/mhlo/dynamic_update_slice.mlir      |  34 -
 .../dialects/tests/mhlo/float_math.mlir       | 199 ----
 .../dialects/tests/mhlo/gather.mlir           |  78 --
 .../dialects/tests/mhlo/int_math.mlir         | 358 -------
 .../dialects/tests/mhlo/iota.mlir             |  30 -
 .../dialects/tests/mhlo/pad.mlir              |  56 --
 .../dialects/tests/mhlo/reduce.mlir           |  17 -
 .../dialects/tests/mhlo/reshape.mlir          |  34 -
 .../dialects/tests/mhlo/scatter.mlir          |  55 --
 .../dialects/tests/mhlo/select.mlir           |  14 -
 .../dialects/tests/mhlo/slice.mlir            |  16 -
 .../dialects/tests/mhlo/sort.mlir             |  25 -
 .../dialects/tests/mhlo/subtract.mlir         |  10 -
 .../dialects/tests/mhlo/transpose.mlir        |  28 -
 .../dialects/tests/mhlo/tuple.mlir            |  30 -
 .../dialects/tests/mhlo/while.mlir            |  25 -
 .../dialects/tests/scf/for.mlir               |  82 --
 .../dialects/tests/scf/forall.mlir            |  62 --
 .../dialects/tests/scf/if.mlir                |  69 --
 .../dialects/tests/scf/parallel.mlir          |  44 -
 .../dialects/tests/scf/while.mlir             |  45 -
 .../dialects/tests/tensor/collapse_shape.mlir |  42 -
 .../dialects/tests/tensor/dim.mlir            |  12 -
 .../dialects/tests/tensor/empty.mlir          |  21 -
 .../dialects/tests/tensor/expand_shape.mlir   |  55 --
 .../dialects/tests/tensor/extract.mlir        |  13 -
 .../dialects/tests/tensor/extract_slice.mlir  |  62 --
 .../dialects/tests/tensor/from_elements.mlir  |  25 -
 .../dialects/tests/tensor/generate.mlir       |  29 -
 .../dialects/tests/tensor/insert.mlir         |  14 -
 .../dialects/tests/tensor/insert_slice.mlir   |  25 -
 .../dialects/tests/tensor/pad.mlir            |  38 -
 .../dialects/tests/thlo/concatenate.mlir      |  16 -
 .../dialects/tests/thlo/reverse.mlir          |  14 -
 .../dialects/tests/thlo/scatter.mlir          |  49 -
 .../dialects/tests/vector/bitcast.mlir        |  32 -
 .../dialects/tests/vector/broadcast.mlir      |  51 -
 .../dialects/tests/vector/compressstore.mlir  |  16 -
 .../dialects/tests/vector/constant_mask.mlir  |  14 -
 .../dialects/tests/vector/contract.mlir       | 141 ---
 .../dialects/tests/vector/create_mask.mlir    |  16 -
 .../dialects/tests/vector/expandload.mlir     |  19 -
 .../dialects/tests/vector/extract.mlir        |  52 -
 .../tests/vector/extract_strided_slice.mlir   |  18 -
 .../dialects/tests/vector/extractelement.mlir |  22 -
 .../dialects/tests/vector/flat_transpose.mlir |  23 -
 .../dialects/tests/vector/fma.mlir            |  13 -
 .../dialects/tests/vector/gather.mlir         |  50 -
 .../dialects/tests/vector/insert.mlir         |  57 --
 .../tests/vector/insert_strided_slice.mlir    |  17 -
 .../dialects/tests/vector/insertelement.mlir  |  24 -
 .../dialects/tests/vector/invalid.mlir        |  27 -
 .../dialects/tests/vector/load.mlir           |  27 -
 .../dialects/tests/vector/maskedload.mlir     |  19 -
 .../dialects/tests/vector/maskedstore.mlir    |  18 -
 .../tests/vector/multi_reduction.mlir         |  46 -
 .../dialects/tests/vector/outerproduct.mlir   | 155 ---
 .../dialects/tests/vector/reduction.mlir      | 235 -----
 .../dialects/tests/vector/shape_cast.mlir     |  23 -
 .../dialects/tests/vector/shuffle.mlir        |  34 -
 .../dialects/tests/vector/splat.mlir          |  11 -
 .../dialects/tests/vector/store.mlir          |  39 -
 .../dialects/tests/vector/transfer_read.mlir  | 118 ---
 .../dialects/tests/vector/transfer_write.mlir |  91 --
 .../dialects/tests/vector/transpose.mlir      |  28 -
 .../dialects/tests/vector/type_cast.mlir      |  11 -
 .../dialects/tests/vector/vscale.mlir         |  12 -
 .../tests/xla_cpu/memref_element_cast.mlir    |  11 -
 .../tools/mlir_interpreter/dialects/thlo.cc   | 121 ---
 .../tools/mlir_interpreter/dialects/util.cc   | 169 ----
 .../tools/mlir_interpreter/dialects/util.h    |  81 --
 .../tools/mlir_interpreter/dialects/vector.cc | 857 -----------------
 .../tools/mlir_interpreter/dialects/xla.cc    |  34 -
 .../mlir_interpreter/framework/interpreter.cc | 142 ---
 .../mlir_interpreter/framework/interpreter.h  | 195 ----
 .../framework/interpreter_value.cc            | 377 --------
 .../framework/interpreter_value.h             | 226 -----
 .../framework/interpreter_value_util.h        | 184 ----
 .../framework/registration.cc                 | 120 ---
 .../mlir_interpreter/framework/registration.h | 225 -----
 .../framework/tensor_or_memref.cc             | 157 ----
 .../framework/tensor_or_memref.h              | 363 -------
 .../mlir_interpreter/framework/tests/BUILD    |  24 -
 .../framework/tests/interpreter_value_test.cc | 235 -----
 .../framework/tests/tensor_or_memref_test.cc  | 104 --
 .../mlir-interpreter-runner.cc                | 142 ---
 200 files changed, 2 insertions(+), 16127 deletions(-)
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/BUILD
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/README.md
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.cc
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.h
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/BUILD
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/func.cc
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/general.cc
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/scf.cc
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/erase-op-without-results.mlir
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/inline-scf-while.mlir
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/reduce-scf-forall-bounds.mlir
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-constant.mlir
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-value.mlir
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-operand-with-constant.mlir
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/return-operands-of-terminator-operands.mlir
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/truncate-function.mlir
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.cc
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.h
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/tests/BUILD
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/tests/bisect.mlir
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/tests/no-bug.mlir
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir.pb
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/BUILD
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/README.md
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay.cc
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace.proto
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils_test.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/affine.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/arith.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/bufferization.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/builtin.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/comparators.h
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/complex.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/cwise_math.h
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/func.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/gml_st.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/linalg.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/math.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/memref.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_binary_cwise.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_unary_cwise.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/scf.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tensor.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/BUILD
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/apply.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/minmax.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/bitcast.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpf.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpi.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/constant.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/extf.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/fptosi.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/index_cast.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/minmax.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/negf.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/remf.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/select.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/sitofp.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/uitofp.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/vector_math.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/alloc_tensor.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/clone.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_memref.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_tensor.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/builtin/unrealized_conversion_cast.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/complex/complex.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/deallocation/deallocation.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/func/call.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/fusion.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/broadcast.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/dot.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/fill.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/matmul.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/transpose.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/vecmat.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/math/math.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/collapse_shape.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/copy.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/dim.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/expand_shape.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/get_global.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/load.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/subview.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/bitcast_convert.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/broadcast_in_dim.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/case.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/clamp.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compare.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/complex_math.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compute_reshape_shape.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/concatenate.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/constant.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/convert.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot_general.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_slice.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_update_slice.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/float_math.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/gather.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/int_math.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/iota.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/pad.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reduce.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reshape.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/scatter.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/select.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/slice.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/sort.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/subtract.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/transpose.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/tuple.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/while.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/for.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/forall.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/if.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/parallel.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/while.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/dim.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/empty.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract_slice.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/from_elements.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/generate.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert_slice.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/pad.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/concatenate.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/reverse.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/scatter.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/bitcast.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/broadcast.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/compressstore.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/constant_mask.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/contract.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/create_mask.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/expandload.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract_strided_slice.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extractelement.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/flat_transpose.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/fma.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/gather.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert_strided_slice.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insertelement.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/invalid.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/load.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedload.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedstore.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/multi_reduction.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/outerproduct.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/reduction.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shape_cast.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shuffle.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/splat.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/store.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_read.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_write.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transpose.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/type_cast.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/vscale.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/xla_cpu/memref_element_cast.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/thlo.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.h
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/xla.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value_util.h
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.h
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/BUILD
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/mlir-interpreter-runner.cc

diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/BUILD b/third_party/xla/xla/mlir/tools/mlir_bisect/BUILD
deleted file mode 100644
index d5c2f3747f84b0..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/BUILD
+++ /dev/null
@@ -1,69 +0,0 @@
-load("//xla:xla.bzl", "xla_cc_binary")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
-
-build_test(
-    name = "mlir-bisect_build_test",
-    targets = [
-        ":mlir-bisect",
-    ],
-)
-
-xla_cc_binary(
-    name = "mlir-bisect",
-    srcs = ["mlir_bisect.cc"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":bisect_lib",
-        "//xla/mlir/runtime/ir:rt",
-        "//xla/mlir/tools/mlir_bisect/rewrites",
-        "//xla/mlir/tools/mlir_replay/public:execution_trace_utils",
-        "//xla/mlir_hlo:deallocation",
-        "//xla/mlir_hlo:deallocation_passes",
-        "//xla/mlir_hlo:gml_st",
-        "//xla/mlir_hlo:gml_st_passes",
-        "//xla/mlir_hlo:gml_st_test_passes",
-        "//xla/mlir_hlo:hlo_dialect_registration",
-        "//xla/mlir_hlo:lhlo",
-        "//xla/mlir_hlo:lmhlo_passes",
-        "//xla/mlir_hlo:mhlo_passes",
-        "//xla/mlir_hlo:mlir_interpreter_dialects",
-        "//xla/mlir_hlo:mlir_interpreter_framework",
-        "//xla/mlir_hlo:thlo",
-        "//xla/mlir_hlo:thlo_passes",
-        "//xla/service:hlo_proto_cc",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MlirReduceLib",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:platform_port",
-    ],
-)
-
-cc_library(
-    name = "bisect_lib",
-    srcs = [
-        "bisect_lib.cc",
-        "test_passes.cc",
-    ],
-    hdrs = [
-        "bisect_lib.h",
-        "test_passes.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc",
-        "//xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc_impl",
-        "//xla/mlir/tools/mlir_replay/public:execution_trace_utils",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-    ],
-)
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/README.md b/third_party/xla/xla/mlir/tools/mlir_bisect/README.md
deleted file mode 100644
index 570e92b6e53802..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/README.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# MLIR HLO mlir_bisect
-
-This is a test case reduction tool, similar in purpose to `mlir-reduce`, but
-specific to the `mlir-interpreter` infrastructure. In particular, reductions can
-depend on concrete values encountered during execution, and reductions can (and
-usually do) generate multiple candidates.
-
-For example, the `ReplaceOpWithConstant` reduction will attempt to replace each
-op with each of its results. If the op is in a loop, each execution will be a
-candidate for replacement.
-
-## Using this tool
-
-1.  Run a JAX test with snapshots enabled:
-
-    ```
-    bazel test some-jax-test
-      --test_env=XLA_FLAGS="--xla_cpu_use_xla_runtime --xla_dump_to=/tmp/dump
-      --xla_dump_hlo_snapshots" --test_filter=SomeSpecific.Test
-      --test_sharding_strategy=disabled --test_strategy=local
-    ```
-
-1.  Figure out the culprit module and pass (sorry, no automation yet):
-
-    ```
-    bazel run tensorflow/compiler/xla/mlir/tools/mlir_replay:mlir_replay -- \
-      --mlir-compilation-trace=/tmp/dump/module_0000.jit__something.mlir-trace.pb \
-      --hlo-snapshot=/tmp/dump/module_0000.jit__something.snapshot.0.pb \
-      --print-changes-only \
-      --execution-trace-dir=/tmp/execution
-    ```
-
-    You should see a pass after which results change. You'll want to use the
-    .mlir file in `/tmp/execution` corresponding to the pass *before* that with
-    the bisect tool.
-
-    Note: If the failing pass is bufferization, you may have to use an earlier
-    snapshot, e.g. before EmptyTensorToAllocTensor.
-1.  Run bisect:
-
-    ```
-    bazel run tensorflow/compiler/xla/mlir/tools/mlir_bisect:mlir-bisect -- \
-      --hlo-snapshot=/tmp/dump/module_0000.jit_something.snapshot.0.pb \
-      --pass-pipeline="builtin.module(empty-tensor-to-alloc-tensor,one-shot-bufferize{allow-return-allocs bufferize-function-boundaries create-deallocs=0})" \
-      /tmp/execution/0052.ScalarizationPass.mlir
-    ```
-
-## Adding a reduction
-
-To add a reduction, create a function that generates the candidates and register
-it:
-
-```
-SmallVector<OwningOpRef<ModuleOp>>
-FrobulateAndDefenestrate(BisectState&, dialect::SomeOp some_op) {
-  auto [cloned_module_1, cloned_op_1] = CloneModuleFor(some_op);
-  Frobulate(cloned_op_1);
-
-  auto [cloned_module_2, cloned_op_2] = CloneModuleFor(some_op);
-  Defenestrate(cloned_op_2);
-
-  return {cloned_module_1, cloned_module_2};
-}
-
-REGISTER_MLIR_REDUCE_STRATEGY(FrobulateAndDefenestrate);
-```
-
-Then, add a test for the strategy. Make sure your strategy is linked into
-mlir-bisect and has `alwayslink` set.
-
-```
-// RUN: mlir-bisect %s --debug-strategy=FrobulateAndDefenestrate | FileCheck %s
-
-func.func @main() {
-  dialect.some_op()
-}
-
-// CHECK: func @main()
-// CHECK-NEXT: frobulated
-
-// CHECK: func @main()
-// CHECK-NEXT: defenestrated
-```
-
-`--debug-strategy` will print all candidates generated by the given strategy.
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.cc
deleted file mode 100644
index 6bf380926fe40b..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/tools/mlir_bisect/bisect_lib.h"
-
-#include <functional>
-#include <iterator>
-#include <tuple>
-#include <utility>
-
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-
-namespace mlir {
-namespace bisect {
-
-Operation* FindInClone(Operation* op, ModuleOp clone) {
-  if (llvm::isa<ModuleOp>(op)) {
-    return clone;
-  }
-
-  auto* parent_clone = FindInClone(op->getParentOp(), clone);
-  auto cloned_ops =
-      parent_clone->getRegions()[op->getParentRegion()->getRegionNumber()]
-          .getOps();
-  for (auto [original_op, cloned_op] :
-       llvm::zip(op->getParentRegion()->getOps(), cloned_ops)) {
-    if (&original_op == op) {
-      return &cloned_op;
-    }
-  }
-
-  llvm_unreachable("Op not found in clone.");
-}
-
-std::pair<OwningOpRef<ModuleOp>, Operation*> CloneModuleFor(Operation* op) {
-  auto module = op->getParentOfType<ModuleOp>().clone();
-  return {OwningOpRef<ModuleOp>{module}, FindInClone(op, module)};
-}
-
-namespace detail {
-
-DenseMap<StringRef, std::function<CandidateVector(BisectState&, Operation*)>>&
-GetStrategies() {
-  static auto* strategies =
-      new DenseMap<StringRef,
-                   std::function<CandidateVector(BisectState&, Operation*)>>();
-  return *strategies;
-}
-
-void RegisterReduceStrategy(
-    StringRef name,
-    std::function<CandidateVector(BisectState&, Operation*)> fn) {
-  GetStrategies()[name] = fn;
-}
-
-CandidateVector GetCandidates(
-    const std::function<CandidateVector(BisectState&, Operation*)>& strategy,
-    BisectState& state, ModuleOp op) {
-  assert(strategy && "GetCandidates was passed a null strategy");
-  CandidateVector result;
-  op.lookupSymbol("main")->walk([&](Operation* subOp) {
-    llvm::move(strategy(state, subOp), std::back_inserter(result));
-  });
-  return result;
-}
-
-}  // namespace detail
-}  // namespace bisect
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.h b/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.h
deleted file mode 100644
index d3d25471e52820..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
-#define XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
-
-#include <functional>
-#include <tuple>
-#include <utility>
-
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
-#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
-
-#define REGISTER_MLIR_REDUCE_STRATEGY(name)                      \
-  static int name##_init = []() {                                \
-    ::mlir::bisect::detail::RegisterReduceStrategy(#name, name); \
-    return 1;                                                    \
-  }();
-
-namespace mlir {
-namespace bisect {
-
-class BisectState {
- public:
-  void SetTrace(mlir::interpreter::ExecutionTrace trace) {
-    trace_ = std::move(trace);
-  }
-
-  // Returns all executions of the given op.
-  llvm::SmallVector<const interpreter::InstructionTrace*> GetExecutions(
-      mlir::Operation* op) const {
-    return interpreter::FindOpExecutionsInTrace(trace_, op);
-  }
-
- private:
-  mlir::interpreter::ExecutionTrace trace_;
-};
-
-std::pair<OwningOpRef<ModuleOp>, Operation*> CloneModuleFor(Operation* op);
-Operation* FindInClone(Operation* op, ModuleOp clone);
-
-template <typename Op>
-std::pair<OwningOpRef<ModuleOp>, Op> CloneModuleFor(Op op) {
-  auto [module, op_clone] = CloneModuleFor(op.getOperation());
-  return {std::move(module), llvm::cast<Op>(op_clone)};
-}
-
-namespace detail {
-
-using CandidateVector = SmallVector<std::function<OwningOpRef<ModuleOp>()>>;
-
-CandidateVector GetCandidates(
-    const std::function<CandidateVector(BisectState&, Operation*)>& strategy,
-    BisectState& state, ModuleOp op);
-
-DenseMap<StringRef, std::function<CandidateVector(BisectState&, Operation*)>>&
-GetStrategies();
-
-// Registers a strategy that applies to all ops.
-void RegisterReduceStrategy(
-    StringRef name,
-    std::function<CandidateVector(BisectState&, Operation*)> fn);
-
-// Registers a strategy that applies to specific ops.
-template <typename Op>
-void RegisterReduceStrategy(StringRef name,
-                            CandidateVector (*fn)(BisectState&, Op)) {
-  RegisterReduceStrategy(
-      name, [fn](BisectState& state, Operation* op) -> CandidateVector {
-        if (auto cast = llvm::dyn_cast<Op>(op)) {
-          return fn(state, cast);
-        }
-        return {};
-      });
-}
-
-}  // namespace detail
-
-}  // namespace bisect
-}  // namespace mlir
-
-#endif  // XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/mlir_bisect.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
deleted file mode 100644
index b5fbb3ec930fd7..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
+++ /dev/null
@@ -1,363 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/IR/Verifier.h"  // from @llvm-project
-#include "mlir/InitAllDialects.h"  // from @llvm-project
-#include "mlir/InitAllPasses.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/FileUtilities.h"  // from @llvm-project
-#include "mlir/Tools/ParseUtilities.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/tools/mlir_bisect/bisect_lib.h"
-#include "xla/mlir/tools/mlir_bisect/test_passes.h"
-#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
-#include "xla/mlir_hlo/deallocation/IR/deallocation_ops.h"
-#include "xla/mlir_hlo/deallocation/transforms/passes.h"
-#include "xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
-#include "xla/mlir_hlo/gml_st/transforms/passes.h"
-#include "xla/mlir_hlo/gml_st/transforms/test_passes.h"
-#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
-#include "xla/mlir_hlo/lhlo/transforms/passes.h"
-#include "xla/mlir_hlo/mhlo/IR/register.h"
-#include "xla/mlir_hlo/mhlo/transforms/passes.h"
-#include "xla/mlir_hlo/thlo/IR/thlo_ops.h"
-#include "xla/mlir_hlo/thlo/transforms/passes.h"
-#include "xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h"
-#include "xla/service/hlo.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/init_main.h"
-
-struct Options {
-  llvm::cl::opt<std::string> input_filename{llvm::cl::Positional,
-                                            llvm::cl::desc("<input file>"),
-                                            llvm::cl::init("-")};
-  llvm::cl::opt<std::string> hlo_snapshot{
-      "hlo-snapshot",
-      llvm::cl::desc(
-          "If set, get argument values from the given snapshot. If not set, "
-          "the input function must not have any arguments."),
-      llvm::cl::init("")};
-  llvm::cl::opt<std::string> debug_strategy{
-      "debug-strategy",
-      llvm::cl::desc("If set, print all reductions for the given strategy and "
-                     "exit. For testing."),
-      llvm::cl::init("")};
-  llvm::cl::opt<std::string> expected_error{
-      "expected-error",
-      llvm::cl::desc("If set, expect the given error message after applying "
-                     "the pass instead of a successful execution."),
-      llvm::cl::init("")};
-  llvm::cl::opt<int64_t> max_steps_per_run{
-      "max-steps-per-run",
-      llvm::cl::desc("Maximum number of steps to execute for each attempt."),
-      llvm::cl::init(100000)};
-  mlir::PassPipelineCLParser pass_pipeline{"", "Passes to run"};
-  llvm::cl::opt<bool> canonicalize{
-      "enable-canonicalization",
-      llvm::cl::desc("If set, canonicalize candidates before trying them. Set "
-                     "to false if you're bisecting --canonicalize."),
-      llvm::cl::init(true)};
-};
-
-namespace mlir {
-namespace bisect {
-namespace {
-
-OwningOpRef<ModuleOp> ParseMlirInput(llvm::StringRef inputFilename,
-                                     MLIRContext* context) {
-  std::string error_message;
-  auto file = mlir::openInputFile(inputFilename, &error_message);
-  if (!file) {
-    llvm::errs() << error_message << "\n";
-    return {};
-  }
-
-  auto source_mgr = std::make_shared<llvm::SourceMgr>();
-  source_mgr->AddNewSourceBuffer(std::move(file), SMLoc());
-  return parseSourceFile<ModuleOp>(source_mgr, context);
-}
-
-LogicalResult RunPipeline(ModuleOp module, const Options& options) {
-  if (!options.pass_pipeline.hasAnyOccurrences()) {
-    return mlir::success();
-  }
-
-  auto error_handler = [&](const Twine& msg) {
-    llvm::errs() << msg << "\n";
-    return failure();
-  };
-  PassManager pm(module.getContext());
-  if (failed(options.pass_pipeline.addToPipeline(pm, error_handler)) ||
-      failed(pm.run(module))) {
-    llvm::errs() << "pipeline failed\n";
-    return failure();
-  }
-  return success();
-}
-
-LogicalResult Run(mlir::Operation* module, interpreter::ExecutionTrace* trace,
-                  const Options& options) {
-  SymbolTable symbol_table{module};
-  interpreter::ExecutionTraceListener tracer(trace);
-  interpreter::InterpreterOptions interpreter_options;
-  interpreter_options.listener = &tracer;
-  interpreter_options.maxSteps = options.max_steps_per_run;
-  auto results_before_pass = interpreter::runInterpreter(
-      symbol_table, llvm::cast<func::FuncOp>(symbol_table.lookup("main")), {},
-      interpreter_options);
-
-  if (!succeeded(results_before_pass)) {
-    llvm::errs() << "Interpreter failed\n";
-    return failure();
-  }
-
-  if (!options.debug_strategy.empty()) {
-    return success();
-  }
-
-  OwningOpRef<ModuleOp> clone(llvm::cast<ModuleOp>(module).clone());
-  if (!succeeded(RunPipeline(*clone, options))) {
-    return failure();
-  }
-
-  SymbolTable symbol_table_after{*clone};
-  interpreter_options.listener = nullptr;
-  bool found_expected_error = false;
-  if (!options.expected_error.empty()) {
-    auto original_handler = interpreter_options.errorHandler;
-    interpreter_options.errorHandler = [&](llvm::StringRef failure) {
-      found_expected_error |=
-          failure.find(options.expected_error) != std::string::npos;
-      original_handler(failure);
-    };
-  }
-
-  auto results_after_pass = interpreter::runInterpreter(
-      symbol_table_after,
-      llvm::cast<func::FuncOp>(symbol_table_after.lookup("main")), {},
-      interpreter_options);
-
-  if (!succeeded(results_after_pass)) {
-    if (found_expected_error) {
-      return success();
-    }
-    llvm::errs() << "Interpreter failed\n";
-    return failure();
-  } else if (!options.expected_error.empty()) {
-    llvm::errs() << "Expected error not seen\n";
-    return failure();
-  }
-
-  // If the results are the same, the bug is no longer present.
-  if (*results_before_pass == *results_after_pass) {
-    return failure();
-  }
-
-  llvm::errs() << "results before:\n";
-  for (auto& result : *results_before_pass) {
-    llvm::errs() << "  " << result.toString() << "\n";
-  }
-  llvm::errs() << "\nresults after:\n";
-  for (auto& result : *results_after_pass) {
-    llvm::errs() << "  " << result.toString() << "\n";
-  }
-
-  return success();
-}
-
-LogicalResult Canonicalize(ModuleOp module) {
-  PassManager pm(module.getContext());
-  pm.addPass(createCanonicalizerPass());
-  return pm.run(module.getOperation());
-}
-
-OwningOpRef<ModuleOp> ReduceModule(OwningOpRef<ModuleOp> module,
-                                   BisectState& state, const Options& options) {
-  auto strategies = llvm::to_vector(mlir::bisect::detail::GetStrategies());
-
-  auto apply_step = [&]() -> std::optional<OwningOpRef<ModuleOp>> {
-    for (auto it = strategies.begin(); it != strategies.end(); ++it) {
-      for (auto& candidate_fn :
-           detail::GetCandidates(it->second, state, *module)) {
-        auto candidate = candidate_fn();
-        if (!candidate || !mlir::verify(*candidate).succeeded()) {
-          continue;
-        }
-        if (options.canonicalize && !Canonicalize(*candidate).succeeded()) {
-          continue;
-        }
-
-        interpreter::ExecutionTrace trace;
-        // Verify that the candidate is still buggy.
-        if (!Run(*candidate, &trace, options).succeeded()) {
-          continue;
-        }
-
-        // Print the new buggy module.
-        llvm::outs() << "module after " << it->first << ":\n"
-                     << *candidate << "\n\n";
-
-        // Update the trace.
-        state.SetTrace(trace);
-
-        // Move strategies to the end.
-        decltype(strategies) new_strategies;
-        std::copy(it + 1, strategies.end(), std::back_inserter(new_strategies));
-        std::copy(strategies.begin(), it + 1,
-                  std::back_inserter(new_strategies));
-        strategies = new_strategies;
-        return {candidate.release()};
-      }
-    }
-    return std::nullopt;
-  };
-
-  while (auto new_module = apply_step()) {
-    module = std::move(*new_module);
-  }
-  return module;
-}
-
-void ReplaceArgsWithConstants(ModuleOp module,
-                              const xla::HloSnapshot& snapshot) {
-  auto main = llvm::cast<func::FuncOp>(module.lookupSymbol("main"));
-  OpBuilder b(main.getBody());
-  for (auto [arg, bbarg] :
-       llvm::zip(snapshot.arguments(), main.getBody().getArguments())) {
-    auto attr = interpreter::ValueToAttribute(
-        *interpreter::LiteralToValue(*xla::Literal::CreateFromProto(arg)),
-        bbarg.getType());
-    CHECK_EQ(attr.size(), 1) << "unsupported argument";
-
-    auto constant = b.create<arith::ConstantOp>(
-        main.getLoc(), bbarg.getType(), llvm::cast<TypedAttr>(attr.front()));
-    bbarg.replaceAllUsesWith(constant);
-  }
-
-  // The remaining ops are output args, so we replace them with allocs.
-  for (auto arg :
-       main.getBody().getArguments().drop_front(snapshot.arguments().size())) {
-    CHECK(llvm::isa<MemRefType>(arg.getType())) << "unsupported argument";
-    arg.replaceAllUsesWith(b.create<memref::AllocOp>(
-        module.getLoc(), llvm::cast<MemRefType>(arg.getType())));
-  }
-  while (main.getBody().getNumArguments() > 0) {
-    main.getBody().eraseArgument(0);
-  }
-  main.setFunctionType(FunctionType::get(main.getContext(), /*inputs=*/{},
-                                         main.getFunctionType().getResults()));
-  main.setArgAttrsAttr(b.getArrayAttr({}));
-}
-
-}  // namespace
-}  // namespace bisect
-}  // namespace mlir
-
-int main(int argc, char* argv[]) {
-  llvm::errs().tie(&llvm::outs());
-  llvm::outs().tie(&llvm::errs());
-  int dummy_argc = 1;
-  tsl::port::InitMain("", &dummy_argc, &argv);
-
-  Options options;
-  llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR bisect tool\n");
-
-  mlir::DialectRegistry registry;
-  mlir::registerAllDialects(registry);
-  mlir::registerAllPasses();
-  mlir::bisect::test::RegisterTestPasses();
-  mlir::mhlo::registerAllMhloPasses();
-  mlir::lmhlo::registerAllLmhloPasses();
-  mlir::thlo::registerAllThloPasses();
-  mlir::gml_st::registerGmlStPasses();
-  mlir::gml_st::registerGmlStTestPasses();
-  mlir::mhlo::registerAllMhloDialects(registry);
-  mlir::deallocation::registerDeallocationPasses();
-
-  registry.insert<mlir::lmhlo::LmhloDialect, mlir::gml_st::GmlStDialect,
-                  mlir::deallocation::DeallocationDialect,
-                  mlir::arith::ArithDialect, mlir::thlo::THLODialect,
-                  xla::runtime::RuntimeDialect>();
-
-  mlir::MLIRContext context(registry);
-  context.getOrLoadDialect<mlir::arith::ArithDialect>();
-  auto module = mlir::bisect::ParseMlirInput(options.input_filename, &context);
-
-  if (!options.hlo_snapshot.empty()) {
-    xla::HloSnapshot snapshot;
-    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), options.hlo_snapshot,
-                                     &snapshot));
-    mlir::bisect::ReplaceArgsWithConstants(*module, snapshot);
-  }
-
-  if (options.debug_strategy.empty()) {
-    llvm::outs() << "initial module:\n" << *module << "\n";
-  }
-
-  mlir::interpreter::ExecutionTrace trace;
-  if (!mlir::bisect::Run(*module, &trace, options).succeeded()) {
-    llvm::outs() << "Did not find bug in initial module\n";
-    if (options.pass_pipeline.hasAnyOccurrences() &&
-        mlir::succeeded(mlir::bisect::RunPipeline(*module, options))) {
-      llvm::outs() << "Module after running pipeline:\n" << *module << "\n";
-    }
-    return 1;
-  }
-
-  mlir::bisect::BisectState state;
-  state.SetTrace(trace);
-  if (!options.debug_strategy.empty()) {
-    bool some_failed = false;
-    for (auto& candidate : mlir::bisect::detail::GetCandidates(
-             mlir::bisect::detail::GetStrategies()[options.debug_strategy],
-             state, *module)) {
-      auto new_module = candidate();
-      if (!new_module) {
-        continue;
-      }
-      llvm::outs() << *new_module << "\n\n";
-      if (!mlir::verify(*new_module).succeeded()) {
-        some_failed = true;
-        llvm::errs() << "verification failed\n";
-      }
-    }
-    return some_failed ? 1 : 0;
-  }
-
-  module = mlir::bisect::ReduceModule(std::move(module), state, options);
-
-  llvm::outs() << "Final module:\n" << *module << "\n";
-  if (options.pass_pipeline.hasAnyOccurrences() &&
-      mlir::succeeded(mlir::bisect::RunPipeline(*module, options))) {
-    llvm::outs() << "Final module after running pipeline:\n" << *module << "\n";
-  }
-  return 0;
-}
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/BUILD b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/BUILD
deleted file mode 100644
index ef89338a3671a7..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
-
-cc_library(
-    name = "rewrites",
-    srcs = [
-        "func.cc",
-        "general.cc",
-        "scf.cc",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/mlir/tools/mlir_bisect:bisect_lib",
-        "//xla/mlir/tools/mlir_replay/public:execution_trace_utils",
-        "//xla/mlir_hlo:gml_st",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:Support",
-    ],
-    alwayslink = 1,
-)
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/func.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/func.cc
deleted file mode 100644
index d8e6257615d221..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/func.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <functional>
-#include <iterator>
-#include <utility>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/mlir/tools/mlir_bisect/bisect_lib.h"
-
-namespace mlir {
-namespace bisect {
-namespace {
-
-void SetReturnValues(func::FuncOp func, ValueRange values) {
-  // We only operate on functions without arguments.
-  func.setFunctionType(mlir::FunctionType::get(func.getContext(), /*inputs=*/{},
-                                               values.getTypes()));
-  func.getBody().getBlocks().front().getTerminator()->setOperands(values);
-}
-
-SmallVector<std::function<OwningOpRef<ModuleOp>()>> TruncateFunction(
-    BisectState&, func::FuncOp func) {
-  SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
-  for (auto& ret : func.getBody().getBlocks().front().without_terminator()) {
-    if (func.getBody().getBlocks().front().getTerminator()->getOperands() ==
-        ret.getResults()) {
-      continue;
-    }
-    auto fun = [r = &ret]() -> OwningOpRef<ModuleOp> {
-      auto [module, ret_clone] = CloneModuleFor(r);
-      SetReturnValues(ret_clone->getParentOfType<func::FuncOp>(),
-                      ret_clone->getResults());
-      return std::move(module);
-    };
-    result.push_back(fun);
-  }
-  return result;
-}
-
-SmallVector<std::function<OwningOpRef<ModuleOp>()>>
-ReturnOperandsOfTerminatorOperands(BisectState&, func::FuncOp func) {
-  SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
-  result.push_back([func]() -> OwningOpRef<ModuleOp> {
-    auto [module, func_clone] = CloneModuleFor(func);
-    auto* terminator = func_clone.getBody().getBlocks().front().getTerminator();
-    SmallVector<Value> new_operands;
-    for (auto operand : terminator->getOperands()) {
-      if (operand.getDefiningOp()) {
-        llvm::copy(operand.getDefiningOp()->getOperands(),
-                   std::back_inserter(new_operands));
-      } else {
-        return nullptr;
-      }
-    }
-    SetReturnValues(func_clone, new_operands);
-    return std::move(module);
-  });
-  return result;
-}
-
-REGISTER_MLIR_REDUCE_STRATEGY(TruncateFunction);
-REGISTER_MLIR_REDUCE_STRATEGY(ReturnOperandsOfTerminatorOperands);
-
-}  // namespace
-}  // namespace bisect
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/general.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/general.cc
deleted file mode 100644
index 7b624016c3e028..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/general.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <functional>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/mlir/tools/mlir_bisect/bisect_lib.h"
-#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
-
-namespace mlir {
-namespace bisect {
-namespace {
-
-bool IsTerminator(Operation* op) {
-  return op->hasTrait<OpTrait::IsTerminator>();
-}
-
-bool IsTopLevelOp(Operation* op) {
-  return !op->getBlock()->back().mightHaveTrait<OpTrait::IsTerminator>();
-}
-
-SmallVector<std::function<OwningOpRef<ModuleOp>()>> EraseOpWithoutResults(
-    BisectState& state, Operation* op) {
-  // Only erase ops with results if they're unused.
-  if (op->getNumResults() > 0 && !op->use_empty()) {
-    return {};
-  }
-
-  // Don't erase entire functions, constants, terminators.
-  if (IsTopLevelOp(op) || IsTerminator(op)) {
-    return {};
-  }
-
-  SmallVector<std::function<OwningOpRef<ModuleOp>()>> ret;
-  ret.push_back([op]() {
-    auto [module, cloned_op] = CloneModuleFor(op);
-    cloned_op->erase();
-    return std::move(module);
-  });
-  return ret;
-}
-
-llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> ReplaceOpWithConstant(
-    BisectState& state, Operation* op) {
-  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
-  if (op->hasTrait<OpTrait::ConstantLike>() || IsTopLevelOp(op) ||
-      IsTerminator(op) || op->use_empty() || op->getNumResults() == 0) {
-    return result;
-  }
-
-  auto mii = llvm::dyn_cast<MemoryEffectOpInterface>(op);
-  if (mii && mii.hasEffect<MemoryEffects::Allocate>()) {
-    // Don't replace allocs with constants.
-    return result;
-  }
-
-  // Ops that are never executed won't be replaced here, but we have other
-  // strategies that get rid of them (e.g. deleting the entire region).
-  for (auto* execution : state.GetExecutions(op)) {
-    assert(execution->results_size() == op->getNumResults() &&
-           "unexpected number of results");
-
-    result.push_back([execution, op]() -> OwningOpRef<ModuleOp> {
-      auto [module_clone, op_clone] = CloneModuleFor(op);
-      SmallVector<Value> results;
-      OpBuilder b(op_clone);
-      for (int64_t i = 0; i < op->getNumResults(); ++i) {
-        auto type = op->getResultTypes()[i];
-        auto value = *interpreter::TracedValueToValue(
-            execution->results(static_cast<int>(i)));
-        auto attribute = interpreter::ValueToAttribute(value, type);
-        // We don't currently support tuples.
-        if (attribute.size() != 1) {
-          return nullptr;
-        }
-        op_clone->getResults()[i].replaceAllUsesWith(
-            b.create<arith::ConstantOp>(
-                op_clone->getLoc(), type,
-                llvm::cast<TypedAttr>(attribute.front())));
-      }
-      return std::move(module_clone);
-    });
-  }
-  return result;
-}
-
-llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>>
-ReplaceOperandWithConstant(BisectState& state, Operation* op) {
-  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
-  if (IsTopLevelOp(op) || op->getNumOperands() == 0) {
-    return result;
-  }
-
-  for (auto* execution : state.GetExecutions(op)) {
-    for (int64_t i = 0; i < op->getNumOperands(); ++i) {
-      auto operand = op->getOperand(i);
-      if (operand.getDefiningOp() &&
-          operand.getDefiningOp()->hasTrait<OpTrait::ConstantLike>()) {
-        continue;
-      }
-      result.push_back([execution, i, op]() -> OwningOpRef<ModuleOp> {
-        auto type = op->getOperandTypes()[i];
-        auto value = *interpreter::TracedValueToValue(
-            execution->args(static_cast<int>(i)));
-        auto attribute = interpreter::ValueToAttribute(value, type);
-        if (attribute.size() != 1) {
-          return nullptr;
-        }
-        auto [module_clone, op_clone] = CloneModuleFor(op);
-        OpBuilder b(op_clone);
-        op_clone->setOperand(i, b.create<arith::ConstantOp>(
-                                    op_clone->getLoc(), type,
-                                    llvm::cast<TypedAttr>(attribute.front())));
-        return std::move(module_clone);
-      });
-    }
-  }
-  return result;
-}
-
-// Replaces an op's result with some other value with the same type defined
-// previously in the same region.
-llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> ReplaceOpWithValue(
-    BisectState&, Operation* op) {
-  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> ret;
-  if (op->hasTrait<OpTrait::ConstantLike>() || IsTopLevelOp(op) ||
-      IsTerminator(op)) {
-    return ret;
-  }
-
-  // TODO(jreiffers): Consider bbargs.
-  llvm::DenseMap<mlir::Type, llvm::SmallVector<std::pair<Operation*, int64_t>>>
-      candidates_by_type;
-  for (auto* pred = op->getPrevNode(); pred != nullptr;
-       pred = pred->getPrevNode()) {
-    for (auto [index, result] : llvm::enumerate(pred->getResults())) {
-      candidates_by_type[result.getType()].emplace_back(pred, index);
-    }
-  }
-
-  for (auto [index, result] : llvm::enumerate(op->getResults())) {
-    if (result.use_empty()) {
-      continue;
-    }
-
-    for (auto [new_result_op, new_result_index] :
-         candidates_by_type[result.getType()]) {
-      ret.push_back(
-          [op, i = index, j = new_result_index, result_op = new_result_op]() {
-            auto [module_clone, op_clone] = CloneModuleFor(op);
-            op_clone->getResults()[i].replaceAllUsesWith(
-                FindInClone(result_op, module_clone.get())->getResults()[j]);
-            return std::move(module_clone);
-          });
-    }
-  }
-  return ret;
-}
-
-REGISTER_MLIR_REDUCE_STRATEGY(EraseOpWithoutResults);
-REGISTER_MLIR_REDUCE_STRATEGY(ReplaceOpWithConstant);
-REGISTER_MLIR_REDUCE_STRATEGY(ReplaceOpWithValue);
-REGISTER_MLIR_REDUCE_STRATEGY(ReplaceOperandWithConstant);
-
-}  // namespace
-}  // namespace bisect
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/scf.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/scf.cc
deleted file mode 100644
index 72f90372b7d5f9..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/scf.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
-
-#include <functional>  // NOLINT
-#include <utility>     // NOLINT
-
-#include "mlir/Dialect/Utils/StaticValueUtils.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/mlir/tools/mlir_bisect/bisect_lib.h"
-
-namespace mlir {
-namespace bisect {
-namespace {
-
-constexpr int64_t kMaxWhileIterations = 1;
-
-// Rewrites a while loop to execute its body a fixed number of times. The
-// condition is executed, but its result is ignored.
-// For ease of implementation, this generates scf.execute_region ops. These are
-// subsequently canonicalized away.
-llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> InlineScfWhile(
-    BisectState&, scf::WhileOp while_op) {
-  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
-  for (int64_t num_executions = 0; num_executions <= kMaxWhileIterations;
-       ++num_executions) {
-    using ::mlir::scf::ExecuteRegionOp;
-
-    result.push_back([while_op, num_executions]() -> OwningOpRef<ModuleOp> {
-      auto [module, op] = CloneModuleFor(while_op);
-      OpBuilder b(op);
-      llvm::SmallVector<scf::ExecuteRegionOp> regions;
-
-      auto wrap_region_in_execute = [&,
-                                     loc = op.getLoc()](mlir::Region& region) {
-        regions
-            .emplace_back(b.create<ExecuteRegionOp>(
-                loc,
-                region.getBlocks().front().getTerminator()->getOperandTypes(),
-                mlir::ValueRange{}))
-            .getRegion()
-            .takeBody(region);
-      };
-
-      wrap_region_in_execute(op.getBefore());
-      // Replace the condition terminator with a yield terminator.
-      {
-        auto& before_block = regions[0].getRegion().getBlocks().front();
-        OpBuilder before_builder(before_block.getTerminator());
-        IRRewriter before_rewriter(before_builder);
-        before_rewriter.replaceOpWithNewOp<scf::YieldOp>(
-            before_block.getTerminator(),
-            before_block.getTerminator()->getOperands());
-      }
-
-      // Clone the execute region ops the requested number of times.
-      if (num_executions > 0) {
-        wrap_region_in_execute(op.getAfter());
-        for (int64_t i = 0; i < num_executions - 1; ++i) {
-          b.insert(regions.emplace_back(regions[0].clone()));
-          b.insert(regions.emplace_back(regions[1].clone()));
-        }
-        b.insert(regions.emplace_back(regions[0].clone()));
-      }
-
-      // Rewire region arguments and erase them.
-      for (int64_t i = 0; i < regions.size(); ++i) {
-        auto args = i == 0 ? ValueRange{op.getOperands()}
-                           : ValueRange{regions[i - 1].getResults()};
-        bool is_after_region = (i & 1) == 1;
-        auto& region = regions[i].getRegion();
-        for (int64_t arg = static_cast<int64_t>(region.getNumArguments()) - 1;
-             arg >= 0; --arg) {
-          region.getArgument(arg).replaceAllUsesWith(
-              args[is_after_region ? arg + 1 : arg]);
-          region.eraseArgument(arg);
-        }
-      }
-      op->replaceAllUsesWith(regions.back().getResults().drop_front(1));
-      op->erase();
-      return std::move(module);
-    });
-  }
-  return result;
-}
-
-SmallVector<std::function<OwningOpRef<ModuleOp>()>> ReduceScfForallBounds(
-    BisectState&, scf::ForallOp forall_op) {
-  SmallVector<OpFoldResult> new_upper_bound{forall_op.getMixedUpperBound()};
-  OpBuilder b(forall_op);
-  bool any_replaced = false;
-  for (auto& ub : new_upper_bound) {
-    auto constant_or = mlir::getConstantIntValue(ub);
-    if (!constant_or.has_value()) {
-      continue;
-    }
-    any_replaced = true;
-    ub = b.getIndexAttr(*constant_or - 1);
-  }
-  SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
-  if (!any_replaced) {
-    return result;
-  }
-  result.push_back([=]() -> OwningOpRef<ModuleOp> {
-    auto [module, op] = CloneModuleFor(forall_op);
-    OpBuilder b(op);
-    SmallVector<Value> dynamic_upper_bound;
-    SmallVector<int64_t> static_upper_bound;
-    dispatchIndexOpFoldResults(new_upper_bound, dynamic_upper_bound,
-                               static_upper_bound);
-    op.getDynamicUpperBoundMutable().assign(dynamic_upper_bound);
-    op.setStaticUpperBound(static_upper_bound);
-    return std::move(module);
-  });
-  return result;
-}
-
-REGISTER_MLIR_REDUCE_STRATEGY(ReduceScfForallBounds);
-REGISTER_MLIR_REDUCE_STRATEGY(InlineScfWhile);
-
-}  // namespace
-}  // namespace bisect
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD
deleted file mode 100644
index 860af275d4726e..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-load("//xla:glob_lit_test.bzl", "glob_lit_tests")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
-
-glob_lit_tests(
-    name = "all_tests",
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = [
-        "mlir",
-    ],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//xla/mlir/tools/mlir_bisect:mlir-bisect",
-        "@llvm-project//llvm:FileCheck",
-    ],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/erase-op-without-results.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/erase-op-without-results.mlir
deleted file mode 100644
index e918e112fe46f3..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/erase-op-without-results.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-bisect %s --debug-strategy=EraseOpWithoutResults | FileCheck %s
-
-func.func @main() -> memref<i32> {
-  %a = arith.constant 1 : i32
-  %b = memref.alloc() : memref<i32>
-  memref.store %a, %b[] : memref<i32>
-  func.return %b : memref<i32>
-}
-
-//      CHECK: func.func @main()
-//      CHECK:   %[[ALLOC:.*]] = memref.alloc
-// CHECK-NEXT:   return %[[ALLOC]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/inline-scf-while.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/inline-scf-while.mlir
deleted file mode 100644
index 6c9deddbc37cb5..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/inline-scf-while.mlir
+++ /dev/null
@@ -1,40 +0,0 @@
-// RUN: mlir-bisect %s --debug-strategy=InlineScfWhile | FileCheck %s
-
-func.func @main() -> i64 {
-  %c0 = arith.constant 0 : i64
-  %c1 = arith.constant 1 : i64
-  %c4 = arith.constant 4 : i64
-  %alloc = memref.alloc() : memref<i64>
-  memref.store %c0, %alloc[] : memref<i64>
-  %ret = scf.while(%arg0 = %c0): (i64) -> (i64) {
-    %cond = arith.cmpi slt, %arg0, %c4 : i64
-    scf.condition(%cond) %arg0 : i64
-  } do {
-  ^bb0(%arg1: i64):
-    %add = arith.addi %arg1, %c1 : i64
-    scf.yield %add : i64
-  }
-  return %ret : i64
-}
-
-//     CHECK: func @main
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:   %[[C4:.*]] = arith.constant 4
-//     CHECK:   %[[RET:.*]]:2 = scf.execute_region
-//     CHECK:     arith.cmpi slt, %[[C0]], %[[C4]]
-//     CHECK:     yield {{.*}}, %[[C0]]
-//     CHECK:   return %[[RET]]#1
-
-//     CHECK: func @main
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
-//     CHECK:   %[[BEFORE0:.*]]:2 = scf.execute_region
-//     CHECK:     arith.cmpi
-//     CHECK:     yield {{.*}}, %[[C0]]
-//     CHECK:   %[[AFTER:.*]] = scf.execute_region
-//     CHECK:     %[[ADD:.*]] = arith.addi %[[BEFORE0]]#1, %[[C1]]
-//     CHECK:     yield %[[ADD]]
-//     CHECK:   %[[BEFORE1:.*]]:2 = scf.execute_region
-//     CHECK:     arith.cmpi
-//     CHECK:     yield {{.*}}, %[[AFTER]]
-//     CHECK:   return %[[BEFORE1]]#1
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/reduce-scf-forall-bounds.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/reduce-scf-forall-bounds.mlir
deleted file mode 100644
index 61f289d3c5cd6c..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/reduce-scf-forall-bounds.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: mlir-bisect %s --debug-strategy=ReduceScfForallBounds | FileCheck %s
-
-func.func @main() -> tensor<8xindex> {
-  %init = tensor.empty() : tensor<8xindex>
-  %iota = scf.forall (%i) = (0) to (8) step (1)
-      shared_outs (%init_ = %init) -> (tensor<8xindex>) {
-    %tensor = tensor.from_elements %i : tensor<1xindex>
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %tensor into %init_[%i] [1] [1]
-        : tensor<1xindex> into tensor<8xindex>
-    }
-  }
-  func.return %iota : tensor<8xindex>
-}
-// CHECK: func @main()
-// CHECK:   scf.forall ({{.*}}) in (7)
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-constant.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-constant.mlir
deleted file mode 100644
index 171472ad733642..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-constant.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: mlir-bisect %s --debug-strategy=ReplaceOpWithConstant | FileCheck %s
-
-func.func @main() -> tensor<2xi32> {
-  %a = arith.constant dense<3> : tensor<2xi32>
-  %b = arith.constant dense<2> : tensor<2xi32>
-  %c = mhlo.add %a, %b : tensor<2xi32>
-  %d = mhlo.multiply %b, %c : tensor<2xi32>
-  func.return %d : tensor<2xi32>
-}
-
-//      CHECK: func.func @main()
-// CHECK-NEXT:   arith.constant dense<3>
-// CHECK-NEXT:   arith.constant dense<2>
-// CHECK-NEXT:   arith.constant dense<5>
-// CHECK-NEXT:   %[[ADD:.*]] = mhlo.add
-//  CHECK-NOT:   %[[ADD]]
-// CHECK-NEXT:   mhlo.multiply
-// CHECK-NEXT:   return
-
-//      CHECK: func.func @main()
-// CHECK-NEXT:   arith.constant dense<3>
-// CHECK-NEXT:   arith.constant dense<2>
-// CHECK-NEXT:   mhlo.add
-// CHECK-NEXT:   %[[D:.*]] = arith.constant dense<10>
-// CHECK-NEXT:   mhlo.multiply
-// CHECK-NEXT:   return %[[D]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-value.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-value.mlir
deleted file mode 100644
index f89f647f14ddc6..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-value.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: mlir-bisect %s --debug-strategy=ReplaceOpWithValue | FileCheck %s
-
-func.func @main() -> (memref<i32>, memref<i32>) {
-  %a = memref.alloc() : memref<i32>
-  %b = memref.alloc() : memref<i32>
-  %c0 = arith.constant 0 : i32
-  memref.store %c0, %b[] : memref<i32>
-  return %a, %b : memref<i32>, memref<i32>
-}
-
-//      CHECK: func @main()
-//      CHECK:   %[[ALLOC:.*]] = memref.alloc()
-// CHECK-NEXT:   memref.alloc
-// CHECK-NEXT:   constant
-// CHECK-NEXT:   memref.store {{.*}}, %[[ALLOC]]
-// CHECK-NEXT:   return %[[ALLOC]], %[[ALLOC]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-operand-with-constant.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-operand-with-constant.mlir
deleted file mode 100644
index 7619a8a500c5e4..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-operand-with-constant.mlir
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: mlir-bisect %s --debug-strategy=ReplaceOperandWithConstant | FileCheck %s
-
-func.func @main() -> (tensor<2xi32>, tensor<2xi32>) {
-  %a = arith.constant dense<3> : tensor<2xi32>
-  %b = arith.constant dense<2> : tensor<2xi32>
-  %c = mhlo.add %a, %b : tensor<2xi32>
-  %d = mhlo.multiply %b, %c : tensor<2xi32>
-  func.return %c, %d : tensor<2xi32>, tensor<2xi32>
-}
-
-// CHECK: func @main()
-// CHECK:   %[[C2:.*]] = arith.constant dense<2>
-// CHECK:   %[[ADD:.*]] = mhlo.add
-// CHECK:   %[[C5:.*]] = arith.constant dense<5>
-// CHECK:   %[[MUL:.*]] = mhlo.multiply %[[C2]], %[[C5]] : tensor<2xi32>
-// CHECK:   return %[[ADD]], %[[MUL]]
-
-// CHECK: func @main()
-// CHECK:   mhlo.add
-// CHECK:   %[[MUL:.*]] = mhlo.multiply %cst_0, %0 : tensor<2xi32>
-// CHECK:   %[[C5:.*]] = arith.constant dense<5>
-// CHECK:   return %[[C5]], %[[MUL]]
-
-// CHECK: func @main()
-// CHECK:   %[[ADD:.*]] = mhlo.add
-// CHECK:   mhlo.multiply
-// CHECK:   %[[C10:.*]] = arith.constant dense<10>
-// CHECK:   return %[[ADD]], %[[C10]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/return-operands-of-terminator-operands.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/return-operands-of-terminator-operands.mlir
deleted file mode 100644
index 8584e2a0008fa0..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/return-operands-of-terminator-operands.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: mlir-bisect %s --debug-strategy=ReturnOperandsOfTerminatorOperands | FileCheck %s
-
-func.func @main() -> tensor<2xi32> {
-  %a = arith.constant dense<3> : tensor<2xi32>
-  %b = arith.constant dense<2> : tensor<2xi32>
-  %c = mhlo.add %a, %b : tensor<2xi32>
-  %d = mhlo.multiply %b, %c : tensor<2xi32>
-  func.return %d : tensor<2xi32>
-}
-
-// CHECK: @main
-// CHECK:   %[[C2:.*]] = arith.constant dense<2>
-// CHECK:   %[[ADD:.*]] = mhlo.add
-// CHECK:   mhlo.multiply
-// CHECK:   return %[[C2]], %[[ADD]]
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/truncate-function.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/truncate-function.mlir
deleted file mode 100644
index af06778bd47c54..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/truncate-function.mlir
+++ /dev/null
@@ -1,31 +0,0 @@
-// RUN: mlir-bisect %s --debug-strategy=TruncateFunction | FileCheck %s
-
-// Function to prevent constant folding below.
-func.func private @cst() -> tensor<2xi32> {
-  %cst = arith.constant dense<2> : tensor<2xi32>
-  return %cst : tensor<2xi32>
-}
-
-func.func @main() -> tensor<2xi32> {
-  %a = arith.constant dense<1> : tensor<2xi32>
-  %b = func.call @cst() : () -> tensor<2xi32>
-  %c = mhlo.add %a, %b : tensor<2xi32>
-  %d = mhlo.multiply %b, %c : tensor<2xi32>
-  func.return %d : tensor<2xi32>
-}
-
-//     CHECK: func @main()
-//     CHECK:   %[[A:.*]] = arith.constant dense<1>
-//     CHECK:   return %[[A]]
-
-//     CHECK: func @main()
-//     CHECK:   %[[B:.*]] = call @cst()
-//     CHECK:   return %[[B]]
-
-//     CHECK: func @main()
-//     CHECK:   %[[A:.*]] = arith.constant dense<1>
-//     CHECK:   %[[B:.*]] = call @cst()
-//     CHECK:   %[[ADD:.*]] = mhlo.add
-// CHECK-DAG:   %[[A]]
-// CHECK-DAG:   %[[B]]
-//     CHECK:   return %[[ADD]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.cc
deleted file mode 100644
index bd253682f3a47b..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/tools/mlir_bisect/test_passes.h"
-
-#include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace mlir {
-namespace bisect {
-namespace test {
-namespace {
-
-struct BreakLinalgTransposePass
-    : public PassWrapper<BreakLinalgTransposePass, OperationPass<ModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(BreakLinalgTransposePass)
-
-  StringRef getArgument() const final { return "test-break-linalg-transpose"; }
-  StringRef getDescription() const final { return "breaks linalg transpose"; }
-  BreakLinalgTransposePass() = default;
-
-  void runOnOperation() override {
-    getOperation().walk([](linalg::TransposeOp op) {
-      auto permutation = llvm::to_vector(op.getPermutation());
-      std::swap(permutation[0], permutation[1]);
-      op.setPermutation(permutation);
-    });
-  }
-};
-}  // namespace
-
-void RegisterTestPasses() { PassRegistration<BreakLinalgTransposePass>(); }
-
-}  // namespace test
-}  // namespace bisect
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.h b/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.h
deleted file mode 100644
index 2903d54c137139..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
-#define XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
-
-namespace mlir {
-namespace bisect {
-namespace test {
-
-void RegisterTestPasses();
-
-}
-}  // namespace bisect
-}  // namespace mlir
-
-#endif  // XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/BUILD b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/BUILD
deleted file mode 100644
index eed6c662a0f74f..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-load("//xla:glob_lit_test.bzl", "glob_lit_tests")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
-
-glob_lit_tests(
-    name = "all_tests",
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = [
-        "mlir",
-    ],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "snapshot.mlir.pb",
-        "//xla/mlir/tools/mlir_bisect:mlir-bisect",
-        "@llvm-project//llvm:FileCheck",
-        "@llvm-project//llvm:not",
-    ],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/bisect.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/bisect.mlir
deleted file mode 100644
index ca839d982c416a..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/bisect.mlir
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: mlir-bisect %s \
-// RUN: --pass-pipeline="builtin.module(test-break-linalg-transpose)" \
-// RUN: --max-steps-per-run=200 \
-// RUN: | FileCheck %s
-
-func.func @main() -> (memref<2x2xindex>, memref<2x2xindex>) {
-  %a = memref.alloc() : memref<2x2xindex>
-  %b = memref.alloc() : memref<2x2xindex>
-  %c = memref.alloc() : memref<2x2xindex>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c3 = arith.constant 3 : index
-  scf.for %i = %c0 to %c2 step %c1 {
-    scf.for %j = %c0 to %c2 step %c1 {
-      memref.store %i, %a[%i, %j] : memref<2x2xindex>
-      memref.store %j, %b[%i, %j] : memref<2x2xindex>
-    }
-  }
-
-  %i = scf.while: () -> (index) {
-    %value = memref.load %a[%c0, %c0] : memref<2x2xindex>
-    %cond = arith.cmpi slt, %value, %c3 : index
-    scf.condition(%cond) %value : index
-  } do {
-  ^bb0(%_: index):
-    %value = memref.load %a[%c0, %c0] : memref<2x2xindex>
-    %add = arith.addi %value, %c1 : index
-    memref.store %add, %a[%c0, %c0] : memref<2x2xindex>
-    linalg.transpose ins(%b : memref<2x2xindex>) outs(%c : memref<2x2xindex>)
-      permutation = [1, 0]
-    memref.copy %c, %b : memref<2x2xindex> to memref<2x2xindex>
-    scf.yield
-  }
-
-  return %a, %b : memref<2x2xindex>, memref<2x2xindex>
-}
-
-//     CHECK: Final module
-//     CHECK: func @main() -> memref<2x2xindex> {
-// CHECK-NOT: scf.while
-// CHECK-NOT: scf.for
-//     CHECK: linalg.transpose {{.*}} permutation = [1, 0]
-
-//     CHECK: Final module after running pipeline
-//     CHECK: linalg.transpose {{.*}} permutation = [0, 1]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/no-bug.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/no-bug.mlir
deleted file mode 100644
index df343f3bf8b09f..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/no-bug.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: not mlir-bisect %s \
-// RUN: --pass-pipeline="builtin.module(test-break-linalg-transpose)" \
-// RUN: | FileCheck %s
-
-func.func @main() -> memref<2x2xindex> {
-  %a = memref.alloc() : memref<2x2xindex>
-  return %a : memref<2x2xindex>
-}
-
-// CHECK: Did not find bug in initial module
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir
deleted file mode 100644
index 916ca47ab0fd8e..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: not mlir-bisect %s --hlo-snapshot=%s.pb \
-// RUN: --pass-pipeline="builtin.module(test-break-linalg-transpose)" \
-// RUN: | FileCheck %s
-
-func.func @main(%a: tensor<3x1xi32>, %b: tensor<3x1xi32>) -> tensor<3x1xi32> {
-  return %a : tensor<3x1xi32>
-}
-
-// CHECK: initial module
-// CHECK: func @main() -> tensor<3x1xi32> {
-// CHECK{LITERAL}: arith.constant dense<[[2], [-4], [5]]> : tensor<3x1xi32>
-// CHECK{LITERAL}: arith.constant dense<[[0], [7], [-5]]> : tensor<3x1xi32>
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir.pb b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir.pb
deleted file mode 100644
index ee3c8f759494db153cd7114783124b1cb7fb5da0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 68
scmWeq;1UpEkz!(I)MDXcVq`F4Vqj3>VfynQ3K&_1u&Q8S{|#3H00LzcG5`Po

diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/BUILD b/third_party/xla/xla/mlir/tools/mlir_replay/BUILD
deleted file mode 100644
index 246676f6526f10..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/BUILD
+++ /dev/null
@@ -1,71 +0,0 @@
-load("//xla:xla.bzl", "xla_cc_binary")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
-
-build_test(
-    name = "mlir_replay_build_test",
-    targets = [
-        ":mlir_replay",
-    ],
-)
-
-xla_cc_binary(
-    name = "mlir_replay",
-    srcs = ["mlir_replay.cc"],
-    deps = [
-        ":mlir_replay_lib",
-        "//xla:debug_options_flags",
-        "//xla/mlir/framework/ir:xla_framework",
-        "//xla/mlir/runtime/ir:rt",
-        "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
-        "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc_impl",
-        "//xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc",
-        "//xla/mlir/tools/mlir_replay/public:execution_trace_utils",
-        "//xla/mlir/xla_cpu/ir:xla_cpu",
-        "//xla/mlir_hlo:deallocation",
-        "//xla/mlir_hlo:gml_st",
-        "//xla/mlir_hlo:hlo_dialect_registration",
-        "//xla/mlir_hlo:lhlo",
-        "//xla/mlir_hlo:lhlo_gpu",
-        "//xla/mlir_hlo:mlir_interpreter_dialects",
-        "//xla/mlir_hlo:mlir_interpreter_framework",
-        "//xla/mlir_hlo:thlo",
-        "//xla/service:hlo_proto_cc",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/util:command_line_flags",
-    ],
-)
-
-cc_library(
-    name = "mlir_replay_lib",
-    srcs = ["mlir_replay_lib.cc"],
-    hdrs = ["mlir_replay_lib.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:xla_data_proto_cc",
-        "//xla/mlir/framework/ir:xla_framework",
-        "//xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc",
-        "//xla/mlir/tools/mlir_replay/public:execution_trace_utils",
-        "//xla/mlir_hlo:mlir_interpreter_framework",
-        "//xla/service:hlo_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/random",
-        "@com_google_absl//absl/random:bit_gen_ref",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MlirReduceLib",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/README.md b/third_party/xla/xla/mlir/tools/mlir_replay/README.md
deleted file mode 100644
index 6c2091d526e92f..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/README.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# MLIR Replay tool
-
-This tool is mainly intended for helping debug miscompiles. It takes as inputs
-an HLO snapshot proto with input tensors and a compiler trace proto with the
-state of the IR after each pass.
-
-This tool is built on top of
-[mlir-interpreter](https://github.com/tensorflow/mlir-hlo/tree/master/tools/mlir_interpreter/).
-
-Example usage:
-
-```
-# Run a JAX test with debug flags enabled:
-$ bazel test :some_jax_test --compilation_mode=opt \
-  --test_env=XLA_FLAGS="--xla_cpu_use_xla_runtime --xla_dump_to=/tmp/test-dump --xla_dump_hlo_snapshots" \
-  --test_filter=SomeSpecific.TestCase \
-  --test_sharding_strategy=disabled --test_strategy=local
-
-# JAX tends to compile many modules, so first check which one is broken:
-./mlir_replay \
-  --mlir-compilation-trace-dir=/tmp/test-dump
-
-Failures for /tmp/test-dump/module_1234.jit_something.mlir-trace.pb:
-  Result mismatch for /tmp/test-dump/module_1234.jit_something.snapshot.56.pb: TensorOrMemref<3xi32>: [1, 2, 3] != TensorOrMemref<3xi32>: [1, 1, 1]
-  run :mlir_replay -- --mlir-compilation-trace=/tmp/test-dump/module_1234.jit_something.mlir-trace.pb --hlo-snapshot=/tmp/test-dump/module_1234.jit_something.snapshot.56.pb --print-changes-only --stop-after-first-failure
-```
-
-There may be multiple failing modules. You can run the provided command to
-replay a particular one:
-
-```
-# Run the IR after each pass. Note that JAX typically compiles many modules, so
-# you may have check more than one.
-# There is one .mlir-trace.pb file per module (containing the intermediate IR)
-# and one .snapshot.pb file per execution (containing the inputs and outputs).
-$ ./mlir_replay \
-  --mlir-compilation-trace=/tmp/test-dump/module_1234.jit_something.mlir-trace.pb \
-  --hlo-snapshot=/tmp/test-dump/module_1234.jit_something.snapshot.56.pb \
-  --print-changes-only --stop-after-first-failure
-Running IR after APass
-Results: [1, 2, 3]
-
-Running IR after BPass
-Running IR after CPass
-Running IR after BrokenPass
-Results: [1, 1, 1]
-```
-
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay.cc b/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay.cc
deleted file mode 100644
index 5d075b6d0ac7f2..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay.cc
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_split.h"
-#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/InitAllDialects.h"  // from @llvm-project
-#include "xla/debug_options_flags.h"
-#include "xla/mlir/framework/ir/xla_framework.h"
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/tools/mlir_replay/mlir_replay_lib.h"
-#include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
-#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
-#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
-#include "xla/mlir/xla_cpu/ir/xla_cpu.h"
-#include "xla/mlir_hlo/deallocation/IR/deallocation_ops.h"
-#include "xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
-#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
-#include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "xla/mlir_hlo/mhlo/IR/register.h"
-#include "xla/mlir_hlo/thlo/IR/thlo_ops.h"
-#include "xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
-#include "xla/service/hlo.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/init_main.h"
-#include "tsl/platform/path.h"
-#include "tsl/platform/status.h"
-#include "tsl/util/command_line_flags.h"
-
-struct ReplayOptions {
-  std::string hlo_snapshot;
-  std::string mlir_compilation_trace;
-  std::string mlir_compilation_trace_dir = "";
-  std::string execution_trace_dir = "";
-  std::vector<std::string> entry_points = {"main", "main_xla_framework"};
-  bool print_changes_only = false;
-  bool stop_after_first_failure = false;
-  bool print_values = true;
-};
-
-bool ResultsMatch(const xla::HloSnapshot& snapshot,
-                  const llvm::SmallVector<mlir::interpreter::InterpreterValue>&
-                      first_pass_results,
-                  std::vector<std::string>& failures,
-                  const ReplayOptions& opts) {
-  auto actual = mlir::interpreter::LiteralToValue(snapshot.result());
-  TF_CHECK_OK(actual.status());
-
-  // We assume this is MHLO, so multiple results will be in a tuple.
-  if (first_pass_results.size() != 1) {
-    failures.push_back("expected one result");
-    return false;
-  }
-
-  if (!(*actual == first_pass_results[0])) {
-    if (opts.print_values) {
-      failures.push_back("result mismatch: " + actual->toString() +
-                         " != " + first_pass_results[0].toString());
-    } else {
-      failures.push_back("result mismatch");
-    }
-    return false;
-  }
-  return true;
-}
-
-void TestAll(mlir::MLIRContext& context, const ReplayOptions& opts) {
-  std::vector<std::string> traces;
-  TF_CHECK_OK(tsl::Env::Default()->GetMatchingPaths(
-      opts.mlir_compilation_trace_dir + "/*.mlir-trace.pb", &traces));
-
-  for (const auto& trace_path : traces) {
-    mlir::interpreter::MlirCompilationTrace trace;
-    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), trace_path, &trace))
-        << "Failed to load " << trace_path;
-
-    std::vector<std::string> snapshots;
-    std::string prefix =
-        trace_path.substr(0, trace_path.length() - strlen(".mlir-trace.pb"));
-    TF_CHECK_OK(tsl::Env::Default()->GetMatchingPaths(prefix + "*.snapshot.*",
-                                                      &snapshots));
-    CHECK_NE(snapshots.size(), 0)
-        << "No snapshots found for module " << trace_path << ".";
-
-    std::vector<std::string> failures;
-    for (const auto& snapshot_path : snapshots) {
-      xla::HloSnapshot snapshot;
-      TF_CHECK_OK(
-          tsl::ReadBinaryProto(tsl::Env::Default(), snapshot_path, &snapshot));
-
-      auto results =
-          mlir::interpreter::Run(context, trace.passes(0).mlir_module(),
-                                 snapshot, nullptr, opts.entry_points);
-      if (!results.status().ok()) {
-        failures.push_back("Failed to execute " + snapshot_path + ": " +
-                           results.status().ToString());
-      } else {
-        if (!ResultsMatch(snapshot, *results, failures, opts)) {
-          failures.push_back(
-              std::string("run :mlir_replay -- --mlir-compilation-trace=") +
-              trace_path + " --hlo-snapshot=" + snapshot_path +
-              " --print-changes-only --stop-after-first-failure");
-        }
-      }
-    }
-
-    if (!failures.empty()) {
-      llvm::errs() << "Failures for " << trace_path << ":\n  "
-                   << absl::StrJoin(failures, "\n  ") << "\n";
-    }
-  }
-}
-
-int main(int argc, char* argv[]) {
-  // Flush llvm::outs before writing errors.
-  llvm::errs().tie(&llvm::outs());
-
-  std::string entry_points;
-  ReplayOptions opts;
-  std::vector<tsl::Flag> flag_list = {
-      tsl::Flag("hlo-snapshot", &opts.hlo_snapshot,
-                "Filename of an HloSnapshot proto. Only used to read inputs."),
-      tsl::Flag("mlir-compilation-trace", &opts.mlir_compilation_trace,
-                "Filename of an MlirCompilerTrace proto."),
-      tsl::Flag("mlir-compilation-trace-dir", &opts.mlir_compilation_trace_dir,
-                "Directory from which to load MlirCompilerTrace and "
-                "HloSnapshot protos. The tool will run all snapshots and "
-                "report the ones with bugs."),
-      tsl::Flag("execution-trace-dir", &opts.execution_trace_dir,
-                "Directory where to store the execution traces (optional)."),
-      tsl::Flag("entry-point", &entry_points,
-                "Program entry function (optional, defaults to 'main')."),
-      tsl::Flag("print-changes-only", &opts.print_changes_only,
-                "If set, only print changed values"),
-      tsl::Flag("stop-after-first-failure", &opts.stop_after_first_failure,
-                "If set, stop after the first failed invocation."),
-      tsl::Flag("print-values", &opts.print_values, "If set, print values."),
-  };
-  xla::AppendDebugOptionsFlags(&flag_list);
-
-  // The usage string includes the message at the top of the file, the
-  // DebugOptions flags and the flags defined above.
-  std::string usage_string = tsl::Flags::Usage(argv[0], flag_list);
-  if (!tsl::Flags::Parse(&argc, argv, flag_list)) {
-    return 1;
-  }
-
-  if (!entry_points.empty()) {
-    opts.entry_points = absl::StrSplit(entry_points, ',');
-  }
-
-  tsl::port::InitMain(usage_string.c_str(), &argc, &argv);
-
-  CHECK(opts.mlir_compilation_trace.empty() !=
-        opts.mlir_compilation_trace_dir.empty())
-      << "Exactly one of --mlir-compilation-trace and "
-         "--mlir-compilation-trace-dir must be specified.";
-
-  CHECK(opts.mlir_compilation_trace_dir.empty() || opts.hlo_snapshot.empty())
-      << "If --mlir-compilation-trace-dir is set, --hlo-snapshot must not be.";
-
-  mlir::DialectRegistry registry;
-  mlir::registerAllDialects(registry);
-  mlir::mhlo::registerAllMhloDialects(registry);
-  registry.insert<mlir::deallocation::DeallocationDialect,
-                  mlir::lmhlo::LmhloDialect, mlir::lmhlo_gpu::LmhloGpuDialect,
-                  mlir::gml_st::GmlStDialect, mlir::thlo::THLODialect,
-                  xla::runtime::RuntimeDialect, mlir::xla_cpu::XlaCpuDialect,
-                  mlir::xla_framework::XLAFrameworkDialect>();
-
-  mlir::MLIRContext context(registry);
-
-  if (!opts.mlir_compilation_trace_dir.empty()) {
-    TestAll(context, opts);
-    return 0;
-  }
-
-  xla::HloSnapshot snapshot;
-  if (!opts.hlo_snapshot.empty()) {
-    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), opts.hlo_snapshot,
-                                     &snapshot));
-  }
-  mlir::interpreter::MlirCompilationTrace trace;
-  TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(),
-                                   opts.mlir_compilation_trace, &trace));
-
-  llvm::SmallVector<mlir::interpreter::InterpreterValue> previous_results;
-  int pass_id = 0;
-  for (auto& state : trace.passes()) {
-    llvm::outs() << "Running IR after " << state.after_pass() << ".\n";
-    mlir::interpreter::ExecutionTrace execution_trace;
-    auto results = mlir::interpreter::Run(
-        context, state.mlir_module(), snapshot,
-        opts.execution_trace_dir.empty() ? nullptr : &execution_trace,
-        opts.entry_points);
-    if (results.status().ok()) {
-      if (opts.print_values &&
-          (!opts.print_changes_only || (*results != previous_results))) {
-        llvm::outs() << "Results:\n";
-        for (const auto& result : *results) {
-          llvm::outs() << result.toString() << "\n";
-        }
-        previous_results = *results;
-        llvm::outs() << "\n";
-      }
-    } else {
-      llvm::errs() << results.status().ToString() << "\n";
-      if (opts.stop_after_first_failure) {
-        return 1;
-      }
-    }
-
-    if (!opts.execution_trace_dir.empty()) {
-      TF_CHECK_OK(
-          tsl::Env::Default()->RecursivelyCreateDir(opts.execution_trace_dir));
-      std::string filename = tsl::io::JoinPath(
-          opts.execution_trace_dir,
-          absl::StrFormat("%.4d.%s.mlir", pass_id, state.after_pass()));
-      TF_CHECK_OK(tsl::WriteStringToFile(tsl::Env::Default(), filename,
-                                         execution_trace.ir()));
-
-      filename = tsl::io::JoinPath(
-          opts.execution_trace_dir,
-          absl::StrFormat("%.4d.%s.trace.pb", pass_id, state.after_pass()));
-      TF_CHECK_OK(tsl::WriteBinaryProto(tsl::Env::Default(), filename,
-                                        execution_trace));
-    }
-    ++pass_id;
-  }
-
-  return 0;
-}
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc b/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
deleted file mode 100644
index d80b8fc7cb257f..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/tools/mlir_replay/mlir_replay_lib.h"
-
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <random>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/random/bit_gen_ref.h"
-#include "absl/random/random.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Tools/ParseUtilities.h"  // from @llvm-project
-#include "xla/mlir/framework/ir/xla_framework.h"
-#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
-#include "xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h"
-#include "xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
-#include "xla/service/hlo.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-tsl::StatusOr<SmallVector<InterpreterValue>> LoadArgs(
-    const xla::HloSnapshot& snapshot, TypeRange types) {
-  SmallVector<InterpreterValue> result;
-  for (const auto& [arg, type] : llvm::zip(snapshot.arguments(), types)) {
-    TF_ASSIGN_OR_RETURN(auto converted, LiteralToValue(arg, type));
-    result.push_back(std::move(converted));
-  }
-  return result;
-}
-
-namespace {
-template <typename T, template <typename _> class rng_t>
-mlir::interpreter::InterpreterValue RandomTensor(absl::BitGenRef bitgen,
-                                                 mlir::Type type) {
-  llvm::SmallVector<int64_t> shape;
-  auto shaped_ty = type.dyn_cast<mlir::ShapedType>();
-  if (shaped_ty) {
-    shape = llvm::to_vector(shaped_ty.getShape());
-  }
-
-  auto rng = rng_t<T>{};
-  auto result = mlir::interpreter::TensorOrMemref<T>::empty(shape);
-  for (const auto& index : result.view.indices()) {
-    auto& elem = result.at(index) = rng(bitgen);
-    // Ints are typically indices, so scale them down to a more reasonable
-    // range.
-    if constexpr (std::is_same_v<T, int64_t>) {
-      elem >>= 60;
-    }
-  }
-  if (shaped_ty) {
-    return {result};
-  }
-  return {result.at({})};
-}
-}  // namespace
-
-mlir::FailureOr<mlir::interpreter::InterpreterValue> MakeRandomInput(
-    absl::BitGenRef bitgen, mlir::Type type) {
-  auto elem_ty =
-      type.isa<ShapedType>() ? type.cast<ShapedType>().getElementType() : type;
-  if (elem_ty.isF32()) {
-    return RandomTensor<float, absl::gaussian_distribution>(bitgen, type);
-  }
-  if (elem_ty.isF64()) {
-    return RandomTensor<double, absl::gaussian_distribution>(bitgen, type);
-  }
-  if (elem_ty.isInteger(32)) {
-    return RandomTensor<int32_t, absl::uniform_int_distribution>(bitgen, type);
-  }
-  if (elem_ty.isInteger(16)) {
-    return RandomTensor<int16_t, absl::uniform_int_distribution>(bitgen, type);
-  }
-  if (elem_ty.isInteger(64)) {
-    return RandomTensor<int64_t, absl::uniform_int_distribution>(bitgen, type);
-  }
-  if (elem_ty.isInteger(1)) {
-    return {{TensorOrMemref<bool>::empty(type.cast<ShapedType>().getShape())}};
-  }
-
-  llvm::errs() << "Unsupported type: ";
-  type.print(llvm::errs());
-  llvm::errs() << "\n";
-  return failure();
-}
-
-// TODO(jreiffers): Add a flag to intentionally alias as many buffers as
-// possible (in particular, all non-variable inputs).
-// Extracts a mapping from function arguments to allocated buffers.
-// The buffer assignment is only relevant once the program is bufferized and
-// memref results were converted to arguments.
-std::vector<int64_t> extractXlaBufferAssignment(func::FuncOp main) {
-  std::vector<int64_t> buffer_assignment(main.getNumArguments());
-  auto result_mapping =
-      main->getAttrOfType<IntegerAttr>("xla_framework.result_mapping");
-  if (!result_mapping) {
-    // No attribute, fall back to unique buffers for each argument.
-    std::iota(buffer_assignment.begin(), buffer_assignment.end(), 0);
-    return buffer_assignment;
-  }
-
-  std::vector<int64_t> result_to_buffer;
-  if (auto inner_mapping = main->getAttrOfType<ArrayAttr>(
-          "xla_framework.result_inner_mapping")) {
-    llvm::copy(llvm::map_range(inner_mapping.getAsValueRange<IntegerAttr>(),
-                               [](const llvm::APInt& value) {
-                                 return value.getSExtValue();
-                               }),
-               std::back_inserter(result_to_buffer));
-  } else {
-    result_to_buffer = {result_mapping.getInt()};
-  }
-
-  int64_t result_index = 0;
-  for (int64_t arg_index : llvm::seq<int64_t>(0, main.getNumArguments())) {
-    if (auto input_buffer_index = main.getArgAttrOfType<IntegerAttr>(
-            arg_index, "xla_framework.input_mapping")) {
-      buffer_assignment[arg_index] = input_buffer_index.getInt();
-    } else {
-      buffer_assignment[arg_index] = result_to_buffer[result_index++];
-    }
-  }
-
-  return buffer_assignment;
-}
-
-}  // namespace
-
-tsl::StatusOr<SmallVector<InterpreterValue>> Run(
-    MLIRContext& context, const std::string& mlir_ir,
-    const xla::HloSnapshot& snapshot, ExecutionTrace* trace,
-    const std::vector<std::string>& entry) {
-  auto sourceMgr = std::make_shared<llvm::SourceMgr>();
-  sourceMgr->AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(mlir_ir),
-                                mlir::SMLoc());
-  mlir::OwningOpRef<mlir::Operation*> module =
-      mlir::parseSourceFileForTool(sourceMgr, &context, false);
-  if (!module) {
-    return tsl::errors::InvalidArgument("failed to parse MLIR");
-  }
-
-  SymbolTable symbols(*module);
-  func::FuncOp main;
-  for (const std::string& candidate : entry) {
-    main = llvm::dyn_cast_or_null<func::FuncOp>(symbols.lookup(candidate));
-    if (main && !main.getBody().empty()) {
-      break;
-    }
-  }
-
-  if (!main) {
-    return tsl::errors::InvalidArgument("failed to find entry point");
-  }
-
-  if (trace) {
-    llvm::raw_string_ostream os(*trace->mutable_ir());
-    (*module)->print(os, OpPrintingFlags().printGenericOpForm());
-  }
-
-  // After xla-rt-export-functions, we have an execution context as the first
-  // argument. The interpreter currently cannot deal with these things, so we
-  // fail in that case.
-  auto function_args = main.getBody().getBlocks().front().getArguments();
-  auto buffer_type = xla_framework::BufferType::get(main.getContext());
-  if (!llvm::all_of(function_args, [&](Value arg) {
-        return arg.getType().isa<ShapedType>() || arg.getType() == buffer_type;
-      })) {
-    return tsl::errors::InvalidArgument(
-        "expected all function arguments to be shaped types");
-  }
-
-  auto args_to_buffers = extractXlaBufferAssignment(main);
-  TF_ASSIGN_OR_RETURN(auto args,
-                      LoadArgs(snapshot, main.getBody().getArgumentTypes()));
-  auto out_args =
-      main.getBody().getBlocks().front().getArguments().drop_front(args.size());
-
-  absl::flat_hash_map<int64_t, InterpreterValue> buffer_to_value;
-  // None of the input arguments will be statically known to alias.
-  for (auto [index, value] : llvm::enumerate(args)) {
-    buffer_to_value[args_to_buffers[index]] = value;
-  }
-
-  std::seed_seq my_seed_seq({0});
-  absl::BitGen bitgen(my_seed_seq);
-  llvm::SmallVector<InterpreterValue> out_buffers;
-  // Add random inputs for output arguments and unspecified inputs.
-  for (auto arg : out_args) {
-    auto ty = arg.getType();
-    if (ty == buffer_type) {
-      // Buffers are used exactly once, in a buffer_to_mem op.
-      if (!arg.hasOneUse()) {
-        return tsl::errors::InvalidArgument(
-            "expected buffer argument to be used eactly once");
-      }
-      ty = arg.getUsers().begin()->getResultTypes().front();
-    }
-
-    int64_t buffer_index = args_to_buffers[arg.getArgNumber()];
-    // If we already have a buffer for this argument, use it.
-    if (buffer_to_value.contains(buffer_index)) {
-      auto& value = buffer_to_value[buffer_index];
-      out_buffers.push_back(value);
-      args.push_back(value);
-      continue;
-    }
-
-    auto arg_or = MakeRandomInput(bitgen, ty);
-    if (!succeeded(arg_or)) {
-      return tsl::errors::InvalidArgument("failed to create input");
-    }
-    out_buffers.push_back(*arg_or);
-    args.push_back(*arg_or);
-    buffer_to_value[buffer_index] = *arg_or;
-  }
-
-  InterpreterOptions options;
-  ExecutionTraceListener tracer(trace);
-  if (trace) {
-    options.listener = &tracer;
-  }
-  auto results_or = runInterpreter(symbols, main, args, options);
-  if (!succeeded(results_or)) {
-    return tsl::errors::Internal("interpreter failed");
-  }
-
-  if (results_or->empty()) {
-    return out_buffers;
-  }
-  return *results_or;
-}
-
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.h b/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
deleted file mode 100644
index e87438b671a2b1..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
-#define XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
-
-#include <string>
-#include <vector>
-
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
-#include "xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
-#include "xla/service/hlo.pb.h"
-#include "tsl/platform/statusor.h"
-
-namespace mlir {
-namespace interpreter {
-
-// Runs the given IR on the inputs from `snapshot` and returns the result.
-tsl::StatusOr<SmallVector<InterpreterValue>> Run(
-    MLIRContext& context, const std::string& mlir_ir,
-    const xla::HloSnapshot& snapshot, ExecutionTrace* trace,
-    const std::vector<std::string>& entry);
-
-}  // namespace interpreter
-}  // namespace mlir
-
-#endif  // XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD b/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD
index e459debb306979..efaa3a84e62d1c 100644
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD
@@ -1,6 +1,5 @@
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -26,47 +25,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "execution_trace_utils",
-    srcs = ["execution_trace_utils.cc"],
-    hdrs = ["execution_trace_utils.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":execution_trace_proto_cc",
-        ":execution_trace_proto_cc_impl",
-        "//xla:literal",
-        "//xla:xla_data_proto_cc",
-        "//xla/mlir_hlo:mlir_interpreter_framework",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "execution_trace_utils_test",
-    srcs = ["execution_trace_utils_test.cc"],
-    deps = [
-        ":execution_trace_utils",
-        "//xla:literal_util",
-        "//xla/mlir_hlo:mlir_interpreter_framework",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-tf_proto_library(
-    name = "execution_trace_proto",
-    srcs = ["execution_trace.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    visibility = ["//visibility:public"],
-)
-
 tf_proto_library(
     name = "compiler_trace_proto",
     srcs = ["compiler_trace.proto"],
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/README.md b/third_party/xla/xla/mlir/tools/mlir_replay/public/README.md
index c886abf5ffd1d6..553accf2d0543e 100644
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/README.md
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/README.md
@@ -1,10 +1,3 @@
-# Public API of mlir_replay
+# DEPRECATED: Public API of mlir_replay
 
-This contains protocol buffers and utilities that can be reused for other
-debugging tools:
-
-1.  **The compiler trace proto**: A record of the state of the IR after each
-    compilation pass
-1.  A compiler instrumentation to create the above proto.
-1.  **The execution trace proto**: A record of SSA values as the IR is executed
-1.  Utilities for working with the above protos.
+Do not use. This is in the process of being removed.
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace.proto b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace.proto
deleted file mode 100644
index 6be6407505c0bc..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace.proto
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-syntax = "proto2";
-
-package mlir.interpreter;
-
-message TracedValue {
-  // The shape - includes vector dimensions.
-  // TODO(jreiffers): Model vector dimensions separately.
-  repeated int64 shape = 1;
-  optional bool is_scalar = 2;
-
-  enum ElementType {
-    UNKNOWN = 0;
-    INTEGRAL = 1;
-    UNSIGNED = 2;
-    FLOAT = 3;
-    COMPLEX = 4;
-    TUPLE = 5;
-  }
-
-  optional int32 bit_width = 3;
-  optional ElementType element_type = 4;
-
-  repeated float floats = 5 [packed = true];
-  repeated double doubles = 6 [packed = true];
-  repeated int64 ints = 7 [packed = true];
-  repeated uint64 uints = 8 [packed = true];
-  repeated TracedValue tuple_elements = 9;
-}
-
-message InstructionTrace {
-  optional string name = 1;
-  repeated TracedValue args = 2;
-  repeated TracedValue results = 3;
-  // TODO(jreiffers): Model side effects (e.g. memref.store).
-
-  repeated RegionTrace regions = 4;
-}
-
-message RegionTrace {
-  // The number of the region that is being executed (within the parent op).
-  // For example: '1' for an scf.while's `after` region.
-  optional int32 region_number = 1;
-  // The arguments that were passed to the region.
-  repeated TracedValue bbargs = 2;
-  // One instruction per instruction in the region.
-  repeated InstructionTrace instructions = 3;
-  repeated TracedValue results = 4;
-}
-
-message ExecutionTrace {
-  // The IR that was executed. Note: this should always be filled in the generic
-  // format.
-  optional string ir = 1;
-
-  // The trace of the entry function execution.
-  optional RegionTrace trace = 2;
-}
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
deleted file mode 100644
index 7659bdcf02975f..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
+++ /dev/null
@@ -1,447 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <type_traits>
-#include <utility>
-#include <variant>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
-#include "xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
-#include "xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h"
-#include "tsl/platform/statusor.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-// Visitor for converting an InterpreterValue to a TracedValue.
-struct TraceInterpreterValueVisitor {
-  TracedValue out;
-
-  void Add(float v) { out.add_floats(v); }
-  void Add(double v) { out.add_doubles(v); }
-  void Add(std::complex<float> v) {
-    out.add_floats(v.real());
-    out.add_floats(v.imag());
-  }
-  void Add(std::complex<double> v) {
-    out.add_doubles(v.real());
-    out.add_doubles(v.imag());
-  }
-  void Add(int64_t v) { out.add_ints(v); }
-  void Add(int32_t v) { out.add_ints(v); }
-  void Add(int16_t v) { out.add_ints(v); }
-  void Add(int8_t v) { out.add_ints(v); }
-  void Add(uint64_t v) { out.add_uints(v); }
-  void Add(uint32_t v) { out.add_uints(v); }
-  void Add(uint16_t v) { out.add_uints(v); }
-  void Add(uint8_t v) { out.add_uints(v); }
-  void Add(bool v) { out.add_ints(static_cast<int64_t>(v)); }
-
-  template <typename T>
-  void operator()(T v) {
-    SetElementType<T>();
-    out.set_is_scalar(true);
-    Add(v);
-  }
-
-  void operator()(const Tuple& t) {
-    out.set_element_type(TracedValue::TUPLE);
-    for (const auto& v : t.values) {
-      *out.add_tuple_elements() = ValueToTracedValue(*v);
-    }
-  }
-
-  template <typename T>
-  void operator()(const TensorOrMemref<T>& v) {
-    for (int64_t size : v.view.sizes) {
-      out.add_shape(size);
-    }
-    SetElementType<T>();
-    for (const auto& index : v.view.indices()) {
-      Add(v.at(index));
-    }
-  }
-
-  template <typename T>
-  void SetElementType() {
-    out.set_element_type(GetElementType(T{}));
-    if constexpr (std::is_same_v<T, bool>) {
-      out.set_bit_width(1);
-    } else {
-      out.set_bit_width(sizeof(T) * 8);
-    }
-  }
-
-  template <typename T>
-  static TracedValue::ElementType GetElementType(const T&) {
-    if constexpr (std::is_floating_point_v<T>) {
-      return TracedValue::FLOAT;
-    } else if constexpr (std::is_integral_v<T>) {
-      if constexpr (std::is_unsigned_v<T>) {
-        return TracedValue::UNSIGNED;
-      } else {
-        return TracedValue::INTEGRAL;
-      }
-    } else {
-      T{"invalid type"} + 0;
-      return TracedValue::UNKNOWN;
-    }
-  }
-
-  template <typename T>
-  static TracedValue::ElementType GetElementType(const std::complex<T>&) {
-    return TracedValue::COMPLEX;
-  }
-
-  static TracedValue::ElementType GetElementType(const Tuple&) {
-    return TracedValue::UNKNOWN;
-  }
-};
-
-}  // namespace
-
-void ExecutionTraceListener::beforeOp(ArrayRef<InterpreterValue> args,
-                                      Operation* op) {
-  auto* inst = regions_.back()->add_instructions();
-  inst->set_name(op->getName().getStringRef().str());
-  for (const auto& arg : args) {
-    *inst->add_args() = ValueToTracedValue(arg);
-  }
-}
-
-void ExecutionTraceListener::afterOp(ArrayRef<InterpreterValue> results) {
-  auto* traced_results =
-      regions_.back()->mutable_instructions()->rbegin()->mutable_results();
-  for (const auto& result : results) {
-    *traced_results->Add() = ValueToTracedValue(result);
-  }
-}
-
-void ExecutionTraceListener::enterRegion(ArrayRef<InterpreterValue> bbargs,
-                                         Region& region) {
-  if (regions_.empty()) {
-    regions_.push_back(trace_->mutable_trace());
-  } else {
-    regions_.push_back(
-        regions_.back()->mutable_instructions()->rbegin()->add_regions());
-  }
-
-  auto& traced_region = *regions_.back();
-  traced_region.set_region_number(region.getRegionNumber());
-  for (const auto& bbarg : bbargs) {
-    *traced_region.add_bbargs() = ValueToTracedValue(bbarg);
-  }
-}
-
-void ExecutionTraceListener::leaveRegion(ArrayRef<InterpreterValue> yielded) {
-  for (const auto& result : yielded) {
-    *regions_.back()->add_results() = ValueToTracedValue(result);
-  }
-  regions_.pop_back();
-}
-
-llvm::SmallVector<mlir::Attribute> ValueToAttribute(
-    const InterpreterValue& value, mlir::Type type) {
-  if (std::holds_alternative<Tuple>(value.storage)) {
-    auto types = type.cast<TupleType>().getTypes();
-    const auto& t = std::get<Tuple>(value.storage);
-    llvm::SmallVector<mlir::Attribute> attrs;
-    for (const auto& [v, ty] : llvm::zip(t.values, types)) {
-      auto attr = ValueToAttribute(*v, ty);
-      assert(attr.size() == 1 && "nested tuples not supported");
-      attrs.push_back(attr.front());
-    }
-    return attrs;
-  }
-
-  if (!value.isTensor()) {
-    return {cast<DenseElementsAttr>(
-                ValueToAttribute(value.asUnitTensor(),
-                                 mlir::RankedTensorType::get({}, type))
-                    .front())
-                .getValues<mlir::Attribute>()[0]};
-  }
-
-  if (!type.isa<ShapedType>()) {
-    return {};
-  }
-
-  auto shaped_ty = type.cast<ShapedType>();
-  return {dispatchScalarType(shaped_ty, [&](auto dummy) -> mlir::Attribute {
-    using T = decltype(dummy);
-    auto& t = std::get<TensorOrMemref<T>>(value.storage);
-    SmallVector<T> vals;
-    for (const auto& index : t.view.indices()) {
-      vals.push_back(t.at(index));
-    }
-    auto attr_ty =
-        shaped_ty.cloneWith(/*shape=*/t.view.sizes, shaped_ty.getElementType());
-    if constexpr (std::is_same_v<T, bool>) {
-      return mlir::DenseElementsAttr::get(attr_ty, vals);
-    } else {
-      return mlir::DenseElementsAttr::get<T>(attr_ty, vals);
-    }
-  })};
-}
-
-namespace {
-template <typename T>
-TensorOrMemref<T> ArrayLiteralToTensor(const xla::Literal& literal) {
-  SmallVector<int64_t> layout;
-  if (literal.shape().has_layout()) {
-    llvm::copy(literal.shape().layout().minor_to_major(),
-               std::back_inserter(layout));
-  }
-  SmallVector<int64_t> shape{literal.shape().dimensions().begin(),
-                             literal.shape().dimensions().end()};
-  auto result = TensorOrMemref<T>::empty(shape, layout);
-  assert(literal.size_bytes() == result.buffer->getByteSize() &&
-         "expected buffer sizes to match");
-  memcpy(result.buffer->at(0, 0), literal.untyped_data(),
-         result.buffer->getByteSize());
-  return result;
-}
-}  // namespace
-
-tsl::StatusOr<InterpreterValue> LiteralToValue(const xla::Literal& literal) {
-  if (literal.shape().IsTuple()) {
-    auto elements = literal.Clone().DecomposeTuple();
-    Tuple result;
-    for (auto& element : elements) {
-      TF_ASSIGN_OR_RETURN(auto converted, LiteralToValue(element));
-      result.values.push_back(
-          std::make_shared<InterpreterValue>(std::move(converted)));
-    }
-    return {{result}};
-  }
-
-  if (literal.shape().IsToken()) {
-    return tsl::errors::Unimplemented("token arguments are not implemented");
-  }
-
-  if (literal.shape().IsArray()) {
-    switch (literal.shape().element_type()) {
-      case xla::PRED:
-        return {{ArrayLiteralToTensor<bool>(literal)}};
-      case xla::S8:
-        return {{ArrayLiteralToTensor<int8_t>(literal)}};
-      case xla::S16:
-        return {{ArrayLiteralToTensor<int16_t>(literal)}};
-      case xla::S32:
-        return {{ArrayLiteralToTensor<int32_t>(literal)}};
-      case xla::S64:
-        return {{ArrayLiteralToTensor<int64_t>(literal)}};
-      case xla::U8:
-        return {{ArrayLiteralToTensor<uint8_t>(literal)}};
-      case xla::U16:
-        return {{ArrayLiteralToTensor<uint16_t>(literal)}};
-      case xla::U32:
-        return {{ArrayLiteralToTensor<uint32_t>(literal)}};
-      case xla::U64:
-        return {{ArrayLiteralToTensor<uint64_t>(literal)}};
-      case xla::F16:
-        return tsl::errors::Unimplemented("F16 not implemented");
-      case xla::F32:
-        return {{ArrayLiteralToTensor<float>(literal)}};
-      case xla::BF16:
-        return tsl::errors::Unimplemented("BF16 not implemented");
-      case xla::F64:
-        return {{ArrayLiteralToTensor<double>(literal)}};
-      case xla::F8E5M2:
-        return tsl::errors::Unimplemented("F8E5M2 not implemented");
-      case xla::F8E4M3FN:
-        return tsl::errors::Unimplemented("F8E4M3FN not implemented");
-      case xla::F8E4M3B11FNUZ:
-        return tsl::errors::Unimplemented("F8E4M3B11FNUZ not implemented");
-      case xla::F8E5M2FNUZ:
-        return tsl::errors::Unimplemented("F8E5M2FNUZ not implemented");
-      case xla::F8E4M3FNUZ:
-        return tsl::errors::Unimplemented("F8E4M3FNUZ not implemented");
-      case xla::C64:
-        return {{ArrayLiteralToTensor<std::complex<float>>(literal)}};
-      case xla::C128:
-        return {{ArrayLiteralToTensor<std::complex<double>>(literal)}};
-      default:
-        // Fallthrough intended.
-        break;
-    }
-  }
-
-  return tsl::errors::InvalidArgument("unexpected literal type");
-}
-
-tsl::StatusOr<InterpreterValue> LiteralToValue(
-    const xla::LiteralProto& literal) {
-  TF_ASSIGN_OR_RETURN(auto deserialized,
-                      xla::Literal::CreateFromProto(literal));
-  return LiteralToValue(deserialized);
-}
-
-tsl::StatusOr<InterpreterValue> LiteralToValue(const xla::LiteralProto& literal,
-                                               mlir::Type type) {
-  TF_ASSIGN_OR_RETURN(auto result, LiteralToValue(literal));
-  return {dispatchScalarType(type, [&](auto dummy) -> InterpreterValue {
-    TensorOrMemref<decltype(dummy)> cast;
-    cast.view = result.view();
-    cast.buffer = result.buffer();
-    return {cast};
-  })};
-}
-
-TracedValue ValueToTracedValue(const InterpreterValue& value) {
-  TraceInterpreterValueVisitor visitor;
-  std::visit(visitor, value.storage);
-  return visitor.out;
-}
-
-tsl::StatusOr<InterpreterValue> TracedValueToValue(
-    const TracedValue& traced_value) {
-  auto extract = [&](auto dummy, auto& elements) -> InterpreterValue {
-    using T = decltype(dummy);
-    if (traced_value.is_scalar()) {
-      return {static_cast<T>(elements[0])};
-    }
-
-    auto result =
-        TensorOrMemref<T>::empty(llvm::to_vector(traced_value.shape()));
-    for (auto [index, element] : llvm::zip(result.view.indices(), elements)) {
-      result.at(index) = element;
-    }
-    return {result};
-  };
-  auto extract_complex = [&](auto& elements) -> InterpreterValue {
-    using T = std::complex<std::decay_t<decltype(elements[0])>>;
-    if (traced_value.is_scalar()) {
-      return {T{elements[0], elements[1]}};
-    }
-
-    auto result =
-        TensorOrMemref<T>::empty(llvm::to_vector(traced_value.shape()));
-    int64_t i = 0;
-    for (auto it = result.view.indices().begin(),
-              end = result.view.indices().end();
-         it != end; ++it, i += 2) {
-      result.at(*it) = {elements[i], elements[i + 1]};
-    }
-    return {result};
-  };
-  switch (traced_value.element_type()) {
-    case TracedValue::UNKNOWN:
-      break;
-    case TracedValue::FLOAT:
-      if (traced_value.bit_width() == 32) {
-        return extract(float{}, traced_value.floats());
-      }
-      return extract(double{}, traced_value.doubles());
-    case TracedValue::UNSIGNED:
-      switch (traced_value.bit_width()) {
-        case 1:
-          return extract(bool{}, traced_value.ints());
-        case 8:
-          return extract(uint8_t{}, traced_value.uints());
-        case 16:
-          return extract(uint16_t{}, traced_value.uints());
-        case 32:
-          return extract(uint32_t{}, traced_value.uints());
-        case 64:
-          return extract(uint64_t{}, traced_value.uints());
-      }
-      break;
-    case TracedValue::INTEGRAL:
-      switch (traced_value.bit_width()) {
-        case 8:
-          return extract(int8_t{}, traced_value.ints());
-        case 16:
-          return extract(int16_t{}, traced_value.ints());
-        case 32:
-          return extract(int32_t{}, traced_value.ints());
-        case 64:
-          return extract(int64_t{}, traced_value.ints());
-      }
-      break;
-    case TracedValue::COMPLEX:
-      switch (traced_value.bit_width()) {
-        case 64:
-          return extract_complex(traced_value.floats());
-        case 128:
-          return extract_complex(traced_value.doubles());
-      }
-      break;
-    case TracedValue::TUPLE:
-      Tuple result;
-      for (const auto& elem : traced_value.tuple_elements()) {
-        TF_ASSIGN_OR_RETURN(auto converted, TracedValueToValue(elem));
-        result.values.push_back(
-            std::make_shared<InterpreterValue>(std::move(converted)));
-      }
-      return {{std::move(result)}};
-  }
-  return tsl::errors::InvalidArgument("unexpected type: " +
-                                      traced_value.DebugString());
-}
-
-llvm::SmallVector<const InstructionTrace*> FindOpExecutionsInTrace(
-    const ExecutionTrace& trace, mlir::Operation* op) {
-  llvm::SmallVector<int64_t> region_indices;
-  llvm::SmallVector<int64_t> op_indices;
-
-  std::function<void(mlir::Operation*)> get_op_path;
-  get_op_path = [&](mlir::Operation* op) {
-    auto* parent = op->getParentOp();
-    if (!llvm::isa<func::FuncOp>(parent)) {
-      get_op_path(parent);
-      region_indices.push_back(op->getParentRegion()->getRegionNumber());
-    }
-
-    int64_t index = 0;
-    while ((op = op->getPrevNode()) != nullptr) ++index;
-    op_indices.push_back(index);
-  };
-  get_op_path(op);
-
-  llvm::SmallVector<const InstructionTrace*> result;
-  std::function<void(const RegionTrace& trace, int index)> step;
-  step = [&](const RegionTrace& trace, int index) {
-    auto& instruction_trace = trace.instructions(op_indices[index]);
-    if (region_indices.size() > index) {
-      for (const auto& region : instruction_trace.regions()) {
-        if (region.region_number() == region_indices[index]) {
-          step(region, index + 1);
-        }
-      }
-    } else {
-      result.push_back(&instruction_trace);
-    }
-  };
-  step(trace.trace(), 0);
-
-  return result;
-}
-
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
deleted file mode 100644
index a152e0e8f90652..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
-#define XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
-
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/Region.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/literal.h"
-#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
-#include "xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h"
-#include "xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-
-namespace mlir {
-namespace interpreter {
-
-// Interpreter listener that builds a trace of all executed ops and regions.
-class ExecutionTraceListener : public InterpreterListener {
- public:
-  explicit ExecutionTraceListener(ExecutionTrace* trace) : trace_(trace) {}
-
-  void beforeOp(ArrayRef<InterpreterValue> args, Operation* op) override;
-  void afterOp(ArrayRef<InterpreterValue> results) override;
-  void enterRegion(ArrayRef<InterpreterValue> bbargs, Region& region) override;
-  void leaveRegion(ArrayRef<InterpreterValue> yielded) override;
-
- private:
-  ExecutionTrace* trace_;
-  SmallVector<RegionTrace*> regions_;
-};
-
-// Returns an attribute with the given contents and type.
-llvm::SmallVector<mlir::Attribute> ValueToAttribute(
-    const InterpreterValue& value, mlir::Type type);
-
-// Deserializes the given literal.
-tsl::StatusOr<InterpreterValue> LiteralToValue(
-    const xla::LiteralProto& literal);
-// Deserializes the given literal and then casts it to the given type.
-tsl::StatusOr<InterpreterValue> LiteralToValue(const xla::LiteralProto& literal,
-                                               mlir::Type type);
-
-// Deserializes the given literal.
-tsl::StatusOr<InterpreterValue> LiteralToValue(const xla::Literal& literal);
-
-// Serializes the given interpreter value.
-TracedValue ValueToTracedValue(const InterpreterValue& value);
-
-// Deserializes the given traced value.
-tsl::StatusOr<InterpreterValue> TracedValueToValue(
-    const TracedValue& traced_value);
-
-// Returns all executions of the given op in the given trace.
-llvm::SmallVector<const InstructionTrace*> FindOpExecutionsInTrace(
-    const ExecutionTrace& trace, mlir::Operation* op);
-
-}  // namespace interpreter
-}  // namespace mlir
-
-#endif  // XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils_test.cc b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils_test.cc
deleted file mode 100644
index c8cdfe6125412f..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils_test.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
-
-#include <cmath>
-#include <complex>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "llvm/ADT/STLExtras.h"
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/literal_util.h"
-#include "xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
-#include "tsl/platform/statusor.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-class TracedValueRoundTripTest
-    : public ::testing::TestWithParam<InterpreterValue> {};
-
-TEST_P(TracedValueRoundTripTest, Run) {
-  auto traced_value = ValueToTracedValue(GetParam());
-  TF_ASSERT_OK_AND_ASSIGN(auto value, TracedValueToValue(traced_value));
-  EXPECT_EQ(GetParam(), value) << GetParam().toString();
-}
-
-template <typename T>
-InterpreterValue MakeTensor(ArrayRef<int64_t> shape, ArrayRef<T> values) {
-  auto result = TensorOrMemref<T>::empty(shape);
-  for (auto [indices, value] : llvm::zip(result.view.indices(), values)) {
-    result.at(indices) = value;
-  }
-  return {result};
-}
-
-template <typename T>
-std::shared_ptr<T> WrapShared(T value) {
-  return std::make_shared<T>(std::move(value));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    RoundTrip, TracedValueRoundTripTest,
-    ::testing::ValuesIn(std::vector<InterpreterValue>{
-        {uint8_t{42}},
-        {uint16_t{43}},
-        {uint32_t{44}},
-        {uint64_t{45}},
-        {int8_t{-47}},
-        {int16_t{-48}},
-        {int32_t{-49}},
-        {int64_t{-50}},
-        {float{42.0}},
-        {double{42.0}},
-        {std::complex<float>{1.0, 2.0}},
-        {std::complex<double>{3.0, 4.0}},
-        {true},
-        {false},
-        {MakeTensor<int16_t>({1, 2}, {42, 43})},
-        {MakeTensor<double>({2, 2}, {1.0, -INFINITY, INFINITY, NAN})},
-        {MakeTensor<std::complex<double>>({}, {{1.0, 2.0}})},
-        {Tuple{SmallVector<std::shared_ptr<InterpreterValue>>{
-            WrapShared(InterpreterValue{42}),
-            WrapShared(InterpreterValue{43.0}),
-        }}}}));
-
-class FromLiteralTest
-    : public ::testing::TestWithParam<
-          std::pair<std::shared_ptr<xla::Literal>, InterpreterValue>> {};
-
-TEST_P(FromLiteralTest, Run) {
-  TF_ASSERT_OK_AND_ASSIGN(auto value, LiteralToValue(*GetParam().first));
-  EXPECT_EQ(value, GetParam().second)
-      << value.toString() << " vs " << GetParam().second.toString();
-}
-
-std::vector<std::pair<std::shared_ptr<xla::Literal>, InterpreterValue>>
-MakeInputs() {
-  using ::xla::LiteralUtil;
-  return {
-      {WrapShared(LiteralUtil::CreateR2<uint8_t>({{41, 42}})),
-       MakeTensor<uint8_t>({1, 2}, {41, 42})},
-      {WrapShared(LiteralUtil::CreateR0<uint16_t>(43)),
-       MakeTensor<uint16_t>({}, {43})},
-      {WrapShared(LiteralUtil::CreateR0<uint32_t>(44)),
-       MakeTensor<uint32_t>({}, {44})},
-      {WrapShared(LiteralUtil::CreateR0<uint64_t>(45)),
-       MakeTensor<uint64_t>({}, {45})},
-      {WrapShared(LiteralUtil::CreateR0<int8_t>(46)),
-       MakeTensor<int8_t>({}, {46})},
-      {WrapShared(LiteralUtil::CreateR0<int16_t>(47)),
-       MakeTensor<int16_t>({}, {47})},
-      {WrapShared(LiteralUtil::CreateR0<int32_t>(48)),
-       MakeTensor<int32_t>({}, {48})},
-      {WrapShared(LiteralUtil::CreateR0<int64_t>(49)),
-       MakeTensor<int64_t>({}, {49})},
-      {WrapShared(LiteralUtil::CreateR0<float>(50.0)),
-       MakeTensor<float>({}, {50.0})},
-      {WrapShared(LiteralUtil::CreateR0<double>(51.0)),
-       MakeTensor<double>({}, {51.0})},
-      {WrapShared(LiteralUtil::CreateR0<std::complex<float>>({52.0, 53.0})),
-       MakeTensor<std::complex<float>>({}, {{52.0, 53.0}})},
-      {WrapShared(LiteralUtil::CreateR0<std::complex<double>>({54.0, 55.0})),
-       MakeTensor<std::complex<double>>({}, {{54.0, 55.0}})},
-      {WrapShared(LiteralUtil::CreateR1<bool>({true, false})),
-       MakeTensor<bool>({2}, {true, false})},
-      {WrapShared(
-           LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0<bool>(true),
-                                       LiteralUtil::CreateR0<int8_t>(56))),
-       InterpreterValue{Tuple{SmallVector<std::shared_ptr<InterpreterValue>>{
-           std::make_shared<InterpreterValue>(MakeTensor<bool>({}, {true})),
-           std::make_shared<InterpreterValue>(
-               MakeTensor<int8_t>({}, {56}))}}}}};
-}
-
-INSTANTIATE_TEST_SUITE_P(Test, FromLiteralTest,
-                         ::testing::ValuesIn(MakeInputs()));
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index 66e05085683514..f55e0dad52f3f6 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -2023,120 +2023,3 @@ cc_binary(
     linkstatic = False,
     deps = ["@llvm-project//mlir:mlir_c_runner_utils"],
 )
-
-cc_library(
-    name = "mlir_interpreter_dialects",
-    srcs = glob(
-        [
-            "tools/mlir_interpreter/dialects/*.cc",
-        ],
-        exclude = ["tools/mlir_interpreter/dialects/util.cc"],
-    ),
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":deallocation",
-        ":gml_st",
-        ":mlir_hlo",
-        ":mlir_interpreter_dialect_utils",
-        ":mlir_interpreter_framework",
-        ":thlo",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:ComplexDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:VectorDialect",
-        "@llvm-project//mlir:ViewLikeInterface",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "mlir_interpreter_dialect_utils",
-    srcs = [
-        "tools/mlir_interpreter/dialects/util.cc",
-    ],
-    hdrs = [
-        "tools/mlir_interpreter/dialects/comparators.h",
-        "tools/mlir_interpreter/dialects/cwise_math.h",
-        "tools/mlir_interpreter/dialects/util.h",
-    ],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":mlir_interpreter_framework",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:ViewLikeInterface",
-    ],
-)
-
-cc_library(
-    name = "mlir_interpreter_framework",
-    srcs = [
-        "tools/mlir_interpreter/framework/interpreter.cc",
-        "tools/mlir_interpreter/framework/interpreter_value.cc",
-        "tools/mlir_interpreter/framework/registration.cc",
-        "tools/mlir_interpreter/framework/tensor_or_memref.cc",
-    ],
-    hdrs = [
-        "tools/mlir_interpreter/framework/interpreter.h",
-        "tools/mlir_interpreter/framework/interpreter_value.h",
-        "tools/mlir_interpreter/framework/interpreter_value_util.h",
-        "tools/mlir_interpreter/framework/registration.h",
-        "tools/mlir_interpreter/framework/tensor_or_memref.h",
-    ],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
-build_test(
-    name = "mlir-interpreter-runner_build_test",
-    targets = [
-        ":mlir-interpreter-runner",
-    ],
-)
-
-cc_binary(
-    name = "mlir-interpreter-runner",
-    srcs = ["tools/mlir_interpreter/mlir-interpreter-runner.cc"],
-    deps = [
-        ":deallocation",
-        ":gml_st",
-        ":hlo_dialect_registration",
-        ":lhlo",
-        ":lhlo_gpu",
-        ":mhlo_passes",
-        ":mlir_interpreter_dialects",
-        ":mlir_interpreter_framework",
-        ":thlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MlirReduceLib",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-    ],
-)
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/affine.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/affine.cc
deleted file mode 100644
index 6cb8f0b85465f2..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/affine.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "tools/mlir_interpreter/dialects/util.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-llvm::SmallVector<int64_t> apply(InterpreterState&, affine::AffineApplyOp op,
-                                 ArrayRef<int64_t> operands) {
-  return evalAffineMap(op.getAffineMap(), operands);
-}
-
-int64_t min(InterpreterState&, affine::AffineMinOp op,
-            ArrayRef<int64_t> operands) {
-  auto results = evalAffineMap(op.getAffineMap(), operands);
-  return *std::min_element(results.begin(), results.end());
-}
-
-int64_t max(InterpreterState&, affine::AffineMaxOp op,
-            ArrayRef<int64_t> operands) {
-  auto results = evalAffineMap(op.getAffineMap(), operands);
-  return *std::max_element(results.begin(), results.end());
-}
-
-REGISTER_MLIR_INTERPRETER_OP(apply);
-REGISTER_MLIR_INTERPRETER_OP(max);
-REGISTER_MLIR_INTERPRETER_OP(min);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/arith.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/arith.cc
deleted file mode 100644
index d11c9d5342d2fb..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/arith.cc
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-
-#include <variant>  // NOLINT
-
-#include "llvm/Support/ErrorHandling.h"
-#include "tools/mlir_interpreter/dialects/comparators.h"
-#include "tools/mlir_interpreter/dialects/cwise_math.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-InterpreterValue bitcast(InterpreterState&, arith::BitcastOp op,
-                         const InterpreterValue& in) {
-  Type ty = op->getResultTypes()[0];
-  auto shapedTy = ty.dyn_cast<ShapedType>();
-  auto result = dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
-    TensorOrMemref<decltype(dummy)> result;
-    result.view = {};
-    if (shapedTy) {
-      result.buffer = in.clone().buffer();
-    } else {
-      result.buffer = in.asUnitTensor().buffer();
-    }
-    return {result};
-  });
-  if (!shapedTy) {
-    return result.extractElement({});
-  }
-  auto& outView = result.view();
-  outView.strides = BufferView::getDefaultStrides(shapedTy.getShape());
-  outView.sizes = llvm::to_vector(shapedTy.getShape());
-  return result;
-}
-
-InterpreterValue constant(InterpreterState&, arith::ConstantOp constant) {
-  auto ty = constant->getResultTypes()[0];
-  auto shapedType = ty.dyn_cast<ShapedType>();
-  auto elemTy = shapedType ? shapedType.getElementType() : ty;
-  return dispatchScalarType(elemTy, [&](auto dummy) -> InterpreterValue {
-    using T = decltype(dummy);
-    if (shapedType) {
-      auto values =
-          constant.getValue().cast<DenseElementsAttr>().getValues<T>();
-      auto result = TensorOrMemref<T>::empty(shapedType.getShape());
-      auto valueIt = values.begin();
-      result.view.isVector = shapedType.isa<VectorType>();
-      for (const auto& index : result.view.indices(true)) {
-        result.at(index) = *valueIt;
-        ++valueIt;
-      }
-      return {result};
-    }
-
-    auto value = constant.getValue();
-    if (auto integer = value.dyn_cast<IntegerAttr>()) {
-      return {static_cast<T>(integer.getInt())};
-    }
-    if (auto floatValue = value.dyn_cast<FloatAttr>()) {
-      return {static_cast<T>(floatValue.getValueAsDouble())};
-    }
-
-    llvm_unreachable("unsupported constant type");
-  });
-}
-
-template <typename Op>
-InterpreterValue intCast(InterpreterState&, Op op,
-                         const InterpreterValue& arg) {
-  if (arg.isTensor()) {
-    return dispatchScalarType(
-        op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
-          auto result = TensorOrMemref<decltype(dummy)>::emptyLike(arg.view());
-          for (const auto& index : result.view.indices()) {
-            result.at(index) =
-                static_cast<decltype(dummy)>(arg.extractElement(index).asInt());
-          }
-          return {result};
-        });
-  }
-
-  return dispatchScalarType(
-      op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
-        return {static_cast<decltype(dummy)>(arg.asInt())};
-      });
-}
-
-template <typename Op>
-InterpreterValue floatCast(InterpreterState&, Op op,
-                           const InterpreterValue& arg) {
-  if (arg.isTensor()) {
-    return dispatchScalarType(
-        op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
-          auto result = TensorOrMemref<decltype(dummy)>::emptyLike(arg.view());
-          for (const auto& index : result.view.indices()) {
-            result.at(index) = static_cast<decltype(dummy)>(
-                arg.extractElement(index).asDouble());
-          }
-          return {result};
-        });
-  }
-
-  return dispatchScalarType(
-      op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
-        return {static_cast<decltype(dummy)>(arg.asDouble())};
-      });
-}
-
-llvm::SmallVector<InterpreterValue> uiToFP(
-    MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
-    InterpreterState&) {
-  if (args[0].isTensor()) {
-    auto ty = op->getResultTypes()[0].cast<ShapedType>();
-    return {dispatchScalarType(
-        ty.getElementType(), [&](auto dummy) -> InterpreterValue {
-          auto result =
-              TensorOrMemref<decltype(dummy)>::emptyLike(args[0].view());
-          for (const auto& index : result.view.indices()) {
-            result.at(index) = static_cast<decltype(dummy)>(
-                args[0].extractElement(index).asUInt());
-          }
-          return {result};
-        })};
-  }
-
-  return {dispatchScalarType(
-      op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
-        return {static_cast<decltype(dummy)>(args[0].asUInt())};
-      })};
-}
-
-InterpreterValue cmpI(InterpreterState&, arith::CmpIOp compare,
-                      const InterpreterValue& lhs,
-                      const InterpreterValue& rhs) {
-  switch (compare.getPredicate()) {
-    case arith::CmpIPredicate::eq:
-      return applyCwiseBinaryMap<Foeq>(lhs, rhs);
-    case arith::CmpIPredicate::ne:
-      return applyCwiseBinaryMap<Fone>(lhs, rhs);
-    case arith::CmpIPredicate::slt:
-      return applyCwiseBinaryMap<Folt>(lhs, rhs);
-    case arith::CmpIPredicate::sle:
-      return applyCwiseBinaryMap<Fole>(lhs, rhs);
-    case arith::CmpIPredicate::sgt:
-      return applyCwiseBinaryMap<Fogt>(lhs, rhs);
-    case arith::CmpIPredicate::sge:
-      return applyCwiseBinaryMap<Foge>(lhs, rhs);
-    case arith::CmpIPredicate::ult:
-      return applyCwiseBinaryMap<Iult>(lhs, rhs);
-    case arith::CmpIPredicate::ule:
-      return applyCwiseBinaryMap<Iule>(lhs, rhs);
-    case arith::CmpIPredicate::ugt:
-      return applyCwiseBinaryMap<Iugt>(lhs, rhs);
-    case arith::CmpIPredicate::uge:
-      return applyCwiseBinaryMap<Iuge>(lhs, rhs);
-  }
-}
-
-template <bool value>
-struct ConstFunctor : CwiseAll {
-  template <typename T>
-  static bool apply(T, T) {
-    return value;
-  }
-};
-
-InterpreterValue cmpF(InterpreterState&, arith::CmpFOp compare,
-                      const InterpreterValue& lhs,
-                      const InterpreterValue& rhs) {
-  switch (compare.getPredicate()) {
-    case arith::CmpFPredicate::AlwaysFalse:
-      return applyCwiseBinaryMap<ConstFunctor<false>>(lhs, rhs);
-    case arith::CmpFPredicate::OEQ:
-      return applyCwiseBinaryMap<Foeq>(lhs, rhs);
-    case arith::CmpFPredicate::OGT:
-      return applyCwiseBinaryMap<Fogt>(lhs, rhs);
-    case arith::CmpFPredicate::OGE:
-      return applyCwiseBinaryMap<Foge>(lhs, rhs);
-    case arith::CmpFPredicate::OLT:
-      return applyCwiseBinaryMap<Folt>(lhs, rhs);
-    case arith::CmpFPredicate::OLE:
-      return applyCwiseBinaryMap<Fole>(lhs, rhs);
-    case arith::CmpFPredicate::ONE:
-      return applyCwiseBinaryMap<Fone>(lhs, rhs);
-    case arith::CmpFPredicate::ORD:
-      return applyCwiseBinaryMap<Ford>(lhs, rhs);
-    case arith::CmpFPredicate::UEQ:
-      return applyCwiseBinaryMap<Fueq>(lhs, rhs);
-    case arith::CmpFPredicate::UGT:
-      return applyCwiseBinaryMap<Fugt>(lhs, rhs);
-    case arith::CmpFPredicate::UGE:
-      return applyCwiseBinaryMap<Fuge>(lhs, rhs);
-    case arith::CmpFPredicate::ULT:
-      return applyCwiseBinaryMap<Fult>(lhs, rhs);
-    case arith::CmpFPredicate::ULE:
-      return applyCwiseBinaryMap<Fule>(lhs, rhs);
-    case arith::CmpFPredicate::UNE:
-      return applyCwiseBinaryMap<Fune>(lhs, rhs);
-    case arith::CmpFPredicate::UNO:
-      return applyCwiseBinaryMap<Funo>(lhs, rhs);
-    case arith::CmpFPredicate::AlwaysTrue:
-      return applyCwiseBinaryMap<ConstFunctor<true>>(lhs, rhs);
-  }
-}
-
-InterpreterValue select(InterpreterState& state, arith::SelectOp,
-                        const InterpreterValue& cond,
-                        const InterpreterValue& trueValue,
-                        const InterpreterValue& falseValue) {
-  if (std::holds_alternative<bool>(cond.storage)) {
-    return std::get<bool>(cond.storage) ? trueValue : falseValue;
-  }
-
-  if (!cond.isTensor() && !cond.view().isVector) {
-    state.addFailure("select requires a scalar or vector argument");
-    return {};
-  }
-
-  auto ret = trueValue.clone();
-  for (const auto& index : cond.view().indices(/*includeVectorDims=*/true)) {
-    if (cond.extractElement(index).asInt() == 0) {
-      ret.insertElement(index, falseValue.extractElement(index));
-    }
-  }
-  return ret;
-}
-
-template <typename R>
-struct ExtFFunctor : CwiseFloat {
-  template <typename A>
-  static R apply(A v) {
-    return v;
-  }
-};
-
-InterpreterValue extF(InterpreterState&, arith::ExtFOp op,
-                      const InterpreterValue& in) {
-  return dispatchScalarType(
-      op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
-        return applyCwiseMap<ExtFFunctor<decltype(dummy)>>(in);
-      });
-}
-
-REGISTER_MLIR_INTERPRETER_OP("arith.addf", applyCwiseBinaryMap<Plus>);
-REGISTER_MLIR_INTERPRETER_OP("arith.andi", applyCwiseBinaryMap<BitAnd>);
-REGISTER_MLIR_INTERPRETER_OP("arith.divf", applyCwiseBinaryMap<Divide>);
-REGISTER_MLIR_INTERPRETER_OP("arith.extui", uiToFP);
-REGISTER_MLIR_INTERPRETER_OP("arith.maxf", applyCwiseBinaryMap<Max>);
-REGISTER_MLIR_INTERPRETER_OP("arith.minf", applyCwiseBinaryMap<Min>);
-REGISTER_MLIR_INTERPRETER_OP("arith.mulf", applyCwiseBinaryMap<Multiply>);
-REGISTER_MLIR_INTERPRETER_OP("arith.negf", applyCwiseMap<Neg>);
-REGISTER_MLIR_INTERPRETER_OP("arith.ori", applyCwiseBinaryMap<BitOr>);
-REGISTER_MLIR_INTERPRETER_OP("arith.remf", applyCwiseBinaryMap<Remainder>);
-REGISTER_MLIR_INTERPRETER_OP("arith.subf", applyCwiseBinaryMap<Minus>);
-REGISTER_MLIR_INTERPRETER_OP("arith.uitofp", uiToFP);
-REGISTER_MLIR_INTERPRETER_OP("arith.xori", applyCwiseBinaryMap<BitXor>);
-REGISTER_MLIR_INTERPRETER_OP("arith.shrui",
-                             applyCwiseBinaryMap<ShiftRightLogical>);
-REGISTER_MLIR_INTERPRETER_OP("arith.shrsi",
-                             applyCwiseBinaryMap<ShiftRightArith>);
-REGISTER_MLIR_INTERPRETER_OP("arith.shli", applyCwiseBinaryMap<ShiftLeft>);
-
-// The float implementations support ints too.
-REGISTER_MLIR_INTERPRETER_OP("arith.addi", "arith.addf");
-REGISTER_MLIR_INTERPRETER_OP("arith.divsi", "arith.divf");
-REGISTER_MLIR_INTERPRETER_OP("arith.maxsi", "arith.maxf");
-REGISTER_MLIR_INTERPRETER_OP("arith.minsi", "arith.minf");
-REGISTER_MLIR_INTERPRETER_OP("arith.muli", "arith.mulf");
-REGISTER_MLIR_INTERPRETER_OP("arith.remsi", "arith.remf");
-REGISTER_MLIR_INTERPRETER_OP("arith.subi", "arith.subf");
-
-REGISTER_MLIR_INTERPRETER_OP(bitcast);
-REGISTER_MLIR_INTERPRETER_OP(cmpF);
-REGISTER_MLIR_INTERPRETER_OP(cmpI);
-REGISTER_MLIR_INTERPRETER_OP(constant);
-REGISTER_MLIR_INTERPRETER_OP(extF);
-REGISTER_MLIR_INTERPRETER_OP(floatCast<arith::FPToSIOp>);
-REGISTER_MLIR_INTERPRETER_OP(intCast<arith::ExtSIOp>);
-REGISTER_MLIR_INTERPRETER_OP(intCast<arith::IndexCastOp>);
-REGISTER_MLIR_INTERPRETER_OP(intCast<arith::SIToFPOp>);
-REGISTER_MLIR_INTERPRETER_OP(intCast<arith::TruncIOp>);
-REGISTER_MLIR_INTERPRETER_OP(select);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/bufferization.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/bufferization.cc
deleted file mode 100644
index 1b65950c4d79c7..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/bufferization.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-
-#include <algorithm>  // NOLINT
-#include <optional>   // NOLINT
-
-#include "tools/mlir_interpreter/dialects/util.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-InterpreterValue toTensor(InterpreterState&, bufferization::ToTensorOp,
-                          const InterpreterValue& in) {
-  return in.clone();
-}
-
-InterpreterValue toMemref(InterpreterState&, bufferization::ToMemrefOp,
-                          const InterpreterValue& in) {
-  return in;
-}
-
-InterpreterValue allocTensor(
-    InterpreterState&, bufferization::AllocTensorOp alloc,
-    ArrayRef<int64_t> dynamicSizes, std::optional<InterpreterValue> copy,
-    const std::optional<InterpreterValue>& /*sizeHint*/) {
-  auto ty = alloc->getResultTypes().front().cast<mlir::ShapedType>();
-  auto shape = replaceDynamicVals(ty.getShape(), dynamicSizes);
-
-  if (copy) {
-    return copy->clone();
-  }
-  return InterpreterValue::makeTensor(ty.getElementType(), shape);
-}
-
-InterpreterValue clone(InterpreterState& state, bufferization::CloneOp,
-                       const InterpreterValue& in) {
-  if (auto* stats = state.getOptions().stats) {
-    stats->heapSize += in.buffer()->getByteSize();
-    stats->peakHeapSize = std::max(stats->peakHeapSize, stats->heapSize);
-    ++stats->numAllocations;
-  }
-  return in.clone();
-}
-
-REGISTER_MLIR_INTERPRETER_OP(allocTensor);
-REGISTER_MLIR_INTERPRETER_OP(clone);
-REGISTER_MLIR_INTERPRETER_OP(toMemref);
-REGISTER_MLIR_INTERPRETER_OP(toTensor);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/builtin.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/builtin.cc
deleted file mode 100644
index 19e039db0c7108..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/builtin.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-llvm::SmallVector<InterpreterValue> unrealizedConversionCast(
-    MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
-    InterpreterState&) {
-  auto resultTy = op->getResultTypes()[0];
-  auto operandTy = op->getOperandTypes()[0];
-  if (resultTy == operandTy) {
-    return {args[0]};
-  }
-
-  if (auto r = llvm::dyn_cast<ShapedType>(resultTy)) {
-    if (auto o = llvm::dyn_cast<ShapedType>(operandTy)) {
-      if (verifyCompatibleShapes({o, r}).succeeded()) {
-        return {dispatchScalarType(r, [&](auto dummy) -> InterpreterValue {
-          TensorOrMemref<decltype(dummy)> result;
-          result.view = args[0].view();
-          result.buffer = args[0].buffer();
-          return {result};
-        })};
-      }
-    }
-  }
-
-  llvm::errs() << "Unimplemented cast: " << *op << "\n";
-  llvm_unreachable("unimplemented cast");
-}
-
-REGISTER_MLIR_INTERPRETER_OP("builtin.unrealized_conversion_cast",
-                             unrealizedConversionCast);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/comparators.h b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/comparators.h
deleted file mode 100644
index 397f95c36c2a77..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/comparators.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
-#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
-
-#include <complex>
-#include <type_traits>
-
-#include "llvm/Support/ErrorHandling.h"
-#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
-
-namespace mlir {
-namespace interpreter {
-
-// Despite the name, this works on integers and complex too.
-template <int64_t v, bool r, bool nan_result>
-struct FloatCompare : CwiseAll {
-  template <typename T>
-  static bool apply(T a, T b) {
-    if (isnan(a) || isnan(b)) return nan_result;
-    if constexpr (v == 0) {
-      // For complex eq/ne.
-      return (a == b) == r;
-    } else if constexpr (std::is_floating_point_v<T> || std::is_integral_v<T>) {
-      auto cmp = a > b ? 1 : (a < b ? -1 : 0);
-      return (cmp == v) == r;
-    } else {
-      llvm_unreachable("operation not supported for this type");
-    }
-  }
-
-  template <typename T>
-  static bool isnan(T a) {
-    return std::isnan(a);
-  }
-  template <typename T>
-  static bool isnan(std::complex<T> a) {
-    return std::isnan(std::real(a)) || std::isnan(std::imag(a));
-  }
-};
-
-using Foeq = FloatCompare<0, true, false>;
-using Foge = FloatCompare<-1, false, false>;
-using Fogt = FloatCompare<1, true, false>;
-using Fole = FloatCompare<1, false, false>;
-using Folt = FloatCompare<-1, true, false>;
-using Fone = FloatCompare<0, false, false>;
-using Ford = FloatCompare<99, false, false>;
-using Fueq = FloatCompare<0, true, true>;
-using Fuge = FloatCompare<-1, false, true>;
-using Fugt = FloatCompare<1, true, true>;
-using Fule = FloatCompare<1, false, true>;
-using Fult = FloatCompare<-1, true, true>;
-using Fune = FloatCompare<0, false, true>;
-using Funo = FloatCompare<99, true, true>;
-
-template <int64_t v, bool r>
-struct UnsignedCompare : CwiseInt {
-  template <typename T>
-  static bool apply(T a, T b) {
-    using U = std::make_unsigned_t<T>;
-    auto aU = static_cast<U>(a);
-    auto bU = static_cast<U>(b);
-    auto cmp = aU > bU ? 1 : (aU < bU ? -1 : 0);
-    return (cmp == v) == r;
-  }
-};
-
-using Iuge = UnsignedCompare<-1, false>;
-using Iule = UnsignedCompare<1, false>;
-using Iugt = UnsignedCompare<1, true>;
-using Iult = UnsignedCompare<-1, true>;
-
-struct Iumax {
-  template <typename T>
-  static T apply(T a, T b) {
-    return Iuge::apply(a, b) ? a : b;
-  }
-};
-
-struct Iumin {
-  template <typename T>
-  static T apply(T a, T b) {
-    return Iule::apply(a, b) ? a : b;
-  }
-};
-
-}  // namespace interpreter
-}  // namespace mlir
-
-#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/complex.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/complex.cc
deleted file mode 100644
index 2f420ab68d1fec..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/complex.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/Complex/IR/Complex.h"
-
-#include "tools/mlir_interpreter/dialects/cwise_math.h"
-#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-InterpreterValue constant(InterpreterState&, complex::ConstantOp constant) {
-  auto ty = constant->getResultTypes()[0];
-  return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
-    if constexpr (is_complex_v<decltype(dummy)>) {
-      using T = typename decltype(dummy)::value_type;
-      auto values =
-          llvm::to_vector(constant.getValue().getAsValueRange<FloatAttr>());
-      return {decltype(dummy){static_cast<T>(values[0].convertToDouble()),
-                              static_cast<T>(values[1].convertToDouble())}};
-    } else {
-      llvm_unreachable("invalid constant");
-    }
-  });
-}
-
-REGISTER_MLIR_INTERPRETER_OP("complex.abs", "math.absf");
-REGISTER_MLIR_INTERPRETER_OP("complex.add", "arith.addf");
-REGISTER_MLIR_INTERPRETER_OP("complex.cos", applyCwiseMap<Cos>);
-REGISTER_MLIR_INTERPRETER_OP("complex.create", applyCwiseBinaryMap<Complex>);
-REGISTER_MLIR_INTERPRETER_OP("complex.div", applyCwiseBinaryMap<Divide>);
-REGISTER_MLIR_INTERPRETER_OP("complex.exp", applyCwiseMap<Exp>);
-REGISTER_MLIR_INTERPRETER_OP("complex.expm1", applyCwiseMap<ExpM1>);
-REGISTER_MLIR_INTERPRETER_OP("complex.im", applyCwiseMap<Imag>);
-REGISTER_MLIR_INTERPRETER_OP("complex.log", applyCwiseMap<Log>);
-REGISTER_MLIR_INTERPRETER_OP("complex.log1p", applyCwiseMap<Log1P>);
-REGISTER_MLIR_INTERPRETER_OP("complex.mul", applyCwiseBinaryMap<Multiply>);
-REGISTER_MLIR_INTERPRETER_OP("complex.neg", applyCwiseMap<Neg>);
-REGISTER_MLIR_INTERPRETER_OP("complex.pow", applyCwiseBinaryMap<Power>);
-REGISTER_MLIR_INTERPRETER_OP("complex.re", applyCwiseMap<Real>);
-REGISTER_MLIR_INTERPRETER_OP("complex.rsqrt", applyCwiseMap<RSqrt>);
-REGISTER_MLIR_INTERPRETER_OP("complex.sin", applyCwiseMap<Sin>);
-REGISTER_MLIR_INTERPRETER_OP("complex.sqrt", applyCwiseMap<Sqrt>);
-REGISTER_MLIR_INTERPRETER_OP("complex.tanh", applyCwiseMap<TanH>);
-REGISTER_MLIR_INTERPRETER_OP(constant);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/cwise_math.h b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/cwise_math.h
deleted file mode 100644
index 3f2274116f5dbc..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/cwise_math.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
-#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
-
-#include <complex>
-#include <type_traits>
-
-#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
-
-namespace mlir {
-namespace interpreter {
-
-struct ATan2 : CwiseReal {
-  template <typename T>
-  static T apply(T a, T b) {
-    return std::atan2(a, b);
-  }
-};
-
-struct Clz : CwiseInt {
-  template <typename T>
-  static T apply(T a) {
-    if (!a) {
-      // Return something well-defined for zeroes.
-      return sizeof(T{}) * CHAR_BIT;
-    }
-    return __builtin_clzl(
-               static_cast<uint64_t>(static_cast<std::make_unsigned_t<T>>(a))) -
-           (sizeof(uint64_t) - sizeof(T{})) * CHAR_BIT;
-  }
-};
-
-struct Ctz : CwiseInt {
-  template <typename T>
-  static T apply(T a) {
-    if (!a) {
-      // Return something well-defined for zeroes.
-      return sizeof(T{}) * CHAR_BIT;
-    }
-    return __builtin_ctzl(static_cast<uint64_t>(a));
-  }
-};
-
-struct Complex : CwiseFloat {
-  template <typename T>
-  static std::complex<T> apply(T a, T b) {
-    return {a, b};
-  }
-};
-
-struct Max : CwiseReal {
-  template <typename T>
-  static T apply(T a, T b) {
-    return std::max(a, b);
-  }
-};
-
-struct Min : CwiseReal {
-  template <typename T>
-  static T apply(T a, T b) {
-    return std::min(a, b);
-  }
-};
-
-struct Power : CwiseArith {
-  template <typename T>
-  static T apply(T a, T b) {
-    if constexpr (std::is_integral_v<T>) {
-      if constexpr (std::is_signed_v<T>) {
-        if (b < 0) {
-          return a == 1 ? 1 : 0;
-        }
-      }
-      T result = 1;
-      while (b > 0) {
-        if (b & 1) result *= a;
-        b >>= 1;
-        if (b) {
-          a *= a;
-        }
-      }
-      return result;
-    } else {
-      return std::pow(a, b);
-    }
-  }
-};
-
-struct Remainder : CwiseReal {
-  template <typename T>
-  static T apply(T a, T b) {
-    if constexpr (std::is_integral_v<T>) {
-      return a % b;
-    } else {
-      return std::fmod(a, b);
-    }
-  }
-};
-
-struct ShiftRightArith : CwiseInt {
-  template <typename T>
-  static T apply(T a, T b) {
-    return b >= sizeof(T) * CHAR_BIT ? 0 : (a >> b);
-  }
-};
-
-struct ShiftRightLogical : CwiseInt {
-  template <typename T>
-  static T apply(T a, T b) {
-    return b >= sizeof(T) * CHAR_BIT
-               ? 0
-               : static_cast<std::make_unsigned_t<T>>(a) >> b;
-  }
-};
-
-struct ShiftLeft : CwiseInt {
-  template <typename T>
-  static T apply(T a, T b) {
-    return b >= sizeof(T) * CHAR_BIT ? 0 : (a << b);
-  }
-};
-
-namespace detail {
-template <template <typename T> class F, typename trait>
-struct Wrap : trait {
-  template <typename T>
-  static T apply(T a, T b) {
-    return F<T>{}(a, b);
-  }
-};
-}  // namespace detail
-
-using Plus = detail::Wrap<std::plus, CwiseArith>;
-using Divide = detail::Wrap<std::divides, CwiseArith>;
-using Multiply = detail::Wrap<std::multiplies, CwiseArith>;
-using Minus = detail::Wrap<std::minus, CwiseAll>;
-using BitAnd = detail::Wrap<std::bit_and, CwiseIntegral>;
-using BitOr = detail::Wrap<std::bit_or, CwiseIntegral>;
-using BitXor = detail::Wrap<std::bit_xor, CwiseIntegral>;
-
-struct RSqrt : CwiseNonIntegral {
-  template <typename T>
-  static T apply(T a) {
-    return static_cast<T>(T{1} / std::sqrt(a));
-  }
-};
-
-#define DEFINE_WRAPPER(name, std_fun, trait) \
-  struct name : trait {                      \
-    template <typename T>                    \
-    static auto apply(T a) {                 \
-      return std_fun(a);                     \
-    }                                        \
-  };
-
-DEFINE_WRAPPER(ATan, std::atan, CwiseNonIntegral);
-DEFINE_WRAPPER(Abs, std::abs, CwiseSignedOrComplex);
-DEFINE_WRAPPER(Cbrt, std::cbrt, CwiseFloat);
-DEFINE_WRAPPER(Ceil, std::ceil, CwiseFloat);
-DEFINE_WRAPPER(Cos, std::cos, CwiseNonIntegral);
-DEFINE_WRAPPER(Erf, std::erf, CwiseFloat);
-DEFINE_WRAPPER(Exp, std::exp, CwiseNonIntegral);
-DEFINE_WRAPPER(Exp2, std::exp2, CwiseFloat);
-DEFINE_WRAPPER(Floor, std::floor, CwiseFloat);
-DEFINE_WRAPPER(Imag, std::imag, CwiseComplex);
-DEFINE_WRAPPER(IsFinite, std::isfinite, CwiseFloat);
-DEFINE_WRAPPER(Log, std::log, CwiseNonIntegral);
-DEFINE_WRAPPER(Log10, std::log10, CwiseNonIntegral);
-DEFINE_WRAPPER(Log2, std::log2, CwiseFloat);
-DEFINE_WRAPPER(NearbyInt, std::nearbyint, CwiseFloat);
-DEFINE_WRAPPER(Neg, std::negate<T>{}, CwiseSignedOrComplex);
-DEFINE_WRAPPER(Real, std::real, CwiseComplex);
-DEFINE_WRAPPER(Round, std::round, CwiseFloat);
-DEFINE_WRAPPER(Sin, std::sin, CwiseNonIntegral);
-DEFINE_WRAPPER(Sqrt, std::sqrt, CwiseNonIntegral);
-DEFINE_WRAPPER(Tan, std::tan, CwiseNonIntegral);
-DEFINE_WRAPPER(TanH, std::tanh, CwiseNonIntegral);
-DEFINE_WRAPPER(Trunc, std::trunc, CwiseFloat);
-
-#undef DEFINE_WRAPPER
-
-struct ExpM1 : CwiseNonIntegral {
-  template <typename T>
-  static T apply(T a) {
-    if constexpr (std::is_floating_point_v<T>) {
-      return std::expm1(a);
-    } else {
-      auto r = std::real(a);
-      auto i = std::imag(a);
-      auto s = std::sin(i / 2);
-      auto real = std::expm1(r) * std::cos(i) - 2 * s * s;
-      auto imag = std::exp(r) * std::sin(i);
-      return {real, imag};
-    }
-  }
-};
-
-struct Log1P : CwiseNonIntegral {
-  template <typename T>
-  static T apply(T a) {
-    if constexpr (std::is_floating_point_v<T>) {
-      return std::log1p(a);
-    } else {
-      auto r = std::real(a);
-      auto i = std::imag(a);
-      auto l = std::hypot(r + 1, i);
-      auto real = std::log(l);
-      auto imag = std::atan2(i, r + 1);
-      return {real, imag};
-    }
-  }
-};
-
-struct PopCount : CwiseInt {
-  template <typename T>
-  static T apply(T a) {
-    return __builtin_popcountl(
-        static_cast<uint64_t>(static_cast<std::make_unsigned_t<T>>(a)));
-  }
-};
-
-}  // namespace interpreter
-}  // namespace mlir
-
-#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc
deleted file mode 100644
index 8db1d3b9d43cbe..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "deallocation/IR/deallocation_ops.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-#include "tools/mlir_interpreter/framework/tensor_or_memref.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-InterpreterValue getBuffer(InterpreterState&, deallocation::GetBufferOp,
-                           const InterpreterValue& memref) {
-  if (!memref.buffer()) return {intptr_t{0}};
-  return {reinterpret_cast<intptr_t>(memref.buffer()->at(0, 0))};
-}
-
-InterpreterValue null(InterpreterState&, deallocation::NullOp) {
-  TensorOrMemref<uint8_t> r;
-  r.buffer = nullptr;
-  r.view.sizes = {0};
-  return {r};
-}
-
-InterpreterValue own(InterpreterState&, deallocation::OwnOp,
-                     const InterpreterValue& alloc) {
-  return alloc;
-}
-
-void freeAlloc(InterpreterState& state, deallocation::FreeOp op,
-               const InterpreterValue& alloc) {
-  if (auto* stats = state.getOptions().stats) {
-    stats->heapSize -= alloc.buffer()->getByteSize();
-    ++stats->numDeallocations;
-  }
-  alloc.buffer()->deallocate(op);
-}
-
-SmallVector<InterpreterValue> retain(InterpreterState& state,
-                                     deallocation::RetainOp op,
-                                     ArrayRef<InterpreterValue> values,
-                                     ArrayRef<InterpreterValue> owned) {
-  SmallVector<InterpreterValue> result(values.size(), null(state, {}));
-
-  llvm::SmallBitVector used(owned.size());
-  for (auto [index, v] : llvm::enumerate(values)) {
-    for (auto [ownedIndex, o] : llvm::enumerate(owned)) {
-      if (used[ownedIndex] || o.buffer() == nullptr) continue;
-      if (v.buffer() == o.buffer()) {
-        used[ownedIndex] = true;
-        result[index] = o;
-      }
-    }
-  }
-
-  for (int64_t i = 0; i < owned.size(); ++i) {
-    if (!used[i] && owned[i].buffer()) {
-      if (auto* stats = state.getOptions().stats) {
-        stats->heapSize -= owned[i].buffer()->getByteSize();
-        ++stats->numDeallocations;
-      }
-      owned[i].buffer()->deallocate(op);
-    }
-  }
-  return result;
-}
-
-REGISTER_MLIR_INTERPRETER_OP(getBuffer);
-REGISTER_MLIR_INTERPRETER_OP(own);
-REGISTER_MLIR_INTERPRETER_OP(null);
-REGISTER_MLIR_INTERPRETER_OP(retain);
-REGISTER_MLIR_INTERPRETER_OP(freeAlloc);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/func.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/func.cc
deleted file mode 100644
index 3852a0edb5eeac..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/func.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <dlfcn.h>
-
-#include <type_traits>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "tools/mlir_interpreter/dialects/util.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-template <typename T>
-bool typeMatches(mlir::Type type) {
-  if constexpr (std::is_same_v<T, float>) {
-    return type.isF32();
-  } else if constexpr (std::is_same_v<T, double>) {
-    return type.isF64();
-  } else {
-    return false;
-  }
-}
-
-template <typename Dummy>
-bool typesMatch(ArrayRef<mlir::Type> types) {
-  return types.empty();
-}
-
-template <typename Dummy, typename T, typename... R>
-bool typesMatch(ArrayRef<mlir::Type> types) {
-  if (types.empty() || !typeMatches<T>(types.front())) return false;
-  return typesMatch<Dummy, R...>(types.drop_front());
-}
-
-template <int n, typename... Args>
-using Arg = std::tuple_element_t<n, std::tuple<Args...>>;
-
-template <typename Ret, typename... Args>
-bool tryCall(void* sym, func::FuncOp callee,
-             MutableArrayRef<InterpreterValue> args, InterpreterValue& ret) {
-  if (args.size() != callee.getNumArguments() || callee.getNumResults() != 1) {
-    return false;
-  }
-
-  if (!typeMatches<Ret>(callee.getResultTypes()[0])) {
-    return false;
-  }
-  if (!typesMatch<void, Args...>(callee.getArgumentTypes())) {
-    return false;
-  }
-
-  static_assert(sizeof...(Args) <= 2);
-  using FnType = Ret (*)(Args...);
-  auto fn = reinterpret_cast<FnType>(sym);
-  constexpr int n = sizeof...(Args);
-
-  if constexpr (n == 1) {
-    ret = {fn(std::get<Arg<0, Args...>>(args[0].storage))};
-  } else {
-    static_assert(n == 2);
-    ret = {fn(std::get<Arg<0, Args...>>(args[0].storage),
-              std::get<Arg<1, Args...>>(args[1].storage))};
-  }
-  return true;
-}
-
-llvm::SmallVector<InterpreterValue> call(MutableArrayRef<InterpreterValue> args,
-                                         mlir::Operation* op,
-                                         InterpreterState& state) {
-  auto call = llvm::cast<func::CallOp>(op);
-  auto callee =
-      llvm::cast<func::FuncOp>(state.getSymbols().lookup(call.getCallee()));
-  if (callee->getRegion(0).hasOneBlock()) {
-    return interpret(state, callee.getRegion(), args);
-  }
-
-  void* sym = dlsym(RTLD_DEFAULT, callee.getSymName().str().c_str());
-  if (sym == nullptr) {
-    state.addFailure("callee not found");
-    return {};
-  }
-
-  InterpreterValue result;
-  if (tryCall<float, float>(sym, callee, args, result) ||
-      tryCall<float, float, float>(sym, callee, args, result) ||
-      tryCall<double, double>(sym, callee, args, result) ||
-      tryCall<double, double, double>(sym, callee, args, result)) {
-    return {result};
-  }
-
-  state.addFailure("unsupported call target");
-  return {};
-}
-
-REGISTER_MLIR_INTERPRETER_OP("func.call", call);
-REGISTER_MLIR_INTERPRETER_OP("func.return", noOpTerminator);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/gml_st.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/gml_st.cc
deleted file mode 100644
index b5a88b78abb094..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/gml_st.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "tools/mlir_interpreter/dialects/util.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-llvm::SmallVector<InterpreterValue> fusion(InterpreterState& state,
-                                           gml_st::FusionOp op,
-                                           ArrayRef<InterpreterValue> inputs,
-                                           ArrayRef<InterpreterValue> inits) {
-  llvm::SmallVector<InterpreterValue> args;
-  llvm::append_range(args, inputs);
-  llvm::append_range(args, inits);
-  auto result = interpret(state, op.getRegion(), args);
-  if (op.getNumResults() == 0) {
-    result.clear();
-  }
-  return result;
-}
-
-REGISTER_MLIR_INTERPRETER_OP(fusion);
-REGISTER_MLIR_INTERPRETER_OP("gml_st.yield", noOpTerminator);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/linalg.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/linalg.cc
deleted file mode 100644
index ef04f0ea08dc0a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/linalg.cc
+++ /dev/null
@@ -1,310 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-
-// clang-format erroneously puts the Linalg header above.
-#include <algorithm>   // NOLINT
-#include <cstdint>     // NOLINT
-#include <functional>  // NOLINT
-#include <memory>      // NOLINT
-
-#include "llvm/ADT/STLExtras.h"
-#include "tools/mlir_interpreter/dialects/util.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-class IterationIndexSideChannel : public InterpreterSideChannel {
- public:
-  explicit IterationIndexSideChannel(ArrayRef<int64_t> indices)
-      : indices(indices) {}
-  ArrayRef<int64_t> getIndices() const { return indices; }
-
- private:
-  ArrayRef<int64_t> indices;
-};
-
-llvm::SmallVector<InterpreterValue> broadcast(InterpreterState&,
-                                              linalg::BroadcastOp broadcast,
-                                              const InterpreterValue& input,
-                                              InterpreterValue init) {
-  auto broadcastDims = llvm::to_vector(broadcast.getDimensions());
-  llvm::sort(broadcastDims);
-
-  auto outShape = init.view().sizes;
-  auto out = input;
-  auto& outView = out.view();
-  for (int64_t dim : broadcastDims) {
-    outView.sizes.insert(outView.sizes.begin() + dim, outShape[dim]);
-    outView.strides.insert(outView.strides.begin() + dim, 0);
-  }
-
-  if (broadcast.getNumResults() == 1) {
-    return {out};
-  }
-
-  init.fill([&](llvm::ArrayRef<int64_t> indices) {
-    return out.extractElement(indices);
-  });
-  return {};
-}
-
-llvm::SmallVector<InterpreterValue> generic(
-    InterpreterState& state, linalg::GenericOp generic,
-    MutableArrayRef<InterpreterValue> inputs,
-    MutableArrayRef<InterpreterValue> outputsRef) {
-  SmallVector<int64_t> shapes;
-  for (auto& value : llvm::concat<InterpreterValue>(inputs, outputsRef)) {
-    if (value.isTensor() /* (or memref) */) {
-      llvm::append_range(
-          shapes, ArrayRef<int64_t>(value.view().sizes)
-                      .drop_back(value.view().numVectorDims.value_or(0)));
-    }
-  }
-  auto ranges = generic.getShapesToLoopsMap().compose(shapes);
-
-  llvm::SmallVector<InterpreterValue> outputs;
-  for (int64_t output = 0; output < outputsRef.size(); ++output) {
-    outputs.push_back(getInitOperand(generic.getOutputs(), output, outputsRef));
-  }
-
-  llvm::SmallVector<int64_t> ivs(ranges.size());
-  InterpreterScope scope(state);
-  scope.setSideChannel(std::make_shared<IterationIndexSideChannel>(ivs));
-
-  auto indexingMaps = generic.getIndexingMapsArray();
-  auto outputMaps = ArrayRef<AffineMap>(indexingMaps).drop_front(inputs.size());
-  std::function<void(int64_t)> run;
-  run = [&](int64_t loopIndex) {
-    // Abort recursion if we encountered some error previously.
-    if (state.hasFailure()) return;
-
-    if (loopIndex < ranges.size()) {
-      for (int64_t index = 0; index < ranges[loopIndex]; ++index) {
-        ivs[loopIndex] = index;
-        run(loopIndex + 1);
-      }
-    } else {
-      llvm::SmallVector<InterpreterValue> bbargs;
-      // Build bbargs: 1. inputs, 2. outputs.
-      for (auto [input, map] : llvm::zip(inputs, indexingMaps)) {
-        auto indices = evalAffineMap(map, ivs);
-        bbargs.push_back(input.extractElement(indices));
-      }
-      llvm::SmallVector<llvm::SmallVector<int64_t>> outputIndices;
-      for (auto [output, map] : llvm::zip(outputs, outputMaps)) {
-        auto& indices = outputIndices.emplace_back(evalAffineMap(map, ivs));
-        bbargs.push_back(output.extractElement(indices));
-      }
-      // Evaluate region.
-      auto yielded = interpret(state, generic.getRegion(), bbargs);
-      if (state.hasFailure()) return;
-      // Insert yielded values in the outputs.
-      for (auto [output, indices, yield] :
-           llvm::zip(outputs, outputIndices, yielded)) {
-        output.insertElement(indices, yield);
-      }
-    }
-  };
-  run(0);
-
-  if (generic.getNumResults() == 0) return {};
-  return outputs;
-}
-
-llvm::SmallVector<InterpreterValue> map(InterpreterState& state,
-                                        linalg::MapOp op,
-                                        ArrayRef<InterpreterValue> inputs,
-                                        const InterpreterValue& init) {
-  InterpreterValue output =
-      op.getInit().getType().isa<TensorType>() ? init.clone() : init;
-
-  InterpreterScope scope(state);
-  SmallVector<int64_t> ivs(output.view().rank());
-  scope.setSideChannel(std::make_shared<IterationIndexSideChannel>(ivs));
-  for (const auto& indices : output.view().indices()) {
-    std::copy(indices.begin(), indices.end(), ivs.begin());
-    llvm::SmallVector<InterpreterValue> args;
-    for (auto& input : inputs) {
-      args.push_back(input.extractElement(indices));
-    }
-    auto yielded = interpret(state, op.getRegion(), args);
-    if (state.hasFailure()) break;
-    output.insertElement(indices, yielded[0]);
-  }
-
-  if (op.getNumResults() == 0) return {};
-  return {output};
-}
-
-llvm::SmallVector<InterpreterValue> reduce(InterpreterState& state,
-                                           linalg::ReduceOp reduce,
-                                           ArrayRef<InterpreterValue> ins,
-                                           ArrayRef<InterpreterValue> inits) {
-  auto dims = reduce.getDimensions();
-  SmallVector<InterpreterValue> output;
-  for (auto [ty, init] : llvm::zip(reduce.getInits().getTypes(), inits)) {
-    output.push_back(ty.isa<TensorType>() ? init.clone() : init);
-  }
-  for (const auto& index : ins[0].view().indices()) {
-    auto dstIndex = index;
-    for (int64_t dim : llvm::reverse(dims)) {
-      dstIndex.erase(dstIndex.begin() + dim);
-    }
-
-    SmallVector<InterpreterValue> args;
-    for (auto& in : ins) {
-      args.push_back(in.extractElement(index));
-    }
-    for (auto& out : output) {
-      args.push_back(out.extractElement(dstIndex));
-    }
-    auto newValues = interpret(state, reduce.getRegion(), args);
-    if (state.hasFailure()) return {};
-
-    for (auto [out, value] : llvm::zip(output, newValues)) {
-      out.insertElement(dstIndex, value);
-    }
-  }
-  if (reduce->getNumResults() == 0) return {};
-  return output;
-}
-
-llvm::SmallVector<InterpreterValue> fill(InterpreterState&, linalg::FillOp op,
-                                         const InterpreterValue& value,
-                                         const InterpreterValue& init) {
-  // TODO(jreiffers): Support variadic fill.
-  InterpreterValue output = getInitOperand(op.getOutputs(), 0, {init});
-  output.fill([&](llvm::ArrayRef<int64_t>) { return value; });
-  if (op.getNumResults() == 0) return {};
-  return {output};
-}
-
-int64_t index(InterpreterState& state, linalg::IndexOp index) {
-  return state.getTopScope()
-      ->getSideChannel<IterationIndexSideChannel>()
-      ->getIndices()[index.getDim()];
-}
-
-SmallVector<InterpreterValue> matmul(InterpreterState& state,
-                                     linalg::MatmulOp matmul,
-                                     ArrayRef<InterpreterValue> inputs,
-                                     const InterpreterValue& init) {
-  if (inputs.size() != 2) {
-    state.addFailure("Invalid matmul");
-    return {};
-  }
-  const auto& lhs = inputs[0];
-  const auto& rhs = inputs[1];
-  auto ty = matmul.getOutputs()[0].getType();
-  auto result = ty.isa<TensorType>() ? init.clone() : init;
-  dispatchScalarType(ty, [&](auto dummy) {
-    using TT = TensorOrMemref<decltype(dummy)>;
-    auto lhsTensor = std::get<TT>(lhs.storage);
-    auto rhsTensor = std::get<TT>(rhs.storage);
-    auto resultTensor = std::get<TT>(result.storage);
-    for (int64_t i = 0; i < resultTensor.view.sizes[0]; ++i) {
-      for (int64_t j = 0; j < resultTensor.view.sizes[1]; ++j) {
-        for (int64_t k = 0; k < lhsTensor.view.sizes[1]; ++k) {
-          resultTensor.at({i, j}) +=
-              lhsTensor.at({i, k}) * rhsTensor.at({k, j});
-        }
-      }
-    }
-  });
-
-  if (matmul.getNumResults() == 0) return {};
-  return {result};
-}
-
-SmallVector<InterpreterValue> transpose(InterpreterState&,
-                                        linalg::TransposeOp transpose,
-                                        const InterpreterValue& input,
-                                        InterpreterValue init) {
-  auto transposed = transposeImpl(input, transpose.getPermutation());
-  if (transpose.getNumResults() == 1) {
-    return {transposed};
-  }
-
-  init.fill([&](auto index) { return transposed.extractElement(index); });
-  return {};
-}
-
-SmallVector<InterpreterValue> dot(InterpreterState&, linalg::DotOp op,
-                                  ArrayRef<InterpreterValue> inputs,
-                                  InterpreterValue acc) {
-  const auto& lhs = inputs[0];
-  const auto& rhs = inputs[1];
-  if (op.getOutputs()[0].getType().isa<TensorType>()) {
-    acc = acc.clone();
-  }
-  dispatchScalarType(op.getOutputs()[0].getType(), [&](auto dummy) {
-    using TT = TensorOrMemref<decltype(dummy)>;
-    auto lhsTensor = std::get<TT>(lhs.storage);
-    auto rhsTensor = std::get<TT>(rhs.storage);
-    auto resultTensor = std::get<TT>(acc.storage);
-    for (int64_t k = 0; k < lhsTensor.view.sizes[0]; ++k) {
-      resultTensor.at({}) += lhsTensor.at(k) * rhsTensor.at(k);
-    }
-  });
-
-  if (op.getNumResults() == 0) return {};
-  return {acc};
-}
-
-SmallVector<InterpreterValue> vecmat(InterpreterState&, linalg::VecmatOp op,
-                                     ArrayRef<InterpreterValue> inputs,
-                                     InterpreterValue acc) {
-  const auto& lhs = inputs[0];
-  const auto& rhs = inputs[1];
-  if (op.getOutputs()[0].getType().isa<TensorType>()) {
-    acc = acc.clone();
-  }
-  dispatchScalarType(op.getOutputs()[0].getType(), [&](auto dummy) {
-    using TT = TensorOrMemref<decltype(dummy)>;
-    auto lhsTensor = std::get<TT>(lhs.storage);
-    auto rhsTensor = std::get<TT>(rhs.storage);
-    auto resultTensor = std::get<TT>(acc.storage);
-    for (int64_t j = 0; j < resultTensor.view.sizes[0]; ++j) {
-      for (int64_t k = 0; k < lhsTensor.view.sizes[0]; ++k) {
-        resultTensor.at(j) += lhsTensor.at(k) * rhsTensor.at({k, j});
-      }
-    }
-  });
-
-  if (op.getNumResults() == 0) return {};
-  return {acc};
-}
-
-REGISTER_MLIR_INTERPRETER_OP("linalg.yield", noOpTerminator);
-REGISTER_MLIR_INTERPRETER_OP(broadcast);
-REGISTER_MLIR_INTERPRETER_OP(dot);
-REGISTER_MLIR_INTERPRETER_OP(fill);
-REGISTER_MLIR_INTERPRETER_OP(generic);
-REGISTER_MLIR_INTERPRETER_OP(index);
-REGISTER_MLIR_INTERPRETER_OP(map);
-REGISTER_MLIR_INTERPRETER_OP(matmul);
-REGISTER_MLIR_INTERPRETER_OP(reduce);
-REGISTER_MLIR_INTERPRETER_OP(transpose);
-REGISTER_MLIR_INTERPRETER_OP(vecmat);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/math.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/math.cc
deleted file mode 100644
index 2ff4eaba2e5689..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/math.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tools/mlir_interpreter/dialects/cwise_math.h"
-#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-struct CopySign : CwiseFloat {
-  template <typename T>
-  static T apply(T a, T b) {
-    return std::copysign(a, b);
-  }
-};
-
-REGISTER_MLIR_INTERPRETER_OP("math.absf", applyCwiseMap<Abs>);
-REGISTER_MLIR_INTERPRETER_OP("math.absi", applyCwiseMap<Abs>);
-REGISTER_MLIR_INTERPRETER_OP("math.atan", applyCwiseMap<ATan>);
-REGISTER_MLIR_INTERPRETER_OP("math.atan2", applyCwiseBinaryMap<ATan2>);
-REGISTER_MLIR_INTERPRETER_OP("math.cbrt", applyCwiseMap<Cbrt>);
-REGISTER_MLIR_INTERPRETER_OP("math.ceil", applyCwiseMap<Ceil>);
-REGISTER_MLIR_INTERPRETER_OP("math.copysign", applyCwiseBinaryMap<CopySign>);
-REGISTER_MLIR_INTERPRETER_OP("math.cos", applyCwiseMap<Cos>);
-REGISTER_MLIR_INTERPRETER_OP("math.ctlz", applyCwiseMap<Clz>);
-REGISTER_MLIR_INTERPRETER_OP("math.ctpop", applyCwiseMap<PopCount>);
-REGISTER_MLIR_INTERPRETER_OP("math.cttz", applyCwiseMap<Ctz>);
-REGISTER_MLIR_INTERPRETER_OP("math.erf", applyCwiseMap<Erf>);
-REGISTER_MLIR_INTERPRETER_OP("math.exp", applyCwiseMap<Exp>);
-REGISTER_MLIR_INTERPRETER_OP("math.exp2", applyCwiseMap<Exp2>);
-REGISTER_MLIR_INTERPRETER_OP("math.expm1", applyCwiseMap<ExpM1>);
-REGISTER_MLIR_INTERPRETER_OP("math.floor", applyCwiseMap<Floor>);
-REGISTER_MLIR_INTERPRETER_OP("math.ipowi", applyCwiseBinaryMap<Power>);
-REGISTER_MLIR_INTERPRETER_OP("math.log", applyCwiseMap<Log>);
-REGISTER_MLIR_INTERPRETER_OP("math.log10", applyCwiseMap<Log10>);
-REGISTER_MLIR_INTERPRETER_OP("math.log1p", applyCwiseMap<Log1P>);
-REGISTER_MLIR_INTERPRETER_OP("math.log2", applyCwiseMap<Log2>);
-REGISTER_MLIR_INTERPRETER_OP("math.powf", applyCwiseBinaryMap<Power>);
-REGISTER_MLIR_INTERPRETER_OP("math.round", applyCwiseMap<Round>);
-REGISTER_MLIR_INTERPRETER_OP("math.roundeven", applyCwiseMap<NearbyInt>);
-REGISTER_MLIR_INTERPRETER_OP("math.rsqrt", applyCwiseMap<RSqrt>);
-REGISTER_MLIR_INTERPRETER_OP("math.sin", applyCwiseMap<Sin>);
-REGISTER_MLIR_INTERPRETER_OP("math.sqrt", applyCwiseMap<Sqrt>);
-REGISTER_MLIR_INTERPRETER_OP("math.tan", applyCwiseMap<Tan>);
-REGISTER_MLIR_INTERPRETER_OP("math.tanh", applyCwiseMap<TanH>);
-REGISTER_MLIR_INTERPRETER_OP("math.trunc", applyCwiseMap<Trunc>);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/memref.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/memref.cc
deleted file mode 100644
index 842f5d1076b99e..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/memref.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-
-// clang-format erroneously puts the MemRef header above.
-#include <algorithm>  // NOLINT
-#include <iterator>   // NOLINT
-#include <limits>     // NOLINT
-#include <variant>    // NOLINT
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "tools/mlir_interpreter/dialects/util.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-InterpreterValue load(InterpreterState& state, memref::LoadOp,
-                      const InterpreterValue& memref,
-                      ArrayRef<int64_t> indices) {
-  if (!memref.buffer()) {
-    state.addFailure("null pointer dereference.");
-    return {};
-  }
-  if (!memref.view().inBounds(indices)) {
-    state.addFailure("array index out of bounds");
-    return {};
-  }
-  return memref.extractElement(indices);
-}
-
-void store(InterpreterState& state, memref::StoreOp,
-           const InterpreterValue& value, InterpreterValue memref,
-           ArrayRef<int64_t> indices) {
-  if (memref.view().inBounds(indices)) {
-    memref.insertElement(indices, value);
-  } else {
-    state.addFailure("array index out of bounds");
-  }
-}
-
-// TODO(jreiffers): Support symbol operands.
-InterpreterValue alloc(InterpreterState& state, memref::AllocOp alloc,
-                       ArrayRef<int64_t> dynamicSizes) {
-  auto ty = alloc->getResultTypes().front().cast<mlir::ShapedType>();
-  auto shape = replaceDynamicVals(ty.getShape(), dynamicSizes);
-  auto result = InterpreterValue::makeTensor(ty.getElementType(), shape);
-  if (auto* stats = state.getOptions().stats) {
-    stats->heapSize += result.buffer()->getByteSize();
-    stats->peakHeapSize = std::max(stats->peakHeapSize, stats->heapSize);
-    ++stats->numAllocations;
-  }
-  result.buffer()->setAllocatedBy(alloc);
-  return result;
-}
-
-InterpreterValue allocA(InterpreterState&, memref::AllocaOp alloc,
-                        ArrayRef<int64_t> dynamicSizes) {
-  auto ty = alloc->getResultTypes().front().cast<mlir::ShapedType>();
-  auto shape = replaceDynamicVals(ty.getShape(), dynamicSizes);
-  auto result = InterpreterValue::makeTensor(ty.getElementType(), shape);
-  result.buffer()->setIsAlloca();
-  result.buffer()->setAllocatedBy(alloc);
-  return result;
-}
-
-void dealloc(InterpreterState& state, memref::DeallocOp op,
-             InterpreterValue memref) {
-  if (!memref.buffer()) {
-    state.addFailure("attempting to deallocate null pointer.");
-    return;
-  }
-  auto buffer = memref.buffer();
-  const auto& view = memref.view();
-  if (auto* stats = state.getOptions().stats) {
-    stats->heapSize -= buffer->getByteSize();
-    ++stats->numDeallocations;
-  }
-  if (view.getNumElements() * memref.getByteSizeOfElement() !=
-      buffer->getByteSize()) {
-    state.addFailure("Attempting to deallocate a subview");
-  } else if (!state.getOptions().disableDeallocations) {
-    buffer->deallocate(op);
-  }
-}
-
-void copy(InterpreterState&, memref::CopyOp, InterpreterValue source,
-          InterpreterValue dest) {
-  dest.fill([&](llvm::ArrayRef<int64_t> indices) {
-    return source.extractElement(indices);
-  });
-}
-
-InterpreterValue subview(InterpreterState& state, memref::SubViewOp subview,
-                         const InterpreterValue& memref,
-                         ArrayRef<int64_t> dynamicOffsets,
-                         ArrayRef<int64_t> dynamicSizes,
-                         ArrayRef<int64_t> dynamicStrides) {
-  auto v = extractOffsetsSizesStrides(dynamicOffsets, dynamicSizes,
-                                      dynamicStrides, subview);
-  auto out = memref;
-  auto& outView = out.view();
-  if (!outView.subview(v.offsets, v.sizes, v.strides).succeeded()) {
-    state.addFailure("subview out of bounds");
-    return {};
-  }
-
-  if (subview.getResult().getType().getRank() == outView.rank()) {
-    return out;
-  }
-
-  auto shape = subview.getResult().getType().getShape();
-  // TODO(jreiffers): Check why subview.getDroppedDims() yields the wrong shape
-  // here for 1x2x2x3 (-> 1x2x1x3) -> 1x2x3 (claiming 0 is dropped).
-  int64_t dim = 0;
-  while (dim < outView.rank() && dim < shape.size()) {
-    if (shape[dim] != 1 && outView.sizes[dim] == 1) {
-      outView.sizes.erase(outView.sizes.begin() + dim);
-      outView.strides.erase(outView.strides.begin() + dim);
-    } else {
-      assert((shape[dim] < 0 || outView.sizes[dim] == shape[dim]) &&
-             "expected static size to match");
-      ++dim;
-    }
-  }
-  while (dim < outView.rank()) {
-    assert(outView.sizes.back() == 1 && "expected remaining dims to be 1");
-    outView.sizes.pop_back();
-    outView.strides.pop_back();
-  }
-  return out;
-}
-
-llvm::SmallVector<InterpreterValue> collapseShape(
-    InterpreterState& state, memref::CollapseShapeOp collapse,
-    const InterpreterValue& memref) {
-  const BufferView& inputView = memref.view();
-  InterpreterValue out = memref;
-  auto& outView = out.view();
-  outView.sizes.clear();
-  outView.strides.clear();
-
-  for (const auto& group : collapse.getReassociationIndices()) {
-    if (auto stride = inputView.getCollapsedStride(group)) {
-      outView.strides.push_back(*stride);
-      int64_t& size = outView.sizes.emplace_back(1);
-      for (int64_t dim : group) size *= inputView.sizes[dim];
-    } else {
-      state.addFailure("cannot collapse dimensions without a common stride");
-      return {};
-    }
-  }
-
-  return {out};
-}
-
-InterpreterValue cast(InterpreterState&, memref::CastOp,
-                      InterpreterValue memref) {
-  return memref;
-}
-
-// TODO(jreiffers): Implement full expand_shape support.
-InterpreterValue expandShape(InterpreterState& state, memref::ExpandShapeOp op,
-                             InterpreterValue memref) {
-  BufferView inputView = memref.view();
-  auto outTy = op->getResultTypes()[0].template cast<MemRefType>();
-  if (outTy.getNumDynamicDims() > 0) {
-    state.addFailure("dynamic dimensions unsupported.");
-    return {};
-  }
-
-  InterpreterValue out = memref;
-  auto& outView = out.view();
-  outView.strides.clear();
-  outView.sizes = llvm::to_vector(outTy.getShape());
-  int64_t dummy;
-  if (!getStridesAndOffset(outTy, outView.strides, dummy).succeeded()) {
-    if (inputView.strides != BufferView::getDefaultStrides(inputView.sizes)) {
-      state.addFailure("unsupported strides");
-      return {};
-    }
-    outView.strides = BufferView::getDefaultStrides(outView.sizes);
-  }
-
-  return out;
-}
-
-InterpreterValue getGlobal(InterpreterState& state,
-                           memref::GetGlobalOp getGlobal) {
-  auto global = llvm::cast<memref::GlobalOp>(
-      state.getSymbols().lookup(getGlobal.getName()));
-
-  auto value = global.getConstantInitValue();
-  assert(value && "mutable globals are not implemented");
-
-  auto ty = getGlobal->getResultTypes()[0].cast<ShapedType>();
-  return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
-    auto values = value.getValues<decltype(dummy)>();
-    auto result = TensorOrMemref<decltype(dummy)>::empty(ty.getShape());
-    auto valueIt = values.begin();
-    for (const auto& index : result.view.indices()) {
-      result.at(index) = *valueIt;
-      ++valueIt;
-    }
-    return {result};
-  });
-}
-
-int64_t dim(InterpreterState& state, memref::DimOp,
-            const InterpreterValue& memref, int64_t dim) {
-  return dimImpl(memref, dim, state);
-}
-
-REGISTER_MLIR_INTERPRETER_OP(alloc);
-REGISTER_MLIR_INTERPRETER_OP(allocA);
-REGISTER_MLIR_INTERPRETER_OP(cast);
-REGISTER_MLIR_INTERPRETER_OP(collapseShape);
-REGISTER_MLIR_INTERPRETER_OP(copy);
-REGISTER_MLIR_INTERPRETER_OP(dealloc);
-REGISTER_MLIR_INTERPRETER_OP(dim);
-REGISTER_MLIR_INTERPRETER_OP(expandShape);
-REGISTER_MLIR_INTERPRETER_OP(getGlobal);
-REGISTER_MLIR_INTERPRETER_OP(load);
-REGISTER_MLIR_INTERPRETER_OP(store);
-REGISTER_MLIR_INTERPRETER_OP(subview);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo.cc
deleted file mode 100644
index efb46584e7751e..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo.cc
+++ /dev/null
@@ -1,886 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// NOTE: These ops aim to match the StableHLO spec and the behavior of the XLA
-// compiler. For op semantics and a reference implementation, check the
-// StableHLO repository (the spec is here:
-// https://github.com/openxla/stablehlo/blob/main/docs/spec.md).
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <numeric>
-#include <type_traits>
-#include <utility>
-#include <variant>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "tools/mlir_interpreter/dialects/comparators.h"
-#include "tools/mlir_interpreter/dialects/cwise_math.h"
-#include "tools/mlir_interpreter/dialects/util.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-InterpreterValue makeTuple(MutableArrayRef<InterpreterValue> values) {
-  Tuple result;
-  for (auto& value : values) {
-    result.values.push_back(
-        std::make_shared<InterpreterValue>(std::move(value)));
-  }
-  return {result};
-}
-
-InterpreterValue getTupleElement(InterpreterState&, mhlo::GetTupleElementOp get,
-                                 const InterpreterValue& tuple) {
-  return *std::get<Tuple>(tuple.storage).values[get.getIndex()];
-}
-
-InterpreterValue bitcastConvert(InterpreterState&, mhlo::BitcastConvertOp op,
-                                const InterpreterValue& in) {
-  ShapedType ty = cast<ShapedType>(op->getResultTypes()[0]);
-  auto result = dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
-    TensorOrMemref<decltype(dummy)> result;
-    result.view = {};
-    result.buffer = in.clone().buffer();
-    return {result};
-  });
-  auto& outView = result.view();
-  outView.strides = BufferView::getDefaultStrides(ty.getShape());
-  outView.sizes = llvm::to_vector(ty.getShape());
-  return result;
-}
-
-InterpreterValue broadcastInDim(InterpreterState&,
-                                mhlo::BroadcastInDimOp broadcast,
-                                InterpreterValue in) {
-  auto broadcastDims = broadcast.getBroadcastDimensions().getValues<int64_t>();
-  const auto& inSizes = in.view().sizes;
-  auto out = in.typedAlike(
-      broadcast->getResultTypes()[0].cast<ShapedType>().getShape());
-  // TODO(jreiffers): Skip the copy.
-  out.fill([&](llvm::ArrayRef<int64_t> outIndices) {
-    llvm::SmallVector<int64_t> inIndices;
-    for (auto [inDim, outDim] : llvm::enumerate(broadcastDims)) {
-      inIndices.push_back(inSizes[inDim] == 1 ? 0 : outIndices[outDim]);
-    }
-    return in.extractElement(inIndices);
-  });
-  return out;
-}
-
-InterpreterValue clamp(InterpreterState&, mhlo::ClampOp,
-                       const InterpreterValue& lb, const InterpreterValue& arg,
-                       const InterpreterValue& ub) {
-  auto result = arg.clone();
-  for (const auto& index : arg.view().indices()) {
-    auto lbScalar = lb.isTensor() ? lb.extractElement(index) : lb;
-    auto ubScalar = ub.isTensor() ? ub.extractElement(index) : ub;
-    assert(arg.isTensor() && "clamp only bcasts scalar bounds");
-    auto argScalar = arg.extractElement(index);
-    auto resultScalar = applyCwiseBinaryMap<Min>(
-        applyCwiseBinaryMap<Max>(argScalar, lbScalar), ubScalar);
-    result.insertElement(index, resultScalar);
-  }
-  return result;
-}
-
-InterpreterValue concatenate(InterpreterState&, mhlo::ConcatenateOp concat,
-                             ArrayRef<InterpreterValue> vals) {
-  uint64_t dim = concat.getDimension();
-  auto sizes = vals[0].view().sizes;
-  sizes[dim] = 0;
-  for (const auto& val : vals) {
-    sizes[dim] += val.view().sizes[dim];
-  }
-
-  auto result = vals[0].typedAlike(sizes);
-  int64_t offset = 0;
-  for (const auto& val : vals) {
-    for (auto index : val.view().indices()) {
-      auto item = val.extractElement(index);
-      index[dim] += offset;
-      result.insertElement(index, item);
-    }
-    offset += val.view().sizes[dim];
-  }
-  return result;
-}
-
-InterpreterValue reshape(InterpreterState&, mhlo::ReshapeOp reshape,
-                         const InterpreterValue& in) {
-  auto ty = reshape->getResultTypes()[0].cast<mlir::ShapedType>();
-  return reshapeTensor(in, ty.getShape());
-}
-
-llvm::SmallVector<int64_t> clampStarts(ArrayRef<int64_t> starts,
-                                       ArrayRef<int64_t> sliceSizes,
-                                       ArrayRef<int64_t> tensorSizes) {
-  llvm::SmallVector<int64_t> result;
-  for (auto [start, sliceSize, tensorSize] :
-       llvm::zip(starts, sliceSizes, tensorSizes)) {
-    result.push_back(
-        std::max(int64_t{0}, std::min(tensorSize - sliceSize, start)));
-  }
-  return result;
-}
-
-InterpreterValue dynamicSlice(InterpreterState&, mhlo::DynamicSliceOp slice,
-                              const InterpreterValue& in,
-                              ArrayRef<int64_t> starts) {
-  auto result = in.typedAlike(
-      llvm::to_vector(slice.getSliceSizes().getValues<int64_t>()));
-  auto clampedStarts =
-      clampStarts(starts, result.view().sizes, in.view().sizes);
-  // TODO(jreiffers): Skip the copy.
-  result.fill([&](llvm::ArrayRef<int64_t> outIndices) {
-    llvm::SmallVector<int64_t> inIndices;
-    for (auto [start, index] : llvm::zip(clampedStarts, outIndices)) {
-      inIndices.push_back(start + index);
-    }
-    return in.extractElement(inIndices);
-  });
-  return result;
-}
-
-InterpreterValue dynamicUpdateSlice(InterpreterState&,
-                                    mhlo::DynamicUpdateSliceOp,
-                                    const InterpreterValue& in,
-                                    const InterpreterValue& updates,
-                                    ArrayRef<int64_t> starts) {
-  auto result = in.clone();
-  auto clampedStarts =
-      clampStarts(starts, updates.view().sizes, result.view().sizes);
-  for (auto inIndices : updates.view().indices()) {
-    llvm::SmallVector<int64_t> outIndices;
-    for (auto [start, index] : llvm::zip(clampedStarts, inIndices)) {
-      outIndices.push_back(start + index);
-    }
-    result.insertElement(outIndices, updates.extractElement(inIndices));
-  }
-  return result;
-}
-
-InterpreterValue slice(InterpreterState&, mhlo::SliceOp slice,
-                       InterpreterValue in) {
-  auto starts = slice.getStartIndices().getValues<int64_t>();
-  auto limits = slice.getLimitIndices().getValues<int64_t>();
-  auto strides = slice.getStrides().getValues<int64_t>();
-
-  llvm::SmallVector<int64_t> sizes;
-  for (auto [start, limit, stride] : llvm::zip(starts, limits, strides)) {
-    sizes.push_back(((limit - start) + (stride - 1)) / stride);
-  }
-  // TODO(jreiffers): Skip the copy.
-  auto result = in.typedAlike(sizes);
-  result.fill([&](llvm::ArrayRef<int64_t> outIndices) {
-    llvm::SmallVector<int64_t> inIndices;
-    for (auto [start, stride, index] : llvm::zip(starts, strides, outIndices)) {
-      inIndices.push_back(start + stride * index);
-    }
-    return in.extractElement(inIndices);
-  });
-  return result;
-}
-
-llvm::SmallVector<InterpreterValue> constant(InterpreterState&,
-                                             mhlo::ConstantOp constant) {
-  auto ty = constant->getResultTypes()[0].cast<ShapedType>();
-  return {dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
-    if (ty.getElementType().isUnsignedInteger()) {
-      if constexpr (!std::is_same_v<decltype(dummy), bool> &&
-                    std::is_integral_v<decltype(dummy)>) {
-        auto values =
-            constant.getValue()
-                .getValues<
-                    typename std::make_unsigned<decltype(dummy)>::type>();
-        auto result = TensorOrMemref<decltype(dummy)>::empty(ty.getShape());
-        auto valueIt = values.begin();
-        for (const auto& index : result.view.indices()) {
-          result.at(index) = *valueIt;
-          ++valueIt;
-        }
-        return {result};
-      } else {
-        llvm_unreachable("invalid input");
-      }
-    } else {
-      auto values = constant.getValue().getValues<decltype(dummy)>();
-      auto result = TensorOrMemref<decltype(dummy)>::empty(ty.getShape());
-      auto valueIt = values.begin();
-      for (const auto& index : result.view.indices()) {
-        result.at(index) = *valueIt;
-        ++valueIt;
-      }
-      return {result};
-    }
-  })};
-}
-
-InterpreterValue pad(InterpreterState&, mhlo::PadOp pad, InterpreterValue arg,
-                     InterpreterValue paddingValue) {
-  paddingValue = paddingValue.extractElement({});
-  auto his = pad.getEdgePaddingHigh().getValues<int64_t>();
-  auto los = pad.getEdgePaddingLow().getValues<int64_t>();
-  auto ins = pad.getInteriorPadding().getValues<int64_t>();
-
-  llvm::SmallVector<int64_t> sizes;
-  for (auto [size, lo, in, hi] : llvm::zip(arg.view().sizes, los, ins, his)) {
-    sizes.push_back(size + lo + hi + (size - 1) * in);
-  }
-
-  auto result = arg.typedAlike(sizes);
-  result.fill([&](llvm::ArrayRef<int64_t>) { return paddingValue; });
-
-  for (const auto& inIndices : arg.view().indices()) {
-    llvm::SmallVector<int64_t> outIndices;
-    for (auto [inIndex, in, lo] : llvm::zip(inIndices, ins, los)) {
-      outIndices.push_back(inIndex * (in + 1) + lo);
-    }
-    if (result.view().inBounds(outIndices)) {
-      result.insertElement(outIndices, arg.extractElement(inIndices));
-    }
-  }
-
-  return result;
-}
-
-InterpreterValue compare(InterpreterState&, mhlo::CompareOp compare,
-                         const InterpreterValue& lhs,
-                         const InterpreterValue& rhs) {
-  switch (compare.getComparisonDirection()) {
-    case mlir::mhlo::ComparisonDirection::EQ:
-      return applyCwiseBinaryMap<Foeq>(lhs, rhs);
-    case mlir::mhlo::ComparisonDirection::NE:
-      return applyCwiseBinaryMap<Fune>(lhs, rhs);
-    case mlir::mhlo::ComparisonDirection::GE:
-      return applyCwiseBinaryMap<Foge>(lhs, rhs);
-    case mlir::mhlo::ComparisonDirection::GT:
-      return applyCwiseBinaryMap<Fogt>(lhs, rhs);
-    case mlir::mhlo::ComparisonDirection::LE:
-      return applyCwiseBinaryMap<Fole>(lhs, rhs);
-    case mlir::mhlo::ComparisonDirection::LT:
-      return applyCwiseBinaryMap<Folt>(lhs, rhs);
-  }
-}
-
-llvm::SmallVector<InterpreterValue> gather(
-    MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
-    InterpreterState& state) {
-  auto gather = llvm::dyn_cast<mhlo::GatherOp>(op);
-  auto dynamicGather = llvm::dyn_cast<mhlo::DynamicGatherOp>(op);
-
-  if (!gather && !dynamicGather) {
-    state.addFailure("invalid gather");
-    return {};
-  }
-
-  const auto& dims = gather ? gather.getDimensionNumbers()
-                            : dynamicGather.getDimensionNumbers();
-
-  auto indexVectorDim = dims.getIndexVectorDim();
-  auto startIndexMap = dims.getStartIndexMap();
-  auto offsetDims = dims.getOffsetDims();
-  auto collapsedSliceDims = dims.getCollapsedSliceDims();
-  auto sliceSizes =
-      gather ? llvm::to_vector(gather.getSliceSizes().getValues<int64_t>())
-             : llvm::to_vector(llvm::map_range(
-                   args[2].view().indices(), [&](const auto& indices) {
-                     return args[2].extractElement(indices).asInt();
-                   }));
-
-  auto& operand = args[0];
-  auto& startIndices = args[1];
-  const auto& operandView = operand.view();
-  int64_t operandRank = operandView.rank();
-
-  // Make a fake BufferView for the start indices.
-  BufferView startIndicesView = startIndices.view();
-  auto outputRank =
-      static_cast<int64_t>(startIndicesView.rank() + offsetDims.size());
-  if (indexVectorDim < startIndicesView.rank()) {
-    --outputRank;
-    startIndicesView.sizes[indexVectorDim] = 1;
-  }
-
-  SmallVector<int64_t> batchDims;
-  for (int64_t i = 0; i < outputRank; ++i) {
-    if (!llvm::is_contained(offsetDims, i)) {
-      batchDims.push_back(i);
-    }
-  }
-
-  // Make a fake BufferView for the slice indices.
-  BufferView sliceIndicesView{0, SmallVector<int64_t>{sliceSizes}, {}};
-
-  SmallVector<int64_t> nonCollapsedSliceDims;
-  for (int64_t i = 0; i < operandRank; ++i) {
-    if (!llvm::is_contained(collapsedSliceDims, i)) {
-      nonCollapsedSliceDims.push_back(i);
-    }
-  }
-
-  SmallVector<int64_t> outputSizes(outputRank);
-  for (auto [outputDim, sliceDim] :
-       llvm::zip(offsetDims, nonCollapsedSliceDims)) {
-    outputSizes[outputDim] = sliceSizes[sliceDim];
-  }
-  for (auto [batchIndex, outputDim] : llvm::enumerate(batchDims)) {
-    if (batchIndex >= indexVectorDim) {
-      ++batchIndex;
-    }
-    outputSizes[outputDim] = startIndicesView.sizes[batchIndex];
-  }
-
-  auto output = operand.typedAlike(outputSizes);
-  for (auto startIndicesIndex : startIndicesView.indices()) {
-    SmallVector<int64_t> operandBaseIndices(operandRank);
-    for (auto [i, dim] : llvm::enumerate(startIndexMap)) {
-      if (indexVectorDim < startIndicesView.rank()) {
-        startIndicesIndex[indexVectorDim] = static_cast<int64_t>(i);
-      }
-      operandBaseIndices[dim] = std::max<int64_t>(
-          0, std::min(startIndices.extractElement(startIndicesIndex).asInt(),
-                      operandView.sizes[dim] - sliceSizes[dim]));
-    }
-
-    for (const auto& sliceIndices : sliceIndicesView.indices()) {
-      SmallVector<int64_t> operandIndices;
-      for (int64_t i = 0; i < operandRank; ++i) {
-        operandIndices.push_back(operandBaseIndices[i] + sliceIndices[i]);
-      }
-
-      SmallVector<int64_t> outputIndices(outputRank);
-      for (auto [outputDim, sliceDim] :
-           llvm::zip(offsetDims, nonCollapsedSliceDims)) {
-        outputIndices[outputDim] = sliceIndices[sliceDim];
-      }
-      for (auto [batchIndex, outputDim] : llvm::enumerate(batchDims)) {
-        outputIndices[outputDim] =
-            startIndicesIndex[batchIndex >= indexVectorDim ? batchIndex + 1
-                                                           : batchIndex];
-      }
-
-      auto value = operand.extractElement(operandIndices);
-      output.insertElement(outputIndices, value);
-    }
-  }
-
-  return {output};
-}
-
-llvm::SmallVector<InterpreterValue> scatter(
-    InterpreterState& state, mhlo::ScatterOp scatter,
-    ArrayRef<InterpreterValue> nInputs, InterpreterValue scatterIndices,
-    ArrayRef<InterpreterValue> nUpdates) {
-  const auto& dims = scatter.getScatterDimensionNumbers();
-  auto indexVectorDim = dims.getIndexVectorDim();
-  auto scatterDimsToOperandDims = dims.getScatterDimsToOperandDims();
-  auto insertedWindowDims = dims.getInsertedWindowDims();
-  auto updateWindowDims = dims.getUpdateWindowDims();
-
-  auto inputView = nInputs.front().view();
-  int64_t operandRank = inputView.rank();
-  int64_t updatesRank = nUpdates.front().view().rank();
-  int64_t indicesRank = scatterIndices.view().rank();
-
-  llvm::SmallVector<int64_t> batchDims;
-  for (int64_t dim = 0; dim < operandRank; ++dim) {
-    if (!llvm::is_contained(insertedWindowDims, dim)) {
-      batchDims.push_back(dim);
-    }
-  }
-
-  llvm::SmallVector<int64_t> updateScatterDims;
-  for (int64_t dim = 0; dim < updatesRank; ++dim) {
-    if (!llvm::is_contained(updateWindowDims, dim)) {
-      updateScatterDims.push_back(dim);
-    }
-  }
-
-  llvm::SmallVector<InterpreterValue> nResults;
-  for (auto& inputs : nInputs) {
-    nResults.push_back(inputs.clone());
-  }
-
-  for (auto [inputs, updates, results] :
-       llvm::zip(nResults, nUpdates, nResults)) {
-    const auto& updatesView = updates.view();
-    for (const auto& updateIndices : updatesView.indices()) {
-      llvm::SmallVector<int64_t> inputIndices(operandRank);
-      llvm::SmallVector<int64_t> maxIndices(operandRank);
-      llvm::SmallVector<int64_t> minIndices(operandRank);
-      llvm::SmallVector<int64_t> scatterIndicesIndex(indicesRank);
-
-      for (auto [index, dim] : llvm::enumerate(updateScatterDims)) {
-        scatterIndicesIndex[index >= indexVectorDim ? index + 1 : index] +=
-            updateIndices[dim];
-      }
-
-      for (auto [updateDim, operandDim] :
-           llvm::zip(updateWindowDims, batchDims)) {
-        inputIndices[operandDim] = updateIndices[updateDim];
-        maxIndices[operandDim] = updatesView.sizes[updateDim] - 1;
-      }
-
-      for (auto [index, dim] : llvm::enumerate(scatterDimsToOperandDims)) {
-        if (indexVectorDim < indicesRank) {
-          scatterIndicesIndex[indexVectorDim] = static_cast<int64_t>(index);
-        }
-
-        int64_t scatterIndex =
-            scatterIndices.extractElement(scatterIndicesIndex).asInt();
-        inputIndices[dim] += scatterIndex;
-        minIndices[dim] += scatterIndex;
-        maxIndices[dim] += scatterIndex;
-      }
-
-      if (!inputView.inBounds(minIndices)) continue;
-      if (!inputView.inBounds(maxIndices)) continue;
-
-      auto currentValue = inputs.extractElement(inputIndices).asUnitTensor();
-      auto update = updates.extractElement(updateIndices).asUnitTensor();
-
-      auto result = interpret(state, scatter.getUpdateComputation(),
-                              {currentValue, update});
-      if (state.hasFailure()) {
-        return nResults;
-      }
-      inputs.insertElement(inputIndices, result.front().extractElement({}));
-    }
-  }
-
-  return nResults;
-}
-
-InterpreterValue select(InterpreterState&, mhlo::SelectOp,
-                        TensorOrMemref<bool> condition,
-                        const InterpreterValue& trueVals,
-                        const InterpreterValue& falseVals) {
-  // TODO(jreiffers): Support implicit broadcasting.
-  auto result = trueVals.clone();
-  for (const auto& indices : condition.view.indices()) {
-    if (!condition.at(indices)) {
-      result.insertElement(indices, falseVals.extractElement(indices));
-    }
-  }
-  return result;
-}
-
-InterpreterValue transpose(InterpreterState&, mhlo::TransposeOp transpose,
-                           const InterpreterValue& tensor) {
-  auto permutation = transpose.getPermutation().getValues<int64_t>();
-  return transposeImpl(tensor, llvm::to_vector(permutation));
-}
-
-InterpreterValue iota(InterpreterState&, mhlo::IotaOp iota) {
-  auto dim = iota.getIotaDimension();
-  auto ty = iota->getResultTypes()[0].cast<ShapedType>();
-  return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
-    auto result = TensorOrMemref<decltype(dummy)>::empty({ty.getShape()[dim]});
-    for (const auto& index : result.view.indices()) {
-      result.at(index) = index[0];
-    }
-    result.view.sizes = SmallVector<int64_t>(ty.getShape());
-    result.view.strides = SmallVector<int64_t>(result.view.sizes.size());
-    result.view.strides[dim] = 1;
-    return {result};
-  });
-}
-
-template <typename R>
-struct Caster {
-  template <typename A>
-  constexpr static bool supportedType() {
-    return std::is_convertible_v<A, R>;
-  }
-
-  template <typename A>
-  static R apply(A v) {
-    return v;
-  }
-};
-
-InterpreterValue convert(InterpreterState&, mhlo::ConvertOp op,
-                         InterpreterValue in) {
-  return dispatchScalarType(op->getResultTypes()[0],
-                            [&](auto dummy) -> InterpreterValue {
-                              return applyCwiseMap<Caster<decltype(dummy)>>(in);
-                            });
-}
-
-llvm::SmallVector<InterpreterValue> mhloWhile(
-    InterpreterState& state, mhlo::WhileOp whileop,
-    SmallVector<InterpreterValue> loopVars) {
-  auto cond = interpret(state, whileop.getRegion(0), loopVars);
-  while (!state.hasFailure() &&
-         std::get<TensorOrMemref<bool>>(cond.front().storage).at({})) {
-    loopVars = interpret(state, whileop.getRegion(1), loopVars);
-    if (state.hasFailure()) break;
-    cond = interpret(state, whileop.getRegion(0), loopVars);
-  }
-  return loopVars;
-}
-
-InterpreterValue copy(InterpreterState&, mhlo::CopyOp op,
-                      const InterpreterValue& tensor) {
-  auto layout = op->getAttr("result_layout");
-  if (!layout) return tensor.clone();
-  return tensor.clone(llvm::to_vector(
-      layout.cast<DenseIntElementsAttr>().getValues<int64_t>()));
-}
-
-llvm::SmallVector<InterpreterValue> fusion(InterpreterState& state,
-                                           mhlo::FusionOp fusion,
-                                           ArrayRef<InterpreterValue> args) {
-  return interpret(state, fusion.getRegion(), args);
-}
-
-llvm::SmallVector<InterpreterValue> reduce(InterpreterState& state,
-                                           mhlo::ReduceOp reduce,
-                                           ArrayRef<InterpreterValue> operands,
-                                           ArrayRef<InterpreterValue> inits) {
-  auto dims = reduce.getDimensions().getValues<int64_t>();
-
-  auto outSizes = operands.front().view().sizes;
-  for (int64_t dim : llvm::reverse(dims)) {
-    outSizes.erase(outSizes.begin() + dim);
-  }
-
-  SmallVector<InterpreterValue> results;
-  for (auto [operand, init] : llvm::zip(operands, inits)) {
-    results.push_back(operand.typedAlike(outSizes));
-    auto initScalar = init.extractElement({});
-    results.back().fill([&](llvm::ArrayRef<int64_t>) { return initScalar; });
-  }
-
-  for (const auto& index : operands[0].view().indices()) {
-    auto dstIndex = index;
-    for (int64_t dim : llvm::reverse(dims)) {
-      dstIndex.erase(dstIndex.begin() + dim);
-    }
-
-    SmallVector<InterpreterValue> args;
-    for (auto& result : results) {
-      args.push_back(result.extractElement(dstIndex).asUnitTensor());
-    }
-    for (auto& operand : operands) {
-      args.push_back(operand.extractElement(index).asUnitTensor());
-    }
-    auto newValues = interpret(state, reduce.getRegion(), args);
-    if (state.hasFailure()) return results;
-
-    for (auto [result, value] : llvm::zip(results, newValues)) {
-      result.insertElement(dstIndex, value.extractElement({}));
-    }
-  }
-  return results;
-}
-
-SmallVector<InterpreterValue> sort(InterpreterState& state, mhlo::SortOp op,
-                                   ArrayRef<InterpreterValue> inputs) {
-  const auto& shape = inputs.front().view().sizes;
-  uint64_t dim =
-      op.getDimension() < shape.size() ? op.getDimension() : shape.size() - 1;
-  SmallVector<int64_t> indices(shape[dim]);
-
-  auto iterView = inputs.front().view();
-  iterView.sizes[dim] = 1;
-  SmallVector<InterpreterValue> results;
-  for (const auto& input : inputs) {
-    results.push_back(input.typedAlike(input.view().sizes));
-  }
-  for (const auto& offset : iterView.indices()) {
-    std::iota(indices.begin(), indices.end(), 0);
-    auto comparator = [&](int64_t a, int64_t b) {
-      if (state.hasFailure()) {
-        return false;
-      }
-      SmallVector<InterpreterValue> args;
-      auto indexA = offset;
-      indexA[dim] = a;
-      auto indexB = offset;
-      indexB[dim] = b;
-      for (const auto& input : inputs) {
-        args.push_back(input.extractElement(indexA).asUnitTensor());
-        args.push_back(input.extractElement(indexB).asUnitTensor());
-      }
-      auto result = interpret(state, op.getComparator(), args);
-      return !state.hasFailure() && result[0].extractElement({}).asInt() != 0;
-    };
-
-    if (op.getIsStable()) {
-      std::stable_sort(indices.begin(), indices.end(), comparator);
-    } else {
-      std::sort(indices.begin(), indices.end(), comparator);
-    }
-
-    auto copy = offset;
-    for (auto [dst, src] : llvm::enumerate(indices)) {
-      for (auto [input, result] : llvm::zip(inputs, results)) {
-        copy[dim] = src;
-        auto elem = input.extractElement(copy);
-        copy[dim] = static_cast<int64_t>(dst);
-        result.insertElement(copy, elem);
-      }
-    }
-  }
-  return results;
-}
-
-SmallVector<InterpreterValue> mhloCase(InterpreterState& state, mhlo::CaseOp op,
-                                       int64_t index) {
-  if (index < 0 || index >= op->getNumResults()) {
-    index = op->getNumRegions() - 1;
-  }
-  return interpret(state, op->getRegion(index), {});
-}
-
-InterpreterValue dotGeneralImpl(InterpreterValue& lhs, InterpreterValue& rhs,
-                                ArrayRef<int64_t> lhsContracting,
-                                ArrayRef<int64_t> rhsContracting,
-                                ArrayRef<int64_t> lhsBatch,
-                                ArrayRef<int64_t> rhsBatch,
-                                mlir::Type elementTy) {
-  auto& lhsv = lhs.view();
-  auto& rhsv = rhs.view();
-  SmallVector<int64_t> dimensions;
-  SmallVector<int64_t> lhsNonBatch;
-  SmallVector<int64_t> rhsNonBatch;
-  auto nbatch = static_cast<int64_t>(lhsBatch.size());
-  for (const int64_t lhsDim : lhsBatch) {
-    dimensions.push_back(lhsv.sizes[lhsDim]);
-  }
-  for (int64_t i = 0; i < lhsv.rank(); i++) {
-    if (!llvm::is_contained(lhsContracting, i) &&
-        !llvm::is_contained(lhsBatch, i)) {
-      dimensions.push_back(lhsv.sizes[i]);
-      lhsNonBatch.push_back(i);
-    }
-  }
-  for (int64_t i = 0; i < rhs.view().rank(); i++) {
-    if (!llvm::is_contained(rhsContracting, i) &&
-        !llvm::is_contained(rhsBatch, i)) {
-      dimensions.push_back(rhsv.sizes[i]);
-      rhsNonBatch.push_back(i);
-    }
-  }
-
-  SmallVector<int64_t> lhsIndex(lhsv.rank());
-  SmallVector<int64_t> rhsIndex(rhsv.rank());
-  SmallVector<int64_t> outputIndex(dimensions.size());
-  auto output = lhs.typedAlike(dimensions);
-
-  std::function<void(int64_t)> applyBatch, applyLhs, applyRhs, applyContract;
-
-  applyBatch = [&](int64_t dim) {
-    if (dim >= nbatch) {
-      applyLhs(0);
-      return;
-    }
-    for (int64_t i = 0; i < dimensions[dim]; ++i) {
-      lhsIndex[lhsBatch[dim]] = i;
-      rhsIndex[rhsBatch[dim]] = i;
-      outputIndex[dim] = i;
-      applyBatch(dim + 1);
-    }
-  };
-
-  applyLhs = [&](int64_t dim) {
-    if (dim >= lhsNonBatch.size()) {
-      applyRhs(0);
-      return;
-    }
-    int64_t outDim = nbatch + dim;
-    for (int64_t i = 0; i < dimensions[outDim]; ++i) {
-      lhsIndex[lhsNonBatch[dim]] = i;
-      outputIndex[outDim] = i;
-      applyLhs(dim + 1);
-    }
-  };
-
-  applyRhs = [&](int64_t dim) {
-    if (dim >= rhsNonBatch.size()) {
-      applyContract(0);
-      return;
-    }
-    auto outDim = static_cast<int64_t>(nbatch + lhsNonBatch.size() + dim);
-    for (int64_t i = 0; i < dimensions[outDim]; ++i) {
-      rhsIndex[rhsNonBatch[dim]] = i;
-      outputIndex[outDim] = i;
-      applyRhs(dim + 1);
-    }
-  };
-
-  applyContract = [&](int64_t dim) {
-    if (dim >= lhsContracting.size()) {
-      dispatchScalarType(elementTy, [&](auto dummy) {
-        using T = TensorOrMemref<decltype(dummy)>;
-        std::get<T>(output.storage).at(outputIndex) +=
-            std::get<T>(lhs.storage).at(lhsIndex) *
-            std::get<T>(rhs.storage).at(rhsIndex);
-      });
-      return;
-    }
-    for (int64_t i = 0; i < lhsv.sizes[lhsContracting[dim]]; ++i) {
-      lhsIndex[lhsContracting[dim]] = i;
-      rhsIndex[rhsContracting[dim]] = i;
-      applyContract(dim + 1);
-    }
-  };
-
-  applyBatch(0);
-  return output;
-}
-
-// TODO(jreiffers): Unify this with DotGeneral.
-InterpreterValue dot(InterpreterState& state, mhlo::DotOp op,
-                     const InterpreterValue& lhs, const InterpreterValue& rhs) {
-  ShapedType ty = cast<ShapedType>(op->getResultTypes()[0]);
-  auto result = lhs.typedAlike(ty.getShape());
-
-  if (lhs.view().rank() == 1 && rhs.view().rank() == 1) {
-    dispatchScalarType(ty, [&](auto dummy) {
-      using T = decltype(dummy);
-      using TT = TensorOrMemref<T>;
-      auto lhsTensor = std::get<TT>(lhs.storage);
-      auto rhsTensor = std::get<TT>(rhs.storage);
-
-      T product = 0;
-      for (int64_t i = 0; i < lhsTensor.view.sizes[0]; ++i) {
-        product += lhsTensor.at(i) * rhsTensor.at(i);
-      }
-      std::get<TT>(result.storage).at({}) += product;
-    });
-  } else if (lhs.view().rank() == 2 && rhs.view().rank() == 1) {
-    dispatchScalarType(ty, [&](auto dummy) {
-      using TT = TensorOrMemref<decltype(dummy)>;
-      auto lhsTensor = std::get<TT>(lhs.storage);
-      auto rhsTensor = std::get<TT>(rhs.storage);
-      auto resultTensor = std::get<TT>(result.storage);
-      for (int64_t i = 0; i < resultTensor.view.sizes[0]; ++i) {
-        for (int64_t j = 0; j < rhsTensor.view.sizes[0]; ++j) {
-          resultTensor.at(i) += lhsTensor.at({i, j}) * rhsTensor.at(j);
-        }
-      }
-    });
-  } else if (lhs.view().rank() == 2 && rhs.view().rank() == 2) {
-    dispatchScalarType(ty, [&](auto dummy) {
-      using TT = TensorOrMemref<decltype(dummy)>;
-      auto lhsTensor = std::get<TT>(lhs.storage);
-      auto rhsTensor = std::get<TT>(rhs.storage);
-      auto resultTensor = std::get<TT>(result.storage);
-      for (int64_t i = 0; i < resultTensor.view.sizes[0]; ++i) {
-        for (int64_t j = 0; j < resultTensor.view.sizes[1]; ++j) {
-          for (int64_t k = 0; k < lhsTensor.view.sizes[1]; ++k) {
-            resultTensor.at({i, j}) +=
-                lhsTensor.at({i, k}) * rhsTensor.at({k, j});
-          }
-        }
-      }
-    });
-  } else {
-    state.addFailure("unsupported dot");
-  }
-
-  return result;
-}
-
-InterpreterValue dotGeneral(InterpreterState&, mhlo::DotGeneralOp op,
-                            InterpreterValue lhs, InterpreterValue rhs) {
-  const auto& dims = op.getDotDimensionNumbers();
-  return dotGeneralImpl(
-      lhs, rhs, dims.getLhsContractingDimensions(),
-      dims.getRhsContractingDimensions(), dims.getLhsBatchingDimensions(),
-      dims.getRhsBatchingDimensions(),
-      op->getResultTypes()[0].cast<ShapedType>().getElementType());
-}
-
-InterpreterValue computeReshapeShape(InterpreterState&,
-                                     mhlo::ComputeReshapeShapeOp,
-                                     const InterpreterValue& numElements,
-                                     const InterpreterValue& dynamicShape) {
-  auto out = dynamicShape.clone();
-  SmallVector<int64_t> dynamicIndex;
-  int64_t staticProduct = 1;
-  for (const auto& index : dynamicShape.view().indices()) {
-    auto value = dynamicShape.extractElement(index).asInt();
-    if (value == -1) {
-      dynamicIndex = index;
-    } else {
-      staticProduct *= value;
-    }
-  }
-
-  if (!dynamicIndex.empty()) {
-    std::visit(
-        [&](auto& it) {
-          if constexpr (is_tensor_or_memref_v<decltype(it)>) {
-            it.at(dynamicIndex) = numElements.asInt() / staticProduct;
-          } else {
-            llvm_unreachable("invalid input");
-          }
-        },
-        out.storage);
-  }
-  return out;
-}
-
-// TODO(jreiffers): Migrate remaining ops to the safer signature.
-REGISTER_MLIR_INTERPRETER_OP("mhlo.dynamic_gather", gather);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.gather", gather);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.return", noOpTerminator);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.tuple", makeTuple);
-REGISTER_MLIR_INTERPRETER_OP(bitcastConvert);
-REGISTER_MLIR_INTERPRETER_OP(broadcastInDim);
-REGISTER_MLIR_INTERPRETER_OP(clamp);
-REGISTER_MLIR_INTERPRETER_OP(compare);
-REGISTER_MLIR_INTERPRETER_OP(computeReshapeShape);
-REGISTER_MLIR_INTERPRETER_OP(concatenate);
-REGISTER_MLIR_INTERPRETER_OP(constant);
-REGISTER_MLIR_INTERPRETER_OP(convert);
-REGISTER_MLIR_INTERPRETER_OP(copy);
-REGISTER_MLIR_INTERPRETER_OP(dot);
-REGISTER_MLIR_INTERPRETER_OP(dotGeneral);
-REGISTER_MLIR_INTERPRETER_OP(dynamicSlice);
-REGISTER_MLIR_INTERPRETER_OP(dynamicUpdateSlice);
-REGISTER_MLIR_INTERPRETER_OP(fusion);
-REGISTER_MLIR_INTERPRETER_OP(getTupleElement);
-REGISTER_MLIR_INTERPRETER_OP(iota);
-REGISTER_MLIR_INTERPRETER_OP(mhloCase);
-REGISTER_MLIR_INTERPRETER_OP(mhloWhile);
-REGISTER_MLIR_INTERPRETER_OP(pad);
-REGISTER_MLIR_INTERPRETER_OP(reduce);
-REGISTER_MLIR_INTERPRETER_OP(reshape);
-REGISTER_MLIR_INTERPRETER_OP(scatter);
-REGISTER_MLIR_INTERPRETER_OP(select);
-REGISTER_MLIR_INTERPRETER_OP(slice);
-REGISTER_MLIR_INTERPRETER_OP(sort);
-REGISTER_MLIR_INTERPRETER_OP(transpose);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_binary_cwise.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_binary_cwise.cc
deleted file mode 100644
index 959aa138cab534..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_binary_cwise.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tools/mlir_interpreter/dialects/cwise_math.h"
-#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-REGISTER_MLIR_INTERPRETER_OP("mhlo.add", applyCwiseBinaryMap<Plus>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.and", applyCwiseBinaryMap<BitAnd>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.atan2", applyCwiseBinaryMap<ATan2>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.complex", applyCwiseBinaryMap<Complex>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.divide", applyCwiseBinaryMap<Divide>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.maximum", applyCwiseBinaryMap<Max>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.minimum", applyCwiseBinaryMap<Min>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.multiply", applyCwiseBinaryMap<Multiply>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.or", applyCwiseBinaryMap<BitOr>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.power", applyCwiseBinaryMap<Power>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.remainder", applyCwiseBinaryMap<Remainder>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.shift_left", applyCwiseBinaryMap<ShiftLeft>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.shift_right_arithmetic",
-                             applyCwiseBinaryMap<ShiftRightArith>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.shift_right_logical",
-                             applyCwiseBinaryMap<ShiftRightLogical>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.subtract", applyCwiseBinaryMap<Minus>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.xor", applyCwiseBinaryMap<BitXor>);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_unary_cwise.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_unary_cwise.cc
deleted file mode 100644
index c27ff6f90c2261..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_unary_cwise.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <complex>
-
-#include "tools/mlir_interpreter/dialects/cwise_math.h"
-#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-struct Logistic : CwiseNonIntegral {
-  template <typename T>
-  static T apply(T a) {
-    if (std::real(a) < 0) {
-      T e = std::exp(a);
-      return e / (e + T{1});
-    }
-    return T{1} / (std::exp(-a) + T{1});
-  }
-};
-
-// Note: this can't be replaced with std::bit_not, which always returns true for
-// bools.
-struct Not : CwiseIntegral {
-  static bool apply(bool a) { return !a; }
-
-  template <typename T>
-  static T apply(T a) {
-    return ~a;
-  }
-};
-
-struct Sign : CwiseSigned {
-  template <typename T>
-  static T apply(T a) {
-    return std::copysign(T{1}, a);
-  }
-};
-
-REGISTER_MLIR_INTERPRETER_OP("mhlo.abs", applyCwiseMap<Abs>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.cbrt", applyCwiseMap<Cbrt>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.ceil", applyCwiseMap<Ceil>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.cosine", applyCwiseMap<Cos>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.count_leading_zeros", applyCwiseMap<Clz>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.exponential", applyCwiseMap<Exp>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.exponential_minus_one",
-                             applyCwiseMap<ExpM1>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.floor", applyCwiseMap<Floor>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.imag", applyCwiseMap<Imag>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.is_finite", applyCwiseMap<IsFinite>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.log", applyCwiseMap<Log>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.log_plus_one", applyCwiseMap<Log1P>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.logistic", applyCwiseMap<Logistic>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.negate", applyCwiseMap<Neg>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.not", applyCwiseMap<Not>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.popcnt", applyCwiseMap<PopCount>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.real", applyCwiseMap<Real>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.round_nearest_afz", applyCwiseMap<Round>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.round_nearest_even",
-                             applyCwiseMap<NearbyInt>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.rsqrt", applyCwiseMap<RSqrt>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.sign", applyCwiseMap<Sign>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.sine", applyCwiseMap<Sin>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.sqrt", applyCwiseMap<Sqrt>);
-REGISTER_MLIR_INTERPRETER_OP("mhlo.tanh", applyCwiseMap<TanH>);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/scf.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/scf.cc
deleted file mode 100644
index d74cfea270b5a2..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/scf.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/SCF/IR/SCF.h"
-
-#include <iterator>  // NOLINT
-#include <memory>    // NOLINT
-#include <utility>   // NOLINT
-
-#include "llvm/ADT/SmallVector.h"
-#include "tools/mlir_interpreter/dialects/util.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-class ParallelSideChannel : public InterpreterSideChannel {
- public:
-  ParallelSideChannel(
-      llvm::SmallVector<InterpreterValue>& results,
-      const llvm::DenseMap<scf::ReduceOp, int64_t>& reduceOpIndices)
-      : results(results), reduceOpIndices(reduceOpIndices) {}
-
-  InterpreterValue& result(scf::ReduceOp op) const {
-    return results[reduceOpIndices.find(op)->second];
-  }
-
- private:
-  SmallVector<InterpreterValue>& results;
-  const llvm::DenseMap<scf::ReduceOp, int64_t>& reduceOpIndices;
-};
-
-llvm::SmallVector<InterpreterValue> scfFor(InterpreterState& state,
-                                           scf::ForOp op, int64_t lb,
-                                           int64_t ub, int64_t step,
-                                           ArrayRef<InterpreterValue> inits) {
-  llvm::SmallVector<InterpreterValue> results;
-  for (int64_t i = 0; i < inits.size(); ++i) {
-    results.push_back(getInitOperand(op.getInitArgs(), i, inits));
-  }
-
-  auto& region = op->getRegion(0);
-  for (; lb < ub; lb += step) {
-    SmallVector<InterpreterValue> inputs;
-    dispatchScalarType(op.getLowerBound().getType(), [&](auto dummy) {
-      inputs.push_back(InterpreterValue{static_cast<decltype(dummy)>(lb)});
-    });
-    llvm::copy(results, std::back_inserter(inputs));
-    results = interpret(state, region, inputs);
-    if (state.hasFailure()) break;
-  }
-  return results;
-}
-
-llvm::SmallVector<InterpreterValue> forAll(
-    InterpreterState& state, scf::ForallOp op,
-    ArrayRef<InterpreterValue> dynamicLowerBounds,
-    ArrayRef<InterpreterValue> dynamicUpperBounds,
-    ArrayRef<InterpreterValue> dynamicSteps, ArrayRef<InterpreterValue> inits) {
-  bool isBufferized = op.getNumResults() == 0;
-
-  // Clone any tensors that are passed in `shared_outs`.
-  SmallVector<InterpreterValue> outs = llvm::to_vector(inits);
-  for (auto [type, out] : llvm::zip(op.getOutputs().getTypes(), outs)) {
-    if (type.isa<TensorType>()) {
-      out = out.clone();
-    }
-  }
-
-  auto lbs = replaceDynamicVals(op.getStaticLowerBound(), dynamicLowerBounds);
-  auto ubs = replaceDynamicVals(op.getStaticUpperBound(), dynamicUpperBounds);
-  auto steps = replaceDynamicVals(op.getStaticStep(), dynamicSteps);
-
-  SmallVector<int64_t> iterSizes;
-  for (auto [lb, ub, step] : llvm::zip(lbs, ubs, steps)) {
-    if (step == 0) {
-      state.addFailure("invalid step");
-      return {};
-    }
-    iterSizes.push_back((ub - lb + (step - 1)) / step);
-  }
-
-  // Make a fake buffer view to abuse its index iterator.
-  BufferView view{0, iterSizes, {}};
-  for (const auto& indices : view.indices()) {
-    SmallVector<InterpreterValue> args;
-    for (auto [i, lb, step] : llvm::zip(indices, lbs, steps)) {
-      args.push_back(InterpreterValue{i * step + lb});
-    }
-    llvm::copy(outs, std::back_inserter(args));
-
-    auto yielded = interpret(state, op->getRegion(0), args);
-    if (state.hasFailure()) break;
-    assert(yielded.empty() && "forall loop shouldn't have yielded anything");
-  }
-
-  if (isBufferized) {
-    return {};
-  }
-  return outs;
-}
-
-void inParallel(InterpreterState& state, scf::InParallelOp op) {
-  interpret(state, op.getRegion(), {});
-}
-
-llvm::SmallVector<InterpreterValue> scfIf(InterpreterState& state, scf::IfOp op,
-                                          bool condition) {
-  if (condition) {
-    return interpret(state, op.getThenRegion(), {});
-  }
-  if (op.getElseRegion().hasOneBlock()) {
-    return interpret(state, op.getElseRegion(), {});
-  }
-  return {};
-}
-
-llvm::SmallVector<InterpreterValue> parallel(InterpreterState& state,
-                                             scf::ParallelOp parallel,
-                                             ArrayRef<int64_t> lbs,
-                                             ArrayRef<int64_t> ubs,
-                                             ArrayRef<int64_t> steps,
-                                             ArrayRef<InterpreterValue> inits) {
-  llvm::SmallVector<InterpreterValue> results;
-  for (int64_t i = 0; i < parallel.getNumReductions(); ++i) {
-    results.push_back(getInitOperand(parallel.getInitVals(), i, inits));
-  }
-
-  BufferView iter;
-  for (auto [lb, ub, step] : llvm::zip(lbs, ubs, steps)) {
-    iter.sizes.push_back((ub - lb + (step - 1)) / step);
-  }
-
-  llvm::DenseMap<scf::ReduceOp, int64_t> reduceOps;
-  for (auto& subOp : parallel.getBody()->getOperations()) {
-    if (auto reduce = llvm::dyn_cast<scf::ReduceOp>(subOp)) {
-      int64_t index = reduceOps.size();
-      reduceOps[reduce] = index;
-    }
-  }
-
-  assert(reduceOps.size() == results.size() &&
-         "expected equal number of reduce ops and results");
-
-  // Make the results available to reduce ops.
-  state.getTopScope()->setSideChannel(
-      std::make_shared<ParallelSideChannel>(results, reduceOps));
-  for (const auto& indices : iter.indices()) {
-    SmallVector<InterpreterValue> iterArgs;
-    for (auto [i, lb, step] : llvm::zip(indices, lbs, steps)) {
-      iterArgs.push_back(InterpreterValue{i * step + lb});
-    }
-
-    // Execute the region. It has no results.
-    interpret(state, parallel.getRegion(), iterArgs);
-  }
-
-  return results;
-}
-
-void reduce(InterpreterState& state, scf::ReduceOp reduce,
-            const InterpreterValue& operand) {
-  auto& accumulator =
-      state.getTopScope()->getSideChannel<ParallelSideChannel>()->result(
-          reduce);
-  // TODO(jreiffers): Is this the correct order?
-  auto results = interpret(state, reduce.getRegion(), {accumulator, operand});
-  if (!state.hasFailure()) {
-    accumulator = results.front();
-  }
-}
-
-llvm::SmallVector<InterpreterValue> scfWhile(
-    InterpreterState& state, scf::WhileOp op,
-    MutableArrayRef<InterpreterValue> inits) {
-  auto loopVars = interpret(state, op.getBefore(), inits);
-  while (!state.hasFailure() && std::get<bool>(loopVars.front().storage)) {
-    loopVars = interpret(state, op.getAfter(),
-                         ArrayRef<InterpreterValue>(loopVars).drop_front());
-    if (state.hasFailure()) break;
-    loopVars = interpret(state, op.getBefore(), loopVars);
-  }
-  if (state.hasFailure()) return {};
-  loopVars.erase(loopVars.begin());
-  return loopVars;
-}
-
-REGISTER_MLIR_INTERPRETER_OP("scf.condition", noOpTerminator);
-REGISTER_MLIR_INTERPRETER_OP("scf.reduce.return", noOpTerminator);
-REGISTER_MLIR_INTERPRETER_OP("scf.yield", noOpTerminator);
-REGISTER_MLIR_INTERPRETER_OP(forAll);
-REGISTER_MLIR_INTERPRETER_OP(inParallel);
-REGISTER_MLIR_INTERPRETER_OP(parallel);
-REGISTER_MLIR_INTERPRETER_OP(reduce);
-REGISTER_MLIR_INTERPRETER_OP(scfFor);
-REGISTER_MLIR_INTERPRETER_OP(scfIf);
-REGISTER_MLIR_INTERPRETER_OP(scfWhile);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tensor.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tensor.cc
deleted file mode 100644
index 529f59369231ef..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tensor.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-
-// clang-format erroneously puts the Tensor.h header above.
-#include <optional>  // NOLINT
-
-#include "llvm/ADT/STLExtras.h"
-#include "tools/mlir_interpreter/dialects/util.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-int64_t dim(InterpreterState& state, tensor::DimOp,
-            const InterpreterValue& tensor, int64_t dim) {
-  return dimImpl(tensor, dim, state);
-}
-
-InterpreterValue empty(InterpreterState&, tensor::EmptyOp op,
-                       ArrayRef<int64_t> dynamicSizes) {
-  auto ty = op->getResultTypes().front().cast<mlir::ShapedType>();
-  auto shape = replaceDynamicVals(ty.getShape(), dynamicSizes);
-  return InterpreterValue::makeTensor(ty.getElementType(), shape);
-}
-
-InterpreterValue extract(InterpreterState& state, tensor::ExtractOp,
-                         InterpreterValue tensor, ArrayRef<int64_t> indices) {
-  if (!tensor.view().inBounds(indices)) {
-    state.addFailure("array index out of bounds");
-    return {};
-  }
-  return tensor.extractElement(indices);
-}
-
-InterpreterValue fromElements(InterpreterState&, tensor::FromElementsOp op,
-                              MutableArrayRef<InterpreterValue> elements) {
-  auto ty = op->getResultTypes().front().cast<mlir::ShapedType>();
-  auto result = InterpreterValue::makeTensor(ty.getElementType(),
-                                             llvm::to_vector(ty.getShape()));
-  for (auto [index, element] : llvm::zip(result.view().indices(), elements)) {
-    result.insertElement(index, element);
-  }
-  return result;
-}
-
-llvm::SmallVector<InterpreterValue> collapseShape(
-    InterpreterState&, tensor::CollapseShapeOp op,
-    const InterpreterValue& tensor) {
-  SmallVector<int64_t> sizes;
-  for (const auto& indices : op.getReassociationIndices()) {
-    int64_t size = 1;
-    for (auto dim : indices) {
-      size *= tensor.view().sizes[dim];
-    }
-    sizes.push_back(size);
-  }
-  return {reshapeTensor(tensor, sizes)};
-}
-
-llvm::SmallVector<InterpreterValue> expandShape(
-    InterpreterState&, tensor::ExpandShapeOp op,
-    const InterpreterValue& tensor) {
-  auto ty = op->getResultTypes().front().template cast<mlir::ShapedType>();
-  auto sizes = llvm::to_vector(ty.getShape());
-  for (const auto& [srcIndex, dstIndices] :
-       llvm::enumerate(op.getReassociationIndices())) {
-    int64_t size = tensor.view().sizes[srcIndex];
-    std::optional<int64_t> dynIndex = std::nullopt;
-    for (auto dim : dstIndices) {
-      if (sizes[dim] < 0) {
-        dynIndex = dim;
-      } else {
-        size /= sizes[dim];
-      }
-    }
-    if (dynIndex) {
-      sizes[*dynIndex] = size;
-    }
-  }
-  return {reshapeTensor(tensor, sizes)};
-}
-
-llvm::SmallVector<InterpreterValue> extractSlice(
-    InterpreterState&, tensor::ExtractSliceOp extract, InterpreterValue tensor,
-    ArrayRef<int64_t> dynamicOffsets, ArrayRef<int64_t> dynamicSizes,
-    ArrayRef<int64_t> dynamicStrides) {
-  auto v = extractOffsetsSizesStrides(dynamicOffsets, dynamicSizes,
-                                      dynamicStrides, extract);
-  int64_t rank = v.offsets.size();
-  auto out = tensor.typedAlike(v.sizes);
-  out.fill([&](llvm::ArrayRef<int64_t> indices) {
-    llvm::SmallVector<int64_t> srcIndices;
-    for (int64_t i = 0; i < rank; ++i) {
-      srcIndices.push_back(indices[i] * v.strides[i] + v.offsets[i]);
-    }
-    return tensor.extractElement(srcIndices);
-  });
-
-  int64_t numDropped = 0;
-  auto& outView = out.view();
-
-  // TODO(jreiffers): Figure out why getDroppedDims fails here when there's
-  // no rank reduction and the output has a dynamic shape.
-  int64_t dim = 0;
-  const auto& resultSizes = extract.getResultType().getShape();
-  const auto& staticSizes = extract.getStaticSizes();
-  while (dim < outView.rank()) {
-    if (staticSizes[numDropped + dim] == 1 &&
-        (dim >= resultSizes.size() || resultSizes[dim] != 1)) {
-      outView.sizes.erase(outView.sizes.begin() + dim);
-      outView.strides.erase(outView.strides.begin() + dim);
-      ++numDropped;
-    } else {
-      ++dim;
-    }
-  }
-
-  return {out};
-}
-
-template <typename Op>
-llvm::SmallVector<InterpreterValue> insertSlice(
-    InterpreterState&, Op insert, InterpreterValue src, InterpreterValue dest,
-    ArrayRef<int64_t> dynamicOffsets, ArrayRef<int64_t> dynamicSizes,
-    ArrayRef<int64_t> dynamicStrides) {
-  // parallel_insert_slice actually writes to its destination.
-  if (insert->getNumResults() == 1) {
-    dest = dest.clone();
-  }
-  auto v = extractOffsetsSizesStrides(dynamicOffsets, dynamicSizes,
-                                      dynamicStrides, insert);
-
-  auto staticSizes = insert.getStaticSizes();
-  llvm::SmallVector<int64_t> insertedDims;
-  auto& srcView = src.view();
-  auto* srcSizeIt = srcView.sizes.begin();
-  for (auto [dim, size] : llvm::enumerate(staticSizes)) {
-    if (srcSizeIt == srcView.sizes.end() || (*srcSizeIt != size && size >= 0)) {
-      assert(size == 1 && "Can only insert unit dims");
-      insertedDims.push_back(dim);
-    } else {
-      ++srcSizeIt;
-    }
-  }
-
-  for (const auto& srcIndices : srcView.indices()) {
-    llvm::SmallVector<int64_t> srcWithInsertedDims = srcIndices;
-    for (int64_t dim : insertedDims) {
-      srcWithInsertedDims.insert(srcWithInsertedDims.begin() + dim, 0);
-    }
-    llvm::SmallVector<int64_t> dstIndices;
-    for (auto [srcIndex, stride, offset] :
-         llvm::zip(srcWithInsertedDims, v.strides, v.offsets)) {
-      dstIndices.push_back(srcIndex * stride + offset);
-    }
-    dest.insertElement(dstIndices, src.extractElement(srcIndices));
-  }
-  if (insert->getNumResults() == 1) {
-    return {dest};
-  }
-  return {};
-}
-
-InterpreterValue generate(InterpreterState& state, tensor::GenerateOp generate,
-                          ArrayRef<int64_t> dynamicSizes) {
-  auto ty = generate->getResultTypes().front().cast<ShapedType>();
-  auto sizes = replaceDynamicVals(ty.getShape(), dynamicSizes);
-
-  auto result = InterpreterValue::makeTensor(ty.getElementType(), sizes);
-  result.fill([&](ArrayRef<int64_t> indices) -> InterpreterValue {
-    auto values = interpret(state, generate.getRegion(),
-                            packInterpreterValues<int64_t>(indices));
-    if (state.hasFailure()) {
-      return {result.extractElement(indices)};
-    }
-    return values.front();
-  });
-  return {result};
-}
-
-InterpreterValue insert(InterpreterState& state, tensor::InsertOp,
-                        const InterpreterValue& value,
-                        const InterpreterValue& tensor,
-                        ArrayRef<int64_t> indices) {
-  auto result = tensor.clone();
-  if (result.view().inBounds(indices)) {
-    result.insertElement(indices, value);
-  } else {
-    state.addFailure("array index out of bounds");
-  }
-  return result;
-}
-
-InterpreterValue pad(InterpreterState& state, tensor::PadOp pad,
-                     InterpreterValue tensor, ArrayRef<int64_t> dynamicLows,
-                     ArrayRef<int64_t> dynamicHighs) {
-  auto lows = replaceDynamicVals(pad.getStaticLow(), dynamicLows);
-  auto highs = replaceDynamicVals(pad.getStaticHigh(), dynamicHighs);
-
-  auto& view = tensor.view();
-  llvm::SmallVector<int64_t> resultSizes;
-  for (auto [size, low, high] : llvm::zip(view.sizes, lows, highs)) {
-    resultSizes.push_back(size + low + high);
-  }
-
-  auto result = tensor.typedAlike(resultSizes);
-  result.fill([&](llvm::ArrayRef<int64_t> outIndex) -> InterpreterValue {
-    llvm::SmallVector<int64_t> inIndex;
-    for (auto [index, low] : llvm::zip(outIndex, lows)) {
-      inIndex.push_back(index - low);
-    }
-    if (view.inBounds(inIndex)) {
-      return tensor.extractElement(inIndex);
-    }
-    return interpret(state, pad.getRegion(), packInterpreterValues(outIndex))
-        .front();
-  });
-  return result;
-}
-
-REGISTER_MLIR_INTERPRETER_OP("tensor.cast",
-                             "builtin.unrealized_conversion_cast");
-REGISTER_MLIR_INTERPRETER_OP("tensor.yield", noOpTerminator);
-REGISTER_MLIR_INTERPRETER_OP(collapseShape);
-REGISTER_MLIR_INTERPRETER_OP(dim);
-REGISTER_MLIR_INTERPRETER_OP(empty);
-REGISTER_MLIR_INTERPRETER_OP(expandShape);
-REGISTER_MLIR_INTERPRETER_OP(extract);
-REGISTER_MLIR_INTERPRETER_OP(extractSlice);
-REGISTER_MLIR_INTERPRETER_OP(fromElements);
-REGISTER_MLIR_INTERPRETER_OP(generate);
-REGISTER_MLIR_INTERPRETER_OP(insert);
-REGISTER_MLIR_INTERPRETER_OP(insertSlice<tensor::InsertSliceOp>);
-REGISTER_MLIR_INTERPRETER_OP(insertSlice<tensor::ParallelInsertSliceOp>);
-REGISTER_MLIR_INTERPRETER_OP(pad);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/BUILD b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/BUILD
deleted file mode 100644
index 0302467446f485..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-load("//xla:glob_lit_test.bzl", "glob_lit_tests")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
-
-glob_lit_tests(
-    name = "all_tests",
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = [
-        "mlir",
-    ],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//xla/mlir_hlo:mlir-interpreter-runner",
-        "@llvm-project//llvm:FileCheck",
-    ],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/apply.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/apply.mlir
deleted file mode 100644
index 43feca045e927f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/apply.mlir
+++ /dev/null
@@ -1,63 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @ceildiv() -> index {
-  %c25 = arith.constant 25 : index
-  %ret = affine.apply affine_map<(d0)[] -> ((d0 ceildiv 8) * 8)>(%c25)[]
-  return %ret : index
-}
-
-// CHECK-LABEL: @ceildiv
-// CHECK: Results
-// CHECK-NEXT: 32
-
-func.func @floordiv() -> index {
-  %c25 = arith.constant 25 : index
-  %ret = affine.apply affine_map<(d0)[] -> ((d0 floordiv 8) * 8)>(%c25)[]
-  return %ret : index
-}
-
-// CHECK-LABEL: @floordiv
-// CHECK: Results
-// CHECK-NEXT: 24
-
-func.func @add() -> index {
-  %c100 = arith.constant 100 : index
-  %c42 = arith.constant 42 : index
-  %ret = affine.apply affine_map<(d0, d1)[] -> (d0 + d1)>(%c100, %c42)[]
-  return %ret : index
-}
-
-// CHECK-LABEL: @add
-// CHECK: Results
-// CHECK-NEXT: 142
-
-func.func @mod() -> index {
-  %c99 = arith.constant 99 : index
-  %ret = affine.apply affine_map<(d0)[] -> (d0 mod 10)>(%c99)[]
-  return %ret : index
-}
-
-// CHECK-LABEL: @mod
-// CHECK: Results
-// CHECK-NEXT: 9
-
-func.func @mul() -> index {
-  %c100 = arith.constant 100 : index
-  %ret = affine.apply affine_map<(d0)[] -> (d0 * 42)>(%c100)[]
-  return %ret : index
-}
-
-// CHECK-LABEL: @mul
-// CHECK: Results
-// CHECK-NEXT: 4200
-
-func.func @symbol() -> index {
-  %c1 = arith.constant 1 : index
-  %c20 = arith.constant 20 : index
-  %ret = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c1)[%c20]
-  return %ret : index
-}
-
-// CHECK-LABEL: @symbol
-// CHECK: Results
-// CHECK-NEXT: 21
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/minmax.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/minmax.mlir
deleted file mode 100644
index cf38c45f3608ab..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/minmax.mlir
+++ /dev/null
@@ -1,36 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-#map = affine_map<(d0)[] -> (1000, d0 + 512, d0*100)>
-
-func.func @min() -> (index, index, index) {
-  %c1 = arith.constant 1 : index
-  %c8 = arith.constant 8 : index
-  %c500 = arith.constant 500 : index
-
-  %0 = affine.min #map (%c1)[]
-  %1 = affine.min #map (%c8)[]
-  %2 = affine.min #map (%c500)[]
-
-  return %0, %1, %2 : index, index, index
-}
-
-// CHECK-LABEL: @min
-// CHECK-NEXT: Results
-// CHECK-NEXT: 100
-// CHECK-NEXT: 520
-// CHECK-NEXT: 1000
-
-func.func @max() -> (index, index) {
-  %c1 = arith.constant 1 : index
-  %c11 = arith.constant 11 : index
-
-  %0 = affine.max #map (%c1)[]
-  %1 = affine.max #map (%c11)[]
-
-  return %0, %1 : index, index
-}
-
-// CHECK-LABEL: @max
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1000
-// CHECK-NEXT: 1100
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/bitcast.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/bitcast.mlir
deleted file mode 100644
index 5dc30c1fbbba20..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/bitcast.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @bitcast_scalar() -> i32 {
-  %c15 = arith.constant 1.5 : f32
-  %ret = arith.bitcast %c15 : f32 to i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @bitcast_scalar
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1069547520
-
-func.func @bitcast_vector() -> vector<2xf32> {
-  %c15 = arith.constant dense<[15, 25]> : vector<2xi32>
-  %ret = arith.bitcast %c15 : vector<2xi32> to vector<2xf32>
-  return %ret : vector<2xf32>
-}
-
-// CHECK-LABEL: @bitcast_vector
-// CHECK-NEXT: Results
-// CHECK-NEXT: [2.101948e-44, 3.503246e-44]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpf.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpf.mlir
deleted file mode 100644
index e416f353b18781..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpf.mlir
+++ /dev/null
@@ -1,129 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func private @compare(%a: f32, %b: f32) -> tensor<16xi1> {
-  %false = arith.cmpf false, %a, %b : f32
-  %oeq = arith.cmpf oeq, %a, %b : f32
-  %ogt = arith.cmpf ogt, %a, %b : f32
-  %oge = arith.cmpf oge, %a, %b : f32
-
-  %olt = arith.cmpf olt, %a, %b : f32
-  %ole = arith.cmpf ole, %a, %b : f32
-  %one = arith.cmpf one, %a, %b : f32
-  %ord = arith.cmpf ord, %a, %b : f32
-
-  %ueq = arith.cmpf ueq, %a, %b : f32
-  %ugt = arith.cmpf ugt, %a, %b : f32
-  %uge = arith.cmpf uge, %a, %b : f32
-  %ult = arith.cmpf ult, %a, %b : f32
-
-  %ule = arith.cmpf ule, %a, %b : f32
-  %une = arith.cmpf une, %a, %b : f32
-  %uno = arith.cmpf uno, %a, %b : f32
-  %true = arith.cmpf true, %a, %b : f32
-
-  %ret = tensor.from_elements
-    %false, %oeq, %ogt, %oge,
-    %olt, %ole, %one, %ord,
-    %ueq, %ugt, %uge, %ult,
-    %ule, %une, %uno, %true : tensor<16xi1>
-  func.return %ret : tensor<16xi1>
-}
-
-func.func @nan_vs_one() -> tensor<16xi1> {
-  %nan = arith.constant 0x7fc00000 : f32
-  %one = arith.constant 1.0 : f32
-
-  %ret = func.call @compare(%nan, %one) : (f32, f32) -> tensor<16xi1>
-  func.return %ret : tensor<16xi1>
-}
-
-// CHECK-LABEL: @nan_vs_one
-// CHECK-NEXT: Results
-// CHECK-NEXT: [false, false, false, false,
-// CHECK-SAME:  false, false, false, false,
-// CHECK-SAME:  true, true, true, true,
-// CHECK-SAME:  true, true, true, true]
-
-func.func @one_vs_nan() -> tensor<16xi1> {
-  %nan = arith.constant 0x7fc00000 : f32
-  %one = arith.constant 1.0 : f32
-
-  %ret = func.call @compare(%one, %nan) : (f32, f32) -> tensor<16xi1>
-  func.return %ret : tensor<16xi1>
-}
-
-// CHECK-LABEL: @one_vs_nan
-// CHECK-NEXT: Results
-// CHECK-NEXT: [false, false, false, false,
-// CHECK-SAME:  false, false, false, false,
-// CHECK-SAME:  true, true, true, true,
-// CHECK-SAME:  true, true, true, true]
-
-func.func @nan_vs_nan() -> tensor<16xi1> {
-  %nan = arith.constant 0x7fc00000 : f32
-
-  %ret = func.call @compare(%nan, %nan) : (f32, f32) -> tensor<16xi1>
-  func.return %ret : tensor<16xi1>
-}
-
-// CHECK-LABEL: @nan_vs_nan
-// CHECK-NEXT: Results
-// CHECK-NEXT: [false, false, false, false,
-// CHECK-SAME:  false, false, false, false,
-// CHECK-SAME:  true, true, true, true,
-// CHECK-SAME:  true, true, true, true]
-
-func.func @one_vs_one() -> tensor<16xi1> {
-  %one = arith.constant 1.0 : f32
-
-  %ret = func.call @compare(%one, %one) : (f32, f32) -> tensor<16xi1>
-  func.return %ret : tensor<16xi1>
-}
-
-// CHECK-LABEL: @one_vs_one
-// CHECK-NEXT: Results
-// CHECK-NEXT: [false, true, false, true,
-// CHECK-SAME:  false, true, false, true,
-// CHECK-SAME:  true, false, true, false,
-// CHECK-SAME:  true, false, false, true]
-
-func.func @one_vs_two() -> tensor<16xi1> {
-  %one = arith.constant 1.0 : f32
-  %two = arith.constant 2.0 : f32
-
-  %ret = func.call @compare(%one, %two) : (f32, f32) -> tensor<16xi1>
-  func.return %ret : tensor<16xi1>
-}
-
-// CHECK-LABEL: @one_vs_two
-// CHECK-NEXT: Results
-// CHECK-NEXT: [false, false, false, false,
-// CHECK-SAME:  true, true, true, true,
-// CHECK-SAME:  false, false, false, true,
-// CHECK-SAME:  true, true, false, true]
-
-func.func @two_vs_one() -> tensor<16xi1> {
-  %one = arith.constant 1.0 : f32
-  %two = arith.constant 2.0 : f32
-
-  %ret = func.call @compare(%two, %one) : (f32, f32) -> tensor<16xi1>
-  func.return %ret : tensor<16xi1>
-}
-
-// CHECK-LABEL: @two_vs_one
-// CHECK-NEXT: Results
-// CHECK-NEXT: [false, false, true, true,
-// CHECK-SAME:  false, false, true, true,
-// CHECK-SAME:  false, true, true, false,
-// CHECK-SAME:  false, true, false, true]
-
-func.func @vector() -> vector<4xi1> {
-  %0 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : vector<4xf32>
-  %1 = arith.constant dense<[10.0, 2.0, 30.0, 4.0]> : vector<4xf32>
-  %ret = arith.cmpf oeq, %0, %1 : vector<4xf32>
-  return %ret : vector<4xi1>
-}
-
-// CHECK-LABEL: @vector
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<4xi1>: [false, true, false, true]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpi.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpi.mlir
deleted file mode 100644
index dbca4f718d5ccd..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpi.mlir
+++ /dev/null
@@ -1,147 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @eq() -> (i1, i1) {
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %e1 = arith.cmpi eq, %c1, %c2 : index
-  %e2 = arith.cmpi eq, %c1, %c1 : index
-  return %e1, %e2 : i1, i1
-}
-
-// CHECK-LABEL: @eq
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: false
-// CHECK-NEXT{LITERAL}: true
-
-func.func @ne() -> (i1, i1) {
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %e1 = arith.cmpi ne, %c1, %c2 : index
-  %e2 = arith.cmpi ne, %c1, %c1 : index
-  return %e1, %e2 : i1, i1
-}
-
-// CHECK-LABEL: @ne
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: true
-// CHECK-NEXT{LITERAL}: false
-
-func.func @slt() -> (i1, i1, i1) {
-  %c-1 = arith.constant -1 : index
-  %c1 = arith.constant 1 : index
-  %e1 = arith.cmpi slt, %c-1, %c1 : index
-  %e2 = arith.cmpi slt, %c1, %c-1 : index
-  %e3 = arith.cmpi slt, %c1, %c1 : index
-  return %e1, %e2, %e3 : i1, i1, i1
-}
-
-// CHECK-LABEL: @slt
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: true
-// CHECK-NEXT{LITERAL}: false
-// CHECK-NEXT{LITERAL}: false
-
-func.func @sle() -> (i1, i1, i1) {
-  %c-1 = arith.constant -1 : index
-  %c1 = arith.constant 1 : index
-  %e1 = arith.cmpi sle, %c-1, %c1 : index
-  %e2 = arith.cmpi sle, %c1, %c-1 : index
-  %e3 = arith.cmpi sle, %c1, %c1 : index
-  return %e1, %e2, %e3 : i1, i1, i1
-}
-
-// CHECK-LABEL: @sle
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: true
-// CHECK-NEXT{LITERAL}: false
-// CHECK-NEXT{LITERAL}: true
-
-func.func @sgt() -> (i1, i1, i1) {
-  %c-1 = arith.constant -1 : index
-  %c1 = arith.constant 1 : index
-  %e1 = arith.cmpi sgt, %c-1, %c1 : index
-  %e2 = arith.cmpi sgt, %c1, %c-1 : index
-  %e3 = arith.cmpi sgt, %c1, %c1 : index
-  return %e1, %e2, %e3 : i1, i1, i1
-}
-
-// CHECK-LABEL: @sgt
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: false
-// CHECK-NEXT{LITERAL}: true
-// CHECK-NEXT{LITERAL}: false
-
-func.func @sge() -> (i1, i1, i1) {
-  %c-1 = arith.constant -1 : index
-  %c1 = arith.constant 1 : index
-  %e1 = arith.cmpi sge, %c-1, %c1 : index
-  %e2 = arith.cmpi sge, %c1, %c-1 : index
-  %e3 = arith.cmpi sge, %c1, %c1 : index
-  return %e1, %e2, %e3 : i1, i1, i1
-}
-
-// CHECK-LABEL: @sge
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: false
-// CHECK-NEXT{LITERAL}: true
-// CHECK-NEXT{LITERAL}: true
-
-func.func @ult() -> (i1, i1, i1) {
-  %c-1 = arith.constant -1 : index
-  %c1 = arith.constant 1 : index
-  %e1 = arith.cmpi ult, %c-1, %c1 : index
-  %e2 = arith.cmpi ult, %c1, %c-1 : index
-  %e3 = arith.cmpi ult, %c1, %c1 : index
-  return %e1, %e2, %e3 : i1, i1, i1
-}
-
-// CHECK-LABEL: @ult
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: false
-// CHECK-NEXT{LITERAL}: true
-// CHECK-NEXT{LITERAL}: false
-
-func.func @ule() -> (i1, i1, i1) {
-  %c-1 = arith.constant -1 : index
-  %c1 = arith.constant 1 : index
-  %e1 = arith.cmpi ule, %c-1, %c1 : index
-  %e2 = arith.cmpi ule, %c1, %c-1 : index
-  %e3 = arith.cmpi ule, %c1, %c1 : index
-  return %e1, %e2, %e3 : i1, i1, i1
-}
-
-// CHECK-LABEL: @ule
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: false
-// CHECK-NEXT{LITERAL}: true
-// CHECK-NEXT{LITERAL}: true
-
-func.func @ugt() -> (i1, i1, i1) {
-  %c-1 = arith.constant -1 : index
-  %c1 = arith.constant 1 : index
-  %e1 = arith.cmpi ugt, %c-1, %c1 : index
-  %e2 = arith.cmpi ugt, %c1, %c-1 : index
-  %e3 = arith.cmpi ugt, %c1, %c1 : index
-  return %e1, %e2, %e3 : i1, i1, i1
-}
-
-// CHECK-LABEL: @ugt
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: true
-// CHECK-NEXT{LITERAL}: false
-// CHECK-NEXT{LITERAL}: false
-
-func.func @uge() -> (i1, i1, i1) {
-  %c-1 = arith.constant -1 : index
-  %c1 = arith.constant 1 : index
-  %e1 = arith.cmpi uge, %c-1, %c1 : index
-  %e2 = arith.cmpi uge, %c1, %c-1 : index
-  %e3 = arith.cmpi uge, %c1, %c1 : index
-  return %e1, %e2, %e3 : i1, i1, i1
-}
-
-// CHECK-LABEL: @uge
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: true
-// CHECK-NEXT{LITERAL}: false
-// CHECK-NEXT{LITERAL}: true
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/constant.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/constant.mlir
deleted file mode 100644
index b378f008bada34..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/constant.mlir
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @tensor() -> tensor<2xi16> {
-  %cst = arith.constant dense<[42, 43]> : tensor<2xi16>
-  return %cst : tensor<2xi16>
-}
-
-// CHECK-LABEL: @tensor
-// CHECK-NEXT: Results
-// CHECK-NEXT: [42, 43]
-
-func.func @tensor_splat() -> tensor<2xi32> {
-  %cst = arith.constant dense<42> : tensor<2xi32>
-  return %cst : tensor<2xi32>
-}
-
-// CHECK-LABEL: @tensor_splat
-// CHECK-NEXT: Results
-// CHECK-NEXT: [42, 42]
-
-func.func @scalar() -> i1 {
-  %cst = arith.constant true
-  return %cst : i1
-}
-
-// CHECK-LABEL: @scalar
-// CHECK-NEXT: Results
-// CHECK-NEXT: true
-
-func.func @vector() -> vector<2x3xi32> {
-  %cst = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
-  return %cst : vector<2x3xi32>
-}
-
-// CHECK-LABEL: @vector
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x3xi32>: [[1, 2, 3], [4, 5, 6]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/extf.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/extf.mlir
deleted file mode 100644
index f6124ea07c3706..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/extf.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @extf() -> f64 {
-  %c1 = arith.constant 1.0 : f32
-  %ext = arith.extf %c1 : f32 to f64
-  return %ext : f64
-}
-
-// CHECK-LABEL: @extf
-// CHECK-NEXT: Results
-// CHECK-NEXT: f64: 1.000000e+00
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/fptosi.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/fptosi.mlir
deleted file mode 100644
index 46592e49f2ed33..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/fptosi.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @i16() -> i16 {
-  %c-1 = arith.constant -1.4 : f32
-  %r = arith.fptosi %c-1 : f32 to i16
-  return %r : i16
-}
-
-// CHECK-LABEL: @i16
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: -1
-
-func.func @vector() -> vector<2xi32> {
-  %c = arith.constant dense<[-1.1, -0.9]> : vector<2xf32>
-  %r = arith.fptosi %c : vector<2xf32> to vector<2xi32>
-  return %r : vector<2xi32>
-}
-
-// CHECK-LABEL: @vector
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2xi32>: [-1, 0]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/index_cast.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/index_cast.mlir
deleted file mode 100644
index d47438e1992062..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/index_cast.mlir
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @i32() -> (index) {
-  %c1 = arith.constant 42 : i32
-  %index = arith.index_cast %c1 : i32 to index
-  return %index : index
-}
-
-// CHECK-LABEL: @i32
-// CHECK{LITERAL}: 42
-
-func.func @i64() -> (index) {
-  %c1 = arith.constant 43 : i64
-  %index = arith.index_cast %c1 : i64 to index
-  return %index : index
-}
-
-// CHECK-LABEL: @i64
-// CHECK{LITERAL}: 43
-
-func.func @narrowing() -> (i32) {
-  %c1 = arith.constant 0x100000001 : index
-  %i32 = arith.index_cast %c1 : index to i32
-  return %i32 : i32
-}
-
-// CHECK-LABEL: @narrowing
-// CHECK{LITERAL}: i32: 1
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir
deleted file mode 100644
index a1c32c64730355..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir
+++ /dev/null
@@ -1,111 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @addi() -> i32 {
-  %c1 = arith.constant 1 : i32
-  %c2 = arith.constant 2 : i32
-  %ret = arith.addi %c1, %c2 : i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @addi
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 3
-
-func.func @muli() -> i32 {
-  %c3 = arith.constant 3 : i32
-  %c5 = arith.constant 5 : i32
-  %ret = arith.muli %c3, %c5 : i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @muli
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 15
-
-func.func @divsi() -> i32 {
-  %c10 = arith.constant 10 : i32
-  %c-2 = arith.constant -2 : i32
-  %ret = arith.divsi %c10, %c-2 : i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @divsi
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: -5
-
-func.func @subi() -> i32 {
-  %c10 = arith.constant 10 : i32
-  %c3 = arith.constant 3 : i32
-  %ret = arith.subi %c10, %c3 : i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @subi
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 7
-
-func.func @andi() -> i32 {
-  %c63 = arith.constant 63 : i32
-  %c131 = arith.constant 131 : i32
-  %ret = arith.andi %c63, %c131 : i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @andi
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 3
-
-func.func @ori() -> i32 {
-  %c3 = arith.constant 3 : i32
-  %c10 = arith.constant 10 : i32
-  %ret = arith.ori %c3, %c10 : i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @ori
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 11
-
-func.func @shrui() -> i32 {
-  %c20 = arith.constant 20 : i32
-  %c-1 = arith.constant -1 : i32
-  %ret = arith.shrui %c-1, %c20 : i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @shrui
-// CHECK-NEXT: Results
-// CHECK-NEXT: 4095
-
-func.func @shli() -> i32 {
-  %c2 = arith.constant 2 : i32
-  %c42 = arith.constant 42 : i32
-  %ret = arith.shli %c42, %c2 : i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @shli
-// CHECK-NEXT: Results
-// CHECK-NEXT: 168
-
-func.func @shrsi() -> i32 {
-  %c3 = arith.constant 3 : i32
-  %c-1023 = arith.constant -1023 : i32
-  %ret = arith.shrsi %c-1023, %c3 : i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @shrsi
-// CHECK-NEXT: Results
-// CHECK-NEXT: -128
-
-func.func @remsi() -> i32 {
-  %c7 = arith.constant 7 : i32
-  %c-1023 = arith.constant -1023 : i32
-  %ret = arith.remsi %c-1023, %c7 : i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @remsi
-// CHECK-NEXT: Results
-// CHECK-NEXT: -1
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/minmax.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/minmax.mlir
deleted file mode 100644
index 6d11378b5d7d31..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/minmax.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @i32() -> (i32, i32) {
-  %c-1 = arith.constant -1 : i32
-  %c1 = arith.constant 1 : i32
-  %r1 = arith.minsi %c-1, %c1 : i32
-  %r2 = arith.maxsi %c-1, %c1 : i32
-  return %r1, %r2 : i32, i32
-}
-
-// CHECK-LABEL: @i32
-// CHECK{LITERAL}: -1
-// CHECK-NEXT{LITERAL}: 1
-
-func.func @i64() -> (i64, i64) {
-  %c-1 = arith.constant -1 : i64
-  %c1 = arith.constant 1000000000000 : i64
-  %r1 = arith.minsi %c-1, %c1 : i64
-  %r2 = arith.maxsi %c-1, %c1 : i64
-  return %r1, %r2 : i64, i64
-}
-
-// CHECK-LABEL: @i64
-// CHECK{LITERAL}: -1
-// CHECK-NEXT{LITERAL}: 1000000000000
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/negf.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/negf.mlir
deleted file mode 100644
index 8b45b85a0611ba..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/negf.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @negf32() -> f32 {
-  %c = arith.constant -1.5 : f32
-  %ret = arith.negf %c : f32
-  return %ret : f32
-}
-
-// CHECK-LABEL: @negf32
-// CHECK-NEXT: Results
-// CHECK-NEXT: f32: 1.500000e+00
-
-func.func @negf64() -> f64 {
-  %c = arith.constant 3.5 : f64
-  %ret = arith.negf %c : f64
-  return %ret : f64
-}
-
-// CHECK-LABEL: @negf64
-// CHECK-NEXT: Results
-// CHECK-NEXT: f64: -3.500000e+00
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/remf.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/remf.mlir
deleted file mode 100644
index 3dc3033646cabd..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/remf.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @remf() -> f32 {
-  %a = arith.constant 3.5 : f32
-  %b = arith.constant 2.25 : f32
-  %ret = arith.remf %a, %b : f32
-  return %ret : f32
-}
-
-// CHECK-LABEL: @remf
-// CHECK-NEXT: Results
-// CHECK-NEXT: f32: 1.250000e+00
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/select.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/select.mlir
deleted file mode 100644
index 103842d4079e92..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/select.mlir
+++ /dev/null
@@ -1,52 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @select() -> (i32, i32) {
-  %c-1 = arith.constant -1 : i32
-  %c1 = arith.constant 1 : i32
-  %true = arith.constant true
-  %false = arith.constant false
-  %r1 = arith.select %true, %c-1, %c1 : i32
-  %r2 = arith.select %false, %c-1, %c1 : i32
-  return %r1, %r2 : i32, i32
-}
-
-// CHECK-LABEL: @select
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: -1
-// CHECK-NEXT{LITERAL}: 1
-
-func.func @vector() -> vector<4xi32> {
-  %a = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32>
-  %b = arith.constant dense<[10, 20, 30, 40]> : vector<4xi32>
-  %c = arith.constant dense<[true, false, true, false]> : vector<4xi1>
-  %r = arith.select %c, %a, %b : vector<4xi1>, vector<4xi32>
-  return %r : vector<4xi32>
-}
-
-// CHECK-LABEL: @vector
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<4xi32>: [1, 20, 3, 40]
-
-func.func @scalar_vector() -> vector<4xi32> {
-  %a = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32>
-  %b = arith.constant dense<[10, 20, 30, 40]> : vector<4xi32>
-  %c = arith.constant false
-  %r = arith.select %c, %a, %b : vector<4xi32>
-  return %r : vector<4xi32>
-}
-
-// CHECK-LABEL: @scalar_vector
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<4xi32>: [10, 20, 30, 40]
-
-func.func @tensor() -> tensor<4xi32> {
-  %a = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-  %b = arith.constant dense<[10, 20, 30, 40]> : tensor<4xi32>
-  %c = arith.constant dense<[true, false, true, false]> : tensor<4xi1>
-  %r = arith.select %c, %a, %b : tensor<4xi1>, tensor<4xi32>
-  return %r : tensor<4xi32>
-}
-
-// CHECK-LABEL: @tensor
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: TensorOrMemref<4xi32>: [1, 20, 3, 40]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/sitofp.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/sitofp.mlir
deleted file mode 100644
index b2a18564d0fcd3..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/sitofp.mlir
+++ /dev/null
@@ -1,31 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @i16() -> f32 {
-  %c-1 = arith.constant -1 : i16
-  %r = arith.sitofp %c-1 : i16 to f32
-  return %r : f32
-}
-
-// CHECK-LABEL: @i16
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: -1.000000e+00
-
-func.func @i1() -> f64 {
-  %true = arith.constant true
-  %r = arith.sitofp %true : i1 to f64
-  return %r : f64
-}
-
-// CHECK-LABEL: @i1
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: 1.000000e+00
-
-func.func @vector() -> vector<1xf32> {
-  %c-1 = arith.constant dense<-1> : vector<1xi8>
-  %r = arith.sitofp %c-1 : vector<1xi8> to vector<1xf32>
-  return %r : vector<1xf32>
-}
-
-// CHECK-LABEL: @vector
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<1xf32>: [-1.000000e+00]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/uitofp.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/uitofp.mlir
deleted file mode 100644
index 990e0196041858..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/uitofp.mlir
+++ /dev/null
@@ -1,31 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @i16() -> f32 {
-  %c-1 = arith.constant -1 : i16
-  %r = arith.uitofp %c-1 : i16 to f32
-  return %r : f32
-}
-
-// CHECK-LABEL: @i16
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: 6.553500e+04
-
-func.func @i1() -> f64 {
-  %true = arith.constant true
-  %r = arith.uitofp %true : i1 to f64
-  return %r : f64
-}
-
-// CHECK-LABEL: @i1
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: 1.000000e+00
-
-func.func @vector() -> vector<1xf32> {
-  %c-1 = arith.constant dense<-1> : vector<1xi8>
-  %r = arith.uitofp %c-1 : vector<1xi8> to vector<1xf32>
-  return %r : vector<1xf32>
-}
-
-// CHECK-LABEL: @vector
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<1xf32>: [2.550000e+02]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/vector_math.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/vector_math.mlir
deleted file mode 100644
index 4ffc4d0ed81961..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/vector_math.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @addi() -> vector<2xi32> {
-  %c1 = arith.constant dense<[1, 2]> : vector<2xi32>
-  %c2 = arith.constant dense<[3, 4]> : vector<2xi32>
-  %ret = arith.addi %c1, %c2 : vector<2xi32>
-  return %ret : vector<2xi32>
-}
-
-// CHECK-LABEL: @addi
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<2xi32>: [4, 6]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/alloc_tensor.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/alloc_tensor.mlir
deleted file mode 100644
index 8d47f63656a05f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/alloc_tensor.mlir
+++ /dev/null
@@ -1,30 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @static() -> tensor<1x2x3xi32> {
-  %t = bufferization.alloc_tensor() : tensor<1x2x3xi32>
-  return %t : tensor<1x2x3xi32>
-}
-
-// CHECK-LABEL: @static
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[0, 0, 0], [0, 0, 0]]]
-
-func.func @dynamic() -> tensor<?x1xi32> {
-  %c4 = arith.constant 4 : index
-  %t = bufferization.alloc_tensor(%c4) : tensor<?x1xi32>
-  return %t : tensor<?x1xi32>
-}
-
-// CHECK-LABEL: @dynamic
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0], [0], [0], [0]]
-
-func.func @copy() -> tensor<i32> {
-  %c = arith.constant dense<123> : tensor<i32>
-  %t = bufferization.alloc_tensor() copy(%c) : tensor<i32>
-  return %t : tensor<i32>
-}
-
-// CHECK-LABEL: @copy
-// CHECK-NEXT: Results
-// CHECK-NEXT: 123
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/clone.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/clone.mlir
deleted file mode 100644
index 09670eb52c81d6..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/clone.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @clone() -> (memref<i32>, memref<i32>) {
-  %a = arith.constant dense<1> : memref<i32>
-  %b = bufferization.clone %a : memref<i32> to memref<i32>
-  %c = arith.constant 2 : i32
-  memref.store %c, %b[] : memref<i32>
-  return %a, %b : memref<i32>, memref<i32>
-}
-
-// CHECK-LABEL: @clone
-// CHECK-NEXT: Results
-// CHECK-NEXT: TensorOrMemref<i32>: 1
-// CHECK-NEXT: TensorOrMemref<i32>: 2
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_memref.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_memref.mlir
deleted file mode 100644
index 2ed938c36c6ece..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_memref.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @memref() -> memref<2xi16> {
-  %cst = arith.constant dense<[42, 43]> : tensor<2xi16>
-  %memref = bufferization.to_memref %cst : memref<2xi16>
-  return %memref : memref<2xi16>
-}
-
-// CHECK-LABEL: @memref
-// CHECK{LITERAL}: [42, 43]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_tensor.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_tensor.mlir
deleted file mode 100644
index 8bc19b4d895bff..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_tensor.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @tensor() -> tensor<2xi16> {
-  %cst = arith.constant dense<[43, 44]> : tensor<2xi16>
-  %memref = bufferization.to_memref %cst : memref<2xi16>
-  %tensor = bufferization.to_tensor %memref : memref<2xi16>
-  return %tensor : tensor<2xi16>
-}
-
-// CHECK-LABEL: @tensor
-// CHECK{LITERAL}: [43, 44]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/builtin/unrealized_conversion_cast.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/builtin/unrealized_conversion_cast.mlir
deleted file mode 100644
index 55c481677eb4ad..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/builtin/unrealized_conversion_cast.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @no_op_cast() -> i32 {
-  %cst = arith.constant 42 : i32
-  %cast = builtin.unrealized_conversion_cast %cst : i32 to i32
-  return %cast : i32
-}
-
-// CHECK-LABEL: @no_op_cast
-// CHECK-NEXT: Results
-// CHECK{LITERAL}: 42
-
-func.func @cast_to_dynamic() -> tensor<?xi32> {
-  %cst = arith.constant dense<[0, 1, 2]> : tensor<3xi32>
-  %cast = builtin.unrealized_conversion_cast %cst : tensor<3xi32> to tensor<?xi32>
-  return %cast : tensor<?xi32>
-}
-
-// CHECK-LABEL: @cast_to_dynamic
-// CHECK-NEXT: Results
-// CHECK{LITERAL}: [0, 1, 2]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/complex/complex.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/complex/complex.mlir
deleted file mode 100644
index f795c0888b7f6a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/complex/complex.mlir
+++ /dev/null
@@ -1,186 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @abs() -> f32 {
-  %c = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %ret = complex.abs %c : complex<f32>
-  return %ret : f32
-}
-
-// CHECK-LABEL: @abs
-// CHECK-NEXT: Results
-// CHECK-NEXT: 2.236068e+00
-
-func.func @add() -> complex<f32> {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %b = complex.constant [10.0 : f32, 200.0 : f32] : complex<f32>
-  %ret = complex.add %a, %b : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @add
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.100000e+01+2.020000e+02i
-
-func.func @cos() -> complex<f32> {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %ret = complex.cos %a : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @cos
-// CHECK-NEXT: Results
-// CHECK-NEXT: 2.032723e+00-3.051898e+00i
-
-func.func @create() -> complex<f32> {
-  %c1 = arith.constant 1.0 : f32
-  %c2 = arith.constant 2.0 : f32
-  %ret = complex.create %c1, %c2 : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @create
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e+00+2.000000e+00i
-
-func.func @div() -> complex<f32> {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %b = complex.constant [3.0 : f32, 4.0 : f32] : complex<f32>
-  %ret = complex.div %a, %b : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @div
-// CHECK-NEXT: Results
-// CHECK-NEXT: 4.400000e-01+8.000000e-02i
-
-func.func @exp() -> complex<f32> {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %ret = complex.exp %a : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @exp
-// CHECK-NEXT: Results
-// CHECK-NEXT: -1.131204e+00+2.471727e+00i
-
-func.func @expm1() -> complex<f32> {
-  %a = complex.constant [1.0e-06 : f32, 1.0e-06 : f32] : complex<f32>
-  %ret = complex.expm1 %a : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @expm1
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e-06+1.000001e-06i
-
-func.func @im() -> f32 {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %ret = complex.im %a : complex<f32>
-  return %ret : f32
-}
-
-// CHECK-LABEL: @im
-// CHECK-NEXT: Results
-// CHECK-NEXT: 2.000000e+00
-
-func.func @log() -> complex<f32> {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %ret = complex.log %a : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @log
-// CHECK-NEXT: Results
-// CHECK-NEXT: 8.047190e-01+1.107149e+00i
-
-func.func @log1p() -> complex<f32> {
-  %a = complex.constant [1.0e-07 : f32, 1.0e-20 : f32] : complex<f32>
-  %ret = complex.log1p %a : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @log1p
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.192093e-07+9.999999e-21i
-
-func.func @mul() -> complex<f32> {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %b = complex.constant [3.0 : f32, 4.0 : f32] : complex<f32>
-  %ret = complex.mul %a, %b : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @mul
-// CHECK-NEXT: Results
-// CHECK-NEXT: -5.000000e+00+1.000000e+01i
-
-func.func @neg() -> complex<f32> {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %ret = complex.neg %a : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @neg
-// CHECK-NEXT: Results
-// CHECK-NEXT: -1.000000e+00-2.000000e+00i
-
-func.func @pow() -> complex<f32> {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %b = complex.constant [3.0 : f32, 4.0 : f32] : complex<f32>
-  %ret = complex.pow %a, %b : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @pow
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.290096e-01+3.392413e-02i
-
-func.func @re() -> f32 {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %ret = complex.re %a : complex<f32>
-  return %ret : f32
-}
-
-// CHECK-LABEL: @re
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e+00
-
-func.func @rsqrt() -> complex<f32> {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %ret = complex.rsqrt %a : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @rsqrt
-// CHECK-NEXT: Results
-// CHECK-NEXT: 5.688645e-01-3.515776e-01i
-
-func.func @sin() -> complex<f64> {
-  %a = complex.constant [1.0 : f64, 2.0 : f64] : complex<f64>
-  %ret = complex.sin %a : complex<f64>
-  return %ret : complex<f64>
-}
-
-// CHECK-LABEL: @sin
-// CHECK-NEXT: Results
-// CHECK-NEXT: 3.165779e+00+1.959601e+00i
-
-func.func @sqrt() -> complex<f32> {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %ret = complex.sqrt %a : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @sqrt
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.272020e+00+7.861515e-01i
-
-func.func @tanh() -> complex<f32> {
-  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
-  %ret = complex.tanh %a : complex<f32>
-  return %ret : complex<f32>
-}
-
-// CHECK-LABEL: @tanh
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.166736e+00-2.434582e-01i
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/deallocation/deallocation.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/deallocation/deallocation.mlir
deleted file mode 100644
index e668c0f41d552b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/deallocation/deallocation.mlir
+++ /dev/null
@@ -1,51 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @null() -> !deallocation.ownership {
-  %null = deallocation.null
-  return %null : !deallocation.ownership
-}
-
-// CHECK-LABEL: @null
-// CHECK-NEXT: Results
-// CHECK-NEXT: null
-
-func.func @get_buffer() -> (index, index, index) {
-  %null = deallocation.null
-  %a = memref.alloc() : memref<f32>
-  %a_owned = deallocation.own %a : memref<f32>
-  %null_buffer = deallocation.get_buffer %null : !deallocation.ownership
-  %a_buffer = deallocation.get_buffer %a : memref<f32>
-  %a_owned_buffer = deallocation.get_buffer %a_owned : !deallocation.ownership
-  return %null_buffer, %a_buffer, %a_owned_buffer : index, index, index
-}
-
-// CHECK-LABEL: @get_buffer
-// CHECK-NEXT: Results
-// CHECK-NEXT: i64: 0
-// CHECK-NEXT: i64: [[ADDR:[0-9]+$]]
-// CHECK-NEXT: i64: [[ADDR]]
-
-func.func @retain() -> (memref<1xf32>, memref<f32>, !deallocation.ownership) {
-  %a = memref.alloc() : memref<1xf32>
-  %a_owned = deallocation.own %a : memref<1xf32>
-  %b = memref.alloc() : memref<f32>
-  %b_owned = deallocation.own %b : memref<f32>
-
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1.0 : f32
-  memref.store %c1, %a[%c0] : memref<1xf32>
-
-  %d = deallocation.null
-
-  %e = memref.cast %a : memref<1xf32> to memref<?xf32>
-  %f = deallocation.retain(%e) of (%b_owned, %a_owned, %d) :
-    (memref<?xf32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership)
-    -> (!deallocation.ownership)
-  return %a, %b, %f : memref<1xf32>, memref<f32>, !deallocation.ownership
-}
-
-// CHECK-LABEL: @retain
-// CHECK-NEXT: Results
-// CHECK-NEXT: <1xf32>: [1.000000e+00]
-// CHECK-NEXT: <<deallocated>>
-// CHECK-NEXT: <1xf32>: [1.000000e+00]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/func/call.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/func/call.mlir
deleted file mode 100644
index b23f9d7423b05b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/func/call.mlir
+++ /dev/null
@@ -1,48 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @cbrtf_caller() -> f32 {
-  %c-27 = arith.constant -27.0 : f32
-  %ret = func.call @cbrtf(%c-27) : (f32) -> f32
-  func.return %ret : f32
-}
-
-// CHECK-LABEL: @cbrtf_caller
-// CHECK-NEXT: Results
-// CHECK-NEXT: -3.000000e+00
-
-func.func @cbrt_caller() -> f64 {
-  %c-8 = arith.constant -8.0 : f64
-  %ret = func.call @cbrt(%c-8) : (f64) -> f64
-  func.return %ret : f64
-}
-
-// CHECK-LABEL: @cbrt_caller
-// CHECK-NEXT: Results
-// CHECK-NEXT: -2.000000e+00
-
-func.func @atan2f_caller() -> f32 {
-  %c1 = arith.constant 1.0 : f32
-  %c2 = arith.constant 2.0 : f32
-  %ret = func.call @atan2f(%c1, %c2) : (f32, f32) -> f32
-  func.return %ret : f32
-}
-
-// CHECK-LABEL: @atan2f_caller
-// CHECK-NEXT: Results
-// CHECK-NEXT: 4.636476e-01
-
-func.func @atan2_caller() -> f64 {
-  %c2 = arith.constant 2.0 : f64
-  %c3 = arith.constant 3.0 : f64
-  %ret = func.call @atan2(%c2, %c3) : (f64, f64) -> f64
-  func.return %ret : f64
-}
-
-// CHECK-LABEL: @atan2_caller
-// CHECK-NEXT: Results
-// CHECK-NEXT: 5.880026e-01
-
-func.func private @cbrtf(%a: f32) -> f32
-func.func private @cbrt(%a: f64) -> f64
-func.func private @atan2f(%a: f32, %b: f32) -> f32
-func.func private @atan2(%a: f64, %b: f64) -> f64
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/fusion.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/fusion.mlir
deleted file mode 100644
index 1c12fc71438b79..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/fusion.mlir
+++ /dev/null
@@ -1,35 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @gml_st_fusion() -> tensor<4xf32> {
-  %a = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf32>
-  %b = tensor.empty() : tensor<4xf32>
-  %0 = gml_st.fusion ins(%a0 = %a : tensor<4xf32>)
-                     inits(%in = %b : tensor<4xf32>) {
-    %res = linalg.map { math.exp }
-      ins(%a0 : tensor<4xf32>)
-      outs(%in : tensor<4xf32>)
-    gml_st.yield %res : tensor<4xf32>
-  } : tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// CHECK-LABEL: @gml_st_fusion
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [2.718282e+00, 7.389056e+00, 2.008554e+01, 5.459815e+01]
-
-func.func @bufferized() -> memref<4xf32> {
-  %a = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : memref<4xf32>
-  %b = memref.alloc() : memref<4xf32>
-  gml_st.fusion ins(%a0 = %a : memref<4xf32>)
-                inits(%in = %b : memref<4xf32>) {
-    linalg.map { math.exp }
-      ins(%a0 : memref<4xf32>)
-      outs(%in : memref<4xf32>)
-    gml_st.yield %in : memref<4xf32>
-  }
-  func.return %b : memref<4xf32>
-}
-
-// CHECK-LABEL: @bufferized
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [2.718282e+00, 7.389056e+00, 2.008554e+01, 5.459815e+01]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/broadcast.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/broadcast.mlir
deleted file mode 100644
index 822b21dfc4c5fd..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/broadcast.mlir
+++ /dev/null
@@ -1,30 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @broadcast() -> (tensor<2x3xi32>, tensor<2x3xi32>) {
-  %v = arith.constant dense<[1,2]> : tensor<2xi32>
-  %init = tensor.empty() : tensor<2x3xi32>
-  %bcast = linalg.broadcast
-      ins(%v: tensor<2xi32>)
-      outs(%init: tensor<2x3xi32>)
-      dimensions = [1]
-  func.return %init, %bcast : tensor<2x3xi32>, tensor<2x3xi32>
-}
-
-// CHECK-LABEL: @broadcast
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 0, 0], [0, 0, 0]]
-// CHECK-NEXT{LITERAL}: [[1, 1, 1], [2, 2, 2]]
-
-func.func @bufferized() -> memref<2x3xi32> {
-  %v = arith.constant dense<[1,2]> : tensor<2xi32>
-  %alloc = memref.alloc() : memref<2x3xi32>
-  linalg.broadcast
-      ins(%v: tensor<2xi32>)
-      outs(%alloc: memref<2x3xi32>)
-      dimensions = [1]
-  func.return %alloc : memref<2x3xi32>
-}
-
-// CHECK-LABEL: @bufferized
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 1, 1], [2, 2, 2]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/dot.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/dot.mlir
deleted file mode 100644
index 895f51e0b0dd93..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/dot.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @dot() -> tensor<i32> {
-  %lhs = arith.constant dense<[1, 2]> : tensor<2xi32>
-  %rhs = arith.constant dense<[3, 4]> : tensor<2xi32>
-  %init = tensor.empty() : tensor<i32>
-  %ret = linalg.dot ins(%lhs, %rhs: tensor<2xi32>, tensor<2xi32>)
-                    outs(%init: tensor<i32>) -> tensor<i32>
-  return %ret : tensor<i32>
-}
-
-// CHECK-LABEL: @dot
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: 11
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/fill.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/fill.mlir
deleted file mode 100644
index 20a91a7768df6b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/fill.mlir
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @fill() -> (tensor<2xi32>, tensor<2xi32>) {
-  %c42 = arith.constant 42 : i32
-  %init = tensor.empty() : tensor<2xi32>
-  %fill = linalg.fill ins(%c42 : i32) outs(%init : tensor<2xi32>) -> tensor<2xi32>
-  func.return %init, %fill : tensor<2xi32>, tensor<2xi32>
-}
-
-// CHECK-LABEL: @fill
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [0, 0]
-// CHECK-NEXT{LITERAL}: [42, 42]
-
-func.func @bufferized() -> memref<2xi32> {
-  %c42 = arith.constant 42 : i32
-  %alloc = memref.alloc() : memref<2xi32>
-  linalg.fill ins(%c42 : i32) outs(%alloc : memref<2xi32>)
-  func.return %alloc : memref<2xi32>
-}
-
-// CHECK-LABEL: @bufferized
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [42, 42]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir
deleted file mode 100644
index 38f0dd67113e12..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir
+++ /dev/null
@@ -1,113 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-#matmul_trait = {
-  indexing_maps = [
-    affine_map<(m, n, k) -> (m, k)>,
-    affine_map<(m, n, k) -> (k, n)>,
-    affine_map<(m, n, k) -> (m, n)>
-  ],
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
-func.func @matmul() -> (tensor<2x2xi32>, tensor<2x2xi32>) {
-  %lhs = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
-  %rhs = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
-  %init = tensor.empty() : tensor<2x2xi32>
-  %ret = linalg.generic #matmul_trait
-    ins(%lhs, %rhs : tensor<2x3xi32>, tensor<3x2xi32>)
-    outs(%init : tensor<2x2xi32>) {
-    ^bb(%a: i32, %b: i32, %c: i32):
-      %d = arith.muli %a, %b: i32
-      %e = arith.addi %c, %d: i32
-      linalg.yield %e : i32
-    } -> tensor<2x2xi32>
-  return %ret, %init : tensor<2x2xi32>, tensor<2x2xi32>
-}
-
-// CHECK-LABEL: @matmul
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[22, 28], [49, 64]]
-// CHECK-NEXT{LITERAL}: [[0, 0], [0, 0]]
-
-func.func @bufferized() -> memref<2x2xi32> {
-  %lhs = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
-  %rhs = arith.constant dense<[[2, 1], [3, 4], [5, 6]]> : tensor<3x2xi32>
-  %alloc = memref.alloc() : memref<2x2xi32>
-  linalg.generic #matmul_trait
-    ins(%lhs, %rhs : tensor<2x3xi32>, tensor<3x2xi32>)
-    outs(%alloc : memref<2x2xi32>) {
-    ^bb(%a: i32, %b: i32, %c: i32):
-      %d = arith.muli %a, %b: i32
-      %e = arith.addi %c, %d: i32
-      linalg.yield %e : i32
-    }
-  return %alloc : memref<2x2xi32>
-}
-
-// CHECK-LABEL: @bufferized
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[23, 27], [53, 60]]
-
-#map = affine_map<(d0) -> (d0)>
-
-func.func @vector() -> tensor<4xvector<2xi32>> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %lhs = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-  %rhs = arith.constant dense<[5, 6, 7, 8]> : tensor<4xi32>
-  %init = tensor.empty() : tensor<4xvector<2xi32>>
-  %ret = linalg.generic {
-      indexing_maps = [#map, #map, #map],
-      iterator_types = ["parallel"]
-    }
-    ins(%lhs, %rhs : tensor<4xi32>, tensor<4xi32>)
-    outs(%init : tensor<4xvector<2xi32>>) {
-    ^bb(%a: i32, %b: i32, %c: vector<2xi32>):
-      %d = vector.insertelement %a, %c[%c0 : index] : vector<2xi32>
-      %e = vector.insertelement %b, %d[%c1 : index] : vector<2xi32>
-      linalg.yield %e : vector<2xi32>
-    } -> tensor<4xvector<2xi32>>
-  return %ret : tensor<4xvector<2xi32>>
-}
-
-// CHECK-LABEL: @vector
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: TensorOrMemref<4xvector<2xi32>>: [[1, 5], [2, 6], [3, 7], [4, 8]]
-
-func.func @matmul_dynamic() -> tensor<2x2xi32> {
-  %lhs = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
-  %rhs = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
-  %lhs_cast = tensor.cast %lhs : tensor<2x3xi32> to tensor<2x?xi32>
-  %rhs_cast = tensor.cast %rhs : tensor<3x2xi32> to tensor<?x2xi32>
-  %init = tensor.empty() : tensor<2x2xi32>
-  %ret = linalg.generic #matmul_trait
-    ins(%lhs_cast, %rhs_cast : tensor<2x?xi32>, tensor<?x2xi32>)
-    outs(%init : tensor<2x2xi32>) {
-    ^bb(%a: i32, %b: i32, %c: i32):
-      %d = arith.muli %a, %b: i32
-      %e = arith.addi %c, %d: i32
-      linalg.yield %e : i32
-    } -> tensor<2x2xi32>
-  return %ret : tensor<2x2xi32>
-}
-
-// CHECK-LABEL: @matmul_dynamic
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[22, 28], [49, 64]]
-
-func.func @dynamic_generic_w_cst() -> tensor<4x?xf64> {
-  %cst_1 =  arith.constant 123.456 : f64
-  %extracted_slice_ = arith.constant dense<[[1.1, 2.2], [3.3, 4.4], [5.5, 6.6], [7.7, 8.8]]> : tensor<4x2xf64>
-  %extracted_slice = tensor.cast %extracted_slice_ : tensor<4x2xf64> to tensor<4x?xf64>
-  %6 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> ()>,
-      affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel",
-      "parallel"]} ins(%cst_1 : f64) outs(%extracted_slice : tensor<4x?xf64>) {
-  ^bb0(%in: f64, %out: f64):
-    linalg.yield %in : f64
-  } -> tensor<4x?xf64>
-  return %6 : tensor<4x?xf64>
-}
-
-// CHECK-LABEL: @dynamic_generic_w_cst
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: TensorOrMemref<4x2xf64>: [[1.234560e+02, 1.234560e+02], [1.234560e+02, 1.234560e+02], [1.234560e+02, 1.234560e+02], [1.234560e+02, 1.234560e+02]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/map.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
deleted file mode 100644
index 4fef179d7989d6..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
+++ /dev/null
@@ -1,74 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @no_inputs() -> tensor<4xf32> {
-  %init = arith.constant dense<[1.0,2.0,3.0,4.0]> : tensor<4xf32>
-  %zero = linalg.map outs(%init:tensor<4xf32>)() {
-    %0 = arith.constant 0.0: f32
-    linalg.yield %0: f32
-  }
-  func.return %zero : tensor<4xf32>
-}
-
-// CHECK-LABEL: @no_inputs
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00]
-
-func.func @binary() -> tensor<4xi32> {
-  %init = tensor.empty() : tensor<4xi32>
-  %lhs = arith.constant dense<[0, 1, 2, 3]> : tensor<4xi32>
-  %rhs = arith.constant dense<[10, 20, 30, 40]> : tensor<4xi32>
-  %add = linalg.map ins(%lhs, %rhs: tensor<4xi32>, tensor<4xi32>)
-                    outs(%init: tensor<4xi32>)
-    (%lhs_elem: i32, %rhs_elem: i32) {
-      %0 = arith.addi %lhs_elem, %rhs_elem: i32
-      linalg.yield %0: i32
-    }
-  func.return %add : tensor<4xi32>
-}
-
-// CHECK-LABEL: @binary
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [10, 21, 32, 43]
-
-func.func @memref() -> memref<4xi32> {
-  %alloc = memref.alloc() : memref<4xi32>
-  %lhs = arith.constant dense<[0, 1, 2, 3]> : memref<4xi32>
-  %rhs = arith.constant dense<[10, 20, 30, 40]> : memref<4xi32>
-  linalg.map ins(%lhs, %rhs: memref<4xi32>, memref<4xi32>)
-             outs(%alloc: memref<4xi32>)
-    (%lhs_elem: i32, %rhs_elem: i32) {
-      %0 = arith.muli %lhs_elem, %rhs_elem: i32
-      linalg.yield %0: i32
-    }
-  func.return %alloc : memref<4xi32>
-}
-
-// CHECK-LABEL: @memref
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [0, 20, 60, 120]
-
-func.func @index() -> memref<4xindex> {
-  %alloc = memref.alloc() : memref<4xindex>
-  linalg.map outs(%alloc: memref<4xindex>)() {
-    %0 = linalg.index 0 : index
-    linalg.yield %0: index
-  }
-  func.return %alloc : memref<4xindex>
-}
-
-// CHECK-LABEL: @index
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [0, 1, 2, 3]
-
-func.func @vector() -> memref<4xvector<2xindex>> {
-  %c = arith.constant dense<42> : vector<2xindex>
-  %alloc = memref.alloc() : memref<4xvector<2xindex>>
-  linalg.map outs(%alloc: memref<4xvector<2xindex>>)() {
-    linalg.yield %c: vector<2xindex>
-  }
-  func.return %alloc : memref<4xvector<2xindex>>
-}
-
-// CHECK-LABEL: @vector
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[42, 42], [42, 42], [42, 42], [42, 42]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/matmul.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/matmul.mlir
deleted file mode 100644
index 5bed36c4c68545..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/matmul.mlir
+++ /dev/null
@@ -1,41 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @m1x1() -> (tensor<1x1xi32>, tensor<1x1xi32>) {
-  %a = arith.constant dense<1> : tensor<1x1xi32>
-  %b = arith.constant dense<2> : tensor<1x1xi32>
-  %c = arith.constant dense<3> : tensor<1x1xi32>
-  %ret = linalg.matmul ins(%a, %b : tensor<1x1xi32>, tensor<1x1xi32>)
-                       outs(%c : tensor<1x1xi32>) -> tensor<1x1xi32>
-  return %c, %ret : tensor<1x1xi32>, tensor<1x1xi32>
-}
-
-// CHECK-LABEL: @m1x1
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[3]]
-// CHECK-NEXT{LITERAL}: [[5]]
-
-func.func @m2x2() -> tensor<2x2xi32> {
-  %a = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  %b = arith.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
-  %c = tensor.empty() : tensor<2x2xi32>
-  %ret = linalg.matmul ins(%a, %b : tensor<2x2xi32>, tensor<2x2xi32>)
-                       outs(%c : tensor<2x2xi32>) -> tensor<2x2xi32>
-  return %ret : tensor<2x2xi32>
-}
-
-// CHECK-LABEL: @m2x2
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[16, 19], [36, 43]]
-
-func.func @m1x1_bufferized() -> memref<1x1xi32> {
-  %a = arith.constant dense<1> : tensor<1x1xi32>
-  %b = arith.constant dense<2> : tensor<1x1xi32>
-  %c = arith.constant dense<3> : memref<1x1xi32>
-  linalg.matmul ins(%a, %b : tensor<1x1xi32>, tensor<1x1xi32>)
-                outs(%c : memref<1x1xi32>)
-  return %c : memref<1x1xi32>
-}
-
-// CHECK-LABEL: @m1x1_bufferized
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[5]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir
deleted file mode 100644
index c21f9f778db316..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir
+++ /dev/null
@@ -1,57 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @reduce() -> tensor<2xi32> {
-  %v = arith.constant dense<[[1,2,3,4], [5,6,7,8]]> : tensor<2x4xi32>
-  %init = arith.constant dense<[9, 10]> : tensor<2xi32>
-  %ret = linalg.reduce ins(%v : tensor<2x4xi32>)
-                       outs(%init: tensor<2xi32>)
-                       dimensions = [1]
-                       (%in: i32, %out: i32) {
-                         %sum = arith.addi %in, %out : i32
-                         linalg.yield %sum: i32
-                       }
-  func.return %ret : tensor<2xi32>
-}
-
-// CHECK-LABEL: @reduce
-// CHECK-NEXT: Results
-// CHECK-NEXT: [19, 36]
-
-func.func @variadic() -> (tensor<2xi32>, tensor<2xf32>) {
-  %v = arith.constant dense<[[1,2,3,4], [5,6,7,8]]> : tensor<2x4xi32>
-  %w = arith.constant dense<[[1.0,2.0,3.0,4.0], [5.0,6.0,7.0,8.0]]> : tensor<2x4xf32>
-  %init = arith.constant dense<[9, 10]> : tensor<2xi32>
-  %init2 = arith.constant dense<[9.0, 10.0]> : tensor<2xf32>
-  %ret, %retf = linalg.reduce ins(%v, %w : tensor<2x4xi32>, tensor<2x4xf32>)
-                              outs(%init, %init2: tensor<2xi32>, tensor<2xf32>)
-                              dimensions = [1]
-    (%in: i32, %inf: f32, %out: i32, %outf: f32) {
-      %sum = arith.addi %in, %out : i32
-      %sumf = arith.addf %inf, %outf : f32
-      linalg.yield %sum, %sumf: i32, f32
-    }
-  func.return %ret, %retf : tensor<2xi32>, tensor<2xf32>
-}
-
-// CHECK-LABEL: @variadic
-// CHECK-NEXT: Results
-// CHECK-NEXT: [19, 36]
-// CHECK-NEXT: [1.900000e+01, 3.600000e+01]
-
-func.func @bufferized() -> memref<2xi32> {
-  %v = arith.constant dense<[[1,2,3,4], [5,6,7,8]]> : tensor<2x4xi32>
-  %init = arith.constant dense<[9, 10]> : memref<2xi32>
-  linalg.reduce ins(%v : tensor<2x4xi32>)
-                outs(%init: memref<2xi32>)
-                dimensions = [1]
-                (%in: i32, %out: i32) {
-                  %sum = arith.addi %in, %out : i32
-                  linalg.yield %sum: i32
-                }
-  func.return %init : memref<2xi32>
-}
-
-// CHECK-LABEL: @bufferized
-// CHECK-NEXT: Results
-// CHECK-NEXT: [19, 36]
-
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/transpose.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/transpose.mlir
deleted file mode 100644
index c9fbb0d4632f35..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/transpose.mlir
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @transpose() -> tensor<2x3xi32> {
-  %a = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
-  %b = tensor.empty() : tensor<2x3xi32>
-  %ret = linalg.transpose ins(%a : tensor<3x2xi32>)
-                          outs(%b : tensor<2x3xi32>)
-                          permutation = [1, 0]
-  return %ret : tensor<2x3xi32>
-}
-
-// CHECK-LABEL: @transpose
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 3, 5], [2, 4, 6]]
-
-func.func @transpose_bufferized() -> memref<2x3xi32> {
-  %a = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
-  %b = memref.alloc() : memref<2x3xi32>
-  linalg.transpose ins(%a : tensor<3x2xi32>)
-                   outs(%b : memref<2x3xi32>)
-                   permutation = [1, 0]
-  return %b : memref<2x3xi32>
-}
-
-// CHECK-LABEL: @transpose_bufferized
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 3, 5], [2, 4, 6]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/vecmat.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/vecmat.mlir
deleted file mode 100644
index 2a4280f1581b2c..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/vecmat.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @vecmat() -> tensor<2xi32> {
-  %lhs = arith.constant dense<[4, 5]> : tensor<2xi32>
-  %rhs = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  %init = tensor.empty() : tensor<2xi32>
-  %ret = linalg.vecmat ins(%lhs, %rhs: tensor<2xi32>, tensor<2x2xi32>)
-                       outs(%init: tensor<2xi32>) -> tensor<2xi32>
-  return %ret : tensor<2xi32>
-}
-
-// CHECK-LABEL: @vecmat
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [19, 28]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/math/math.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/math/math.mlir
deleted file mode 100644
index 3e99a449c8cef6..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/math/math.mlir
+++ /dev/null
@@ -1,252 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @absi() -> i32 {
-  %a = arith.constant -1 : i32
-  %ret = math.absi %a : i32
-  return %ret : i32
-}
-
-// CHECK-LABEL: @absi
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1
-
-func.func @absf() -> (f32, f32) {
-  %a = arith.constant 1.0 : f32
-  %b = arith.constant -1.0 : f32
-  %c = math.absf %a : f32
-  %d = math.absf %b : f32
-  return %c, %d : f32, f32
-}
-
-// CHECK-LABEL: @absf
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e+00
-// CHECK-NEXT: 1.000000e+00
-
-func.func @atan() -> f32 {
-  %c1 = arith.constant 1.0 : f32
-  %ret = math.atan %c1 : f32
-  return %ret : f32
-}
-
-// CHECK-LABEL: @atan
-// CHECK-NEXT: Results
-// CHECK-NEXT: 7.853982e-01
-
-func.func @atan2() -> f32 {
-  %c10 = arith.constant 10.0 : f32
-  %c1 = arith.constant 1.0 : f32
-  %ret = math.atan2 %c10, %c1 : f32
-  return %ret : f32
-}
-
-// CHECK-LABEL: @atan2
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.471128e+00
-
-func.func @cbrt() -> f32 {
-  %c-27 = arith.constant -27.0 : f32
-  %ret = math.cbrt %c-27 : f32
-  return %ret : f32
-}
-
-// CHECK-LABEL: @cbrt
-// CHECK-NEXT: Results
-// CHECK-NEXT: -3.000000e+00
-
-func.func @ceil() -> f32 {
-  %a = arith.constant 1.234 : f32
-  %ret = math.ceil %a : f32
-  return %ret : f32
-}
-
-// CHECK-LABEL: @ceil
-// CHECK-NEXT: Results
-// CHECK-NEXT: 2.000000e+00
-
-func.func @ctlz() -> i8 {
-  %c1 = arith.constant 1 : i8
-  %ret = math.ctlz %c1 : i8
-  return %ret : i8
-}
-
-// CHECK-LABEL: @ctlz
-// CHECK-NEXT: Results
-// CHECK-NEXT: 7
-
-func.func @ctpop() -> i8 {
-  %c1 = arith.constant 17 : i8
-  %ret = math.ctpop %c1 : i8
-  return %ret : i8
-}
-
-// CHECK-LABEL: @ctpop
-// CHECK-NEXT: Results
-// CHECK-NEXT: i8: 2
-
-func.func @ctz() -> i8 {
-  %c1 = arith.constant 8 : i8
-  %ret = math.cttz %c1 : i8
-  return %ret : i8
-}
-
-// CHECK-LABEL: @ctz
-// CHECK-NEXT: Results
-// CHECK-NEXT: 3
-
-func.func @copysign() -> (f32, f32) {
-  %a = arith.constant 1.5 : f32
-  %b = arith.constant -10.0 : f32
-  %c = math.copysign %a, %b : f32
-  %d = math.copysign %a, %a : f32
-  return %c, %d : f32, f32
-}
-
-// CHECK-LABEL: @copysign
-// CHECK-NEXT: Results
-// CHECK-NEXT: -1.500000e+00
-// CHECK-NEXT: 1.500000e+00
-
-func.func @cos() -> f32 {
-  %a = arith.constant 0.0 : f32
-  %b = math.cos %a : f32
-  return %b : f32
-}
-
-// CHECK-LABEL: @cos
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e+00
-
-func.func @erf() -> f32 {
-  %a = arith.constant 1.0 : f32
-  %b = math.erf %a : f32
-  return %b : f32
-}
-
-// CHECK-LABEL: @erf
-// CHECK-NEXT: Results
-// CHECK-NEXT: 8.427008e-01
-
-func.func @exp() -> f32 {
-  %a = arith.constant 1.0 : f32
-  %b = math.exp %a : f32
-  return %b : f32
-}
-
-// CHECK-LABEL: @exp
-// CHECK-NEXT: Results
-// CHECK-NEXT: 2.718282e+00
-
-func.func @exp2() -> f32 {
-  %a = arith.constant 3.0 : f32
-  %b = math.exp2 %a : f32
-  return %b : f32
-}
-
-// CHECK-LABEL: @exp2
-// CHECK-NEXT: Results
-// CHECK-NEXT: 8.000000e+00
-
-func.func @expm1() -> f32 {
-  %a = arith.constant 1.0e-06 : f32
-  %b = math.expm1 %a : f32
-  return %b : f32
-}
-
-// CHECK-LABEL: @expm1
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e-06
-
-func.func @floor() -> (f32, f32) {
-  %a = arith.constant 1.5 : f32
-  %b = arith.constant -10.5 : f32
-  %c = math.floor %a : f32
-  %d = math.floor %b : f32
-  return %c, %d : f32, f32
-}
-
-// CHECK-LABEL: @floor
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e+00
-// CHECK-NEXT: -1.100000e+01
-
-func.func @ipowi() -> i32 {
-  %a = arith.constant 2 : i32
-  %b = arith.constant 3 : i32
-  %c = math.ipowi %a, %b : i32
-  return %c : i32
-}
-
-// CHECK-LABEL: @ipowi
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 8
-
-func.func @log() -> f32 {
-  %a = arith.constant 1.0 : f32
-  %ret = math.log %a : f32
-  return %ret : f32
-}
-
-// CHECK-LABEL: @log
-// CHECK-NEXT: Results
-// CHECK-NEXT: 0.000000e+00
-
-func.func @log10() -> f32 {
-  %a = arith.constant 1.0e12 : f32
-  %ret = math.log10 %a : f32
-  return %ret : f32
-}
-
-// CHECK-LABEL: @log
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.200000e+01
-
-func.func @log1p() -> f32 {
-  %a = arith.constant 1.0e-10 : f32
-  %ret = math.log1p %a : f32
-  return %ret : f32
-}
-
-// CHECK-LABEL: @log1p
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e-10
-
-func.func @log2() -> f32 {
-  %a = arith.constant 65536.0 : f32
-  %ret = math.log2 %a : f32
-  return %ret : f32
-}
-
-// CHECK-LABEL: @log2
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.600000e+01
-
-func.func @powf() -> f32 {
-  %a = arith.constant 2.0 : f32
-  %b = arith.constant 3.0 : f32
-  %c = math.powf %a, %b : f32
-  return %c : f32
-}
-
-// CHECK-LABEL: @powf
-// CHECK-NEXT: Results
-// CHECK-NEXT: 8.000000e+00
-
-//REGISTER_MLIR_INTERPRETER_OP("math.round", applyCwiseMap<Round>);
-//REGISTER_MLIR_INTERPRETER_OP("math.roundeven",  applyCwiseMap<NearbyInt>);
-//REGISTER_MLIR_INTERPRETER_OP("math.rsqrt", applyCwiseMap<RSqrt>);
-
-func.func @sin() -> f32 {
-  %a = arith.constant 0.0 : f32
-  %b = math.sin %a : f32
-  return %b : f32
-}
-
-// CHECK-LABEL: @sin
-// CHECK-NEXT: Results
-// CHECK-NEXT: 0.000000e+00
-
-//REGISTER_MLIR_INTERPRETER_OP("math.sqrt", applyCwiseMap<Sqrt>);
-//REGISTER_MLIR_INTERPRETER_OP("math.tan", applyCwiseMap<Tan>);
-//REGISTER_MLIR_INTERPRETER_OP("math.tanh", applyCwiseMap<TanH>);
-//REGISTER_MLIR_INTERPRETER_OP("math.trunc", applyCwiseMap<Trunc>);
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir
deleted file mode 100644
index d6026b5b913a1b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir
+++ /dev/null
@@ -1,57 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @alloc() -> memref<2x3xi32> {
-  %ret = memref.alloc() : memref<2x3xi32>
-  return %ret : memref<2x3xi32>
-}
-
-// CHECK-LABEL: @alloc
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: <2x3xi32>: [[0, 0, 0], [0, 0, 0]]
-
-func.func @alloc_unit() -> memref<i32> {
-  %ret = memref.alloc() : memref<i32>
-  return %ret : memref<i32>
-}
-
-// CHECK-LABEL: @alloc_unit
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: <i32>: 0
-
-func.func @alloc_unit_vector() -> memref<vector<i32>> {
-  %ret = memref.alloc() : memref<vector<i32>>
-  return %ret : memref<vector<i32>>
-}
-
-// CHECK-LABEL: @alloc_unit_vector
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: <vector<i32>>: 0
-
-func.func @alloc_vector() -> memref<2xvector<3xi32>> {
-  %ret = memref.alloc() : memref<2xvector<3xi32>>
-  return %ret : memref<2xvector<3xi32>>
-}
-
-// CHECK-LABEL: @alloc_vector
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: <2xvector<3xi32>>: [[0, 0, 0], [0, 0, 0]]
-
-func.func @alloc_dynamic() -> memref<?x3xi32> {
-  %c2 = arith.constant 2 : index
-  %ret = memref.alloc(%c2) : memref<?x3xi32>
-  return %ret : memref<?x3xi32>
-}
-
-// CHECK-LABEL: @alloc_dynamic
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: <2x3xi32>: [[0, 0, 0], [0, 0, 0]]
-
-func.func @dealloc() -> memref<i32> {
-  %a = memref.alloc() : memref<i32>
-  memref.dealloc %a : memref<i32>
-  return %a : memref<i32>
-}
-
-// CHECK-LABEL: @dealloc
-// CHECK-NEXT: Results
-// CHECK-NEXT: TensorOrMemref<i32>: <<deallocated>>
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/collapse_shape.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/collapse_shape.mlir
deleted file mode 100644
index 027b59755bed8c..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/collapse_shape.mlir
+++ /dev/null
@@ -1,33 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @collapse_shape()
-    -> (memref<1x2x3xi32>, memref<1x6xi32>, memref<2x3xi32>, memref<6xi32>) {
-  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : memref<1x2x3xi32>
-  %collapse1 = memref.collapse_shape %cst [[0], [1, 2]]
-      : memref<1x2x3xi32> into memref<1x6xi32>
-  %collapse2 = memref.collapse_shape %cst [[0, 1], [2]]
-      : memref<1x2x3xi32> into memref<2x3xi32>
-  %collapse3 = memref.collapse_shape %cst [[0, 1, 2]]
-      : memref<1x2x3xi32> into memref<6xi32>
-  return %cst, %collapse1, %collapse2, %collapse3
-      : memref<1x2x3xi32>, memref<1x6xi32>, memref<2x3xi32>, memref<6xi32>
-}
-
-// CHECK-LABEL: @collapse_shape
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
-// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4, 5, 6]]
-// CHECK-NEXT{LITERAL}: [[1, 2, 3], [4, 5, 6]]
-// CHECK-NEXT{LITERAL}: [1, 2, 3, 4, 5, 6]
-
-func.func @zero_dim()
-    -> (memref<6x0xi32>) {
-  %cst = arith.constant dense<> : memref<1x2x3x0xi32>
-  %collapse = memref.collapse_shape %cst [[0, 1, 2], [3]]
-      : memref<1x2x3x0xi32> into memref<6x0xi32>
-  return %collapse : memref<6x0xi32>
-}
-
-// CHECK-LABEL: @zero_dim
-// CHECK-NEXT: Results
-// CHECK-NEXT: TensorOrMemref<6x0xi32>
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/copy.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/copy.mlir
deleted file mode 100644
index 09815f5e8cf61a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/copy.mlir
+++ /dev/null
@@ -1,39 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @subview() -> memref<4x4xi32, strided<[4, 1], offset: 0>> {
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-
-  %0 = memref.alloc() : memref<4x4xi32, strided<[4, 1], offset: 0>>
-  %1 = memref.subview %0[%c1, %c1][%c2, %c2][%c1, %c1]
-    : memref<4x4xi32, strided<[4, 1], offset: 0>> to
-      memref<?x?xi32, strided<[?, ?], offset: ?>>
-
-  %cst = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
-  memref.copy %cst, %1 : memref<2x2xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
-
-  return %0 : memref<4x4xi32, strided<[4, 1], offset: 0>>
-}
-
-// CHECK-LABEL: @subview
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 1, 2, 0], [0, 3, 4, 0], [0, 0, 0, 0]]
-
-func.func @strided() -> memref<4x4xi32> {
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-
-  %0 = memref.alloc() : memref<4x4xi32>
-  %1 = memref.subview %0[%c1, %c1][%c2, %c2][%c2, %c2]
-    : memref<4x4xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
-
-  %cst = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
-  memref.copy %cst, %1
-    : memref<2x2xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
-
-  return %0 : memref<4x4xi32>
-}
-
-// CHECK-LABEL: @strided
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 1, 0, 2], [0, 0, 0, 0], [0, 3, 0, 4]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/dim.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/dim.mlir
deleted file mode 100644
index 2740b7df65de6d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/dim.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @dim() -> index {
-  %alloc = memref.alloc() {alignment = 64 : i64} : memref<10x50xf32>
-  %c1 = arith.constant 1 : index
-  %dim = memref.dim %alloc, %c1 : memref<10x50xf32>
-  return %dim : index
-}
-
-// CHECK-LABEL: @dim
-// CHECK-NEXT: Results
-// CHECK-NEXT: i64: 50
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/expand_shape.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/expand_shape.mlir
deleted file mode 100644
index b9d05ebad24c93..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/expand_shape.mlir
+++ /dev/null
@@ -1,52 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @expand_shape()
-    -> (memref<1x2x3xi32>, memref<1x2x3xi32>, memref<2x1x3xi32>,
-        memref<1x2x3xi32>) {
-  %cst1 = arith.constant dense<[[1, 2, 3, 4, 5, 6]]> : memref<1x6xi32>
-  %cst2 = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : memref<2x3xi32>
-  %cst3 = arith.constant dense<[1, 2, 3, 4, 5, 6]> : memref<6xi32>
-  %expand1 = memref.expand_shape %cst1 [[0], [1, 2]]
-      : memref<1x6xi32> into memref<1x2x3xi32>
-  %expand2 = memref.expand_shape %cst2 [[0, 1], [2]]
-      : memref<2x3xi32> into memref<1x2x3xi32>
-  %expand3 = memref.expand_shape %cst2 [[0, 1], [2]]
-      : memref<2x3xi32> into memref<2x1x3xi32>
-  %expand4 = memref.expand_shape %cst3 [[0, 1, 2]]
-      : memref<6xi32> into memref<1x2x3xi32>
-  return %expand1, %expand2, %expand3, %expand4
-      : memref<1x2x3xi32>, memref<1x2x3xi32>, memref<2x1x3xi32>,
-        memref<1x2x3xi32>
-}
-
-// CHECK-LABEL: @expand_shape
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
-// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
-// CHECK-NEXT{LITERAL}: [[[1, 2, 3]], [[4, 5, 6]]]
-// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
-
-func.func @zero_rank()
-    -> (memref<1x1xi32>) {
-  %cst = arith.constant dense<1> : memref<i32>
-  %expand = memref.expand_shape %cst []
-      : memref<i32> into memref<1x1xi32>
-  return %expand : memref<1x1xi32>
-}
-
-// CHECK-LABEL: @zero_rank
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1]]
-
-func.func @split_dim() -> memref<3x2x2xi32, strided<[4, 2, 1], offset: 0>> {
-  %cst = arith.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
-    : memref<3x4xi32, strided<[4, 1], offset: 0>>
-  %ret = memref.expand_shape %cst [[0], [1, 2]]
-    : memref<3x4xi32, strided<[4, 1], offset: 0>> into
-      memref<3x2x2xi32, strided<[4, 2, 1], offset: 0>>
-  return %ret : memref<3x2x2xi32, strided<[4, 2, 1], offset: 0>>
-}
-
-// CHECK-LABEL: @split_dim
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/get_global.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/get_global.mlir
deleted file mode 100644
index a8b5de8930cc83..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/get_global.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-memref.global "private" constant @cst : memref<2xi16> = dense<[1, 2]>
-
-func.func @get_global() -> memref<2xi16> {
-  %0 = memref.get_global @cst : memref<2xi16>
-  return %0 : memref<2xi16>
-}
-
-// CHECK-LABEL: @get_global
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [1, 2]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir
deleted file mode 100644
index a533a9664c235d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir
+++ /dev/null
@@ -1,77 +0,0 @@
-// RUN: (! mlir-interpreter-runner %s -run-all 2>&1) | FileCheck %s
-
-func.func @out_of_bounds_load() -> i32 {
-  %c3 = arith.constant 3 : index
-  %cst = arith.constant dense<[1, 2]> : memref<2xi32>
-  %ret = memref.load %cst[%c3] : memref<2xi32>
-  return %ret : i32
-}
-
-// CHECK-LABEL: @out_of_bounds_load
-// CHECK-NEXT: array index out of bounds
-
-func.func @out_of_bounds_store() {
-  %c3 = arith.constant 3 : index
-  %v = arith.constant 32 : i32
-  %cst = arith.constant dense<[1, 2]> : memref<2xi32>
-  memref.store %v, %cst[%c3] : memref<2xi32>
-  return
-}
-
-// CHECK-LABEL: @out_of_bounds_store
-// CHECK-NEXT: array index out of bounds
-
-func.func @out_of_bounds_subview() -> memref<?xi32, strided<[?], offset: ?>> {
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %v = arith.constant 32 : i32
-  %cst = arith.constant dense<[1, 2]> : memref<2xi32>
-  %subview = memref.subview %cst[%c1][%c2][%c1]
-    : memref<2xi32> to memref<?xi32, strided<[?], offset: ?>>
-  return %subview : memref<?xi32, strided<[?], offset: ?>>
-}
-
-// CHECK-LABEL: @out_of_bounds_subview
-// CHECK-NEXT: subview out of bounds
-
-func.func @collapse_shape_no_common_stride()
-    -> (memref<1x2x3xi32>, memref<1x6xi32>) {
-  %a = arith.constant dense<[[[[0, 1, 2], [3, 4, 5]],
-                              [[6, 7, 8], [9, 10,11]]]]>
-    : memref<1x2x2x3xi32>
-  %b = memref.subview %a[0, 0, 0, 0][1, 2, 1, 3][1, 1, 1, 1]
-    : memref<1x2x2x3xi32> to memref<1x2x3xi32>
-  %c = memref.collapse_shape %b [[0], [1, 2]]
-    : memref<1x2x3xi32> into memref<1x6xi32>
-  return %b, %c : memref<1x2x3xi32>, memref<1x6xi32>
-}
-
-// CHECK-LABEL: @collapse_shape_no_common_stride
-// CHECK-NEXT: cannot collapse dimensions without a common stride
-
-func.func @double_free() {
-  %a = memref.alloc() : memref<i32>
-  memref.dealloc %a : memref<i32>
-  memref.dealloc %a : memref<i32>
-  return
-}
-
-// CHECK-LABEL: @double_free
-// CHECK-NEXT: Interpreter failure: double-free
-// CHECK-NEXT: Note: allocated by %alloc = memref.alloc() : memref<i32>
-// CHECK-NEXT: Note: previously freed by memref.dealloc %alloc : memref<i32>
-// CHECK-NEXT{2}: Encountered failure while executing memref.dealloc %alloc : memref<i32>
-
-func.func @use_after_free() {
-  %a = memref.alloc() : memref<i32>
-  memref.dealloc %a : memref<i32>
-  %b = arith.constant 1 : i32
-  memref.store %b, %a[] : memref<i32>
-  return
-}
-
-// CHECK-LABEL: @use_after_free
-// CHECK-NEXT: Interpreter failure: use-after-free
-// CHECK-NEXT: Note: allocated by %alloc = memref.alloc() : memref<i32>
-// CHECK-NEXT: Note: previously freed by memref.dealloc %alloc : memref<i32>
-// CHECK-NEXT{2}: Encountered failure while executing memref.store %c1_i32, %alloc[] : memref<i32>
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/load.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/load.mlir
deleted file mode 100644
index 17d935319622e1..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/load.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @load() -> i32 {
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
-  %ret = memref.load %cst[%c1, %c1] : memref<2x2xi32>
-  return %ret : i32
-}
-
-// CHECK-LABEL: @load
-// CHECK-NEXT: Results
-// CHECK-NEXT: 4
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/subview.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/subview.mlir
deleted file mode 100644
index 65a9aba38d8e2a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/subview.mlir
+++ /dev/null
@@ -1,131 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @subview() -> (memref<4x4xi32, strided<[4, 1], offset: 0>>,
-                         memref<?x?xi32, strided<[?, ?], offset: ?>>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-
-  %0 = memref.alloc() : memref<4x4xi32, strided<[4, 1], offset: 0>>
-  %1 = memref.subview %0[%c1, %c1][%c2, %c2][%c1, %c1]
-    : memref<4x4xi32, strided<[4, 1], offset: 0>> to
-      memref<?x?xi32, strided<[?, ?], offset: ?>>
-
-  %i1 = arith.constant 1 : i32
-  %i2 = arith.constant 2 : i32
-  %i3 = arith.constant 3 : i32
-  %i4 = arith.constant 4 : i32
-
-  memref.store %i1, %1[%c0, %c0] : memref<?x?xi32, strided<[?, ?], offset: ?>>
-  memref.store %i2, %1[%c0, %c1] : memref<?x?xi32, strided<[?, ?], offset: ?>>
-  memref.store %i3, %1[%c1, %c0] : memref<?x?xi32, strided<[?, ?], offset: ?>>
-  memref.store %i4, %1[%c1, %c1] : memref<?x?xi32, strided<[?, ?], offset: ?>>
-
-  return %0, %1 : memref<4x4xi32, strided<[4, 1], offset: 0>>,
-                  memref<?x?xi32, strided<[?, ?], offset: ?>>
-}
-
-// CHECK-LABEL: @subview
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 1, 2, 0], [0, 3, 4, 0], [0, 0, 0, 0]]
-// CHECK-NEXT{LITERAL}: [[1, 2], [3, 4]]
-
-func.func @strided() -> memref<?x?xi32, strided<[?, ?], offset: ?>> {
-  %0 = arith.constant dense<[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]> : memref<2x5xi32>
-
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-
-  %1 = memref.subview %0[%c0, %c1][%c2, %c2][%c1, %c2]
-    : memref<2x5xi32> to
-      memref<?x?xi32, strided<[?, ?], offset: ?>>
-
-  return %1 : memref<?x?xi32, strided<[?, ?], offset: ?>>
-}
-
-// CHECK-LABEL: @strided
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[2, 4], [7, 9]]
-
-func.func @subview_of_subview() -> (memref<?xi32, strided<[?], offset: ?>>,
-                                     memref<?xi32, strided<[?], offset: ?>>) {
-  %0 = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]> : memref<10xi32>
-
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c4 = arith.constant 4 : index
-
-  %1 = memref.subview %0[%c1][%c4][%c2]
-    : memref<10xi32> to memref<?xi32, strided<[?], offset: ?>>
-  %2 = memref.subview %1[%c1][%c2][%c2]
-    : memref<?xi32, strided<[?], offset: ?>> to
-      memref<?xi32, strided<[?], offset: ?>>
-
-  return %1, %2 : memref<?xi32, strided<[?], offset: ?>>,
-                  memref<?xi32, strided<[?], offset: ?>>
-}
-
-// CHECK-LABEL: @subview_of_subview
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [2, 4, 6, 8]
-// CHECK-NEXT{LITERAL}: [4, 8]
-
-func.func @negative_stride() -> memref<?xi32, strided<[?], offset: ?>> {
-  %0 = arith.constant dense<[1, 2, 3, 4]> : memref<4xi32>
-  %c-1 = arith.constant -1 : index
-  %c3 = arith.constant 3 : index
-  %c4 = arith.constant 4 : index
-  %1 = memref.subview %0[%c3][%c4][%c-1]
-    : memref<4xi32> to memref<?xi32, strided<[?], offset: ?>>
-  return %1 : memref<?xi32, strided<[?], offset: ?>>
-}
-
-// CHECK-LABEL: @negative_stride
-// CHECK-NEXT: Results
-// CHECK-NEXT: [4, 3, 2, 1]
-
-func.func @negative_stride_of_subview() -> (memref<?xi32, strided<[?], offset: ?>>, memref<?xi32, strided<[?], offset: ?>>) {
-  %0 = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]> : memref<10xi32>
-  %c-2 = arith.constant -2 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c3 = arith.constant 3 : index
-  %c4 = arith.constant 4 : index
-  %1 = memref.subview %0[%c1][%c4][%c2]
-    : memref<10xi32> to memref<?xi32, strided<[?], offset: ?>>
-  %2 = memref.subview %1[%c3][%c2][%c-2]
-    : memref<?xi32, strided<[?], offset: ?>> to
-      memref<?xi32, strided<[?], offset: ?>>
-  return %1, %2 : memref<?xi32, strided<[?], offset: ?>>, memref<?xi32, strided<[?], offset: ?>>
-}
-
-// CHECK-LABEL: @negative_stride_of_subview
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [2, 4, 6, 8]
-// CHECK-NEXT{LITERAL}: [8, 4]
-
-func.func @rank_reduce() -> memref<2xi32> {
-  %a = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : memref<2x3xi32>
-  %b = memref.subview %a[0, 0][2, 1][1, 1]
-    : memref<2x3xi32> to memref<2xi32>
-  return %b : memref<2xi32>
-}
-
-// CHECK-LABEL: @rank_reduce
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1, 4]
-
-func.func @rank_reduce_middle() -> memref<1x2x3xi32> {
-  %a = arith.constant dense<[[[[0, 1, 2], [3, 4, 5]],
-                              [[6, 7, 8], [9, 10,11]]]]>
-    : memref<1x2x2x3xi32>
-  %b = memref.subview %a[0, 0, 0, 0][1, 2, 1, 3][1, 1, 1, 1]
-    : memref<1x2x2x3xi32> to memref<1x2x3xi32>
-  return %b : memref<1x2x3xi32>
-}
-
-// CHECK-LABEL: @rank_reduce_middle
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[0, 1, 2], [6, 7, 8]]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/bitcast_convert.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/bitcast_convert.mlir
deleted file mode 100644
index f628ebadd21abc..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/bitcast_convert.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @bitcast_convert() -> tensor<3xf32> {
-  %c = arith.constant dense<[10000000,20000000,30000000]> : tensor<3xi32>
-  %ret = mhlo.bitcast_convert %c : (tensor<3xi32>) -> tensor<3xf32>
-  return %ret : tensor<3xf32>
-}
-
-// CHECK-LABEL: @bitcast_convert
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1.401298e-38, 3.254205e-38, 7.411627e-38]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/broadcast_in_dim.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/broadcast_in_dim.mlir
deleted file mode 100644
index 6f834dfb2fecea..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/broadcast_in_dim.mlir
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @broadcast() -> tensor<2x3xui16> {
-  %0 = mhlo.constant dense<[1, 2]> : tensor<2xui16>
-  %1 = "mhlo.broadcast_in_dim"(%0) {
-    broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<2xui16>) -> tensor<2x3xui16>
-  return %1 : tensor<2x3xui16>
-}
-
-// CHECK{LITERAL}: [[1, 1, 1], [2, 2, 2]]
-
-func.func @zero_rank() -> tensor<1x2x3xi32> {
-  %in = mhlo.constant dense<1> : tensor<i32>
-  %0 = "mhlo.broadcast_in_dim"(%in) {
-    broadcast_dimensions = dense<[]> : tensor<0xi64>
-  } : (tensor<i32>) -> tensor<1x2x3xi32>
-  func.return %0 : tensor<1x2x3xi32>
-}
-
-// CHECK{LITERAL}: [[[1, 1, 1], [1, 1, 1]]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/case.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/case.mlir
deleted file mode 100644
index 17d7f8fecd3641..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/case.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @case() -> tensor<i32> {
-  %c1 = mhlo.constant dense<1> : tensor<i32>
-  %c2 = mhlo.constant dense<2> : tensor<i32>
-  %c3 = mhlo.constant dense<3> : tensor<i32>
-  %ret = "mhlo.case"(%c1) ({
-    "mhlo.return"(%c2) : (tensor<i32>) -> ()
-  }, {
-    "mhlo.return"(%c3) : (tensor<i32>) -> ()
-  }) : (tensor<i32>) -> tensor<i32>
-  func.return %ret : tensor<i32>
-}
-
-// CHECK-LABEL: @case
-// CHECK-NEXT: Results
-// CHECK-NEXT: <i32>: 3
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/clamp.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/clamp.mlir
deleted file mode 100644
index f7a455a78e2293..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/clamp.mlir
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @clamp() -> tensor<2x2xi32> {
-  %lb = mhlo.constant dense<[[1, 7], [1, 7]]> : tensor<2x2xi32>
-  %arg = mhlo.constant dense<[[4, 5], [6, 9]]> : tensor<2x2xi32>
-  %ub = mhlo.constant dense<[[5, 9], [3, 6]]> : tensor<2x2xi32>
-  %clamp = "mhlo.clamp"(%lb, %arg, %ub)
-      : (tensor<2x2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  return %clamp : tensor<2x2xi32>
-}
-
-// CHECK-LABEL:         @clamp
-// CHECK-NEXT:          Results
-// CHECK-NEXT{LITERAL}: [[4, 7], [3, 6]]
-
-func.func @clamp_f32() -> tensor<2x2xf32> {
-  %lb = mhlo.constant dense<[[1.1, 7.1], [1.1, 7.1]]> : tensor<2x2xf32>
-  %arg = mhlo.constant dense<[[4.1, 5.1], [6.1, 9.1]]> : tensor<2x2xf32>
-  %ub = mhlo.constant dense<[[5.1, 9.1], [3.1, 6.1]]> : tensor<2x2xf32>
-  %clamp = "mhlo.clamp"(%lb, %arg, %ub)
-      : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %clamp : tensor<2x2xf32>
-}
-
-// CHECK-LABEL:         @clamp
-// CHECK-NEXT:          Results
-// CHECK-NEXT{LITERAL}: [[4.100000e+00, 7.100000e+00], [3.100000e+00, 6.100000e+00]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compare.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compare.mlir
deleted file mode 100644
index f9d7eb99cbeac7..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compare.mlir
+++ /dev/null
@@ -1,143 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @eq() -> (tensor<i1>, tensor<i1>) {
-  %c1 = arith.constant dense<1> : tensor<i32>
-  %c2 = arith.constant dense<2> : tensor<i32>
-  %0 = mhlo.compare EQ, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  %1 = mhlo.compare EQ, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  return %0, %1 : tensor<i1>, tensor<i1>
-}
-
-// CHECK-LABEL: @eq
-// CHECK-NEXT: Results
-// CHECK-NEXT: false
-// CHECK-NEXT: true
-
-func.func @ne() -> (tensor<i1>, tensor<i1>) {
-  %c1 = arith.constant dense<1> : tensor<i32>
-  %c2 = arith.constant dense<2> : tensor<i32>
-  %0 = mhlo.compare NE, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  %1 = mhlo.compare NE, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  return %0, %1 : tensor<i1>, tensor<i1>
-}
-
-// CHECK-LABEL: @ne
-// CHECK-NEXT: Results
-// CHECK-NEXT: true
-// CHECK-NEXT: false
-
-func.func @ge() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
-  %c1 = arith.constant dense<1> : tensor<i32>
-  %c2 = arith.constant dense<2> : tensor<i32>
-  %0 = mhlo.compare GE, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  %1 = mhlo.compare GE, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  %2 = mhlo.compare GE, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
-}
-
-// CHECK-LABEL: @ge
-// CHECK-NEXT: Results
-// CHECK-NEXT: false
-// CHECK-NEXT: true
-// CHECK-NEXT: true
-
-func.func @gt() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
-  %c1 = arith.constant dense<1> : tensor<i32>
-  %c2 = arith.constant dense<2> : tensor<i32>
-  %0 = mhlo.compare GT, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  %1 = mhlo.compare GT, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  %2 = mhlo.compare GT, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
-}
-
-// CHECK-LABEL: @gt
-// CHECK-NEXT: Results
-// CHECK-NEXT: false
-// CHECK-NEXT: false
-// CHECK-NEXT: true
-
-func.func @le() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
-  %c1 = arith.constant dense<1> : tensor<i32>
-  %c2 = arith.constant dense<2> : tensor<i32>
-  %0 = mhlo.compare LE, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  %1 = mhlo.compare LE, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  %2 = mhlo.compare LE, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
-}
-
-// CHECK-LABEL: @le
-// CHECK-NEXT: Results
-// CHECK-NEXT: true
-// CHECK-NEXT: true
-// CHECK-NEXT: false
-
-func.func @lt() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
-  %c1 = arith.constant dense<1> : tensor<i32>
-  %c2 = arith.constant dense<2> : tensor<i32>
-  %0 = mhlo.compare LT, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  %1 = mhlo.compare LT, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  %2 = mhlo.compare LT, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
-}
-
-// CHECK-LABEL: @lt
-// CHECK-NEXT: Results
-// CHECK-NEXT: true
-// CHECK-NEXT: false
-// CHECK-NEXT: false
-
-func.func @complex_eq() -> (tensor<i1>, tensor<i1>) {
-  %c1 = arith.constant dense<(1.0, 1.0)> : tensor<complex<f32>>
-  %c2 = arith.constant dense<(1.0, 2.0)> : tensor<complex<f32>>
-  %0 = mhlo.compare EQ, %c1, %c2
-    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
-  %1 = mhlo.compare EQ, %c1, %c1
-    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
-  return %0, %1 : tensor<i1>, tensor<i1>
-}
-
-// CHECK-LABEL: @complex_eq
-// CHECK-NEXT: Results
-// CHECK-NEXT: false
-// CHECK-NEXT: true
-
-func.func @complex_nan_compare() -> (tensor<i1>, tensor<i1>) {
-  %nan = arith.constant dense<0x7FC00000> : tensor<f32>
-  %c1 = arith.constant dense<1.0> : tensor<f32>
-  %c = mhlo.complex %c1, %nan : tensor<complex<f32>>
-  %0 = mhlo.compare EQ, %c, %c
-    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
-  %1 = mhlo.compare NE, %c, %c
-    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
-  return %0, %1 : tensor<i1>, tensor<i1>
-}
-
-// CHECK-LABEL: @complex_nan_compare
-// CHECK-NEXT: Results
-// CHECK-NEXT: false
-// CHECK-NEXT: true
-
-func.func @float_eq() -> (tensor<i1>, tensor<i1>) {
-  %c1 = arith.constant dense<1.0> : tensor<f32>
-  %c2 = arith.constant dense<2.0> : tensor<f32>
-  %0 = mhlo.compare EQ, %c1, %c2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  %1 = mhlo.compare EQ, %c1, %c1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  return %0, %1 : tensor<i1>, tensor<i1>
-}
-
-// CHECK-LABEL: @float_eq
-// CHECK-NEXT: Results
-// CHECK-NEXT: false
-// CHECK-NEXT: true
-
-func.func @float_nan_compare() -> (tensor<i1>, tensor<i1>) {
-  %nan = arith.constant dense<0x7FC00000> : tensor<f32>
-  %0 = mhlo.compare EQ, %nan, %nan : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  %1 = mhlo.compare NE, %nan, %nan : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  return %0, %1 : tensor<i1>, tensor<i1>
-}
-
-// CHECK-LABEL: @float_nan_compare
-// CHECK-NEXT: Results
-// CHECK-NEXT: false
-// CHECK-NEXT: true
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/complex_math.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/complex_math.mlir
deleted file mode 100644
index c31953be62158e..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/complex_math.mlir
+++ /dev/null
@@ -1,100 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @cos() -> tensor<complex<f64>> {
-  %c = mhlo.constant dense<(1.0, 1.0)> : tensor<complex<f64>>
-  %cos = mhlo.cosine %c : tensor<complex<f64>>
-  return %cos : tensor<complex<f64>>
-}
-
-// CHECK-LABEL: @cos
-// CHECK-NEXT: Results
-// CHECK-NEXT: 8.337300e-01-9.888977e-01i
-
-func.func @expm1() -> tensor<complex<f32>> {
-  // import numpy as np  -- not jax.numpy
-  // np.expm1(np.array(1e-6 + 1e-6j, dtype=np.complex64))
-  // Don't run this with jax.numpy, it returns 9.536743e-07+0.j.
-  %c = mhlo.constant dense<(1.0e-06, 1.0e-06)> : tensor<complex<f32>>
-  %expm1 = mhlo.exponential_minus_one %c : tensor<complex<f32>>
-  return %expm1 : tensor<complex<f32>>
-}
-
-// CHECK-LABEL: @expm1
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e-06+1.000001e-06i
-
-func.func @expm1d() -> tensor<complex<f64>> {
-  %c = mhlo.constant dense<(1.0e-50, 1.0e-50)> : tensor<complex<f64>>
-  %expm1 = mhlo.exponential_minus_one %c : tensor<complex<f64>>
-  return %expm1 : tensor<complex<f64>>
-}
-
-// CHECK-LABEL: @expm1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e-50+1.000000e-50i
-
-func.func @imag() -> tensor<f64> {
-  %c = mhlo.constant dense<(1.0, 2.0)> : tensor<complex<f64>>
-  %imag = mhlo.imag %c : (tensor<complex<f64>>) -> tensor<f64>
-  return %imag : tensor<f64>
-}
-
-// CHECK-LABEL: @imag
-// CHECK-NEXT: Results
-// CHECK-NEXT: 2.000000e+00
-
-func.func @log1pf() -> tensor<complex<f32>> {
-  %c = mhlo.constant dense<(1.0e-07, 1.0e-20)> : tensor<complex<f32>>
-  %cos = mhlo.log_plus_one %c : tensor<complex<f32>>
-  return %cos : tensor<complex<f32>>
-}
-
-// CHECK-LABEL: @log1p
-// CHECK-NEXT: Results
-// The accuracy of this is rather poor, but it matches numpy.
-// CHECK-NEXT: 1.192093e-07+9.999999e-21i
-
-func.func @log1pd() -> tensor<complex<f64>> {
-  %c = mhlo.constant dense<(1.0e-07, 1.0e-20)> : tensor<complex<f64>>
-  %cos = mhlo.log_plus_one %c : tensor<complex<f64>>
-  return %cos : tensor<complex<f64>>
-}
-
-// CHECK-LABEL: @log1pd
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e-07+9.999999e-21i
-
-func.func @logistic() -> tensor<5xcomplex<f32>> {
-  %c = mhlo.constant dense<[(-1.0e5, 0.0), (-1.0, 0.0),
-      (0.0, 0.0), (1.0, 0.0), (1.0e5, 0.0)]> : tensor<5xcomplex<f32>>
-  %ret = mhlo.logistic %c : tensor<5xcomplex<f32>>
-  return %ret : tensor<5xcomplex<f32>>
-}
-
-// CHECK-LABEL: @logistic
-// CHECK-NEXT: Results
-// CHECK-NEXT: [0.000000e+00+0.000000e+00i,
-// CHECK-SAME:  2.689414e-01+0.000000e+00i,
-// CHECK-SAME:  5.000000e-01+0.000000e+00i,
-// CHECK-SAME:  7.310586e-01+0.000000e+00i,
-// CHECK-SAME:  1.000000e+00+0.000000e+00i]
-
-func.func @real() -> tensor<f64> {
-  %c = mhlo.constant dense<(1.0, 2.0)> : tensor<complex<f64>>
-  %real = mhlo.real %c : (tensor<complex<f64>>) -> tensor<f64>
-  return %real : tensor<f64>
-}
-
-// CHECK-LABEL: @real
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e+00
-
-func.func @sin() -> tensor<complex<f64>> {
-  %c = mhlo.constant dense<(1.0, 1.0)> : tensor<complex<f64>>
-  %cos = mhlo.sine %c : tensor<complex<f64>>
-  return %cos : tensor<complex<f64>>
-}
-
-// CHECK-LABEL: @sin
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.298458e+00+6.349639e-01i
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compute_reshape_shape.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compute_reshape_shape.mlir
deleted file mode 100644
index 0c9cb658ba7379..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compute_reshape_shape.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @compute_reshape_shape() -> tensor<3xi32> {
-  %dynamic_shape = mhlo.constant dense<[2, -1, 3]> : tensor<3xi32>
-  %n = arith.constant 24 : index
-  %shape = mhlo.compute_reshape_shape %n, %dynamic_shape
-    : (index, tensor<3xi32>) -> tensor<3xi32>
-  return %shape : tensor<3xi32>
-}
-
-// CHECK-LABEL: @compute_reshape_shape
-// CHECK-NEXT: Results
-// CHECK-NEXT: [2, 4, 3]
-
-func.func @compute_reshape_shape_static() -> tensor<3xi32> {
-  %dynamic_shape = mhlo.constant dense<[2, 4, 3]> : tensor<3xi32>
-  %n = arith.constant 24 : index
-  %shape = mhlo.compute_reshape_shape %n, %dynamic_shape
-    : (index, tensor<3xi32>) -> tensor<3xi32>
-  return %shape : tensor<3xi32>
-}
-
-// CHECK-LABEL: @compute_reshape_shape_static
-// CHECK-NEXT: Results
-// CHECK-NEXT: [2, 4, 3]
-
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/concatenate.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/concatenate.mlir
deleted file mode 100644
index fb88f0c0da1fe3..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/concatenate.mlir
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @concat_1d()  -> tensor<3xi32> {
-  %a = mhlo.constant dense<[1]> : tensor<1xi32>
-  %b = mhlo.constant dense<[2, 3]> : tensor<2xi32>
-  %0 = "mhlo.concatenate"(%a, %b) { dimension = 0 : i64 }
-     : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
-  func.return %0 : tensor<3xi32>
-}
-
-// CHECK-LABEL: @concat_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1, 2, 3]
-
-func.func @concat_dim0() -> tensor<4x2xi32> {
-  %a = mhlo.constant dense<1> : tensor<2x2xi32>
-  %b = mhlo.constant dense<2> : tensor<2x2xi32>
-  %0 = "mhlo.concatenate"(%a, %b) { dimension = 0 : i64 }
-     : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<4x2xi32>
-  func.return %0 : tensor<4x2xi32>
-}
-
-// CHECK-LABEL: @concat_dim0
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 1], [1, 1], [2, 2], [2, 2]]
-
-func.func @concat_dim1() -> tensor<2x4xi32> {
-  %a = mhlo.constant dense<1> : tensor<2x2xi32>
-  %b = mhlo.constant dense<2> : tensor<2x2xi32>
-  %0 = "mhlo.concatenate"(%a, %b) { dimension = 1 : i64 }
-     : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x4xi32>
-  func.return %0 : tensor<2x4xi32>
-}
-
-// CHECK-LABEL: @concat_dim1
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 1, 2, 2], [1, 1, 2, 2]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/constant.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/constant.mlir
deleted file mode 100644
index 77547aff85ba20..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/constant.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @main() -> (tensor<2x3xi32>, tensor<2x3xf32>, tensor<0x0x3xi16>, tensor<f64>) {
-  %i32 = mhlo.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi32>
-  %f32 = mhlo.constant dense<[[0.0, 0.1, 0.2], [0.3, 0.4, 0.5]]> : tensor<2x3xf32>
-  %empty = mhlo.constant dense<> : tensor<0x0x3xi16>
-  %scalar = mhlo.constant dense<3.14> : tensor<f64>
-  return %i32, %f32, %empty, %scalar : tensor<2x3xi32>, tensor<2x3xf32>, tensor<0x0x3xi16>, tensor<f64>
-}
-
-// CHECK-LABEL: @main
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 1, 2], [3, 4, 5]]
-// CHECK-NEXT{LITERAL}: [[0.000000e+00, 1.000000e-01, 2.000000e-01], [3.000000e-01, 4.000000e-01, 5.000000e-01]]
-// CHECK-NEXT{LITERAL}: []
-// CHECK-NEXT{LITERAL}: 3.140000e+00
-
-func.func @ui8() -> tensor<ui8> {
-  %v = mhlo.constant dense<123> : tensor<ui8>
-  return %v : tensor<ui8>
-}
-
-// CHECK-LABEL: @ui8
-// CHECK-NEXT: Results
-// CHECK-NEXT: 123
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/convert.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/convert.mlir
deleted file mode 100644
index fe897b2856a3b6..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/convert.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @convert_i1_to_f32() -> tensor<2xf32> {
-  %input = mhlo.constant dense<[true, false]> : tensor<2xi1>
-  %result = "mhlo.convert"(%input) : (tensor<2xi1>) -> tensor<2xf32>
-  func.return %result : tensor<2xf32>
-}
-
-// CHECK-LABEL: @convert_i1_to_f32
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1.000000e+00, 0.000000e+00]
-
-func.func @convert_f32_to_i16() -> tensor<2xi16> {
-  %input = mhlo.constant dense<[1.4, 2.55]> : tensor<2xf32>
-  %result = "mhlo.convert"(%input) : (tensor<2xf32>) -> tensor<2xi16>
-  func.return %result : tensor<2xi16>
-}
-
-// CHECK-LABEL: @convert_f32_to_i16
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1, 2]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot.mlir
deleted file mode 100644
index 66ed3956c0cbae..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot.mlir
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @dot_2d() -> tensor<2x2xi32> {
-  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  %rhs = mhlo.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
-  %dot = "mhlo.dot"(%lhs, %rhs)
-    : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  return %dot : tensor<2x2xi32>
-}
-
-// CHECK-LABEL: @dot_2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[16, 19], [36, 43]]
-
-func.func @dot_2d_1d() -> tensor<2xi32> {
-  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
-  %dot = "mhlo.dot"(%lhs, %rhs)
-    : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %dot : tensor<2xi32>
-}
-
-// CHECK-LABEL: @dot_2d_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [14, 32]
-
-func.func @dot_1d_1d() -> tensor<i32> {
-  %lhs = mhlo.constant dense<[1, 2]> : tensor<2xi32>
-  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
-  %dot = "mhlo.dot"(%lhs, %rhs)
-    : (tensor<2xi32>, tensor<2xi32>) -> tensor<i32>
-  return %dot : tensor<i32>
-}
-
-// CHECK-LABEL: @dot_1d_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: 14
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot_general.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot_general.mlir
deleted file mode 100644
index 1f83ad5ff63d62..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot_general.mlir
+++ /dev/null
@@ -1,73 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @dot_general_2d() -> tensor<2x2xi32> {
-  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  %rhs = mhlo.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
-  %dot = "mhlo.dot_general"(%lhs, %rhs) {
-    dot_dimension_numbers = #mhlo.dot<
-      lhs_contracting_dimensions = [1],
-      rhs_contracting_dimensions = [0],
-      lhs_batching_dimensions = [],
-      rhs_batching_dimensions = []
-    >
-  } : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  return %dot : tensor<2x2xi32>
-}
-
-// CHECK-LABEL: @dot_general_2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[16, 19], [36, 43]]
-
-func.func @dot_general_2d_2() -> tensor<2x2xi32> {
-  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  %rhs = mhlo.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
-  %dot = "mhlo.dot_general"(%lhs, %rhs) {
-    dot_dimension_numbers = #mhlo.dot<
-      lhs_contracting_dimensions = [0],
-      rhs_contracting_dimensions = [1],
-      lhs_batching_dimensions = [],
-      rhs_batching_dimensions = []
-    >
-  } : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  return %dot : tensor<2x2xi32>
-}
-
-// CHECK-LABEL: @dot_general_2d_2
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[19, 27], [28, 40]]
-
-func.func @dot_general_2d_1d() -> tensor<2xi32> {
-  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
-  %dot = "mhlo.dot_general"(%lhs, %rhs) {
-    dot_dimension_numbers = #mhlo.dot<
-      lhs_contracting_dimensions = [0],
-      rhs_contracting_dimensions = [0],
-      lhs_batching_dimensions = [],
-      rhs_batching_dimensions = []
-    >
-  } : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %dot : tensor<2xi32>
-}
-
-// CHECK-LABEL: @dot_general_2d_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [19, 28]
-
-func.func @dot_general_batch_only() -> tensor<2x2xi32> {
-  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
-  %dot = "mhlo.dot_general"(%lhs, %rhs) {
-    dot_dimension_numbers = #mhlo.dot<
-      lhs_contracting_dimensions = [],
-      rhs_contracting_dimensions = [],
-      lhs_batching_dimensions = [1],
-      rhs_batching_dimensions = [0]
-    >
-  } : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
-  return %dot : tensor<2x2xi32>
-}
-
-// CHECK-LABEL: @dot_general_batch_only
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[4, 12], [10, 20]]
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_slice.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_slice.mlir
deleted file mode 100644
index 8708694d3a8504..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_slice.mlir
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @dynamic_slice() -> tensor<1x2xi32> {
-  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
-    : tensor<3x4xi32>
-  %s0 = mhlo.constant dense<1> : tensor<i32>
-  %s1 = mhlo.constant dense<0> : tensor<i32>
-  %0 = "mhlo.dynamic_slice"(%cst, %s0, %s1) {
-    slice_sizes = dense<[1, 2]> : tensor<2xi64>
-  } : (tensor<3x4xi32>, tensor<i32>, tensor<i32>) -> tensor<1x2xi32>
-  func.return %0 : tensor<1x2xi32>
-}
-
-// CHECK-LABEL: @dynamic_slice
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[5, 6]]
-
-func.func @clamp_starts() -> tensor<2x3xi32> {
-
-  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
-    : tensor<3x4xi32>
-  %s0 = mhlo.constant dense<-10> : tensor<i32>
-  %s1 = mhlo.constant dense<10> : tensor<i32>
-  %0 = "mhlo.dynamic_slice"(%cst, %s0, %s1) {
-    slice_sizes = dense<[2, 3]> : tensor<2xi64>
-  } : (tensor<3x4xi32>, tensor<i32>, tensor<i32>) -> tensor<2x3xi32>
-  func.return %0 : tensor<2x3xi32>
-}
-
-// CHECK-LABEL: @clamp_starts
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[2, 3, 4], [6, 7, 8]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_update_slice.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_update_slice.mlir
deleted file mode 100644
index 481389e156e927..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_update_slice.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @dynamic_update_slice() -> tensor<3x4xi32> {
-  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
-    : tensor<3x4xi32>
-  %v = mhlo.constant dense<[[13, 14]]> : tensor<1x2xi32>
-  %s0 = mhlo.constant dense<1> : tensor<i32>
-  %s1 = mhlo.constant dense<0> : tensor<i32>
-  %0 = "mhlo.dynamic_update_slice"(%cst, %v, %s0, %s1)
-    : (tensor<3x4xi32>, tensor<1x2xi32>, tensor<i32>, tensor<i32>)
-      -> tensor<3x4xi32>
-  func.return %0 : tensor<3x4xi32>
-}
-
-// CHECK-LABEL: @dynamic_update_slice
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4], [13, 14, 7, 8], [9, 10, 11, 12]]
-
-func.func @clamp_starts() -> tensor<3x4xi32> {
-  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
-    : tensor<3x4xi32>
-  %v = mhlo.constant dense<[[13, 14]]> : tensor<1x2xi32>
-  %s0 = mhlo.constant dense<-10> : tensor<i32>
-  %s1 = mhlo.constant dense<10> : tensor<i32>
-  %0 = "mhlo.dynamic_update_slice"(%cst, %v, %s0, %s1) {
-    slice_sizes = dense<[2, 3]> : tensor<2xi64>
-  } : (tensor<3x4xi32>, tensor<1x2xi32>, tensor<i32>, tensor<i32>)
-      -> tensor<3x4xi32>
-  func.return %0 : tensor<3x4xi32>
-}
-
-// CHECK-LABEL: @clamp_starts
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 2, 13, 14], [5, 6, 7, 8], [9, 10, 11, 12]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/float_math.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/float_math.mlir
deleted file mode 100644
index c713e4bf46394d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/float_math.mlir
+++ /dev/null
@@ -1,199 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @atan2() -> tensor<1xf32> {
-  %c10 = mhlo.constant dense<10.0> : tensor<1xf32>
-  %c1 = mhlo.constant dense<1.0> : tensor<1xf32>
-  %ret = mhlo.atan2 %c10, %c1 : tensor<1xf32>
-  return %ret : tensor<1xf32>
-}
-
-// CHECK-LABEL: @atan2
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1.471128e+00]
-
-func.func @cbrt() -> tensor<1xf32> {
-  %c-27 = mhlo.constant dense<-27.0> : tensor<1xf32>
-  %ret = mhlo.cbrt %c-27 : tensor<1xf32>
-  return %ret : tensor<1xf32>
-}
-
-// CHECK-LABEL: @cbrt
-// CHECK-NEXT: Results
-// CHECK-NEXT: [-3.000000e+00]
-
-func.func @ceil() -> tensor<1xf32> {
-  %c = mhlo.constant dense<0.123> : tensor<1xf32>
-  %ret = mhlo.ceil %c : tensor<1xf32>
-  return %ret : tensor<1xf32>
-}
-
-// CHECK-LABEL: @ceil
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1.000000e+00]
-
-func.func @complex() -> tensor<complex<f32>> {
-  %c1 = mhlo.constant dense<1.0> : tensor<f32>
-  %c2 = mhlo.constant dense<2.0> : tensor<f32>
-  %ret = mhlo.complex %c1, %c2 : tensor<complex<f32>>
-  return %ret : tensor<complex<f32>>
-}
-
-// CHECK-LABEL: @complex
-// CHECK-NEXT: Results
-// CHECK-NEXT: <complex<f32>>: 1.000000e+00+2.000000e+00i
-
-func.func @exp() -> tensor<1xf32> {
-  %c = mhlo.constant dense<0.0> : tensor<1xf32>
-  %ret = mhlo.exponential %c : tensor<1xf32>
-  return %ret : tensor<1xf32>
-}
-
-// CHECK-LABEL: @exp
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1.000000e+00]
-
-func.func @floor() -> tensor<1xf32> {
-  %c = mhlo.constant dense<3.123> : tensor<1xf32>
-  %ret = mhlo.floor %c : tensor<1xf32>
-  return %ret : tensor<1xf32>
-}
-
-// CHECK-LABEL: @floor
-// CHECK-NEXT: Results
-// CHECK-NEXT: [3.000000e+00]
-
-func.func @is_finite() -> tensor<3xi1> {
-  %c = mhlo.constant dense<[0x7FC00000, 2.0, 0x7F800000]> : tensor<3xf32>
-  %is_finite = mhlo.is_finite %c : (tensor<3xf32>) -> tensor<3xi1>
-  return %is_finite : tensor<3xi1>
-}
-
-// CHECK-LABEL: @is_finite
-// CHECK-NEXT: Results
-// CHECK-NEXT: [false, true, false]
-
-func.func @log() -> tensor<1xf32> {
-  %c = mhlo.constant dense<1.0> : tensor<1xf32>
-  %ret = mhlo.log %c : tensor<1xf32>
-  return %ret : tensor<1xf32>
-}
-
-// CHECK-LABEL: @log
-// CHECK-NEXT: Results
-// CHECK-NEXT: [0.000000e+00]
-
-func.func @logistic() -> tensor<5xf32> {
-  %c = mhlo.constant dense<[-1.0e5, -1.0, 0.0, 1.0, 1.0e5]> : tensor<5xf32>
-  %ret = mhlo.logistic %c : tensor<5xf32>
-  return %ret : tensor<5xf32>
-}
-
-// CHECK-LABEL: @logistic
-// CHECK-NEXT: Results
-// CHECK-NEXT: [0.000000e+00, 2.689414e-01, 5.000000e-01, 7.310586e-01, 1.000000e+00]
-
-func.func @maximum() -> tensor<f32> {
-  %c1 = mhlo.constant dense<1.0> : tensor<f32>
-  %c2 = mhlo.constant dense<2.0> : tensor<f32>
-  %ret = mhlo.maximum %c1, %c2 : tensor<f32>
-  return %ret : tensor<f32>
-}
-
-// CHECK-LABEL: @maximum
-// CHECK-NEXT: Results
-// CHECK-NEXT: 2.000000e+00
-
-func.func @minimum() -> tensor<f32> {
-  %c1 = mhlo.constant dense<1.0> : tensor<f32>
-  %c2 = mhlo.constant dense<2.0> : tensor<f32>
-  %ret = mhlo.minimum %c1, %c2 : tensor<f32>
-  return %ret : tensor<f32>
-}
-
-// CHECK-LABEL: @minimum
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e+00
-
-func.func @pow() -> tensor<f32> {
-  %c2 = mhlo.constant dense<2.0> : tensor<f32>
-  %c5 = mhlo.constant dense<5.0> : tensor<f32>
-  %pow = mhlo.power %c2, %c5 : tensor<f32>
-  return %pow : tensor<f32>
-}
-
-// CHECK-LABEL: @pow
-// CHECK-NEXT: Results
-// CHECK-NEXT: 3.200000e+01
-
-func.func @rem() -> tensor<8xf32> {
-  %0 = mhlo.constant dense<[-2.5, 2.25, -10.0, 6.0, 3.0, 3.0, -1.0, -8.0]> : tensor<8xf32>
-  %1 = mhlo.constant dense<[10.0, 1.0, 10.0, -6.0, 2.0, -2.0, 7.0, -4.0]> : tensor<8xf32>
-  %2 = mhlo.remainder %0, %1 : tensor<8xf32>
-  func.return %2 : tensor<8xf32>
-}
-
-// CHECK-LABEL: @rem
-// CHECK-NEXT: Results
-// CHECK-NEXT: [-2.500000e+00, 2.500000e-01, -0.000000e+00, 0.000000e+00, 1.000000e+00, 1.000000e+00, -1.000000e+00, -0.000000e+00]
-
-func.func @rem_inf() -> (tensor<1xf32>, tensor<1xf32>) {
-  %0 = mhlo.constant dense<[1.0]> : tensor<1xf32>
-  %1 = mhlo.constant dense<[0x7F800000]> : tensor<1xf32>
-  %2 = mhlo.remainder %0, %1 : tensor<1xf32>
-  func.return %2, %1 : tensor<1xf32>, tensor<1xf32>
-}
-
-// CHECK-LABEL: @rem_inf
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1.000000e+00]
-// CHECK-NEXT: [INF]
-
-func.func @round_nearest_afz() -> tensor<4xf32> {
-  %c = mhlo.constant dense<[-1.5, -0.5, 0.5, 1.5]> : tensor<4xf32>
-  %ret = mhlo.round_nearest_afz %c : tensor<4xf32>
-  return %ret : tensor<4xf32>
-}
-
-// CHECK-LABEL: @round_nearest_afz
-// CHECK-NEXT: Results
-// CHECK-NEXT: [-2.000000e+00, -1.000000e+00, 1.000000e+00, 2.000000e+00]
-
-func.func @round_nearest_even() -> tensor<5xf32> {
-  %c = mhlo.constant dense<[-1.5, -0.5, 0.5, 0.6, 1.5]> : tensor<5xf32>
-  %ret = mhlo.round_nearest_even %c : tensor<5xf32>
-  return %ret : tensor<5xf32>
-}
-
-// CHECK-LABEL: @round_nearest_even
-// CHECK-NEXT: Results
-// CHECK-NEXT: [-2.000000e+00, -0.000000e+00, 0.000000e+00, 1.000000e+00, 2.000000e+00]
-
-func.func @rsqrt() -> tensor<1xf32> {
-  %c4 = mhlo.constant dense<4.0> : tensor<1xf32>
-  %ret = mhlo.rsqrt %c4 : tensor<1xf32>
-  return %ret : tensor<1xf32>
-}
-
-// CHECK-LABEL: @rsqrt
-// CHECK-NEXT: Results
-// CHECK-NEXT: [5.000000e-01]
-
-func.func @sign() -> tensor<3xf32> {
-  %c = mhlo.constant dense<[-1.0, 2.0, 0x7F800000]> : tensor<3xf32>
-  %ret = mhlo.sign %c : tensor<3xf32>
-  return %ret : tensor<3xf32>
-}
-
-// CHECK-LABEL: @sign
-// CHECK-NEXT: Results
-// CHECK-NEXT: [-1.000000e+00, 1.000000e+00, 1.000000e+00]
-
-func.func @tanh() -> tensor<1xf32> {
-  %c = mhlo.constant dense<[1.0]> : tensor<1xf32>
-  %ret = mhlo.tanh %c : tensor<1xf32>
-  return %ret : tensor<1xf32>
-}
-
-// CHECK-LABEL: @tanh
-// CHECK-NEXT: Results
-// CHECK-NEXT: [7.615942e-01]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/gather.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/gather.mlir
deleted file mode 100644
index dd216b5f29a357..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/gather.mlir
+++ /dev/null
@@ -1,78 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @bounds_check() -> tensor<4x3xi32> {
-  %operand = mhlo.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi32>
-  %indices = mhlo.constant dense<[[1], [8], [-3]]> : tensor<3x1xi32>
-  %gather = "mhlo.gather"(%operand, %indices) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [],
-      index_vector_dim = 1,
-      offset_dims = [0],
-      start_index_map = [0]
-    >,
-    slice_sizes = dense<[4]> : tensor<1xi64>
-  } : (tensor<10xi32>, tensor<3x1xi32>) -> tensor<4x3xi32>
-  return %gather : tensor<4x3xi32>
-}
-
-// CHECK-LABEL: @bounds_check
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 6, 0], [2, 7, 1], [3, 8, 2], [4, 9, 3]]
-
-func.func @gather_2d() -> tensor<4x2xi32> {
-  // operand = np.arange(1, 16).reshape([5, 3, 1])
-  // indices = np.array([[0, 0], [1, 0], [4, 3], [-1, -1]])
-  // lax.gather(operand, indices, lax.GatherDimensionNumbers(offset_dims=(1,),
-  //   collapsed_slice_dims=(1,2), start_index_map=(0,1,)), slice_sizes=[2,1,1]))
-  %operand = arith.constant dense<[
-    [[1], [2], [3]],
-    [[4], [5], [6]],
-    [[7], [8], [9]],
-    [[10], [11], [12]],
-    [[13], [14], [15]]
-  ]> : tensor<5x3x1xi32>
-
-  %indices = arith.constant dense<[
-    [0, 0],
-    [1, 0],
-    [4, 3],
-    [-1, -1]
-  ]> : tensor<4x2xi64>
-
-  %0 = "mhlo.gather"(%operand, %indices) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [1, 2],
-      index_vector_dim = 1,
-      offset_dims = [1],
-      start_index_map = [0, 1]
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[2, 1, 1]> : tensor<3xi64>
-  } : (tensor<5x3x1xi32>, tensor<4x2xi64>) -> tensor<4x2xi32>
-
-  func.return %0 : tensor<4x2xi32>
-}
-
-// CHECK-LABEL: @gather_2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 4], [4, 7], [12, 15], [1, 4]]
-
-func.func @dynamic_gather() -> tensor<*xi32> {
-  %operand = mhlo.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi32>
-  %indices = mhlo.constant dense<[[1], [8], [-3]]> : tensor<3x1xi32>
-  %slice_sizes = mhlo.constant dense<[4]> : tensor<1xi32>
-  %gather = "mhlo.dynamic_gather"(%operand, %indices, %slice_sizes) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [],
-      index_vector_dim = 1,
-      offset_dims = [0],
-      start_index_map = [0]
-    >,
-    slice_sizes = dense<[4]> : tensor<1xi64>
-  } : (tensor<10xi32>, tensor<3x1xi32>, tensor<1xi32>) -> tensor<*xi32>
-  return %gather : tensor<*xi32>
-}
-
-// CHECK-LABEL: @dynamic_gather
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 6, 0], [2, 7, 1], [3, 8, 2], [4, 9, 3]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/int_math.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/int_math.mlir
deleted file mode 100644
index 999f6f6996e8b6..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/int_math.mlir
+++ /dev/null
@@ -1,358 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @abs() -> tensor<1xi32> {
-  %c-3 = mhlo.constant dense<-3> : tensor<1xi32>
-  %ret = mhlo.abs %c-3 : tensor<1xi32>
-  return %ret : tensor<1xi32>
-}
-
-// CHECK-LABEL: @abs
-// CHECK-NEXT: Results
-// CHECK-NEXT: [3]
-
-func.func @add_tensor() -> tensor<2x3xi32> {
-  %lhs = mhlo.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi32>
-  %rhs = mhlo.constant dense<[[10, 20, 30], [40, 50, 60]]> : tensor<2x3xi32>
-  %result = mhlo.add %lhs, %rhs : tensor<2x3xi32>
-  return %result : tensor<2x3xi32>
-}
-
-// CHECK-LABEL: @add_tensor
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[10, 21, 32], [43, 54, 65]]
-
-func.func @add_scalar_i8() -> tensor<i8> {
-  %lhs = mhlo.constant dense<40> : tensor<i8>
-  %rhs = mhlo.constant dense<2> : tensor<i8>
-  %result = mhlo.add %lhs, %rhs : tensor<i8>
-  return %result : tensor<i8>
-}
-
-// CHECK-LABEL: @add_scalar_i8
-// CHECK-NEXT: Results
-// CHECK-NEXT: <i8>: 42
-
-func.func @add_scalar_i16() -> tensor<i16> {
-  %lhs = mhlo.constant dense<40> : tensor<i16>
-  %rhs = mhlo.constant dense<2> : tensor<i16>
-  %result = mhlo.add %lhs, %rhs : tensor<i16>
-  return %result : tensor<i16>
-}
-
-// CHECK-LABEL: @add_scalar_i16
-// CHECK-NEXT: Results
-// CHECK-NEXT: <i16>: 42
-
-func.func @add_scalar_i32() -> tensor<i32> {
-  %lhs = mhlo.constant dense<40> : tensor<i32>
-  %rhs = mhlo.constant dense<2> : tensor<i32>
-  %result = mhlo.add %lhs, %rhs : tensor<i32>
-  return %result : tensor<i32>
-}
-
-// CHECK-LABEL: @add_scalar_i32
-// CHECK-NEXT: Results
-// CHECK-NEXT: <i32>: 42
-
-func.func @and() -> tensor<1xi32> {
-  %c63 = mhlo.constant dense<63> : tensor<1xi32>
-  %c131 = mhlo.constant dense<131> : tensor<1xi32>
-  %ret = mhlo.and %c63, %c131 : tensor<1xi32>
-  return %ret : tensor<1xi32>
-}
-
-// CHECK-LABEL: @and
-// CHECK-NEXT: Results
-// CHECK-NEXT: [3]
-
-func.func @and_i16() -> tensor<i16> {
-  %c63 = mhlo.constant dense<63> : tensor<i16>
-  %c131 = mhlo.constant dense<131> : tensor<i16>
-  %ret = mhlo.and %c63, %c131 : tensor<i16>
-  return %ret : tensor<i16>
-}
-
-// CHECK-LABEL: @and_i16
-// CHECK-NEXT: Results
-// CHECK-NEXT: <i16>: 3
-
-func.func @clz_negative()
-    -> (tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>) {
-  %c-1_8 = mhlo.constant dense<-1> : tensor<1xi8>
-  %c-2_16 = mhlo.constant dense<-2> : tensor<1xi16>
-  %c-4_32 = mhlo.constant dense<-4> : tensor<1xi32>
-  %c-8_64 = mhlo.constant dense<-8> : tensor<1xi64>
-  %clz_8 = mhlo.count_leading_zeros %c-1_8 : tensor<1xi8>
-  %clz_16 = mhlo.count_leading_zeros %c-2_16 : tensor<1xi16>
-  %clz_32 = mhlo.count_leading_zeros %c-4_32 : tensor<1xi32>
-  %clz_64 = mhlo.count_leading_zeros %c-8_64 : tensor<1xi64>
-  return %clz_8, %clz_16, %clz_32, %clz_64
-    : tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>
-}
-
-// CHECK-LABEL: @clz_negative
-// CHECK-NEXT: Results
-// CHECK-NEXT: [0]
-// CHECK-NEXT: [0]
-// CHECK-NEXT: [0]
-// CHECK-NEXT: [0]
-
-func.func @clz_signed()
-    -> (tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>) {
-  %c1_8 = mhlo.constant dense<1> : tensor<1xi8>
-  %c2_16 = mhlo.constant dense<2> : tensor<1xi16>
-  %c4_32 = mhlo.constant dense<4> : tensor<1xi32>
-  %c8_64 = mhlo.constant dense<8> : tensor<1xi64>
-  %clz_8 = mhlo.count_leading_zeros %c1_8 : tensor<1xi8>
-  %clz_16 = mhlo.count_leading_zeros %c2_16 : tensor<1xi16>
-  %clz_32 = mhlo.count_leading_zeros %c4_32 : tensor<1xi32>
-  %clz_64 = mhlo.count_leading_zeros %c8_64 : tensor<1xi64>
-  return %clz_8, %clz_16, %clz_32, %clz_64
-    : tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>
-}
-
-// CHECK-LABEL: @clz_signed
-// CHECK-NEXT: Results
-// CHECK-NEXT: [7]
-// CHECK-NEXT: [14]
-// CHECK-NEXT: [29]
-// CHECK-NEXT: [60]
-
-func.func @clz_unsigned()
-    -> (tensor<1xui8>, tensor<1xui16>, tensor<1xui32>, tensor<1xui64>) {
-  %c1_8 = mhlo.constant dense<1> : tensor<1xui8>
-  %c2_16 = mhlo.constant dense<2> : tensor<1xui16>
-  %c4_32 = mhlo.constant dense<4> : tensor<1xui32>
-  %c8_64 = mhlo.constant dense<8> : tensor<1xui64>
-  %clz_8 = mhlo.count_leading_zeros %c1_8 : tensor<1xui8>
-  %clz_16 = mhlo.count_leading_zeros %c2_16 : tensor<1xui16>
-  %clz_32 = mhlo.count_leading_zeros %c4_32 : tensor<1xui32>
-  %clz_64 = mhlo.count_leading_zeros %c8_64 : tensor<1xui64>
-  return %clz_8, %clz_16, %clz_32, %clz_64
-    : tensor<1xui8>, tensor<1xui16>, tensor<1xui32>, tensor<1xui64>
-}
-
-// CHECK-LABEL: @clz_unsigned
-// CHECK-NEXT: Results
-// CHECK-NEXT: [7]
-// CHECK-NEXT: [14]
-// CHECK-NEXT: [29]
-// CHECK-NEXT: [60]
-
-func.func @divide() -> tensor<1xi32> {
-  %c-10 = mhlo.constant dense<-10> : tensor<1xi32>
-  %c-2 = mhlo.constant dense<-2> : tensor<1xi32>
-  %ret = mhlo.divide %c-10, %c-2 : tensor<1xi32>
-  return %ret : tensor<1xi32>
-}
-
-// CHECK-LABEL: @divide
-// CHECK-NEXT: Results
-// CHECK-NEXT: [5]
-
-func.func @subtract() -> tensor<1xi32> {
-  %c10 = mhlo.constant dense<10> : tensor<1xi32>
-  %c3 = mhlo.constant dense<3> : tensor<1xi32>
-  %ret = mhlo.subtract %c10, %c3 : tensor<1xi32>
-  return %ret : tensor<1xi32>
-}
-
-// CHECK-LABEL: @subtract
-// CHECK-NEXT: Results
-// CHECK-NEXT: [7]
-
-func.func @or() -> tensor<1xi32> {
-  %c3 = mhlo.constant dense<3> : tensor<1xi32>
-  %c10 = mhlo.constant dense<10> : tensor<1xi32>
-  %ret = mhlo.or %c3, %c10 : tensor<1xi32>
-  return %ret : tensor<1xi32>
-}
-
-// CHECK-LABEL: @or
-// CHECK-NEXT: Results
-// CHECK-NEXT: [11]
-
-func.func @max_scalar() -> tensor<i32> {
-  %lhs = mhlo.constant dense<40> : tensor<i32>
-  %rhs = mhlo.constant dense<2> : tensor<i32>
-  %result = mhlo.maximum %lhs, %rhs : tensor<i32>
-  return %result : tensor<i32>
-}
-
-// CHECK-LABEL: @max_scalar
-// CHECK-NEXT: Results
-// CHECK-NEXT: 40
-
-func.func @multiply() -> tensor<1xi32> {
-  %c3 = mhlo.constant dense<3> : tensor<1xi32>
-  %c-5 = mhlo.constant dense<-5> : tensor<1xi32>
-  %ret = mhlo.multiply %c3, %c-5 : tensor<1xi32>
-  return %ret : tensor<1xi32>
-}
-
-// CHECK-LABEL: @multiply
-// CHECK-NEXT: Results
-// CHECK-NEXT: [-15]
-
-func.func @multiply_scalar_i16() -> tensor<i16> {
-  %lhs = mhlo.constant dense<40> : tensor<i16>
-  %rhs = mhlo.constant dense<2> : tensor<i16>
-  %result = mhlo.multiply %lhs, %rhs : tensor<i16>
-  return %result : tensor<i16>
-}
-
-// CHECK-LABEL: @multiply_scalar_i16
-// CHECK-NEXT: Results
-// CHECK-NEXT: <i16>: 80
-
-func.func @multiply_scalar_ui16() -> tensor<ui16> {
-  %lhs = mhlo.constant dense<40> : tensor<ui16>
-  %rhs = mhlo.constant dense<2> : tensor<ui16>
-  %result = mhlo.multiply %lhs, %rhs : tensor<ui16>
-  return %result : tensor<ui16>
-}
-
-// CHECK-LABEL: @multiply_scalar_ui16
-// CHECK-NEXT: Results
-// CHECK-NEXT: <ui16>: 80
-
-func.func @not_i1() -> tensor<2xi1> {
-  %cst = mhlo.constant dense<[false, true]> : tensor<2xi1>
-  %not = mhlo.not %cst : tensor<2xi1>
-  return %not : tensor<2xi1>
-}
-
-// CHECK-LABEL: @not_i1
-// CHECK-NEXT: Results
-// CHECK-NEXT: [true, false]
-
-func.func @not_ui16() -> tensor<ui16> {
-  %cst = mhlo.constant dense<1> : tensor<ui16>
-  %not = mhlo.not %cst : tensor<ui16>
-  return %not : tensor<ui16>
-}
-
-// CHECK-LABEL: @not_ui16
-// CHECK-NEXT: Results
-// CHECK-NEXT: 65534
-
-func.func @popcnt_negative()
-    -> (tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>) {
-  %c-1_8 = mhlo.constant dense<-1> : tensor<1xi8>
-  %c-2_16 = mhlo.constant dense<-2> : tensor<1xi16>
-  %c-4_32 = mhlo.constant dense<-4> : tensor<1xi32>
-  %c-8_64 = mhlo.constant dense<-8> : tensor<1xi64>
-  %pop_8 = mhlo.popcnt %c-1_8 : tensor<1xi8>
-  %pop_16 = mhlo.popcnt %c-2_16 : tensor<1xi16>
-  %pop_32 = mhlo.popcnt %c-4_32 : tensor<1xi32>
-  %pop_64 = mhlo.popcnt %c-8_64 : tensor<1xi64>
-  return %pop_8, %pop_16, %pop_32, %pop_64
-    : tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>
-}
-
-// CHECK-LABEL: @popcnt_negative
-// CHECK-NEXT: Results
-// CHECK-NEXT: [8]
-// CHECK-NEXT: [15]
-// CHECK-NEXT: [30]
-// CHECK-NEXT: [61]
-
-func.func @pow() -> tensor<ui16> {
-  %c2 = mhlo.constant dense<2> : tensor<ui16>
-  %c5 = mhlo.constant dense<5> : tensor<ui16>
-  %pow = mhlo.power %c2, %c5 : tensor<ui16>
-  return %pow : tensor<ui16>
-}
-
-// CHECK-LABEL: @pow
-// CHECK-NEXT: Results
-// CHECK-NEXT: <ui16>: 32
-
-func.func @pow_negative() -> tensor<3xi64> {
-  %c = mhlo.constant dense<[0, 1, 2]> : tensor<3xi64>
-  %c2 = mhlo.constant dense<-1> : tensor<3xi64>
-  %pow = mhlo.power %c, %c2 : tensor<3xi64>
-  return %pow : tensor<3xi64>
-}
-
-// CHECK-LABEL: @pow_negative
-// CHECK-NEXT: Results
-// CHECK-NEXT: [0, 1, 0]
-
-func.func @pow_non_double() -> tensor<i64> {
-  %c3 = mhlo.constant dense<3> : tensor<i64>
-  %c35 = mhlo.constant dense<35> : tensor<i64>
-  // The result of this operation cannot be represented by a double.
-  %pow = mhlo.power %c3, %c35 : tensor<i64>
-  return %pow : tensor<i64>
-}
-
-// CHECK-LABEL: @pow_non_double
-// CHECK-NEXT: Results
-// CHECK-NEXT: 50031545098999707
-
-func.func @rem() -> tensor<4xi32> {
-  %0 = mhlo.constant dense<[5, 66, 5, -1]> : tensor<4xi32>
-  %1 = mhlo.constant dense<[3, 5, 1, -2]> : tensor<4xi32>
-  %2 = mhlo.remainder %0, %1 : tensor<4xi32>
-  func.return %2 : tensor<4xi32>
-}
-
-// CHECK-LABEL: @rem
-// CHECK-NEXT: Results
-// CHECK-NEXT: [2, 1, 0, -1]
-
-func.func @shift_left() -> tensor<2xi32> {
-  %0 = mhlo.constant dense<[3, 7]> : tensor<2xi32>
-  %1 = mhlo.constant dense<[4, 31]> : tensor<2xi32>
-  %ret = mhlo.shift_left %0, %1 : tensor<2xi32>
-  func.return %ret : tensor<2xi32>
-}
-
-// CHECK-LABEL: @shift_left
-// CHECK-NEXT: Results
-// CHECK-NEXT: [48, -2147483648]
-
-func.func @shift_right_arith() -> tensor<2xi32> {
-  %0 = mhlo.constant dense<[100, -100]> : tensor<2xi32>
-  %1 = mhlo.constant dense<[2, 2]> : tensor<2xi32>
-  %ret = mhlo.shift_right_arithmetic %0, %1 : tensor<2xi32>
-  func.return %ret : tensor<2xi32>
-}
-
-// CHECK-LABEL: @shift_right_arith
-// CHECK-NEXT: Results
-// CHECK-NEXT: [25, -25]
-
-func.func @shift_right_arith_ui32() -> tensor<ui32> {
-  %0 = mhlo.constant dense<100> : tensor<ui32>
-  %1 = mhlo.constant dense<2> : tensor<ui32>
-  %ret = mhlo.shift_right_arithmetic %0, %1 : tensor<ui32>
-  func.return %ret : tensor<ui32>
-}
-
-// CHECK-LABEL: @shift_right_arith_ui32
-// CHECK-NEXT: Results
-// CHECK-NEXT: 25
-
-func.func @shift_right_logical() -> tensor<2xi32> {
-  %0 = mhlo.constant dense<[100, -100]> : tensor<2xi32>
-  %1 = mhlo.constant dense<[2, 2]> : tensor<2xi32>
-  %ret = mhlo.shift_right_logical %0, %1 : tensor<2xi32>
-  func.return %ret : tensor<2xi32>
-}
-
-// CHECK-LABEL: @shift_right_logical
-// CHECK-NEXT: Results
-// CHECK-NEXT: [25, 1073741799]
-
-func.func @shift_right_logical_ui32() -> tensor<ui32> {
-  %0 = mhlo.constant dense<100> : tensor<ui32>
-  %1 = mhlo.constant dense<2> : tensor<ui32>
-  %ret = mhlo.shift_right_logical %0, %1 : tensor<ui32>
-  func.return %ret : tensor<ui32>
-}
-
-// CHECK-LABEL: @shift_right_logical_ui32
-// CHECK-NEXT: Results
-// CHECK-NEXT: 25
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/iota.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/iota.mlir
deleted file mode 100644
index e2d12d998e0bb5..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/iota.mlir
+++ /dev/null
@@ -1,30 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @iota_f32() -> tensor<1x2x3x4xf32> {
-  %result = "mhlo.iota"() {
-    iota_dimension = 2 : i64
-  } : () -> tensor<1x2x3x4xf32>
-  func.return %result : tensor<1x2x3x4xf32>
-}
-
-// CHECK-LABEL: @iota_f32
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[[0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00],
-// CHECK{LITERAL}:         [1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00],
-// CHECK{LITERAL}:         [2.000000e+00, 2.000000e+00, 2.000000e+00, 2.000000e+00]]]]
-
-func.func @iota_i32() -> tensor<1x2x3x4xi32> {
-  %result = "mhlo.iota"() {
-    iota_dimension = 3 : i64
-  } : () -> tensor<1x2x3x4xi32>
-  func.return %result : tensor<1x2x3x4xi32>
-}
-
-// CHECK-LABEL: @iota_i32
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[[0, 1, 2, 3],
-// CHECK{LITERAL}          [0, 1, 2, 3],
-// CHECK{LITERAL}          [0, 1, 2, 3]],
-// CHECK{LITERAL}         [[0, 1, 2, 3],
-// CHECK{LITERAL}          [0, 1, 2, 3],
-// CHECK{LITERAL}          [0, 1, 2, 3]]]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/pad.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/pad.mlir
deleted file mode 100644
index 1f563b2d5ec57f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/pad.mlir
+++ /dev/null
@@ -1,56 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @static_pad() -> tensor<2x4x7xi32> {
-  // lax.pad(np.array([[[1,2,3],[4,5,6]]]), 42,
-  //                  [(0, 1, 0), (1, 1, 0), (2, 0, 1)])
-  %cst = mhlo.constant dense<[[[1,2,3],[4,5,6]]]> : tensor<1x2x3xi32>
-  %pad_value = mhlo.constant dense<42> : tensor<i32>
-  %0 = "mhlo.pad"(%cst, %pad_value) {
-    edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
-    edge_padding_high = dense<[1, 1, 0]> : tensor<3xi64>,
-    interior_padding = dense<[0, 0, 1]> : tensor<3xi64>
-  } : (tensor<1x2x3xi32>, tensor<i32>) -> tensor<2x4x7xi32>
-  func.return %0 : tensor<2x4x7xi32>
-}
-
-// CHECK-LABEL: @static_pad
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[42, 42, 42, 42, 42, 42, 42],
-// CHECK-SAME{LITERAL}:   [42, 42, 1, 42, 2, 42, 3]
-// CHECK-SAME{LITERAL}:   [42, 42, 4, 42, 5, 42, 6],
-// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42]],
-// CHECK-SAME{LITERAL}:  [[42, 42, 42, 42, 42, 42, 42],
-// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42],
-// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42],
-// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42]]]
-
-func.func @dynamic_pad() -> tensor<?x4xi32> {
-  %c1 = arith.constant 1 : index
-  %empty = tensor.empty(%c1) : tensor<?x2xi32>
-  %pad_value = mhlo.constant dense<42> : tensor<i32>
-  %0 = "mhlo.pad"(%empty, %pad_value) {
-    edge_padding_low = dense<[0, 1]> : tensor<2xi64>,
-    edge_padding_high = dense<[1, 1]> : tensor<2xi64>,
-    interior_padding = dense<[0, 0]> : tensor<2xi64>
-  } : (tensor<?x2xi32>, tensor<i32>) -> tensor<?x4xi32>
-  func.return %0 : tensor<?x4xi32>
-}
-
-// CHECK-LABEL: @dynamic_pad
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[42, 0, 0, 42], [42, 42, 42, 42]]
-
-func.func @negative_pad() -> tensor<10xi32> {
-  %empty = arith.constant dense<[1,2,3,4,5,6,7]> : tensor<7xi32>
-  %pad_value = mhlo.constant dense<42> : tensor<i32>
-  %0 = "mhlo.pad"(%empty, %pad_value) {
-    edge_padding_low = dense<[-2]> : tensor<1xi64>,
-    edge_padding_high = dense<[-1]> : tensor<1xi64>,
-    interior_padding = dense<[1]> : tensor<1xi64>
-  } : (tensor<7xi32>, tensor<i32>) -> tensor<10xi32>
-  func.return %0 : tensor<10xi32>
-}
-
-// CHECK-LABEL: @negative_pad
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [2, 42, 3, 42, 4, 42, 5, 42, 6, 42]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reduce.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reduce.mlir
deleted file mode 100644
index f371769413c2c5..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reduce.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @reduce() -> tensor<3xi32> {
-  %cst = mhlo.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
-  %init = mhlo.constant dense<1> : tensor<i32>
-  %reduce = mhlo.reduce(%cst init: %init) across dimensions = [0]
-      : (tensor<2x3xi32>, tensor<i32>) -> tensor<3xi32>
-    reducer(%arg0: tensor<i32>, %arg1: tensor<i32>)  {
-      %0 = mhlo.add %arg0, %arg1 : tensor<i32>
-      mhlo.return %0 : tensor<i32>
-    }
-  return %reduce : tensor<3xi32>
-}
-
-// CHECK-LABEL: @reduce
-// CHECK-NEXT: Results
-// CHECK-NEXT: [6, 8, 10]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reshape.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reshape.mlir
deleted file mode 100644
index 117489375b00e7..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reshape.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @reshape() -> tensor<2x4x2xi32> {
-  %cst = mhlo.constant dense<
-    [[0, 1, 2, 3], [4, 5, 6, 7],
-     [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>
-  %reshape = mhlo.reshape %cst : (tensor<4x4xi32>) -> tensor<2x4x2xi32>
-  func.return %reshape : tensor<2x4x2xi32>
-}
-
-// CHECK-LABEL: @reshape
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: <2x4x2xi32>: [[[0, 1], [2, 3], [4, 5], [6, 7]],
-// CHECK-SAME{LITERAL}:               [[8, 9], [10, 11], [12, 13], [14, 15]]]
-
-func.func @reshape_0d_1d() -> tensor<1xi32> {
-  %cst = mhlo.constant dense<42> : tensor<i32>
-  %reshape = mhlo.reshape %cst : (tensor<i32>) -> tensor<1xi32>
-  func.return %reshape : tensor<1xi32>
-}
-
-// CHECK-LABEL: @reshape_0d_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: [42]
-
-func.func @reshape_1d_0d() -> tensor<i32> {
-  %cst = mhlo.constant dense<42> : tensor<1xi32>
-  %reshape = mhlo.reshape %cst : (tensor<1xi32>) -> tensor<i32>
-  func.return %reshape : tensor<i32>
-}
-
-// CHECK-LABEL: @reshape_1d_0d
-// CHECK-NEXT: Results
-// CHECK-NEXT: 42
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/scatter.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/scatter.mlir
deleted file mode 100644
index 85bb6225f9ba54..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/scatter.mlir
+++ /dev/null
@@ -1,55 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @bounds_check() -> tensor<10xi32> {
-  // operand = np.zeros([8], dtype=np.int32)
-  // indices = np.array([[1], [8], [-1]])
-  // updates = np.array([[4, 5, 6], [6, 7, 8], [8, 9, 10]])
-  // lax.scatter_add(operand, indices, updates,
-  //   dimension_numbers=lax.ScatterDimensionNumbers(
-  //      update_window_dims=(0,), inserted_window_dims=(),
-  //      scatter_dims_to_operand_dims=(0,)))
-  %operand = mhlo.constant dense<0> : tensor<10xi32>
-  %indices = mhlo.constant dense<[[1], [8], [-1]]> : tensor<3x1xi32>
-  %updates = mhlo.constant dense<[[4, 5, 6], [6, 7, 8], [8, 9, 10]]> : tensor<3x3xi32>
-  %scatter = "mhlo.scatter"(%operand, %indices, %updates) ({
-  ^bb0(%lhs: tensor<i32>, %rhs: tensor<i32>):
-    %add = mhlo.add %lhs, %rhs : tensor<i32>
-    "mhlo.return"(%add) : (tensor<i32>) -> ()
-  }) {
-    scatter_dimension_numbers = #mhlo.scatter<
-      update_window_dims = [0],
-      inserted_window_dims = [],
-      index_vector_dim = 1,
-      scatter_dims_to_operand_dims = [0]
-    >
-  } : (tensor<10xi32>, tensor<3x1xi32>, tensor<3x3xi32>) -> tensor<10xi32>
-  return %scatter : tensor<10xi32>
-}
-
-// CHECK-LABEL: @bounds_check
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [0, 4, 6, 8, 0, 0, 0, 0, 0, 0]
-
-func.func @update_last_element() -> tensor<2xi32> {
-  %operand = mhlo.constant dense<[1, 1]> : tensor<2xi32>
-  %indices = mhlo.constant dense<[[1]]> : tensor<1x1xi32>
-  %updates = mhlo.constant dense<[[0]]> : tensor<1x1xi32>
-
-  %scatter = "mhlo.scatter"(%operand, %indices, %updates) ({
-    ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):
-      "mhlo.return"(%arg4) : (tensor<i32>) -> ()
-    }) {
-      indices_are_sorted = false,
-      scatter_dimension_numbers = #mhlo.scatter<
-        update_window_dims = [1],
-        scatter_dims_to_operand_dims = [0],
-        index_vector_dim = 1
-      >,
-      unique_indices = false
-    } : (tensor<2xi32>, tensor<1x1xi32>, tensor<1x1xi32>) -> tensor<2xi32>
-  return %scatter : tensor<2xi32>
-}
-
-// CHECK-LABEL: @update_last_element
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [1, 0]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/select.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/select.mlir
deleted file mode 100644
index 5959b3cf105b89..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/select.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @reshape() -> tensor<2xi32> {
-  %cst = mhlo.constant dense<[true, false]> : tensor<2xi1>
-  %a = mhlo.constant dense<[1, 2]> : tensor<2xi32>
-  %b = mhlo.constant dense<[3, 4]> : tensor<2xi32>
-  %ret = "mhlo.select"(%cst, %a, %b) :
-    (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %ret : tensor<2xi32>
-}
-
-// CHECK-LABEL: @reshape
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1, 4]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/slice.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/slice.mlir
deleted file mode 100644
index c5bb53f8740cc0..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/slice.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @slice() -> tensor<1x2xi32> {
-  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
-    : tensor<3x4xi32>
-  %0 = "mhlo.slice"(%cst) {
-    start_indices = dense<[1, 0]> : tensor<2xi64>,
-    limit_indices = dense<[2, 4]> : tensor<2xi64>,
-    strides = dense<[1, 2]> : tensor<2xi64>
-  } : (tensor<3x4xi32>) -> tensor<1x2xi32>
-  func.return %0 : tensor<1x2xi32>
-}
-
-// CHECK-LABEL: @slice
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[5, 7]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/sort.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/sort.mlir
deleted file mode 100644
index fa163718e91fbf..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/sort.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @sort() -> (tensor<2x5xf32>, tensor<2x5xi32>) {
-  %input0 = arith.constant dense<[
-    [4.0, 2.0, 1.0, 5.0, 3.0],
-    [6.0, 9.0, 8.0, 7.0, 10.0]
-  ]> : tensor<2x5xf32>
-
-  %input1 = arith.constant dense<[
-    [1, 2, 3, 4,  5],
-    [6, 7, 8, 9, 10]
-  ]> : tensor<2x5xi32>
-  %0, %1 = "mhlo.sort"(%input0, %input1) ({
-  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
-    %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (tensor<2x5xf32>, tensor<2x5xi32>) -> (tensor<2x5xf32>, tensor<2x5xi32>)
-
-  return %0, %1 : tensor<2x5xf32>, tensor<2x5xi32>
-}
-
-// CHECK-LABEL: @sort
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00], [6.000000e+00, 7.000000e+00, 8.000000e+00, 9.000000e+00, 1.000000e+01]]
-// CHECK-NEXT{LITERAL}: [[3, 2, 5, 1, 4], [6, 9, 8, 7, 10]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/subtract.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/subtract.mlir
deleted file mode 100644
index 334e8421863ef2..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/subtract.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: mlir-interpreter-runner %s | FileCheck %s
-
-func.func @main() -> tensor<2x3xi64> {
-  %lhs = mhlo.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi64>
-  %rhs = mhlo.constant dense<[[10, 20, 30], [40, 50, 60]]> : tensor<2x3xi64>
-  %result = mhlo.subtract %lhs, %rhs : tensor<2x3xi64>
-  return %result : tensor<2x3xi64>
-}
-
-// CHECK{LITERAL}: [[-10, -19, -28], [-37, -46, -55]]
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/transpose.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/transpose.mlir
deleted file mode 100644
index 09f9822aa8daa8..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/transpose.mlir
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @transpose() -> tensor<2x1x4x3xi32> {
-  %0 = mhlo.constant dense<[[[
-      [000, 001, 002, 003],
-      [010, 011, 012, 013],
-      [020, 021, 022, 023]
-    ],
-    [
-      [100, 101, 102, 103],
-      [110, 111, 112, 113],
-      [120, 121, 122, 123]
-    ]]]> : tensor<1x2x3x4xi32>
-  %1 = "mhlo.transpose"(%0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}
-    : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
-  return %1 : tensor<2x1x4x3xi32>
-}
-
-// CHECK-LABEL: @transpose
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[[0, 10, 20],
-// CHECK{LITERAL}:         [1, 11, 21],
-// CHECK{LITERAL}:         [2, 12, 22],
-// CHECK{LITERAL}:         [3, 13, 23]]],
-// CHECK{LITERAL}:       [[[100, 110, 120],
-// CHECK{LITERAL}:         [101, 111, 121],
-// CHECK{LITERAL}:         [102, 112, 122],
-// CHECK{LITERAL}:         [103, 113, 123]]]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/tuple.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/tuple.mlir
deleted file mode 100644
index 02cb16c72e8b5d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/tuple.mlir
+++ /dev/null
@@ -1,30 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @tuple() -> tuple<tensor<2xi1>, tensor<i32>> {
-  %cst = mhlo.constant dense<[true, false]> : tensor<2xi1>
-  %c1 = mhlo.constant dense<1> : tensor<i32>
-  %ret = "mhlo.tuple"(%cst, %c1)
-    : (tensor<2xi1>, tensor<i32>) -> tuple<tensor<2xi1>, tensor<i32>>
-  return %ret : tuple<tensor<2xi1>, tensor<i32>>
-}
-
-// CHECK-LABEL: @tuple
-// CHECK-NEXT: Results
-// CHECK-NEXT: (TensorOrMemref<2xi1>: [true, false], TensorOrMemref<i32>: 1)
-
-func.func @get_tuple_element() -> (tensor<2xi1>, tensor<i32>) {
-  %cst = mhlo.constant dense<[true, false]> : tensor<2xi1>
-  %c42 = mhlo.constant dense<42> : tensor<i32>
-  %tuple = "mhlo.tuple"(%cst, %c42)
-    : (tensor<2xi1>, tensor<i32>) -> tuple<tensor<2xi1>, tensor<i32>>
-  %r0 = "mhlo.get_tuple_element"(%tuple) {index = 0 : i32}
-    : (tuple<tensor<2xi1>, tensor<i32>>) -> tensor<2xi1>
-  %r1 = "mhlo.get_tuple_element"(%tuple) {index = 1 : i32}
-    : (tuple<tensor<2xi1>, tensor<i32>>) -> tensor<i32>
-  return %r0, %r1 : tensor<2xi1>, tensor<i32>
-}
-
-// CHECK-LABEL: @get_tuple_element
-// CHECK-NEXT: Results
-// CHECK-NEXT: [true, false]
-// CHECK-NEXT: 42
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/while.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/while.mlir
deleted file mode 100644
index 4525174d7af4be..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/while.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @while() -> (tensor<i32>, tensor<i32>) {
-  %c0 = mhlo.constant dense<0> : tensor<i32>
-  %c1 = mhlo.constant dense<1> : tensor<i32>
-  %c10 = mhlo.constant dense<10> : tensor<i32>
-  %3:2 = "mhlo.while"(%c0, %c1) ({
-    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
-      %4 = "mhlo.compare"(%arg0, %c10) {
-        comparison_direction = #mhlo<comparison_direction LT>
-      } : (tensor<i32>, tensor<i32>) -> tensor<i1>
-      "mhlo.return"(%4) : (tensor<i1>) -> ()
-    },  {
-    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
-      %5 = mhlo.add %arg0, %c1 : tensor<i32>
-      %6 = mhlo.add %arg1, %arg1 : tensor<i32>
-      "mhlo.return"(%5, %6) : (tensor<i32>, tensor<i32>) -> ()
-    }) : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
-  func.return %3#0, %3#1 : tensor<i32>, tensor<i32>
-}
-
-// CHECK-LABEL: @while
-// CHECK-NEXT: Results
-// CHECK-NEXT: TensorOrMemref<i32>: 10
-// CHECK-NEXT: TensorOrMemref<i32>: 1024
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/for.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/for.mlir
deleted file mode 100644
index bad5951f4b4ca9..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/for.mlir
+++ /dev/null
@@ -1,82 +0,0 @@
-
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @for() -> memref<4xi64> {
-  %c0 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  %c4 = arith.constant 4 : index
-  %alloc = memref.alloc() : memref<4xi64>
-  scf.for %i = %c0 to %c4 step %c2 {
-    %1 = arith.index_cast %i: index to i64
-    memref.store %1, %alloc[%i]: memref<4xi64>
-  }
-  return %alloc : memref<4xi64>
-}
-
-// CHECK-LABEL: @for
-// CHECK: Results
-// CHECK-NEXT{LITERAL}: [0, 0, 2, 0]
-
-func.func @nested() -> memref<2x2xindex> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %alloc = memref.alloc() : memref<2x2xindex>
-  scf.for %i = %c0 to %c2 step %c1 {
-    scf.for %j = %c0 to %c2 step %c1 {
-      memref.store %c1, %alloc[%i, %j]: memref<2x2xindex>
-    }
-  }
-  return %alloc : memref<2x2xindex>
-}
-
-// CHECK-LABEL: @nested
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 1], [1, 1]]
-
-func.func @iter_arg() -> index {
-  %c0 = arith.constant 0 : index
-  %c4 = arith.constant 4 : index
-  %c1 = arith.constant 1 : index
-  %sum = scf.for %i = %c0 to %c4 step %c1 iter_args(%x = %c1) -> index {
-    %sum = arith.addi %i, %x : index
-    scf.yield %sum : index
-  }
-  return %sum : index
-}
-
-// CHECK-LABEL: @iter_arg
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: i64: 7
-
-func.func @int32() -> memref<4xi32> {
-  %c0 = arith.constant 0 : i32
-  %c1 = arith.constant 1 : i32
-  %c4 = arith.constant 4 : i32
-  %alloc = memref.alloc() : memref<4xi32>
-  scf.for %i = %c0 to %c4 step %c1 : i32 {
-    %index = arith.index_cast %i : i32 to index
-    memref.store %i, %alloc[%index]: memref<4xi32>
-  }
-  return %alloc : memref<4xi32>
-}
-
-// CHECK-LABEL: int32
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: <4xi32>: [0, 1, 2, 3]
-
-func.func @int16() -> memref<4xi16> {
-  %c0 = arith.constant 0 : i16
-  %c1 = arith.constant 1 : i16
-  %c4 = arith.constant 4 : i16
-  %alloc = memref.alloc() : memref<4xi16>
-  scf.for %i = %c0 to %c4 step %c1 : i16 {
-    %index = arith.index_cast %i : i16 to index
-    memref.store %i, %alloc[%index]: memref<4xi16>
-  }
-  return %alloc : memref<4xi16>
-}
-
-// CHECK-LABEL: int16
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: <4xi16>: [0, 1, 2, 3]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/forall.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/forall.mlir
deleted file mode 100644
index 29bec5e463ea92..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/forall.mlir
+++ /dev/null
@@ -1,62 +0,0 @@
-
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @add() -> (tensor<3xi32>, tensor<3xi32>) {
-  %c3 = arith.constant 3 : index
-  %in = arith.constant dense<[1, 2, 3]> : tensor<3xi32>
-  %out = arith.constant dense<[4, 5, 6]> : tensor<3xi32>
-
-  %result = scf.forall (%i) in (%c3) shared_outs(%o = %out) -> tensor<3xi32> {
-    %addend = tensor.extract_slice %in[%i][1][1] : tensor<3xi32> to tensor<1xi32>
-    %augend = tensor.extract_slice %out[%i][1][1] : tensor<3xi32> to tensor<1xi32>
-    %sum = mhlo.add %augend, %addend : tensor<1xi32>
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %sum into %o[%i][1][1]
-        : tensor<1xi32> into tensor<3xi32>
-    }
-  }
-
-  return %out, %result : tensor<3xi32>, tensor<3xi32>
-}
-
-// CHECK-LABEL: @add
-// CHECK-NEXT: Results
-// CHECK-NEXT: [4, 5, 6]
-// CHECK-NEXT: [5, 7, 9]
-
-func.func @bufferized_add() -> memref<3xi32> {
-  %c3 = arith.constant 3 : index
-  %in = arith.constant dense<[1, 2, 3]> : tensor<3xi32>
-  %out = arith.constant dense<[4, 5, 6]> : memref<3xi32>
-
-  scf.forall (%i) in (%c3) {
-    %addend = tensor.extract %in[%i] : tensor<3xi32>
-    %augend = memref.load %out[%i] : memref<3xi32>
-    %sum = arith.addi %augend, %addend : i32
-    memref.store %sum, %out[%i] : memref<3xi32>
-  }
-
-  return %out : memref<3xi32>
-}
-
-// CHECK-LABEL: @bufferized_add
-// CHECK-NEXT: Results
-// CHECK-NEXT: [5, 7, 9]
-
-func.func @step() -> memref<4xi32> {
-  %c2 = arith.constant 2 : index
-  %c4 = arith.constant 4 : index
-
-  %c42 = arith.constant 42 : i32
-  %out = arith.constant dense<[1, 1, 1, 1]> : memref<4xi32>
-
-  scf.forall (%i) = (0) to (%c4) step (%c2) {
-    memref.store %c42, %out[%i] : memref<4xi32>
-  }
-
-  return %out : memref<4xi32>
-}
-
-// CHECK-LABEL: @step
-// CHECK-NEXT: Results
-// CHECK-NEXT: [42, 1, 42, 1]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/if.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/if.mlir
deleted file mode 100644
index 52d899d13328a6..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/if.mlir
+++ /dev/null
@@ -1,69 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @true() -> i64 {
-  %c0 = arith.constant 0 : i64
-  %c1 = arith.constant 1 : i64
-  %true = arith.constant true
-  %ret = scf.if %true -> i64 {
-    scf.yield %c0 : i64
-  } else {
-    scf.yield %c1 : i64
-  }
-  return %ret : i64
-}
-
-// CHECK-LABEL: @true
-// CHECK-NEXT: Results
-// CHECK-NEXT: i64: 0
-
-func.func @false() -> i64 {
-  %c2 = arith.constant 2 : i64
-  %c3 = arith.constant 3 : i64
-  %false = arith.constant false
-  %ret = scf.if %false -> i64 {
-    scf.yield %c2 : i64
-  } else {
-    scf.yield %c3 : i64
-  }
-  return %ret : i64
-}
-
-// CHECK-LABEL: @false
-// CHECK-NEXT: Results
-// CHECK-NEXT: i64: 3
-
-func.func @side_effect() -> memref<i64> {
-  %alloc = memref.alloc() : memref<i64>
-  %true = arith.constant true
-  %c124 = arith.constant 124 : i64
-  %c125 = arith.constant 125 : i64
-  scf.if %true {
-    memref.store %c124, %alloc[] : memref<i64>
-    scf.yield
-  } else {
-    memref.store %c125, %alloc[] : memref<i64>
-    scf.yield
-  }
-  return %alloc : memref<i64>
-}
-
-// CHECK-LABEL: @side_effect
-// CHECK-NEXT: Results
-// CHECK-NEXT: <i64>: 124
-
-func.func @side_effect_not_executed() -> memref<i64> {
-  %alloc = memref.alloc() : memref<i64>
-  %false = arith.constant false
-  %c126 = arith.constant 126 : i64
-  memref.store %c126, %alloc[] : memref<i64>
-  %c127 = arith.constant 127 : i64
-  scf.if %false {
-    memref.store %c127, %alloc[] : memref<i64>
-    scf.yield
-  }
-  return %alloc : memref<i64>
-}
-
-// CHECK-LABEL: @side_effect_not_executed
-// CHECK-NEXT: Results
-// CHECK-NEXT: <i64>: 126
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/parallel.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/parallel.mlir
deleted file mode 100644
index 41bcb08ac484ef..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/parallel.mlir
+++ /dev/null
@@ -1,44 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @parallel() -> memref<4x4xi32> {
-  %ret = memref.alloc() : memref<4x4xi32>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c4 = arith.constant 4 : index
-  %c9 = arith.constant 9 : i32
-  scf.parallel (%i, %j) = (%c0, %c1) to (%c4, %c4) step (%c1, %c2) {
-    memref.store %c9, %ret[%i, %j] : memref<4x4xi32>
-  }
-  return %ret : memref<4x4xi32>
-}
-
-// CHECK-LABEL: @parallel
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 9, 0, 9], [0, 9, 0, 9], [0, 9, 0, 9], [0, 9, 0, 9]]
-
-func.func @reduce_2() -> (index, index) {
-  %c1 = arith.constant 1 : index
-  %c6 = arith.constant 6 : index
-  %ret:2 = scf.parallel (%i) = (%c1) to (%c6) step (%c1)
-             init (%c1, %c1) -> (index, index) {
-    scf.reduce (%i) : index {
-      ^bb0(%lhs: index, %rhs: index):
-        %ret = arith.muli %lhs, %rhs : index
-        scf.reduce.return %ret : index
-    }
-
-    scf.reduce (%i) : index {
-      ^bb0(%lhs: index, %rhs: index):
-        %ret = arith.addi %lhs, %rhs : index
-        scf.reduce.return %ret : index
-    }
-  }
-  return %ret#0, %ret#1 : index, index
-}
-
-// CHECK-LABEL: @reduce_2
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: i64: 120
-// CHECK-NEXT{LITERAL}: i64: 16
-
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/while.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/while.mlir
deleted file mode 100644
index 1be3814f40365f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/while.mlir
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @while_empty() -> memref<i64> {
-  %c0 = arith.constant 0 : i64
-  %c1 = arith.constant 1 : i64
-  %c4 = arith.constant 4 : i64
-  %alloc = memref.alloc() : memref<i64>
-  memref.store %c0, %alloc[] : memref<i64>
-  scf.while: () -> () {
-    %value = memref.load %alloc[] : memref<i64>
-    %cond = arith.cmpi slt, %value, %c4 : i64
-    scf.condition(%cond)
-  } do {
-    %value = memref.load %alloc[] : memref<i64>
-    %add = arith.addi %value, %c1 : i64
-    memref.store %add, %alloc[] : memref<i64>
-    scf.yield
-  }
-  return %alloc : memref<i64>
-}
-
-// CHECK-LABEL: @while_empty
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: TensorOrMemref<i64>: 4
-
-func.func @while_var() -> i64 {
-  %c0 = arith.constant 0 : i64
-  %c1 = arith.constant 1 : i64
-  %c4 = arith.constant 4 : i64
-  %alloc = memref.alloc() : memref<i64>
-  memref.store %c0, %alloc[] : memref<i64>
-  %ret = scf.while(%arg0 = %c0): (i64) -> (i64) {
-    %cond = arith.cmpi slt, %arg0, %c4 : i64
-    scf.condition(%cond) %arg0 : i64
-  } do {
-  ^bb0(%arg1: i64):
-    %add = arith.addi %arg1, %c1 : i64
-    scf.yield %add : i64
-  }
-  return %ret : i64
-}
-
-// CHECK-LABEL: @while_var
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: i64: 4
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir
deleted file mode 100644
index 1201a2a342e73a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir
+++ /dev/null
@@ -1,42 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @collapse_shape()
-    -> (tensor<1x6xi32>, tensor<2x3xi32>, tensor<6xi32>) {
-  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
-  %collapse1 = tensor.collapse_shape %cst [[0], [1, 2]]
-      : tensor<1x2x3xi32> into tensor<1x6xi32>
-  %collapse2 = tensor.collapse_shape %cst [[0, 1], [2]]
-      : tensor<1x2x3xi32> into tensor<2x3xi32>
-  %collapse3 = tensor.collapse_shape %cst [[0, 1, 2]]
-      : tensor<1x2x3xi32> into tensor<6xi32>
-  return %collapse1, %collapse2, %collapse3
-      : tensor<1x6xi32>, tensor<2x3xi32>, tensor<6xi32>
-}
-
-// CHECK-LABEL: @collapse_shape
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4, 5, 6]]
-// CHECK-NEXT{LITERAL}: [[1, 2, 3], [4, 5, 6]]
-// CHECK-NEXT{LITERAL}: [1, 2, 3, 4, 5, 6]
-
-func.func @to_unit() -> tensor<i32> {
-  %cst = arith.constant dense<42> : tensor<1x1x1x1xi32>
-  %collapse = tensor.collapse_shape %cst []
-    : tensor<1x1x1x1xi32> into tensor<i32>
-  return %collapse : tensor<i32>
-}
-
-// CHECK-LABEL: @to_unit
-// CHECK-NEXT: Results
-// CHECK-NEXT: 42
-
-func.func @dynamic() -> tensor<?xi32> {
-  %cst = arith.constant dense<42> : tensor<2x3xi32>
-  %cast = tensor.cast %cst : tensor<2x3xi32> to tensor<?x3xi32>
-  %collapse = tensor.collapse_shape %cast [[0, 1]] : tensor<?x3xi32> into tensor<?xi32>
-  return %collapse : tensor<?xi32>
-}
-
-// CHECK-LABEL: @dynamic
-// CHECK-NEXT: Results
-// CHECK-NEXT: [42, 42, 42, 42, 42, 42]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/dim.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/dim.mlir
deleted file mode 100644
index 6e38150587ed27..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/dim.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @dim() -> index {
-  %it = tensor.empty() : tensor<2x3x4xi32>
-  %c1 = arith.constant 1 : index
-  %dim = tensor.dim %it, %c1 : tensor<2x3x4xi32>
-  return %dim : index
-}
-
-// CHECK-LABEL: @dim
-// CHECK-NEXT: Results
-// CHECK-NEXT: i64: 3
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/empty.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/empty.mlir
deleted file mode 100644
index 1fdd75afc7e227..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/empty.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @static() -> tensor<2x3xi32> {
-  %ret = tensor.empty() : tensor<2x3xi32>
-  return %ret : tensor<2x3xi32>
-}
-
-// CHECK-LABEL: @static
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 0, 0], [0, 0, 0]]
-
-func.func @dynamic() -> tensor<2x?x3x?xi32> {
-  %c5 = arith.constant 5 : index
-  %c7 = arith.constant 7 : index
-  %ret = tensor.empty(%c5, %c7) : tensor<2x?x3x?xi32>
-  return %ret : tensor<2x?x3x?xi32>
-}
-
-// CHECK-LABEL: @dynamic
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: TensorOrMemref<2x5x3x7xi32>
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir
deleted file mode 100644
index 724e53384bb393..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir
+++ /dev/null
@@ -1,55 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @expand_shape()
-    -> (tensor<1x6xi32>, tensor<2x3xi32>, tensor<1x2x3xi32>) {
-  %cst = arith.constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi32>
-  %expand1 = tensor.expand_shape %cst [[0, 1]]
-      : tensor<6xi32> into tensor<1x6xi32>
-  %expand2 = tensor.expand_shape %cst [[0, 1]]
-      : tensor<6xi32> into tensor<2x3xi32>
-  %expand3 = tensor.expand_shape %cst [[0, 1, 2]]
-      : tensor<6xi32> into tensor<1x2x3xi32>
-  return %expand1, %expand2, %expand3
-      : tensor<1x6xi32>, tensor<2x3xi32>, tensor<1x2x3xi32>
-}
-
-// CHECK-LABEL: @expand_shape
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4, 5, 6]]
-// CHECK-NEXT{LITERAL}: [[1, 2, 3], [4, 5, 6]]
-// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
-
-func.func @from_unit() -> tensor<1x1xi32> {
-  %cst = arith.constant dense<42> : tensor<i32>
-  %expand = tensor.expand_shape %cst [] : tensor<i32> into tensor<1x1xi32>
-  return %expand : tensor<1x1xi32>
-}
-
-// CHECK-LABEL: @from_unit
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: TensorOrMemref<1x1xi32>: [[42]]
-
-func.func @dynamic()
-    -> (tensor<?x2x3xi32>) {
-  %cst = arith.constant dense<[[1, 2, 3, 4, 5, 6]]> : tensor<1x6xi32>
-  %cst_cast = tensor.cast %cst : tensor<1x6xi32> to tensor<?x6xi32>
-  %expand1 = tensor.expand_shape %cst_cast [[0], [1, 2]]
-      : tensor<?x6xi32> into tensor<?x2x3xi32>
-  return %expand1 : tensor<?x2x3xi32>
-}
-
-// CHECK-LABEL: @dynamic
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
-
-func.func @dynamic2() -> (tensor<?x3xi32>) {
-  %cst = arith.constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi32>
-  %cst_cast = tensor.cast %cst : tensor<6xi32> to tensor<?xi32>
-  %expand1 = tensor.expand_shape %cst_cast [[0, 1]]
-      : tensor<?xi32> into tensor<?x3xi32>
-  return %expand1 : tensor<?x3xi32>
-}
-
-// CHECK-LABEL: @dynamic2
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 2, 3], [4, 5, 6]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract.mlir
deleted file mode 100644
index 9209eea17d9a50..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract.mlir
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @extract() -> i32 {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
-  %ret = tensor.extract %cst[%c0, %c1, %c1] : tensor<1x2x3xi32>
-  return %ret : i32
-}
-
-// CHECK-LABEL: @extract
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 5
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract_slice.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract_slice.mlir
deleted file mode 100644
index 81721e12b2abf7..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract_slice.mlir
+++ /dev/null
@@ -1,62 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @extract() -> tensor<1x2x1xi32> {
-  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
-  %ret = tensor.extract_slice %cst[0, 0, 0][1, 2, 1][1, 1, 1] : tensor<1x2x3xi32> to tensor<1x2x1xi32>
-  return %ret : tensor<1x2x1xi32>
-}
-
-// CHECK-LABEL: @extract
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[1], [4]]]
-
-func.func @rank_reduction() -> tensor<2xi32> {
-  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
-  %ret = tensor.extract_slice %cst[0, 0, 0][1, 2, 1][1, 1, 1] : tensor<1x2x3xi32> to tensor<2xi32>
-  return %ret : tensor<2xi32>
-}
-
-// CHECK-LABEL: @rank_reduction
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [1, 4]
-
-func.func @rank_reduction_to_dynamic() -> tensor<?xi32> {
-  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
-  %c2 = arith.constant 2 : index
-  %ret = tensor.extract_slice %cst[0, 0, 0][1, %c2, 1][1, 1, 1] : tensor<1x2x3xi32> to tensor<?xi32>
-  return %ret : tensor<?xi32>
-}
-
-// CHECK-LABEL: @rank_reduction_to_dynamic
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [1, 4]
-
-func.func @no_rank_reduction_to_dynamic() -> tensor<1x?xi32> {
-  %cst = arith.constant dense<[[0], [1], [2], [3], [4]]> : tensor<5x1xi32>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %ret = tensor.extract_slice %cst[%c2, %c0] [1, %c1] [1, 1]
-    : tensor<5x1xi32> to tensor<1x?xi32>
-  return %ret : tensor<1x?xi32>
-}
-
-// CHECK-LABEL: @no_rank_reduction_to_dynamic
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[2]]
-
-func.func @extract_from_extract() -> tensor<1x1xi32> {
-  %cst = arith.constant dense<[[0], [1], [2], [3], [4]]> : tensor<5x1xi32>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %extracted = tensor.extract_slice %cst[%c2, %c0] [1, %c1] [1, 1]
-    : tensor<5x1xi32> to tensor<1x?xi32>
-  %ret = tensor.extract_slice %extracted[%c0, %c0] [1, 1] [1, 1]
-    : tensor<1x?xi32> to tensor<1x1xi32>
-  return %ret : tensor<1x1xi32>
-}
-
-// CHECK-LABEL: @extract_from_extract
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[2]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/from_elements.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/from_elements.mlir
deleted file mode 100644
index e8bda72ada6dd4..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/from_elements.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @from_elements() -> tensor<2x3xindex> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c3 = arith.constant 3 : index
-  %c4 = arith.constant 4 : index
-  %c5 = arith.constant 5 : index
-  %ret = tensor.from_elements %c0, %c1, %c2, %c3, %c4, %c5 :  tensor<2x3xindex>
-  return %ret : tensor<2x3xindex>
-}
-
-// CHECK-LABEL: @from_elements
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 1, 2], [3, 4, 5]]
-
-func.func @empty() -> tensor<0xindex> {
-  %ret = tensor.from_elements : tensor<0xindex>
-  return %ret : tensor<0xindex>
-}
-
-// CHECK-LABEL: @empty
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: []
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/generate.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/generate.mlir
deleted file mode 100644
index 9ae2a4d43de1fa..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/generate.mlir
+++ /dev/null
@@ -1,29 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @generate() -> tensor<?xindex> {
-  %size = arith.constant 5 : index
-  %iota = tensor.generate %size {
-    ^bb0(%i : index):
-      tensor.yield %i : index
-    } : tensor<?xindex>
-  return %iota : tensor<?xindex>
-}
-
-// CHECK-LABEL: @generate
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [0, 1, 2, 3, 4]
-
-func.func @generate_2d() -> tensor<?x?xindex> {
-  %c5 = arith.constant 5 : index
-  %c2 = arith.constant 2 : index
-  %iota = tensor.generate %c5, %c2 {
-    ^bb0(%i : index, %j : index):
-      %sum = arith.addi %i, %j : index
-      tensor.yield %sum : index
-    } : tensor<?x?xindex>
-  return %iota : tensor<?x?xindex>
-}
-
-// CHECK-LABEL: @generate_2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert.mlir
deleted file mode 100644
index 47d5677694d89c..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @insert() -> tensor<1x2x3xi32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c7 = arith.constant 7 : i32
-  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
-  %ret = tensor.insert %c7 into %cst[%c0, %c1, %c1] : tensor<1x2x3xi32>
-  return %ret : tensor<1x2x3xi32>
-}
-
-// CHECK-LABEL: @insert
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 7, 6]]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert_slice.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert_slice.mlir
deleted file mode 100644
index ec5383d5c2852a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert_slice.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @insert() -> tensor<1x3x3xi32> {
-  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]> : tensor<1x3x3xi32>
-  %cst_2 = arith.constant dense<[[[10], [11]]]> : tensor<1x2x1xi32>
-  %ret = tensor.insert_slice %cst_2 into %cst[0, 1, 1][1, 2, 1][1, 1, 1]
-    : tensor<1x2x1xi32> into tensor<1x3x3xi32>
-  return %ret : tensor<1x3x3xi32>
-}
-
-// CHECK-LABEL: @insert
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 10, 6], [7, 11, 9]]]
-
-func.func @rank_increase() -> tensor<1x3x3xi32> {
-  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]> : tensor<1x3x3xi32>
-  %cst_2 = arith.constant dense<[10, 11]> : tensor<2xi32>
-  %ret = tensor.insert_slice %cst_2 into %cst[0, 1, 1][1, 2, 1][1, 1, 1]
-    : tensor<2xi32> into tensor<1x3x3xi32>
-  return %ret : tensor<1x3x3xi32>
-}
-
-// CHECK-LABEL: @rank_increase
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 10, 6], [7, 11, 9]]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/pad.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/pad.mlir
deleted file mode 100644
index a2a6470b6e3f17..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/pad.mlir
+++ /dev/null
@@ -1,38 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @pad() -> tensor<1x?x?xi32> {
-  %it = arith.constant dense<[[[1, 2, 3], [2, 3, 4]]]> : tensor<1x2x3xi32>
-  %offset = arith.constant 2 : index
-  %c0 = arith.constant 0 : index
-  %out = tensor.pad %it low[%c0, %offset, 0] high[0, %c0, %offset]  {
-    ^bb0(%a: index, %b: index, %c: index):
-      %c5 = arith.constant 5 : i32
-      tensor.yield %c5 : i32
-    } : tensor<1x2x3xi32> to tensor<1x?x?xi32>
-  return %out : tensor<1x?x?xi32>
-}
-
-// CHECK-LABEL: @pad
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[5, 5, 5, 5, 5],
-// CHECK-SAME{LITERAL}:   [5, 5, 5, 5, 5],
-// CHECK-SAME{LITERAL}:   [1, 2, 3, 5, 5],
-// CHECK-SAME{LITERAL}:   [2, 3, 4, 5, 5]]]
-
-func.func @pad_args() -> tensor<3x3xindex> {
-  %it = arith.constant dense<[[999]]> : tensor<1x1xindex>
-  %out = tensor.pad %it low[1, 1] high[1, 1]  {
-    ^bb0(%a: index, %b: index):
-      %c10 = arith.constant 10 : index
-      %mul = arith.muli %a, %c10 : index
-      %ret = arith.addi %mul, %b : index
-      tensor.yield %ret : index
-    } : tensor<1x1xindex> to tensor<3x3xindex>
-  return %out : tensor<3x3xindex>
-}
-
-// CHECK-LABEL: @pad_args
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 1, 2],
-// CHECK-SAME{LITERAL}:  [10, 999, 12],
-// CHECK-SAME{LITERAL}:  [20, 21, 22]]
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/concatenate.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/concatenate.mlir
deleted file mode 100644
index 9800bceca69f35..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/concatenate.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @concatenate() -> tensor<5xi32> {
-  %a = arith.constant dense<[1,2,3]> : tensor<3xi32>
-  %b = arith.constant dense<[4,5]> : tensor<2xi32>
-  %init = tensor.empty() : tensor<5xi32>
-  %cat = thlo.concatenate
-      ins(%a: tensor<3xi32>, %b: tensor<2xi32>)
-      outs(%init: tensor<5xi32>)
-      dimension = 0
-  return %cat : tensor<5xi32>
-}
-
-// CHECK-LABEL: @concatenate
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [1, 2, 3, 4, 5]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/reverse.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/reverse.mlir
deleted file mode 100644
index e98480ada1cea6..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/reverse.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @reverse() -> tensor<3x3xi32> {
-  %tensor = arith.constant dense<[[4, 5, 6], [6, 7, 8], [8, 9, 10]]> : tensor<3x3xi32>
-  %init = tensor.empty() : tensor<3x3xi32>
-  %reverse = thlo.reverse ins(%tensor: tensor<3x3xi32>)
-                          outs(%init: tensor<3x3xi32>)
-                          reverse_dimensions = [1]
-  return %reverse : tensor<3x3xi32>
-}
-
-// CHECK-LABEL: @reverse
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[6, 5, 4], [8, 7, 6], [10, 9, 8]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/scatter.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/scatter.mlir
deleted file mode 100644
index e4b4365671a222..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/scatter.mlir
+++ /dev/null
@@ -1,49 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @bounds_check() -> tensor<10xi32> {
-  %operand = arith.constant dense<0> : tensor<10xi32>
-  %indices = arith.constant dense<[[1], [8], [-1]]> : tensor<3x1xindex>
-  %updates = arith.constant dense<[[4, 5, 6], [6, 7, 8], [8, 9, 10]]> : tensor<3x3xi32>
-  %scatter = thlo.scatter ins(%indices: tensor<3x1xindex>, %updates: tensor<3x3xi32>)
-                          outs(%operand: tensor<10xi32>) (%in: i32, %out: i32) {
-    %add = arith.addi %in, %out : i32
-    thlo.yield %add : i32
-  }
-  return %scatter : tensor<10xi32>
-}
-
-// CHECK-LABEL: @bounds_check
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [0, 4, 5, 6, 0, 0, 0, 0, 0, 0]
-
-func.func @update_last_element() -> tensor<2xi32> {
-  %operand = arith.constant dense<[1, 1]> : tensor<2xi32>
-  %indices = arith.constant dense<[[1]]> : tensor<1x1xindex>
-  %updates = arith.constant dense<[[0]]> : tensor<1x1xi32>
-
-  %scatter = thlo.scatter ins(%indices: tensor<1x1xindex>, %updates: tensor<1x1xi32>)
-                          outs(%operand: tensor<2xi32>) (%in: i32, %out: i32) {
-    thlo.yield %in : i32
-  }
-  return %scatter : tensor<2xi32>
-}
-
-// CHECK-LABEL: @update_last_element
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [1, 0]
-
-func.func @bufferized() -> memref<10xi32> {
-  %operand = arith.constant dense<0> : memref<10xi32>
-  %indices = arith.constant dense<[[1], [8], [-1]]> : memref<3x1xindex>
-  %updates = arith.constant dense<[[4, 5, 6], [6, 7, 8], [8, 9, 10]]> : memref<3x3xi32>
-  thlo.scatter ins(%indices: memref<3x1xindex>, %updates: memref<3x3xi32>)
-               outs(%operand: memref<10xi32>) (%in: i32, %out: i32) {
-    %add = arith.addi %in, %out : i32
-    thlo.yield %add : i32
-  }
-  return %operand : memref<10xi32>
-}
-
-// CHECK-LABEL: @bufferized
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [0, 4, 5, 6, 0, 0, 0, 0, 0, 0]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/bitcast.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/bitcast.mlir
deleted file mode 100644
index 242b025324e934..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/bitcast.mlir
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @bitcast1d() -> vector<4xi8> {
-  %c = arith.constant dense<-2> : vector<1xi32>
-  %b = vector.bitcast %c : vector<1xi32> to vector<4xi8>
-  return %b : vector<4xi8>
-}
-
-// CHECK-LABEL: @bitcast1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<4xi8>: [-2, -1, -1, -1]
-
-func.func @bitcast2d() -> vector<2x1xi64> {
-  %c = arith.constant dense<[[0, 1], [2, 3]]> : vector<2x2xi32>
-  %b = vector.bitcast %c : vector<2x2xi32> to vector<2x1xi64>
-  return %b : vector<2x1xi64>
-}
-
-// CHECK-LABEL: @bitcast2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x1xi64>: [[4294967296], [12884901890]]
-
-func.func @transpose_bitcast() -> vector<2x1xi64> {
-  %c = arith.constant dense<[[0, 1], [2, 3]]> : vector<2x2xi32>
-  %d = vector.transpose %c, [1, 0] : vector<2x2xi32> to vector<2x2xi32>
-  %b = vector.bitcast %d : vector<2x2xi32> to vector<2x1xi64>
-  return %b : vector<2x1xi64>
-}
-
-// CHECK-LABEL: @transpose_bitcast
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x1xi64>: [[8589934592], [12884901889]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/broadcast.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/broadcast.mlir
deleted file mode 100644
index 1e30f7aa9e603f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/broadcast.mlir
+++ /dev/null
@@ -1,51 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @scalar_to_unit() -> vector<i32> {
-  %c1 = arith.constant 1 : i32
-  %b = vector.broadcast %c1 : i32 to vector<i32>
-  return %b : vector<i32>
-}
-
-// CHECK-LABEL: @scalar_to_unit
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<i32>: 1
-
-func.func @scalar_to_2d() -> vector<2x3xi32> {
-  %c1 = arith.constant 1 : i32
-  %b = vector.broadcast %c1 : i32 to vector<2x3xi32>
-  return %b : vector<2x3xi32>
-}
-
-// CHECK-LABEL: @scalar_to_2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x3xi32>: [[1, 1, 1], [1, 1, 1]]
-
-func.func @unit_to_unit() -> vector<i32> {
-  %c1 = arith.constant dense<1> : vector<i32>
-  %b = vector.broadcast %c1 : vector<i32> to vector<i32>
-  return %b : vector<i32>
-}
-
-// CHECK-LABEL: @unit_to_unit
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<i32>: 1
-
-func.func @stretch() -> vector<3xi32> {
-  %c1 = arith.constant dense<1> : vector<1xi32>
-  %b = vector.broadcast %c1 : vector<1xi32> to vector<3xi32>
-  return %b : vector<3xi32>
-}
-
-// CHECK-LABEL: @stretch
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<3xi32>: [1, 1, 1]
-
-func.func @stretch_and_broadcast() -> vector<2x1x2x2xi32> {
-  %c1 = arith.constant dense<[[1], [2]]> : vector<2x1xi32>
-  %b = vector.broadcast %c1 : vector<2x1xi32> to vector<2x1x2x2xi32>
-  return %b : vector<2x1x2x2xi32>
-}
-
-// CHECK-LABEL: @stretch_and_broadcast
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x1x2x2xi32>: [[[[1, 1], [2, 2]]], [[[1, 1], [2, 2]]]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/compressstore.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/compressstore.mlir
deleted file mode 100644
index 079633094723c0..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/compressstore.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @compressstore() -> memref<3x4xi32> {
-  %alloc = memref.alloc() : memref<3x4xi32>
-  %c = arith.constant dense<[1,2,3]> : vector<3xi32>
-  %m = arith.constant dense<[true,false,true]> : vector<3xi1>
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  vector.compressstore %alloc[%c1, %c2], %m, %c
-    : memref<3x4xi32>, vector<3xi1>, vector<3xi32>
-  return %alloc : memref<3x4xi32>
-}
-
-// CHECK-LABEL: @compressstore
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 0, 1, 3], [0, 0, 0, 0]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/constant_mask.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/constant_mask.mlir
deleted file mode 100644
index a3e3e0d03cc959..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/constant_mask.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @constant_mask() -> vector<4x3xi1> {
-  %1 = vector.constant_mask [3, 2] : vector<4x3xi1>
-  return %1 : vector<4x3xi1>
-}
-
-// CHECK-LABEL: @constant_mask
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<4x3xi1>:
-// CHECK-SAME{LITERAL}: [[true, true, false],
-// CHECK-SAME{LITERAL}:  [true, true, false],
-// CHECK-SAME{LITERAL}:  [true, true, false],
-// CHECK-SAME{LITERAL}:  [false, false, false]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/contract.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/contract.mlir
deleted file mode 100644
index f7f26fed00d8d0..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/contract.mlir
+++ /dev/null
@@ -1,141 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-// Adapted from
-// mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir
-
-#dotp_accesses = [
-  affine_map<(i) -> (i)>,
-  affine_map<(i) -> (i)>,
-  affine_map<(i) -> ()>
-]
-#dotp_trait = {
-  indexing_maps = #dotp_accesses,
-  iterator_types = ["reduction"]
-}
-
-#matvec_accesses = [
-  affine_map<(i, j) -> (i, j)>,
-  affine_map<(i, j) -> (j)>,
-  affine_map<(i, j) -> (i)>
-]
-#matvec_trait = {
-  indexing_maps = #matvec_accesses,
-  iterator_types = ["parallel", "reduction"]
-}
-
-#mattransvec_accesses = [
-  affine_map<(i, j) -> (j, i)>,
-  affine_map<(i, j) -> (j)>,
-  affine_map<(i, j) -> (i)>
-]
-#mattransvec_trait = {
-  indexing_maps = #mattransvec_accesses,
-  iterator_types = ["parallel", "reduction"]
-}
-
-#matmat_accesses = [
-  affine_map<(i, j, k) -> (i, k)>,
-  affine_map<(i, j, k) -> (k, j)>,
-  affine_map<(i, j, k) -> (i, j)>
-]
-#matmat_trait = {
-  indexing_maps = #matmat_accesses,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
-#column_major_matmat_accesses = [
-  affine_map<(i, j, k) -> (k, j)>,
-  affine_map<(i, j, k) -> (i, k)>,
-  affine_map<(i, j, k) -> (j, i)>
-]
-#column_major_matmat_trait = {
-  indexing_maps = #column_major_matmat_accesses,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
-func.func @dot_products() -> (i32, i32) {
-  %c0 = arith.constant 0 : i32
-  %c1 = arith.constant 1 : i32
-  %a = arith.constant dense<[1, 2]> : vector<2xi32>
-  %b = arith.constant dense<[3, 4]> : vector<2xi32>
-  // Contraction: dot-product a x b
-  %dp1 = vector.contract #dotp_trait %a, %b, %c0
-    : vector<2xi32>, vector<2xi32> into i32
-  %dp2 = vector.contract #dotp_trait %a, %b, %c1
-    : vector<2xi32>, vector<2xi32> into i32
-  return %dp1, %dp2 : i32, i32
-}
-
-// CHECK-LABEL: @dot_product
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 11
-// CHECK-NEXT: i32: 12
-
-func.func @matrix_vector() -> (vector<2xi32>, vector<2xi32>) {
-  %z1 = arith.constant dense<0> : vector<2xi32>
-  %a = arith.constant dense<[1, 2]> : vector<2xi32>
-  %c = arith.constant dense<[5, 6]> : vector<2xi32>
-  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
-  %mv1 = vector.contract #matvec_trait %A, %c, %z1
-    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
-  %mv2 = vector.contract #matvec_trait %A, %c, %a
-    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
-  return %mv1, %mv2 : vector<2xi32>, vector<2xi32>
-}
-
-// CHECK-LABEL: @matrix_vector
-// CHECK-NEXT: Results
-// CHECK-NEXT: [17, 39]
-// CHECK-NEXT: [18, 41]
-
-func.func @matrix_trans_vector() -> (vector<2xi32>, vector<2xi32>) {
-  %z1 = arith.constant dense<0> : vector<2xi32>
-  %a = arith.constant dense<[1, 2]> : vector<2xi32>
-  %c = arith.constant dense<[5, 6]> : vector<2xi32>
-  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
-  %mv1 = vector.contract #mattransvec_trait %A, %c, %z1
-    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
-  %mv2 = vector.contract #mattransvec_trait %A, %c, %a
-    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
-  return %mv1, %mv2 : vector<2xi32>, vector<2xi32>
-}
-
-// CHECK-LABEL: @matrix_trans_vector
-// CHECK-NEXT: Results
-// CHECK-NEXT: [23, 34]
-// CHECK-NEXT: [24, 36]
-
-func.func @matrix_matrix() -> (vector<2x2xi32>, vector<2x2xi32>) {
-  %z2 = arith.constant dense<0> : vector<2x2xi32>
-  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
-  %B = arith.constant dense<[[5, 6], [7, 8]]> : vector<2x2xi32>
-  %mm1 = vector.contract #matmat_trait %A, %B, %z2
-    : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
-  %mm2 = vector.contract #matmat_trait %A, %B, %A
-    : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
-  return %mm1, %mm2 : vector<2x2xi32>, vector<2x2xi32>
-}
-
-// CHECK-LABEL: @matrix_matrix
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[19, 22], [43, 50]]
-// CHECK-NEXT{LITERAL}: [[20, 24], [46, 54]]
-
-func.func @matrix_matrix_column_major() -> (vector<2x2xi32>, vector<2x2xi32>) {
-  %z2 = arith.constant dense<0> : vector<2x2xi32>
-  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
-  %B = arith.constant dense<[[5, 6], [7, 8]]> : vector<2x2xi32>
-  %llvm_matrix_column_major_mm0 =
-    vector.contract #column_major_matmat_trait %A, %B, %z2
-      : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
-  %llvm_matrix_column_major_mm1 =
-    vector.contract #column_major_matmat_trait %A, %B, %A
-      : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
-  return %llvm_matrix_column_major_mm0, %llvm_matrix_column_major_mm1 :
-    vector<2x2xi32>, vector<2x2xi32>
-}
-
-// CHECK-LABEL: @matrix_matrix_column_major
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[23, 31], [34, 46]]
-// CHECK-NEXT{LITERAL}: [[24, 33], [37, 50]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/create_mask.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/create_mask.mlir
deleted file mode 100644
index 2616eddb273fb2..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/create_mask.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @create_mask() -> vector<4x3xi1> {
-  %c2 = arith.constant 2 : index
-  %c3 = arith.constant 3 : index
-  %1 = vector.create_mask %c3, %c2 : vector<4x3xi1>
-  return %1 : vector<4x3xi1>
-}
-
-// CHECK-LABEL: @create_mask
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<4x3xi1>:
-// CHECK-SAME{LITERAL}: [[true, true, false],
-// CHECK-SAME{LITERAL}:  [true, true, false],
-// CHECK-SAME{LITERAL}:  [true, true, false],
-// CHECK-SAME{LITERAL}:  [false, false, false]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/expandload.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/expandload.mlir
deleted file mode 100644
index 35a4bc27358128..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/expandload.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @expandload() -> (vector<4xi32>, vector<4xi32>) {
-  %passthrough = arith.constant dense<[1,2,3,4]> : vector<4xi32>
-  %mask = arith.constant dense<[true, false, true, false]> : vector<4xi1>
-  %memref = arith.constant dense<[[10,11,12,13,14],
-                                  [15,16,17,18,19]]> : memref<2x5xi32>
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %ret = vector.expandload %memref[%c1, %c2], %mask, %passthrough
-    : memref<2x5xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32>
-
-  return %passthrough, %ret : vector<4xi32>, vector<4xi32>
-}
-
-// CHECK-LABEL: @expandload
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<4xi32>: [1, 2, 3, 4]
-// CHECK-NEXT: vector<4xi32>: [17, 2, 18, 4]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract.mlir
deleted file mode 100644
index 4c0ec223ea7c9d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract.mlir
+++ /dev/null
@@ -1,52 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @extract_1d_1d() -> vector<2xi32> {
-  %c = arith.constant dense<[1, 2]> : vector<2xi32>
-  %i = vector.extract %c[] : vector<2xi32>
-  return %i : vector<2xi32>
-}
-
-// CHECK-LABEL: @extract_1d_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<2xi32>: [1, 2]
-
-func.func @extract_1d_0d() -> i32 {
-  %c = arith.constant dense<[1, 2]> : vector<2xi32>
-  %i = vector.extract %c[1] : vector<2xi32>
-  return %i : i32
-}
-
-// CHECK-LABEL: @extract_1d_0d
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 2
-
-func.func @extract_2d_0d() -> i32 {
-  %c = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
-  %i = vector.extract %c[0, 1] : vector<2x2xi32>
-  return %i : i32
-}
-
-// CHECK-LABEL: @extract_2d_0d
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 2
-
-func.func @extract_2d_1d() -> vector<3xi32> {
-  %c = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
-  %i = vector.extract %c[0] : vector<2x3xi32>
-  return %i : vector<3xi32>
-}
-
-// CHECK-LABEL: @extract_2d_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<3xi32>: [1, 2, 3]
-
-func.func @extract_2d_2d() -> vector<2x2xi32> {
-  %c = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
-  %i = vector.extract %c[] : vector<2x2xi32>
-  return %i : vector<2x2xi32>
-}
-
-// CHECK-LABEL: @extract_2d_2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[1, 2], [3, 4]]
-
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract_strided_slice.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract_strided_slice.mlir
deleted file mode 100644
index eae5ae9e5e483d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract_strided_slice.mlir
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @extract_strided_slice() -> vector<2x3xi32> {
-  %c = arith.constant dense<[[1,2,3,4],
-                             [5,6,7,8],
-                             [9,10,11,12]]> : vector<3x4xi32>
-  %o = vector.extract_strided_slice %c {
-    offsets = [0, 1],
-    sizes = [2, 3],
-    // TODO(jreiffers): Test non-unit strides when supported by verifier.
-    strides = [1, 1]
-  } : vector<3x4xi32> to vector<2x3xi32>
-  return %o : vector<2x3xi32>
-}
-
-// CHECK-LABEL: @extract_strided_slice
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[2, 3, 4], [6, 7, 8]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extractelement.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extractelement.mlir
deleted file mode 100644
index 2e4ffc708d64ea..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extractelement.mlir
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @extract0d() -> i32 {
-  %c = arith.constant dense<1> : vector<i32>
-  %i = vector.extractelement %c[] : vector<i32>
-  return %i : i32
-}
-
-// CHECK-LABEL: @extract0d
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 1
-
-func.func @extract1d() -> i32 {
-  %c = arith.constant dense<[1,2]> : vector<2xi32>
-  %c1 = arith.constant 1 : index
-  %i = vector.extractelement %c[%c1 : index] : vector<2xi32>
-  return %i : i32
-}
-
-// CHECK-LABEL: @extract1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 2
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/flat_transpose.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/flat_transpose.mlir
deleted file mode 100644
index b67a84a4cce49b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/flat_transpose.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @flattranspose_2x3() -> vector<6xi32> {
-  %c = arith.constant dense<[0, 1, 2, 3, 4, 5]> : vector<6xi32>
-  %ret = vector.flat_transpose %c { columns = 2: i32, rows = 3: i32 }
-    : vector<6xi32> -> vector<6xi32>
-  return %ret : vector<6xi32>
-}
-
-// CHECK-LABEL: @flattranspose_2x3
-// CHECK-NEXT: Results
-// CHECK-NEXT: [0, 3, 1, 4, 2, 5]
-
-func.func @flattranspose_3x2() -> vector<6xi32> {
-  %c = arith.constant dense<[0, 1, 2, 3, 4, 5]> : vector<6xi32>
-  %ret = vector.flat_transpose %c { columns = 3: i32, rows = 2: i32 }
-    : vector<6xi32> -> vector<6xi32>
-  return %ret : vector<6xi32>
-}
-
-// CHECK-LABEL: @flattranspose_3x2
-// CHECK-NEXT: Results
-// CHECK-NEXT: [0, 2, 4, 1, 3, 5]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/fma.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/fma.mlir
deleted file mode 100644
index 696799d5c99c6c..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/fma.mlir
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @fma() -> vector<2xf32> {
-  %a = arith.constant dense<[1.0,2.0]> : vector<2xf32>
-  %b = arith.constant dense<[3.0,4.0]> : vector<2xf32>
-  %c = arith.constant dense<[5.0,6.0]> : vector<2xf32>
-  %r = vector.fma %a, %b, %c : vector<2xf32>
-  return %r : vector<2xf32>
-}
-
-// CHECK-LABEL: @fma
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<2xf32>: [8.000000e+00, 1.400000e+01]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/gather.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/gather.mlir
deleted file mode 100644
index c92a5589b1ecb4..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/gather.mlir
+++ /dev/null
@@ -1,50 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-// Adapted from mlir/test/Integration/Dialect/Vector/CPU/test-gather.mlir
-
-func.func private @gather8(%base: memref<10xi32>, %indices: vector<8xi32>,
-              %mask: vector<8xi1>, %pass_thru: vector<8xi32>) -> vector<8xi32> {
-  %c0 = arith.constant 0: index
-  %g = vector.gather %base[%c0][%indices], %mask, %pass_thru
-    : memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32> into vector<8xi32>
-  return %g : vector<8xi32>
-}
-
-func.func @gather() ->
-    (vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>) {
-  %A = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : memref<10xi32>
-  %idx = arith.constant dense<[0, 6, 1, 3, 5, 4, 9, 2]> : vector<8xi32>
-  %pass = arith.constant dense<-7> : vector<8xi32>
-  %none = vector.constant_mask [0] : vector<8xi1>
-  %all = vector.constant_mask [8] : vector<8xi1>
-  %some = vector.constant_mask [4] : vector<8xi1>
-  %true = arith.constant true
-  %more = vector.insert %true, %some[7] : i1 into vector<8xi1>
-
-  %g1 = call @gather8(%A, %idx, %all, %pass)
-    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
-    -> (vector<8xi32>)
-  %g2 = call @gather8(%A, %idx, %none, %pass)
-    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
-    -> (vector<8xi32>)
-  %g3 = call @gather8(%A, %idx, %some, %pass)
-    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
-    -> (vector<8xi32>)
-  %g4 = call @gather8(%A, %idx, %more, %pass)
-    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
-    -> (vector<8xi32>)
-  %g5 = call @gather8(%A, %idx, %all, %pass)
-    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
-    -> (vector<8xi32>)
-
-  return %g1, %g2, %g3, %g4, %g5
-    : vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>
-}
-
-// CHECK-LABEL: @gather
-// CHECK-NEXT: Results
-// CHECK-NEXT: [0, 6, 1, 3, 5, 4, 9, 2]
-// CHECK-NEXT: [-7, -7, -7, -7, -7, -7, -7, -7]
-// CHECK-NEXT: [0, 6, 1, 3, -7, -7, -7, -7]
-// CHECK-NEXT: [0, 6, 1, 3, -7, -7, -7, 2]
-// CHECK-NEXT: [0, 6, 1, 3, 5, 4, 9, 2]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert.mlir
deleted file mode 100644
index 97019cf98e4775..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert.mlir
+++ /dev/null
@@ -1,57 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @insert_1d_1d() -> vector<2xi32> {
-  %c = arith.constant dense<[3, 4]> : vector<2xi32>
-  %d = arith.constant dense<[1, 2]> : vector<2xi32>
-  %i = vector.insert %c, %d[] : vector<2xi32> into vector<2xi32>
-  return %i : vector<2xi32>
-}
-
-// CHECK-LABEL: @insert_1d_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<2xi32>: [3, 4]
-
-func.func @insert_1d_0d() -> vector<2xi32> {
-  %c = arith.constant 42 : i32
-  %d = arith.constant dense<[1, 2]> : vector<2xi32>
-  %i = vector.insert %c, %d[1] : i32 into vector<2xi32>
-  return %i : vector<2xi32>
-}
-
-// CHECK-LABEL: @insert_1d_0d
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<2xi32>: [1, 42]
-
-func.func @insert_2d_0d() -> vector<2x2xi32> {
-  %c = arith.constant 42 : i32
-  %d = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
-  %i = vector.insert %c, %d[0, 1] : i32 into vector<2x2xi32>
-  return %i : vector<2x2xi32>
-}
-
-// CHECK-LABEL: @insert_2d_0d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[1, 42], [3, 4]]
-
-func.func @insert_2d_1d() -> vector<2x3xi32> {
-  %c = arith.constant dense<[42, 43, 44]> : vector<3xi32>
-  %d = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
-  %i = vector.insert %c, %d[0] : vector<3xi32> into vector<2x3xi32>
-  return %i : vector<2x3xi32>
-}
-
-// CHECK-LABEL: @insert_2d_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x3xi32>: [[42, 43, 44], [4, 5, 6]]
-
-func.func @insert_2d_2d() -> vector<2x2xi32> {
-  %c = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
-  %d = arith.constant dense<0> : vector<2x2xi32>
-  %i = vector.insert %c, %d[] : vector<2x2xi32> into vector<2x2xi32>
-  return %i : vector<2x2xi32>
-}
-
-// CHECK-LABEL: @insert_2d_2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[1, 2], [3, 4]]
-
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert_strided_slice.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert_strided_slice.mlir
deleted file mode 100644
index 140cc716bfdf0a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert_strided_slice.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @insert_strided_slice() -> (vector<3x4xi32>, vector<3x4xi32>) {
-  %v = arith.constant dense<[[2, 3, 4], [6, 7, 8]]> : vector<2x3xi32>
-  %c = arith.constant dense<0> : vector<3x4xi32>
-  %o = vector.insert_strided_slice %v, %c {
-    offsets = [0, 1],
-    // TODO(jreiffers): Test non-unit strides when supported by verifier.
-    strides = [1, 1]
-  } : vector<2x3xi32> into vector<3x4xi32>
-  return %c, %o : vector<3x4xi32>, vector<3x4xi32>
-}
-
-// CHECK-LABEL: @insert_strided_slice
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]
-// CHECK-NEXT{LITERAL}: [[0, 2, 3, 4], [0, 6, 7, 8], [0, 0, 0, 0]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insertelement.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insertelement.mlir
deleted file mode 100644
index 4befb8c9598486..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insertelement.mlir
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @insert0d() -> vector<i32> {
-  %c = arith.constant dense<1> : vector<i32>
-  %v = arith.constant 42 : i32
-  %i = vector.insertelement %v, %c[] : vector<i32>
-  return %i : vector<i32>
-}
-
-// CHECK-LABEL: @insert0d
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<i32>: 42
-
-func.func @insert1d() -> vector<2xi32> {
-  %c = arith.constant dense<1> : vector<2xi32>
-  %c1 = arith.constant 1 : index
-  %v = arith.constant 42 : i32
-  %i = vector.insertelement %v, %c[%c1 : index] : vector<2xi32>
-  return %i : vector<2xi32>
-}
-
-// CHECK-LABEL: @insert1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<2xi32>: [1, 42]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/invalid.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/invalid.mlir
deleted file mode 100644
index d8ffb0c4c098de..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/invalid.mlir
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: (! mlir-interpreter-runner %s -run-all 2>&1) | FileCheck %s
-
-func.func @write_4_at_3_inbounds() {
-  %a = memref.alloc() : memref<5xi32>
-  %base = arith.constant 3 : index
-  %f = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32>
-  vector.transfer_write %f, %a[%base]
-    {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]}
-    : vector<4xi32>, memref<5xi32>
-  return
-}
-
-// CHECK-LABEL: @write_4_at_3_inbounds
-// CHECK-NEXT: index out of bounds
-
-func.func @transfer_read_2d_1d_oob()-> vector<2xi32> {
-  %a = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : memref<2x4xi32>
-  %c0 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  %c-42 = arith.constant -42: i32
-  %f = vector.transfer_read %a[%c2, %c0], %c-42
-      : memref<2x4xi32>, vector<2xi32>
-  return %f : vector<2xi32>
-}
-
-// CHECK-LABEL: @transfer_read_2d_1d_oob
-// CHECK-NEXT: index out of bounds
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/load.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/load.mlir
deleted file mode 100644
index 3fabd7676d6413..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/load.mlir
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @load_vector_memref() -> vector<2x2xi32> {
-  %m = memref.alloc() : memref<4x4xvector<2x2xi32>>
-  %c3 = arith.constant 3 : index
-  %r = vector.load %m[%c3, %c3] : memref<4x4xvector<2x2xi32>>, vector<2x2xi32>
-  return %r : vector<2x2xi32>
-}
-
-// CHECK-LABEL: @load_vector_memref
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[0, 0], [0, 0]]
-
-func.func @load_scalar_memref() -> vector<2x2xi32> {
-  %m = arith.constant dense<[[1, 2, 3, 4],
-                             [5, 6, 7, 8],
-                             [9, 10, 11, 12],
-                             [13, 14, 15, 16]]> : memref<4x4xi32>
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %r = vector.load %m[%c1, %c2] : memref<4x4xi32>, vector<2x2xi32>
-  return %r : vector<2x2xi32>
-}
-
-// CHECK-LABEL: @load_scalar_memref
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[7, 8], [11, 12]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedload.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedload.mlir
deleted file mode 100644
index bb11f49d261b28..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedload.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @maskedload() -> (vector<4xi32>, vector<4xi32>) {
-  %passthrough = arith.constant dense<[1,2,3,4]> : vector<4xi32>
-  %mask = arith.constant dense<[true, false, true, false]> : vector<4xi1>
-  %memref = arith.constant dense<[[10,11,12,13,14],
-                                  [15,16,17,18,19]]> : memref<2x5xi32>
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %ret = vector.maskedload %memref[%c1, %c2], %mask, %passthrough
-    : memref<2x5xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32>
-
-  return %passthrough, %ret : vector<4xi32>, vector<4xi32>
-}
-
-// CHECK-LABEL: @maskedload
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<4xi32>: [1, 2, 3, 4]
-// CHECK-NEXT: vector<4xi32>: [17, 2, 19, 4]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedstore.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedstore.mlir
deleted file mode 100644
index 9ceaefc3a9c622..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedstore.mlir
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @maskedstore() -> memref<2x5xi32> {
-  %value = arith.constant dense<[1,2,3,4]> : vector<4xi32>
-  %mask = arith.constant dense<[true, false, true, false]> : vector<4xi1>
-  %memref = arith.constant dense<[[10,11,12,13,14],
-                                  [15,16,17,18,19]]> : memref<2x5xi32>
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  vector.maskedstore %memref[%c1, %c2], %mask, %value
-    : memref<2x5xi32>, vector<4xi1>, vector<4xi32>
-
-  return %memref : memref<2x5xi32>
-}
-
-// CHECK-LABEL: @maskedstore
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[10, 11, 12, 13, 14], [15, 16, 1, 18, 3]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/multi_reduction.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/multi_reduction.mlir
deleted file mode 100644
index 94fc5481653583..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/multi_reduction.mlir
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @multi_reduction_1d_0d() -> i32 {
-  %a = arith.constant dense<[10, 4, 7]> : vector<3xi32>
-  %acc = arith.constant 1 : i32
-  %r = vector.multi_reduction <add>, %a, %acc [0] : vector<3xi32> to i32
-  return %r : i32
-}
-
-// CHECK-LABEL: @multi_reduction_1d_0d
-// CHECK-NEXT: Results
-// CHECK-NEXT: 22
-
-#dense234 = dense<[[[0,1,2,3],[4,5,6,7],[8,9,10,11]],
-                   [[12,13,14,15],[16,17,18,19],[20,21,22,23]]]>
-              : vector<2x3x4xi32>
-
-func.func @multi_reduction_3d_2d()
-    -> (vector<2x3xi32>, vector<2x4xi32>, vector<2x3xi32>) {
-  %a = arith.constant #dense234
-  %acc_23 = arith.constant dense<[[0, 1, 2], [3, 4, 5]]> : vector<2x3xi32>
-  %r1 = vector.multi_reduction <add>, %a, %acc_23 [2]
-    : vector<2x3x4xi32> to vector<2x3xi32>
-  %acc_24 = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : vector<2x4xi32>
-  %r2 = vector.multi_reduction <mul>, %a, %acc_24 [1]
-    : vector<2x3x4xi32> to vector<2x4xi32>
-  return %r1, %r2, %acc_23 : vector<2x3xi32>, vector<2x4xi32>, vector<2x3xi32>
-}
-
-// CHECK-LABEL: @multi_reduction_3d_2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[6, 23, 40], [57, 74, 91]]
-// CHECK-NEXT{LITERAL}: [[0, 45, 240, 693], [15360, 23205, 33264, 45885]]
-// CHECK-NEXT{LITERAL}: [[0, 1, 2], [3, 4, 5]]
-
-func.func @multi_reduction_3d_1d() -> vector<3xi32> {
-  %a = arith.constant #dense234
-  %acc_3 = arith.constant dense<[0, 1, 2]> : vector<3xi32>
-  %r1 = vector.multi_reduction <add>, %a, %acc_3 [2, 0]
-    : vector<2x3x4xi32> to vector<3xi32>
-  return %r1 : vector<3xi32>
-}
-
-// CHECK-LABEL: @multi_reduction_3d_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: [60, 93, 126]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/outerproduct.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/outerproduct.mlir
deleted file mode 100644
index ccd7f747933d22..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/outerproduct.mlir
+++ /dev/null
@@ -1,155 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @outerproduct_1d_1d() -> vector<2x3xi32> {
-  %a = arith.constant dense<[1, 2]> : vector<2xi32>
-  %b = arith.constant dense<[3, 4, 5]> : vector<3xi32>
-  %o = vector.outerproduct %a, %b : vector<2xi32>, vector<3xi32>
-  return %o : vector<2x3xi32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[3, 4, 5], [6, 8, 10]]
-
-func.func @outerproduct_1d_1d_add() -> vector<2x3xi32> {
-  %a = arith.constant dense<[1, 2]> : vector<2xi32>
-  %b = arith.constant dense<[3, 4, 5]> : vector<3xi32>
-  %init = arith.constant dense<[[100, 200, 300], [400, 500, 600]]>
-    : vector<2x3xi32>
-  %o = vector.outerproduct %a, %b, %init : vector<2xi32>, vector<3xi32>
-  return %o : vector<2x3xi32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_1d_add
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[103, 204, 305], [406, 508, 610]]
-
-func.func @outerproduct_1d_0d_and() -> vector<1xi32> {
-  %a = arith.constant dense<[3]> : vector<1xi32>
-  %b = arith.constant 1 : i32
-  %init = arith.constant dense<[9]> : vector<1xi32>
-  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<and>}
-    : vector<1xi32>, i32
-  return %o : vector<1xi32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_0d_and
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [1]
-
-func.func @outerproduct_1d_0d_maxui() -> vector<3xi32> {
-  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
-  %b = arith.constant 3 : i32
-  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
-  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<maxui>}
-    : vector<3xi32>, i32
-  return %o : vector<3xi32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_0d_maxui
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [100, 6, -100]
-
-func.func @outerproduct_1d_0d_mul() -> vector<3xi32> {
-  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
-  %b = arith.constant 3 : i32
-  %init = arith.constant dense<[1, 2, 3]> : vector<3xi32>
-  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<mul>}
-    : vector<3xi32>, i32
-  return %o : vector<3xi32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_0d_mul
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [3, 12, 27]
-
-func.func @outerproduct_1d_0d_minui() -> vector<3xi32> {
-  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
-  %b = arith.constant 3 : i32
-  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
-  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<minui>}
-    : vector<3xi32>, i32
-  return %o : vector<3xi32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_0d_minui
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [3, 0, 9]
-
-func.func @outerproduct_1d_0d_maxsi() -> vector<3xi32> {
-  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
-  %b = arith.constant 3 : i32
-  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
-  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<maxsi>}
-    : vector<3xi32>, i32
-  return %o : vector<3xi32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_0d_maxsi
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [100, 6, 9]
-
-func.func @outerproduct_1d_0d_minsi() -> vector<3xi32> {
-  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
-  %b = arith.constant 3 : i32
-  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
-  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<minsi>}
-    : vector<3xi32>, i32
-  return %o : vector<3xi32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_0d_minsi
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [3, 0, -100]
-
-func.func @outerproduct_1d_0d_maxf() -> vector<1xf32> {
-  %a = arith.constant dense<[1.0]> : vector<1xf32>
-  %b = arith.constant 3.0 : f32
-  %init = arith.constant dense<[10.0]> : vector<1xf32>
-  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<maxf>}
-    : vector<1xf32>, f32
-  return %o : vector<1xf32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_0d_maxf
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [1.000000e+01]
-
-func.func @outerproduct_1d_0d_minf() -> vector<1xf32> {
-  %a = arith.constant dense<[1.0]> : vector<1xf32>
-  %b = arith.constant 3.0 : f32
-  %init = arith.constant dense<[10.0]> : vector<1xf32>
-  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<minf>}
-    : vector<1xf32>, f32
-  return %o : vector<1xf32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_0d_minf
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [3.000000e+00]
-
-func.func @outerproduct_1d_0d_or() -> vector<1xi32> {
-  %a = arith.constant dense<[3]> : vector<1xi32>
-  %b = arith.constant 1 : i32
-  %init = arith.constant dense<[9]> : vector<1xi32>
-  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<or>}
-    : vector<1xi32>, i32
-  return %o : vector<1xi32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_0d_or
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [11]
-
-func.func @outerproduct_1d_0d_xor() -> vector<1xi32> {
-  %a = arith.constant dense<[3]> : vector<1xi32>
-  %b = arith.constant 1 : i32
-  %init = arith.constant dense<[9]> : vector<1xi32>
-  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<xor>}
-    : vector<1xi32>, i32
-  return %o : vector<1xi32>
-}
-
-// CHECK-LABEL: @outerproduct_1d_0d_xor
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [10]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/reduction.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/reduction.mlir
deleted file mode 100644
index c3998aea47d852..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/reduction.mlir
+++ /dev/null
@@ -1,235 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @reduction_add() -> i32 {
-  %a = arith.constant dense<[10, 4, 7]> : vector<3xi32>
-  %r = vector.reduction <add>, %a : vector<3xi32> into i32
-  return %r : i32
-}
-
-// CHECK-LABEL: @reduction_add
-// CHECK-NEXT: Results
-// CHECK-NEXT: 21
-
-func.func @reduction_minf_degenerate() -> f32 {
-  %a = arith.constant dense<[1.0]> : vector<1xf32>
-  %r = vector.reduction <minf>, %a : vector<1xf32> into f32
-  return %r : f32
-}
-
-// CHECK-LABEL: @reduction_minf_degenerate
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e+00
-
-func.func @reduction_minf() -> f32 {
-  %a = arith.constant dense<[10.0, 4.0, 7.0]> : vector<3xf32>
-  %r = vector.reduction <minf>, %a : vector<3xf32> into f32
-  return %r : f32
-}
-
-// CHECK-LABEL: @reduction_minf
-// CHECK-NEXT: Results
-// CHECK-NEXT: 4.000000e+00
-
-func.func @reduction_acc() -> f32 {
-  %a = arith.constant dense<[10.0, 4.0, 7.0]> : vector<3xf32>
-  %acc = arith.constant 3.0 : f32
-  %r = vector.reduction <minf>, %a, %acc : vector<3xf32> into f32
-  return %r : f32
-}
-
-// CHECK-LABEL: @reduction_acc
-// CHECK-NEXT: Results
-// CHECK-NEXT: 3.000000e+00
-
-func.func @reduction_acc_first_is_minimum() -> f32 {
-  %a = arith.constant dense<[1.0, 4.0, 7.0]> : vector<3xf32>
-  %acc = arith.constant 3.0 : f32
-  %r = vector.reduction <minf>, %a, %acc : vector<3xf32> into f32
-  return %r : f32
-}
-
-// CHECK-LABEL: @reduction_acc_first_is_minimum
-// CHECK-NEXT: Results
-// CHECK-NEXT: 1.000000e+00
-
-func.func @masked_and() -> i32 {
-  %a = arith.constant dense<[255, 127, 6]> : vector<3xi32>
-  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <and>, %a : vector<3xi32> into i32
-  } : vector<3xi1> -> i32
-  return %r : i32
-}
-
-// CHECK-LABEL: @masked_and
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 127
-
-func.func @masked_xor() -> i32 {
-  %a = arith.constant dense<[255, 1, 3]> : vector<3xi32>
-  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <xor>, %a : vector<3xi32> into i32
-  } : vector<3xi1> -> i32
-  return %r : i32
-}
-
-// CHECK-LABEL: @masked_xor
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 2
-
-func.func @masked_or() -> i32 {
-  %a = arith.constant dense<[255, 1, 3]> : vector<3xi32>
-  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <or>, %a : vector<3xi32> into i32
-  } : vector<3xi1> -> i32
-  return %r : i32
-}
-
-// CHECK-LABEL: @masked_or
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 3
-
-func.func @masked_add_i32() -> i32 {
-  %a = arith.constant dense<[255, 1, 3]> : vector<3xi32>
-  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <add>, %a : vector<3xi32> into i32
-  } : vector<3xi1> -> i32
-  return %r : i32
-}
-
-// CHECK-LABEL: @masked_add_i32
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 4
-
-func.func @masked_add_f32() -> f32 {
-  %a = arith.constant dense<[255.0, 1.0, 3.0]> : vector<3xf32>
-  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <add>, %a : vector<3xf32> into f32
-  } : vector<3xi1> -> f32
-  return %r : f32
-}
-
-// CHECK-LABEL: @masked_add_f32
-// CHECK-NEXT: Results
-// CHECK-NEXT: f32: 4.0
-
-func.func @masked_mul_i32() -> i32 {
-  %a = arith.constant dense<[255, 2, 3]> : vector<3xi32>
-  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <mul>, %a : vector<3xi32> into i32
-  } : vector<3xi1> -> i32
-  return %r : i32
-}
-
-// CHECK-LABEL: @masked_mul_i32
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 6
-
-func.func @masked_mul_f32() -> f32 {
-  %a = arith.constant dense<[255.0, 2.0, 3.0]> : vector<3xf32>
-  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <mul>, %a : vector<3xf32> into f32
-  } : vector<3xi1> -> f32
-  return %r : f32
-}
-
-// CHECK-LABEL: @masked_mul_f32
-// CHECK-NEXT: Results
-// CHECK-NEXT: f32: 6.0
-
-func.func @masked_minsi() -> i32 {
-  %a = arith.constant dense<[255, -2, -5]> : vector<3xi32>
-  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <minsi>, %a : vector<3xi32> into i32
-  } : vector<3xi1> -> i32
-  return %r : i32
-}
-
-// CHECK-LABEL: @masked_minsi
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: -2
-
-func.func @masked_minui() -> i32 {
-  %a = arith.constant dense<[255, -2, -5]> : vector<3xi32>
-  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <minui>, %a : vector<3xi32> into i32
-  } : vector<3xi1> -> i32
-  return %r : i32
-}
-
-// CHECK-LABEL: @masked_minui
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 255
-
-func.func @masked_maxsi() -> i32 {
-  %a = arith.constant dense<[255, -2, 500]> : vector<3xi32>
-  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <maxsi>, %a : vector<3xi32> into i32
-  } : vector<3xi1> -> i32
-  return %r : i32
-}
-
-// CHECK-LABEL: @masked_maxsi
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: 255
-
-func.func @masked_maxui() -> i32 {
-  %a = arith.constant dense<[255, -5, -2]> : vector<3xi32>
-  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <maxui>, %a : vector<3xi32> into i32
-  } : vector<3xi1> -> i32
-  return %r : i32
-}
-
-// CHECK-LABEL: @masked_maxui
-// CHECK-NEXT: Results
-// CHECK-NEXT: i32: -5
-
-func.func @masked_minf() -> f32 {
-  %a = arith.constant dense<[255.0, 2.0, 3.0]> : vector<3xf32>
-  %m = arith.constant dense<[true, false, true]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <minf>, %a : vector<3xf32> into f32
-  } : vector<3xi1> -> f32
-  return %r : f32
-}
-
-// CHECK-LABEL: @masked_minf
-// CHECK-NEXT: Results
-// CHECK-NEXT: f32: 3.0
-
-func.func @masked_maxf() -> f32 {
-  %a = arith.constant dense<[255.0, 2.0, 3.0]> : vector<3xf32>
-  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
-  %r = vector.mask %m {
-    vector.reduction <maxf>, %a : vector<3xf32> into f32
-  } : vector<3xi1> -> f32
-  return %r : f32
-}
-
-// CHECK-LABEL: @masked_maxf
-// CHECK-NEXT: Results
-// CHECK-NEXT: f32: 3.0
-
-func.func @masked_maxf_empty() -> f32 {
-  %a = arith.constant dense<[255.0]> : vector<1xf32>
-  %m = arith.constant dense<[false]> : vector<1xi1>
-  %r = vector.mask %m {
-    vector.reduction <maxf>, %a : vector<1xf32> into f32
-  } : vector<1xi1> -> f32
-  return %r : f32
-}
-
-// CHECK-LABEL: @masked_maxf_empty
-// CHECK-NEXT: Results
-// CHECK-NEXT: f32: -INF
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shape_cast.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shape_cast.mlir
deleted file mode 100644
index 333adcbc4b1dcb..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shape_cast.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @shape_cast() -> vector<2x2xi32> {
-  %a = arith.constant dense<[[1, 2, 3, 4]]> : vector<1x4xi32>
-  %cast = vector.shape_cast %a : vector<1x4xi32> to vector<2x2xi32>
-  return %cast : vector<2x2xi32>
-}
-
-// CHECK-LABEL: @shape_cast
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 2], [3, 4]]
-
-func.func @cast_of_transpose() -> (vector<3x2xi32>, vector<2x3xi32>) {
-  %a = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
-  %b = vector.transpose %a, [1, 0] : vector<2x3xi32> to vector<3x2xi32>
-  %cast = vector.shape_cast %b : vector<3x2xi32> to vector<2x3xi32>
-  return %b, %cast : vector<3x2xi32>, vector<2x3xi32>
-}
-
-// CHECK-LABEL: @cast_of_transpose
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[1, 4], [2, 5], [3, 6]]
-// CHECK-NEXT{LITERAL}: [[1, 4, 2], [5, 3, 6]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shuffle.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shuffle.mlir
deleted file mode 100644
index 3fb3d881fbee51..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shuffle.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @shuffle0d() -> vector<4xi32> {
-  %c1 = arith.constant dense<1> : vector<i32>
-  %c2 = arith.constant dense<2> : vector<i32>
-  %shuffle = vector.shuffle %c1, %c2[0, 1, 1, 0] : vector<i32>, vector<i32>
-  return %shuffle : vector<4xi32>
-}
-
-// CHECK-LABEL: @shuffle0d
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<4xi32>: [1, 2, 2, 1]
-
-func.func @shuffle1d() -> vector<4xi32> {
-  %c1 = arith.constant dense<[1, 2]> : vector<2xi32>
-  %c2 = arith.constant dense<[11, 12, 13, 14]> : vector<4xi32>
-  %shuffle = vector.shuffle %c1, %c2[0, 2, 5, 1] : vector<2xi32>, vector<4xi32>
-  return %shuffle : vector<4xi32>
-}
-
-// CHECK-LABEL: @shuffle1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<4xi32>: [1, 11, 14, 2]
-
-func.func @shuffle2d() -> vector<3x2xi32> {
-  %c1 = arith.constant dense<[[1, 2], [11, 12]]> : vector<2x2xi32>
-  %c2 = arith.constant dense<[[21, 22]]> : vector<1x2xi32>
-  %shuffle = vector.shuffle %c1, %c2[0, 2, 1] : vector<2x2xi32>, vector<1x2xi32>
-  return %shuffle : vector<3x2xi32>
-}
-
-// CHECK-LABEL: @shuffle2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<3x2xi32>: [[1, 2], [21, 22], [11, 12]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/splat.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/splat.mlir
deleted file mode 100644
index bfbb76b38af58f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/splat.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @splat() -> vector<2x4xi32> {
-  %c42 = arith.constant 42 : i32
-  %splat = vector.splat %c42 : vector<2x4xi32>
-  return %splat : vector<2x4xi32>
-}
-
-// CHECK-LABEL: @splat
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x4xi32>: [[42, 42, 42, 42], [42, 42, 42, 42]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/store.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/store.mlir
deleted file mode 100644
index f97f6435d61230..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/store.mlir
+++ /dev/null
@@ -1,39 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @store_vector_memref() -> memref<1x2xvector<2xi32>> {
-  %m = memref.alloc() : memref<1x2xvector<2xi32>>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %v = arith.constant dense<[1,2]> : vector<2xi32>
-  vector.store %v, %m[%c0, %c1] : memref<1x2xvector<2xi32>>, vector<2xi32>
-  return %m : memref<1x2xvector<2xi32>>
-}
-
-// CHECK-LABEL: @store_vector_memref
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: TensorOrMemref<1x2xvector<2xi32>>: [[[0, 0], [1, 2]]]
-
-func.func @store_scalar_memref() -> memref<2x2xi32> {
-  %m = memref.alloc() : memref<2x2xi32>
-  %v = arith.constant dense<[[1,2]]> : vector<1x2xi32>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  vector.store %v, %m[%c1, %c0] : memref<2x2xi32>, vector<1x2xi32>
-  return %m : memref<2x2xi32>
-}
-
-// CHECK-LABEL: @store_scalar_memref
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: TensorOrMemref<2x2xi32>: [[0, 0], [1, 2]]
-
-func.func @store_oob() -> memref<2x2xi32> {
-  %m = memref.alloc() : memref<2x2xi32>
-  %v = arith.constant dense<[[1,2]]> : vector<1x2xi32>
-  %c1 = arith.constant 1 : index
-  vector.store %v, %m[%c1, %c1] : memref<2x2xi32>, vector<1x2xi32>
-  return %m : memref<2x2xi32>
-}
-
-// CHECK-LABEL: @store_oob
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: TensorOrMemref<2x2xi32>: [[0, 0], [0, 1]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_read.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_read.mlir
deleted file mode 100644
index 6cafe0b2f4cd06..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_read.mlir
+++ /dev/null
@@ -1,118 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-// Adapted from mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir.
-
-func.func @transfer_read_1d() -> vector<13xi32> {
-  %a = arith.constant dense<[0, 1, 2, 3, 4]> : memref<5xi32>
-  %c2 = arith.constant 2 : index
-  %c-42 = arith.constant -42 : i32
-  %f = vector.transfer_read %a[%c2], %c-42
-      {permutation_map = affine_map<(d0) -> (d0)>} :
-    memref<5xi32>, vector<13xi32>
-  return %f : vector<13xi32>
-}
-
-// CHECK-LABEL: @transfer_read_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<13xi32>: [2, 3, 4, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42]
-
-func.func @transfer_read_mask_1d() -> vector<13xi32> {
-  %a = arith.constant dense<[0, 1, 2, 3, 4]> : memref<5xi32>
-  %c2 = arith.constant 2 : index
-  %c-42 = arith.constant -42: i32
-  %m = arith.constant dense<[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]> : vector<13xi1>
-  %f = vector.transfer_read %a[%c2], %c-42, %m : memref<5xi32>, vector<13xi32>
-  return %f : vector<13xi32>
-}
-
-// CHECK-LABEL: @transfer_read_mask_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<13xi32>: [-42, -42, 4, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42]
-
-func.func @transfer_read_vector_mask() -> vector<6xi32> {
-  %a = arith.constant dense<[0, 1, 2, 3, 4]> : memref<5xi32>
-  %c2 = arith.constant 2 : index
-  %c-42 = arith.constant -42: i32
-  %m = arith.constant dense<[0, 0, 1, 1, 1, 1]> : vector<6xi1>
-  %passthrough = arith.constant dense<[-1, -2, -3, -4, -5, -6]> : vector<6xi32>
-  %f = vector.mask %m, %passthrough {
-    vector.transfer_read %a[%c2], %c-42 : memref<5xi32>, vector<6xi32>
-  } : vector<6xi1> -> vector<6xi32>
-  return %f : vector<6xi32>
-}
-
-// CHECK-LABEL: @transfer_read_vector_mask
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<6xi32>: [-1, -2, 4, -42, -42, -42]
-
-func.func @transfer_read_inbounds_4() -> vector<4xi32> {
-  %a = arith.constant dense<[0, 1, 2, 0, 0]> : memref<5xi32>
-  %c-42 = arith.constant -42: i32
-  %c1 = arith.constant 1 : index
-  %f = vector.transfer_read %a[%c1], %c-42
-      {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]} :
-    memref<5xi32>, vector<4xi32>
-  return %f : vector<4xi32>
-}
-
-// CHECK-LABEL: @transfer_read_inbounds_4
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<4xi32>: [1, 2, 0, 0]
-
-func.func @transfer_read_mask_inbounds_4() -> vector<4xi32> {
-  %a = arith.constant dense<[0, 1, 2, 0, 0]> : memref<5xi32>
-  %c-42 = arith.constant -42: i32
-  %c1 = arith.constant 1 : index
-  %m = arith.constant dense<[0, 1, 0, 1]> : vector<4xi1>
-  %f = vector.transfer_read %a[%c1], %c-42, %m {in_bounds = [true]}
-      : memref<5xi32>, vector<4xi32>
-  return %f : vector<4xi32>
-}
-
-// CHECK-LABEL: @transfer_read_mask_inbounds_4
-// CHECK-NEXT: Results
-// CHECK-NEXT: vector<4xi32>: [-42, 2, -42, 0]
-
-func.func @transfer_read_2d()-> vector<2x2xi32> {
-  %a = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]>
-    : memref<3x4xi32>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c-42 = arith.constant -42: i32
-  %f = vector.transfer_read %a[%c1, %c0], %c-42
-      : memref<3x4xi32>, vector<2x2xi32>
-  return %f : vector<2x2xi32>
-}
-
-// CHECK-LABEL: @transfer_read_2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[4, 5], [8, 9]]
-
-func.func @transfer_read_2d_1d() -> vector<2xi32> {
-  %a = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : memref<2x4xi32>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c-42 = arith.constant -42: i32
-  %f = vector.transfer_read %a[%c1, %c0], %c-42
-      : memref<2x4xi32>, vector<2xi32>
-  return %f : vector<2xi32>
-}
-
-// CHECK-LABEL: @transfer_read_2d_1d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2xi32>: [4, 5]
-
-func.func @transfer_read_i1() -> vector<2xi1> {
-  %a = arith.constant dense<[true, false]> : memref<2xi1>
-  %c0 = arith.constant 0 : index
-  %false = arith.constant false
-  // Note: this read is technically illegal, but it can occur in practice and
-  // persist for a while until it's lowered to something legal.
-  %ret = vector.transfer_read %a[%c0], %false {in_bounds = [true]}
-    : memref<2xi1>, vector<2xi1>
-  return %ret : vector<2xi1>
-}
-
-// CHECK-LABEL: @transfer_read_i1
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [true, false]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_write.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_write.mlir
deleted file mode 100644
index 6ca9eb8860395f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_write.mlir
+++ /dev/null
@@ -1,91 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @write_4_at_3_inbounds() -> memref<?xi32> {
-  %c8 = arith.constant 8 : index
-  %a = memref.alloc(%c8) : memref<?xi32>
-  %base = arith.constant 3 : index
-  %f = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32>
-  vector.transfer_write %f, %a[%base]
-    {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]}
-    : vector<4xi32>, memref<?xi32>
-  return %a : memref<?xi32>
-}
-
-// CHECK-LABEL: @write_4_at_3_inbounds
-// CHECK-NEXT: Results
-// CHECK-NEXT: [0, 0, 0, 1, 2, 3, 4, 0]
-
-func.func @write_6_at_5() -> memref<8xi32> {
-  %a = memref.alloc() : memref<8xi32>
-  %base = arith.constant 5 : index
-  %f = arith.constant dense<[1, 2, 3, 4, 5, 6]> : vector<6xi32>
-  vector.transfer_write %f, %a[%base]
-    {permutation_map = affine_map<(d0) -> (d0)>}
-    : vector<6xi32>, memref<8xi32>
-  return %a : memref<8xi32>
-}
-
-// CHECK-LABEL: @write_6_at_5
-// CHECK-NEXT: Results
-// CHECK-NEXT: [0, 0, 0, 0, 0, 1, 2, 3]
-
-func.func @write_to_tensor() -> (tensor<3xi32>, tensor<3xi32>) {
-  %a = arith.constant dense<[1,2,3]> : tensor<3xi32>
-  %base = arith.constant 1 : index
-  %f = arith.constant dense<[4, 5]> : vector<2xi32>
-  %b = vector.transfer_write %f, %a[%base]
-    {permutation_map = affine_map<(d0) -> (d0)>}
-    : vector<2xi32>, tensor<3xi32>
-  return %a, %b : tensor<3xi32>, tensor<3xi32>
-}
-
-// CHECK-LABEL: @write_to_tensor
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1, 2, 3]
-// CHECK-NEXT: [1, 4, 5]
-
-func.func @write_masked() -> memref<4xi32> {
-  %a = arith.constant dense<[10, 11, 12, 13]> : memref<4xi32>
-  %base = arith.constant 1 : index
-  %f = arith.constant dense<[1, 2, 3]> : vector<3xi32>
-  %mask = arith.constant dense<[true, false, true]> : vector<3xi1>
-  vector.transfer_write %f, %a[%base], %mask
-    {permutation_map = affine_map<(d0) -> (d0)>}
-    : vector<3xi32>, memref<4xi32>
-  return %a : memref<4xi32>
-}
-
-// CHECK-LABEL: @write_masked
-// CHECK-NEXT: Results
-// CHECK-NEXT: [10, 1, 12, 3]
-
-func.func @write_vector_mask() -> memref<4xi32> {
-  %a = arith.constant dense<[10, 11, 12, 13]> : memref<4xi32>
-  %base = arith.constant 1 : index
-  %f = arith.constant dense<[1, 2, 3]> : vector<3xi32>
-  %mask = arith.constant dense<[true, false, true]> : vector<3xi1>
-  vector.mask %mask {
-    vector.transfer_write %f, %a[%base]
-      {permutation_map = affine_map<(d0) -> (d0)>}
-      : vector<3xi32>, memref<4xi32>
-  } : vector<3xi1>
-  return %a : memref<4xi32>
-}
-
-// CHECK-LABEL: @write_vector_mask
-// CHECK-NEXT: Results
-// CHECK-NEXT: [10, 1, 12, 3]
-
-func.func @write_1d_to_2d() -> memref<2x4xi32> {
-  %a = memref.alloc() : memref<2x4xi32>
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %f = arith.constant dense<[1, 2]> : vector<2xi32>
-  vector.transfer_write %f, %a[%c1, %c2]
-    : vector<2xi32>, memref<2x4xi32>
-  return %a : memref<2x4xi32>
-}
-
-// CHECK-LABEL: @write_1d_to_2d
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 0, 1, 2]]
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transpose.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transpose.mlir
deleted file mode 100644
index 3c39327d67d92a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transpose.mlir
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @transpose() -> vector<2x1x4x3xi32> {
-  %0 = arith.constant dense<[[[
-      [000, 001, 002, 003],
-      [010, 011, 012, 013],
-      [020, 021, 022, 023]
-    ],
-    [
-      [100, 101, 102, 103],
-      [110, 111, 112, 113],
-      [120, 121, 122, 123]
-    ]]]> : vector<1x2x3x4xi32>
-  %1 = vector.transpose %0, [1, 0, 3, 2]
-    : vector<1x2x3x4xi32> to vector<2x1x4x3xi32>
-  return %1 : vector<2x1x4x3xi32>
-}
-
-// CHECK-LABEL: @transpose
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: [[[[0, 10, 20],
-// CHECK{LITERAL}:         [1, 11, 21],
-// CHECK{LITERAL}:         [2, 12, 22],
-// CHECK{LITERAL}:         [3, 13, 23]]],
-// CHECK{LITERAL}:       [[[100, 110, 120],
-// CHECK{LITERAL}:         [101, 111, 121],
-// CHECK{LITERAL}:         [102, 112, 122],
-// CHECK{LITERAL}:         [103, 113, 123]]]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/type_cast.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/type_cast.mlir
deleted file mode 100644
index fecc8e21466152..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/type_cast.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @type_cast() -> memref<vector<2x2xi32>> {
-  %alloc = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
-  %cast = vector.type_cast %alloc: memref<2x2xi32> to memref<vector<2x2xi32>>
-  return %cast : memref<vector<2x2xi32>>
-}
-
-// CHECK-LABEL: @type_cast
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: TensorOrMemref<vector<2x2xi32>>: [[1, 2], [3, 4]]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/vscale.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/vscale.mlir
deleted file mode 100644
index 6d162357d57936..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/vscale.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @vscale() -> (vector<[4]xi32>, index) {
-  %c = arith.constant dense<0> : vector<[4]xi32>
-  %vscale = vector.vscale
-  return %c, %vscale : vector<[4]xi32>, index
-}
-
-// CHECK-LABEL: @vscale
-// CHECK-NEXT: Results
-// CHECK-NEXT: [0, 0, 0, 0]
-// CHECK-NEXT: 1
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/xla_cpu/memref_element_cast.mlir b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/xla_cpu/memref_element_cast.mlir
deleted file mode 100644
index d238b450f989d3..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/xla_cpu/memref_element_cast.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @memref_element_cast() -> tensor<2xi8> {
-  %c = arith.constant dense<[true, false]> : tensor<2xi1>
-  %ret = "xla_cpu.memref_element_cast"(%c) : (tensor<2xi1>) -> tensor<2xi8>
-  return %ret : tensor<2xi8>
-}
-
-// CHECK-LABEL: @memref_element_cast
-// CHECK-NEXT: Results
-// CHECK-NEXT: [1, 0]
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/thlo.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/thlo.cc
deleted file mode 100644
index b626f68a27f22f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/thlo.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "thlo/IR/thlo_ops.h"
-#include "tools/mlir_interpreter/dialects/util.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-llvm::SmallVector<InterpreterValue> scatter(InterpreterState& state,
-                                            thlo::ScatterOp scatter,
-                                            const InterpreterValue& indices,
-                                            const InterpreterValue& updates,
-                                            InterpreterValue init) {
-  auto out = scatter.getNumResults() == 0 ? init : init.clone();
-
-  const auto& inputView = init.view();
-  const auto& updatesView = updates.view();
-  int64_t operandRank = inputView.rank();
-
-  for (const auto& updateIndices : updatesView.indices()) {
-    llvm::SmallVector<int64_t> inputIndices(operandRank);
-    llvm::SmallVector<int64_t> maxIndices(operandRank);
-    llvm::SmallVector<int64_t> minIndices(operandRank);
-
-    for (int64_t dim = 0; dim < operandRank; ++dim) {
-      int64_t scatterIndex =
-          indices.view().sizes[1] > dim
-              ? indices.extractElement({updateIndices[0], dim}).asInt()
-              : 0;
-      inputIndices[dim] = updateIndices[dim + 1] + scatterIndex;
-      minIndices[dim] = scatterIndex;
-      maxIndices[dim] = updatesView.sizes[dim + 1] - 1 + scatterIndex;
-    }
-
-    if (!inputView.inBounds(minIndices)) continue;
-    if (!inputView.inBounds(maxIndices)) continue;
-
-    auto currentValue = out.extractElement(inputIndices);
-    auto update = updates.extractElement(updateIndices);
-
-    auto result = interpret(state, scatter.getUpdateComputation(),
-                            {update, currentValue});
-    if (state.hasFailure()) {
-      break;
-    }
-    out.insertElement(inputIndices, result.front());
-  }
-
-  if (scatter.getNumResults() == 0) return {};
-  return {out};
-}
-
-llvm::SmallVector<InterpreterValue> concatenate(
-    InterpreterState&, thlo::ConcatenateOp op,
-    ArrayRef<InterpreterValue> values, InterpreterValue init) {
-  if (op.getNumResults() > 0) {
-    init = init.clone();
-  }
-  int64_t dim = op.getDimension().getSExtValue();
-  int64_t offset = 0;
-  for (const auto& value : values) {
-    for (auto index : value.view().indices()) {
-      auto val = value.extractElement(index);
-      index[dim] += offset;
-      init.insertElement(index, val);
-    }
-    offset += value.view().sizes[dim];
-  }
-  if (op.getNumResults() > 0) return {init};
-  return {};
-}
-
-llvm::SmallVector<InterpreterValue> reverse(InterpreterState& state,
-                                            thlo::ReverseOp op,
-                                            const InterpreterValue& in,
-                                            InterpreterValue init) {
-  // No bufferized thlo.reverse currently exists. This if guards against
-  // possible future changes to that.
-  if (op->getNumResults() != 1) {
-    state.addFailure("bufferized thlo.reverse unsupported");
-    return {};
-  }
-
-  init = init.clone();
-  init.fill([&](ArrayRef<int64_t> indices) {
-    auto srcIndex = llvm::to_vector(indices);
-    for (int64_t dim : op.getReverseDimensions()) {
-      srcIndex[dim] = in.view().sizes[dim] - 1 - srcIndex[dim];
-    }
-    return in.extractElement(srcIndex);
-  });
-
-  if (op->getNumResults() > 0) return {init};
-  return {};
-}
-
-REGISTER_MLIR_INTERPRETER_OP("thlo.yield", noOpTerminator);
-REGISTER_MLIR_INTERPRETER_OP(concatenate);
-REGISTER_MLIR_INTERPRETER_OP(scatter);
-REGISTER_MLIR_INTERPRETER_OP(reverse);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.cc
deleted file mode 100644
index addb4ac1abd4d4..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tools/mlir_interpreter/dialects/util.h"
-
-#include <variant>
-
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/Support/MathExtras.h"
-#include "tools/mlir_interpreter/framework/tensor_or_memref.h"
-
-namespace mlir {
-namespace interpreter {
-
-SmallVector<int64_t> replaceDynamicVals(llvm::ArrayRef<int64_t> staticVals,
-                                        ArrayRef<InterpreterValue>& args) {
-  llvm::SmallVector<int64_t> out;
-  for (int64_t val : staticVals) {
-    if (ShapedType::isDynamic(val)) {
-      out.push_back(std::get<int64_t>(args.front().storage));
-      args = args.drop_front(1);
-    } else {
-      out.push_back(val);
-    }
-  }
-  return out;
-}
-
-SmallVector<int64_t> replaceDynamicVals(ArrayRef<int64_t> staticVals,
-                                        ArrayRef<int64_t> dynamicVals) {
-  llvm::SmallVector<int64_t> out;
-  for (int64_t val : staticVals) {
-    if (ShapedType::isDynamic(val)) {
-      out.push_back(dynamicVals.front());
-      dynamicVals = dynamicVals.drop_front(1);
-    } else {
-      out.push_back(val);
-    }
-  }
-  assert(dynamicVals.empty() && "expected no leftover dynamic values");
-  return out;
-}
-
-OffsetsSizesStrides extractOffsetsSizesStrides(
-    ArrayRef<InterpreterValue> args, OffsetSizeAndStrideOpInterface op) {
-  auto offsets = replaceDynamicVals(op.getStaticOffsets(), args);
-  auto sizes = replaceDynamicVals(op.getStaticSizes(), args);
-  auto strides = replaceDynamicVals(op.getStaticStrides(), args);
-  return {offsets, sizes, strides};
-}
-
-OffsetsSizesStrides extractOffsetsSizesStrides(
-    ArrayRef<int64_t> dynamicOffsets, ArrayRef<int64_t> dynamicSizes,
-    ArrayRef<int64_t> dynamicStrides, OffsetSizeAndStrideOpInterface op) {
-  auto offsets = replaceDynamicVals(op.getStaticOffsets(), dynamicOffsets);
-  auto sizes = replaceDynamicVals(op.getStaticSizes(), dynamicSizes);
-  auto strides = replaceDynamicVals(op.getStaticStrides(), dynamicStrides);
-  return {offsets, sizes, strides};
-}
-
-InterpreterValue reshapeTensor(const InterpreterValue& in,
-                               ArrayRef<int64_t> shape) {
-  // This doesn't need a copy in many cases, but it's easier that way.
-  auto out = in.typedAlike(shape);
-  for (const auto& [inIndex, outIndex] :
-       llvm::zip(in.view().indices(), out.view().indices())) {
-    out.insertElement(outIndex, in.extractElement(inIndex));
-  }
-  return out;
-}
-
-InterpreterValue getInitOperand(mlir::Operation* op, int64_t index,
-                                MutableArrayRef<InterpreterValue> args) {
-  return getInitOperand(op->getOperands(), index, args);
-}
-
-InterpreterValue getInitOperand(mlir::ValueRange values, int64_t index,
-                                ArrayRef<InterpreterValue> args) {
-  assert(args.size() == values.size() && "expected matching sizes");
-  return values[index].getType().isa<TensorType>() ? args[index].clone()
-                                                   : args[index];
-}
-
-InterpreterValue transposeImpl(const InterpreterValue& in,
-                               ArrayRef<int64_t> permutation) {
-  auto out = in;
-  auto& view = out.view();
-
-  view.sizes.clear();
-  view.strides.clear();
-  for (int64_t p : permutation) {
-    view.sizes.push_back(in.view().sizes[p]);
-    view.strides.push_back(in.view().strides[p]);
-  }
-
-  return out;
-}
-
-int64_t dimImpl(const InterpreterValue& in, int64_t index,
-                InterpreterState& state) {
-  if (index < 0 || index >= in.view().rank()) {
-    state.addFailure("dimension index out of bounds");
-    return 0;
-  }
-  return in.view().sizes[index];
-}
-
-llvm::SmallVector<InterpreterValue> noOpTerminator(
-    MutableArrayRef<InterpreterValue> args, mlir::Operation*,
-    InterpreterState&) {
-  return llvm::to_vector(args);
-}
-
-int64_t evalAffineExpr(AffineExpr expr, ArrayRef<int64_t> dims,
-                       ArrayRef<int64_t> symbols) {
-  int64_t lhs = 0, rhs = 0;
-  if (auto bin = expr.dyn_cast<AffineBinaryOpExpr>()) {
-    lhs = evalAffineExpr(bin.getLHS(), dims, symbols);
-    rhs = evalAffineExpr(bin.getRHS(), dims, symbols);
-  }
-  switch (expr.getKind()) {
-    case AffineExprKind::Add:
-      return lhs + rhs;
-    case AffineExprKind::Mul:
-      return lhs * rhs;
-    case AffineExprKind::Mod:
-      return mod(lhs, rhs);
-    case AffineExprKind::FloorDiv:
-      return floorDiv(lhs, rhs);
-    case AffineExprKind::CeilDiv:
-      return ceilDiv(lhs, rhs);
-    case AffineExprKind::Constant:
-      return expr.cast<AffineConstantExpr>().getValue();
-    case AffineExprKind::DimId:
-      return dims[expr.cast<AffineDimExpr>().getPosition()];
-    case AffineExprKind::SymbolId:
-      return symbols[expr.cast<AffineSymbolExpr>().getPosition()];
-  }
-}
-
-SmallVector<int64_t> evalAffineMap(AffineMap map, ArrayRef<int64_t> dims,
-                                   ArrayRef<int64_t> symbols) {
-  SmallVector<int64_t> result;
-  for (auto expr : map.getResults()) {
-    result.push_back(evalAffineExpr(expr, dims, symbols));
-  }
-  return result;
-}
-
-llvm::SmallVector<int64_t> evalAffineMap(AffineMap map,
-                                         ArrayRef<int64_t> operands) {
-  return evalAffineMap(map, operands.take_front(map.getNumDims()),
-                       operands.drop_front(map.getNumDims()));
-}
-
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.h b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.h
deleted file mode 100644
index 4e48e9cd794f3a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
-#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
-
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/AffineMap.h"
-#include "mlir/Interfaces/ViewLikeInterface.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-
-namespace mlir {
-namespace interpreter {
-
-struct OffsetsSizesStrides {
-  llvm::SmallVector<int64_t> offsets;
-  llvm::SmallVector<int64_t> sizes;
-  llvm::SmallVector<int64_t> strides;
-};
-
-// Replaces dynamic placeholders in staticVals using elements from the front
-// of args, which are removed.
-SmallVector<int64_t> replaceDynamicVals(ArrayRef<int64_t> staticVals,
-                                        ArrayRef<InterpreterValue>& args);
-// Same as above, but the values are already unpacked. `dynamicVals.size` must
-// match the number of dynamic values in `staticVals`.
-SmallVector<int64_t> replaceDynamicVals(ArrayRef<int64_t> staticVals,
-                                        ArrayRef<int64_t> dynamicVals);
-
-OffsetsSizesStrides extractOffsetsSizesStrides(
-    ArrayRef<InterpreterValue> args, OffsetSizeAndStrideOpInterface op);
-OffsetsSizesStrides extractOffsetsSizesStrides(
-    ArrayRef<int64_t> dynamicOffsets, ArrayRef<int64_t> dynamicSizes,
-    ArrayRef<int64_t> dynamicStrides, OffsetSizeAndStrideOpInterface op);
-
-InterpreterValue reshapeTensor(const InterpreterValue& in,
-                               ArrayRef<int64_t> shape);
-
-// Gets the given operand, cloning its storage if it is a tensor.
-InterpreterValue getInitOperand(mlir::Operation* op, int64_t index,
-                                MutableArrayRef<InterpreterValue> args);
-InterpreterValue getInitOperand(mlir::ValueRange values, int64_t index,
-                                ArrayRef<InterpreterValue> args);
-
-// Common implementations for ops from different dialects but sharing the same
-// semantics.
-InterpreterValue transposeImpl(const InterpreterValue& in,
-                               ArrayRef<int64_t> permutation);
-int64_t dimImpl(const InterpreterValue& in, int64_t index,
-                InterpreterState& state);
-
-// Terminator that just returns its args.
-llvm::SmallVector<InterpreterValue> noOpTerminator(
-    MutableArrayRef<InterpreterValue> args, mlir::Operation*,
-    InterpreterState&);
-
-int64_t evalAffineExpr(AffineExpr expr, ArrayRef<int64_t> dims,
-                       ArrayRef<int64_t> symbols);
-llvm::SmallVector<int64_t> evalAffineMap(AffineMap map, ArrayRef<int64_t> dims,
-                                         ArrayRef<int64_t> symbols);
-llvm::SmallVector<int64_t> evalAffineMap(AffineMap map,
-                                         ArrayRef<int64_t> operands);
-
-}  // namespace interpreter
-}  // namespace mlir
-
-#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc
deleted file mode 100644
index 47cf0267bc8605..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc
+++ /dev/null
@@ -1,857 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <type_traits>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/Operation.h"
-#include "tools/mlir_interpreter/dialects/comparators.h"
-#include "tools/mlir_interpreter/dialects/util.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-class MaskSideChannel : public InterpreterSideChannel {
- public:
-  MaskSideChannel(TensorOrMemref<bool> mask,
-                  std::optional<InterpreterValue> passthrough)
-      : mask(std::move(mask)), passthrough(std::move(passthrough)) {}
-
-  const TensorOrMemref<bool>& getMask() const { return mask; }
-  const std::optional<InterpreterValue>& getPassthrough() const {
-    return passthrough;
-  }
-
- private:
-  const TensorOrMemref<bool> mask;
-  const std::optional<InterpreterValue> passthrough;
-};
-
-template <typename T>
-using combiner_t = T (*)(T, T);
-
-template <typename T>
-combiner_t<T> getCombiner(vector::CombiningKind kind) {
-  if constexpr (std::is_arithmetic_v<T> && !std::is_same_v<T, bool>) {
-    switch (kind) {
-      case vector::CombiningKind::ADD:
-        return +[](T a, T b) -> T { return a + b; };
-      case vector::CombiningKind::MUL:
-        return +[](T a, T b) -> T { return a * b; };
-      case vector::CombiningKind::MINSI:
-      case vector::CombiningKind::MINF:
-        return +[](T a, T b) -> T { return std::min(a, b); };
-      case vector::CombiningKind::MAXSI:
-      case vector::CombiningKind::MAXF:
-        return +[](T a, T b) -> T { return std::max(a, b); };
-      default: {
-      }
-    }
-  }
-
-  if constexpr (std::is_integral_v<T> && !std::is_same_v<T, bool>) {
-    switch (kind) {
-      case vector::CombiningKind::MINUI:
-        return &Iumin::apply<T>;
-      case vector::CombiningKind::MAXUI:
-        return &Iumax::apply<T>;
-      default: {
-      }
-    }
-  }
-
-  if constexpr (std::is_integral_v<T>) {
-    switch (kind) {
-      case vector::CombiningKind::AND:
-        return +[](T a, T b) -> T { return a & b; };
-      case vector::CombiningKind::OR:
-        return +[](T a, T b) -> T { return a | b; };
-      case vector::CombiningKind::XOR:
-        return +[](T a, T b) -> T { return a ^ b; };
-      default: {
-      }
-    }
-  }
-
-  llvm_unreachable("unknown combining kind");
-}
-
-InterpreterValue getNeutralElement(vector::CombiningKind kind, mlir::Type ty) {
-  return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
-    using T = decltype(dummy);
-    switch (kind) {
-      case vector::CombiningKind::AND:
-      case vector::CombiningKind::MINUI:
-        if constexpr (std::is_same_v<T, bool>) {
-          return {true};
-        } else if constexpr (std::is_integral_v<T>) {
-          return {static_cast<T>(static_cast<std::make_unsigned_t<T>>(-1))};
-        }
-        break;
-      case vector::CombiningKind::ADD:
-      case vector::CombiningKind::MAXUI:
-      case vector::CombiningKind::OR:
-      case vector::CombiningKind::XOR:
-        return {T{0}};
-      case vector::CombiningKind::MUL:
-        return {T{1}};
-      case vector::CombiningKind::MINSI:
-        return {std::numeric_limits<T>::max()};
-      case vector::CombiningKind::MINF:
-        return {std::numeric_limits<T>::infinity()};
-      case vector::CombiningKind::MAXSI:
-        return {std::numeric_limits<T>::min()};
-      case vector::CombiningKind::MAXF:
-        return {-std::numeric_limits<T>::infinity()};
-      default: {
-      }
-    }
-    llvm_unreachable("invalid combining kind");
-  });
-}
-
-template <typename IntType>
-SmallVector<IntType> extractVector(ArrayAttr arrayAttr) {
-  return llvm::to_vector<4>(llvm::map_range(
-      arrayAttr.getAsRange<IntegerAttr>(),
-      [](IntegerAttr attr) { return static_cast<IntType>(attr.getInt()); }));
-}
-
-InterpreterValue bitcast(InterpreterState&, vector::BitCastOp op,
-                         const InterpreterValue& vector) {
-  ShapedType ty = cast<ShapedType>(op->getResultTypes()[0]);
-  auto flattened = vector.coerceLayout({});
-  auto buffer = flattened.buffer();
-  auto view = flattened.view();
-  view.sizes = llvm::to_vector(ty.getShape());
-  view.strides = BufferView::getDefaultStrides(view.sizes);
-  return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
-    // TODO(jreiffers): i1 semantics are currently broken (both here and
-    // upstream).
-    using T = decltype(dummy);
-    return {TensorOrMemref<T>{buffer, view}};
-  });
-}
-
-InterpreterValue broadcast(InterpreterState&, vector::BroadcastOp broadcast,
-                           const InterpreterValue& value) {
-  auto result =
-      value.isTensor() ? value : value.asUnitTensor(/*isVector=*/true);
-  auto& resultView = result.view();
-
-  // Insert additional leading stride 0 dims.
-  SmallVector<int64_t> strides(broadcast.getResultVectorType().getRank());
-  llvm::copy(llvm::reverse(resultView.strides), strides.rbegin());
-  // Zero out broadcast dimension strides.
-  for (int64_t i : broadcast.computeBroadcastedUnitDims()) strides[i] = 0;
-
-  resultView.strides = std::move(strides);
-  resultView.sizes =
-      llvm::to_vector(broadcast.getResultVectorType().getShape());
-  return result;
-}
-
-void compressStore(InterpreterState& state, vector::CompressStoreOp,
-                   InterpreterValue dst, ArrayRef<int64_t> indices,
-                   TensorOrMemref<bool> mask, InterpreterValue value) {
-  auto dstBuffer = dst.buffer();
-  const auto& dstView = dst.view();
-  if (dstView.strides.back() != 1) {
-    state.addFailure("trailing dimension must be continguous");
-    return;
-  }
-  auto offset = dstView.getPhysicalIndex(indices);
-  if (!offset) {
-    state.addFailure("index out of bounds");
-    return;
-  }
-
-  auto srcBuffer = value.buffer();
-  const auto& srcView = value.view();
-
-  // TODO(jreiffers): Bounds checks.
-  int64_t n = srcView.sizes[0];
-  int64_t elementSize = value.getByteSizeOfElement();
-  for (int64_t i = 0; i < n; ++i) {
-    if (mask.at(i)) {
-      memcpy(dstBuffer->at(offset, elementSize), srcBuffer->at(i, elementSize),
-             elementSize);
-      ++*offset;
-    }
-  }
-}
-
-InterpreterValue maskImpl(mlir::Operation* op, ArrayRef<int64_t> maskSizes) {
-  auto outSizes = op->getResultTypes()[0].cast<ShapedType>().getShape();
-  auto result = TensorOrMemref<bool>::empty(outSizes);
-  result.view.isVector = true;
-  BufferView iter;
-  iter.sizes = llvm::to_vector(maskSizes);
-  for (const auto& indices : iter.indices()) result.at(indices) = true;
-  return {result};
-}
-
-InterpreterValue constantMask(InterpreterState&, vector::ConstantMaskOp mask) {
-  return maskImpl(mask, extractVector<int64_t>(mask.getMaskDimSizes()));
-}
-
-// TODO(jreiffers): Support maksed contractions.
-InterpreterValue contract(InterpreterState&, vector::ContractionOp contraction,
-                          const InterpreterValue& lhs,
-                          const InterpreterValue& rhs,
-                          const InterpreterValue& acc) {
-  BufferView iter;
-  contraction.getIterationBounds(iter.sizes);
-  auto maps = contraction.getIndexingMapsArray();
-  auto resultTy = contraction->getResultTypes()[0];
-  auto shapedTy = resultTy.dyn_cast<ShapedType>();
-  auto result =
-      dispatchScalarType(resultTy, [&](auto dummy) -> InterpreterValue {
-        using T = decltype(dummy);
-        using TT = TensorOrMemref<T>;
-        const auto& lhsT = std::get<TT>(lhs.storage);
-        const auto& rhsT = std::get<TT>(rhs.storage);
-        auto resultValue = shapedTy ? acc.clone() : acc.asUnitTensor();
-        auto& result = std::get<TT>(resultValue.storage);
-        auto combiner = *getCombiner<T>(contraction.getKind());
-        for (const auto& indices : iter.indices()) {
-          auto lhsIndices = evalAffineMap(maps[0], indices);
-          auto rhsIndices = evalAffineMap(maps[1], indices);
-          auto resultIndices = evalAffineMap(maps[2], indices);
-
-          auto& resultItem = result.at(resultIndices);
-          resultItem =
-              combiner(resultItem, lhsT.at(lhsIndices) * rhsT.at(rhsIndices));
-        }
-        return resultValue;
-      });
-
-  return shapedTy ? result : result.extractElement({});
-}
-
-InterpreterValue createMask(InterpreterState&, vector::CreateMaskOp op,
-                            ArrayRef<int64_t> sizes) {
-  return maskImpl(op, sizes);
-}
-
-InterpreterValue expandLoad(InterpreterState& state, vector::ExpandLoadOp,
-                            InterpreterValue memref,
-                            SmallVector<int64_t> offsets,
-                            TensorOrMemref<bool> mask,
-                            const InterpreterValue& passThrough) {
-  if (memref.view().strides.back() != 1) {
-    state.addFailure("expected last dimension to be contiguous");
-    return {};
-  }
-
-  auto out = passThrough.clone();
-  for (int64_t i = 0, e = out.view().sizes[0]; i < e; ++i) {
-    if (mask.at({i})) {
-      out.insertElement({i}, memref.extractElement(offsets));
-      ++offsets.back();
-    }
-  }
-  return out;
-}
-
-InterpreterValue extract(InterpreterState& state, vector::ExtractOp extract,
-                         const InterpreterValue& vector) {
-  auto result = vector;
-  auto& resultView = result.view();
-  for (int64_t offset : extract.getStaticPosition()) {
-    state.checkSuccess(resultView.slice(0, offset), "index out of bounds");
-  }
-  return resultView.rank() == 0 ? result.extractElement({}) : result;
-}
-
-InterpreterValue extractElement(InterpreterState& state,
-                                vector::ExtractElementOp,
-                                const InterpreterValue& vector,
-                                std::optional<int64_t> index) {
-  if (!index) {
-    return vector.extractElement({});
-  }
-  if (!vector.view().inBounds(*index)) {
-    state.addFailure("array index out of bounds");
-    return {};
-  }
-  return vector.extractElement(*index);
-}
-
-InterpreterValue extractSlice(InterpreterState& state,
-                              vector::ExtractStridedSliceOp extract,
-                              const InterpreterValue& vector) {
-  auto out = vector;
-  state.checkSuccess(
-      out.view().subview(extractVector<int64_t>(extract.getOffsets()),
-                         extractVector<int64_t>(extract.getSizes()),
-                         extractVector<int64_t>(extract.getStrides())),
-      "subview out of bounds");
-  return out;
-}
-
-InterpreterValue flatTranspose(InterpreterState&,
-                               vector::FlatTransposeOp transpose,
-                               const InterpreterValue& vector) {
-  auto out = vector.clone();
-  // We currently only implement -matrix-default-layout=column-major.
-  int64_t rows = transpose.getRows();
-  int64_t cols = transpose.getColumns();
-  for (int64_t i = 0; i < rows * cols; ++i) {
-    int64_t srcIndex = (i % cols) * rows + (i / cols);
-    out.insertElement({i}, vector.extractElement({srcIndex}));
-  }
-  return out;
-}
-
-InterpreterValue fusedMultiplyAdd(InterpreterState&, vector::FMAOp op,
-                                  const InterpreterValue& lhs,
-                                  const InterpreterValue& rhs,
-                                  const InterpreterValue& acc) {
-  auto out = acc.clone();
-  dispatchScalarType(op->getResultTypes()[0], [&](auto dummy) {
-    using TT = TensorOrMemref<decltype(dummy)>;
-    const auto& lhsT = std::get<TT>(lhs.storage);
-    const auto& rhsT = std::get<TT>(rhs.storage);
-    auto& result = std::get<TT>(out.storage);
-
-    for (const auto& indices : lhsT.view.indices()) {
-      result.at(indices) += lhsT.at(indices) * rhsT.at(indices);
-    }
-  });
-  return out;
-}
-
-InterpreterValue gather(InterpreterState& state, vector::GatherOp op,
-                        const InterpreterValue& src, ArrayRef<int64_t> offsets,
-                        const InterpreterValue& indices,
-                        const TensorOrMemref<bool>& mask,
-                        const InterpreterValue& passThrough) {
-  if (op->getOperandTypes()[0].isa<MemRefType>() &&
-      src.view().strides.back() != 1) {
-    state.addFailure("expected trailing dimension to be contiguous");
-    return {};
-  }
-  auto out = passThrough.clone();
-  for (const auto& outIndex : out.view().indices()) {
-    if (!mask.at(outIndex)) continue;
-    auto inIndex = llvm::to_vector(offsets);
-    inIndex[0] += indices.extractElement(outIndex).asInt();
-    out.insertElement(outIndex, src.extractElement(inIndex));
-  }
-  return out;
-}
-
-InterpreterValue insert(InterpreterState& state, vector::InsertOp insert,
-                        const InterpreterValue& src,
-                        const InterpreterValue& dst) {
-  auto result = dst.clone();
-  auto resultSlice = result;
-  auto& resultSliceView = resultSlice.view();
-  for (int64_t offset : insert.getStaticPosition()) {
-    state.checkSuccess(resultSliceView.slice(0, offset), "index out of bounds");
-  }
-  resultSlice.fill([&](auto indices) { return src.extractElement(indices); });
-  return result;
-}
-
-InterpreterValue insertElement(InterpreterState& state, vector::InsertElementOp,
-                               const InterpreterValue& value,
-                               const InterpreterValue& vector,
-                               std::optional<int64_t> index) {
-  auto result = vector.clone();
-  if (!index) {
-    result.insertElement({}, value);
-    return result;
-  }
-  if (!result.view().inBounds(*index)) {
-    state.addFailure("array index out of bounds");
-    return {};
-  }
-  result.insertElement(*index, value);
-  return result;
-}
-
-InterpreterValue insertSlice(InterpreterState& state,
-                             vector::InsertStridedSliceOp insert,
-                             InterpreterValue src,
-                             const InterpreterValue& dst) {
-  auto out = dst.clone();
-  auto outSlice = out;
-  if (!outSlice.view()
-           .subview(extractVector<int64_t>(insert.getOffsets()),
-                    insert.getSourceVectorType().getShape(),
-                    extractVector<int64_t>(insert.getStrides()))
-           .succeeded()) {
-    state.addFailure("subview out of bounds");
-  }
-  for (const auto& index : src.view().indices()) {
-    outSlice.insertElement(index, src.extractElement(index));
-  }
-  return out;
-}
-
-InterpreterValue load(InterpreterState& state, vector::LoadOp load,
-                      const InterpreterValue& memref,
-                      ArrayRef<int64_t> indices) {
-  if (memref.view().numVectorDims > 0) {
-    return {memref.extractElement(indices)};
-  }
-  auto out = memref;
-  if (!out.view()
-           .subview(indices, load.getVectorType().getShape(),
-                    SmallVector<int64_t>(load.getVectorType().getRank(), 1))
-           .succeeded()) {
-    // "Not all targets may support out-of-bounds vector loads"
-    state.addFailure("out of bounds loads not supported");
-    return {};
-  }
-  out = out.clone();
-  out.view().isVector = true;
-  return out;
-}
-
-llvm::SmallVector<InterpreterValue> mask(
-    InterpreterState& state, vector::MaskOp op, TensorOrMemref<bool> mask,
-    std::optional<InterpreterValue> passthrough) {
-  InterpreterScope scope(state);
-  scope.setSideChannel(std::make_shared<MaskSideChannel>(mask, passthrough));
-  return interpret(state, op.getMaskRegion(), {});
-}
-
-InterpreterValue maskedLoad(InterpreterState& state, vector::MaskedLoadOp,
-                            const InterpreterValue& memref,
-                            SmallVector<int64_t> offsets,
-                            TensorOrMemref<bool> mask,
-                            const InterpreterValue& passThrough) {
-  if (memref.view().strides.back() != 1) {
-    state.addFailure("expected last dimension to be contiguous");
-    return {};
-  }
-
-  auto out = passThrough.clone();
-  for (int64_t i = 0, e = mask.view.sizes[0]; i < e; ++i) {
-    if (mask.at(i)) {
-      out.insertElement(i, memref.extractElement(offsets));
-    }
-    ++offsets.back();
-  }
-  return out;
-}
-
-void maskedStore(InterpreterState& state, vector::MaskedStoreOp,
-                 InterpreterValue memref, SmallVector<int64_t> offsets,
-                 TensorOrMemref<bool> mask, const InterpreterValue& vector) {
-  if (memref.view().strides.back() != 1) {
-    state.addFailure("expected last dimension to be contiguous");
-    return;
-  }
-
-  for (int64_t i = 0, e = mask.view.sizes[0]; i < e; ++i) {
-    if (mask.at({i})) {
-      memref.insertElement(offsets, vector.extractElement({i}));
-    }
-    ++offsets.back();
-  }
-}
-
-InterpreterValue reductionImpl(InterpreterState& state,
-                               const InterpreterValue& v,
-                               const InterpreterValue* acc,
-                               vector::CombiningKind kind,
-                               SmallVector<int64_t> dims, Type elementType) {
-  llvm::sort(dims);
-  SmallVector<int64_t> keptDims;
-  SmallVector<int64_t> resultShape;
-  for (auto [dim, size] : llvm::enumerate(v.view().sizes)) {
-    if (!llvm::is_contained(dims, dim)) {
-      keptDims.push_back(dim);
-      resultShape.push_back(size);
-    }
-  }
-  return dispatchScalarType(elementType, [&](auto dummy) -> InterpreterValue {
-    using T = decltype(dummy);
-    using TT = TensorOrMemref<T>;
-
-    auto combiner = *getCombiner<T>(kind);
-
-    TT result = acc ? std::get<TT>((keptDims.empty() ? acc->asUnitTensor()
-                                                     : acc->clone())
-                                       .storage)
-                    : TensorOrMemref<T>::empty(resultShape);
-
-    for (const auto& resultIndex : result.view.indices()) {
-      auto src = std::get<TT>(v.storage);
-      for (auto [dim, index] :
-           llvm::reverse(llvm::zip(keptDims, resultIndex))) {
-        state.checkSuccess(src.view.slice(dim, index), "index out of bounds");
-      }
-
-      T& item = result.at(resultIndex);
-      bool first = acc == nullptr;
-      for (const auto& srcIndex : src.view.indices()) {
-        if (first) {
-          item = src.at(srcIndex);
-          first = false;
-        } else {
-          item = combiner(item, src.at(srcIndex));
-        }
-      }
-    }
-
-    if (keptDims.empty()) {
-      return {result.at({})};
-    }
-    return {result};
-  });
-}
-
-InterpreterValue multiReduction(InterpreterState& state,
-                                vector::MultiDimReductionOp reduction,
-                                const InterpreterValue& source,
-                                const InterpreterValue& acc) {
-  auto elementTy = getElementTypeOrSelf(reduction->getResultTypes()[0]);
-  return {reductionImpl(state, source, &acc, reduction.getKind(),
-                        extractVector<int64_t>(reduction.getReductionDims()),
-                        elementTy)};
-}
-
-InterpreterValue outerProduct(InterpreterState&,
-                              vector::OuterProductOp outerproduct,
-                              const InterpreterValue& lhs,
-                              const InterpreterValue& rhs,
-                              std::optional<InterpreterValue> acc) {
-  ShapedType ty = cast<ShapedType>(outerproduct->getResultTypes()[0]);
-  return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
-    using T = decltype(dummy);
-    using TT = TensorOrMemref<T>;
-    const TT& lhsT = std::get<TT>(lhs.storage);
-
-    auto combiner = *getCombiner<T>(outerproduct.getKind());
-    auto result =
-        acc ? std::get<TT>(acc->storage).clone() : TT::empty(ty.getShape());
-    if (std::holds_alternative<T>(rhs.storage)) {
-      T rhsS = std::get<T>(rhs.storage);
-      for (int64_t i : llvm::seq(int64_t{0}, lhsT.view.sizes[0])) {
-        result.at(i) = combiner(result.at(i), lhsT.at(i) * rhsS);
-      }
-    } else {
-      const TT& rhsT = std::get<TT>(rhs.storage);
-      for (int64_t i : llvm::seq(int64_t{0}, lhsT.view.sizes[0])) {
-        for (int64_t j : llvm::seq(int64_t{0}, rhsT.view.sizes[0])) {
-          result.at({i, j}) =
-              combiner(result.at({i, j}), lhsT.at(i) * rhsT.at(j));
-        }
-      }
-    }
-    return {result};
-  });
-}
-
-InterpreterValue reduction(InterpreterState& state,
-                           vector::ReductionOp reduction, InterpreterValue arg,
-                           std::optional<InterpreterValue> acc) {
-  auto* mask =
-      state.getTopScope()->getSideChannel<MaskSideChannel>(/*optional=*/true);
-  auto ty = reduction->getResultTypes()[0];
-  if (mask) {
-    if (mask->getPassthrough()) {
-      state.addFailure("passthrough should not be set with masked reduction");
-      return {};
-    }
-    arg = arg.clone();
-    if (mask->getMask().view.sizes != arg.view().sizes) {
-      state.addFailure("mask shape should match argument shape");
-      return {};
-    }
-    auto neutral = getNeutralElement(reduction.getKind(), ty);
-    for (const auto& idx : arg.view().indices()) {
-      if (!mask->getMask().at(idx)) {
-        arg.insertElement(idx, neutral);
-      }
-    }
-  }
-
-  return reductionImpl(state, arg, acc ? &acc.value() : nullptr,
-                       reduction.getKind(), {0}, ty);
-}
-
-InterpreterValue shapeCast(InterpreterState&, vector::ShapeCastOp op,
-                           const InterpreterValue& in) {
-  auto out = in.coerceLayout({});
-  auto& outView = out.view();
-  outView.sizes =
-      llvm::to_vector(op->getResultTypes()[0].cast<ShapedType>().getShape());
-  outView.strides = BufferView::getDefaultStrides(outView.sizes);
-  return out;
-}
-
-InterpreterValue shuffle(InterpreterState& state, vector::ShuffleOp shuffle,
-                         const InterpreterValue& v0,
-                         const InterpreterValue& v1) {
-  auto result = v0.typedAlike(shuffle.getResultVectorType().getShape());
-  auto& resultView = result.view();
-  resultView.isVector = true;
-
-  auto mask = extractVector<int64_t>(shuffle.getMask());
-  bool isZeroDim = v0.view().rank() == 0;
-  int64_t size0 = isZeroDim ? 1 : v0.view().sizes[0];
-  for (auto [dstIndex, srcIndex] : llvm::enumerate(mask)) {
-    auto src = srcIndex < size0 ? v0 : v1;
-    if (!isZeroDim) {
-      state.checkSuccess(
-          src.view().slice(0, srcIndex < size0 ? srcIndex : srcIndex - size0),
-          "index out of bounds");
-    }
-    auto dst = result;
-    state.checkSuccess(dst.view().slice(0, dstIndex), "index out of bounds");
-    dst.fill([&](auto indices) { return src.extractElement(indices); });
-  }
-  return result;
-}
-
-InterpreterValue splat(InterpreterState&, vector::SplatOp op,
-                       const InterpreterValue& in) {
-  auto out = in.asUnitTensor(/*isVector=*/true);
-  auto& view = out.view();
-  view.sizes =
-      llvm::to_vector(op->getResultTypes()[0].cast<ShapedType>().getShape());
-  view.strides = SmallVector<int64_t>(view.sizes.size(), 0);
-  return out;
-}
-
-void store(InterpreterState&, vector::StoreOp, const InterpreterValue& src,
-           InterpreterValue dst, ArrayRef<int64_t> offsets) {
-  const auto& outView = dst.view();
-  if (outView.numVectorDims > 0) {
-    dst.insertElement(offsets, src);
-  } else {
-    for (const auto& srcIndex : src.view().indices()) {
-      auto dstIndex = srcIndex;
-      for (int64_t i = 0; i < dstIndex.size(); ++i) {
-        dstIndex[i] += offsets[i];
-      }
-      if (outView.inBounds(dstIndex)) {
-        dst.insertElement(dstIndex, src.extractElement(srcIndex));
-      }
-    }
-  }
-}
-
-std::optional<InterpreterValue> extractMemorySlice(
-    InterpreterState& state, const AffineMap& map,
-    const InterpreterValue& memory, const InterpreterValue& vector,
-    ArrayRef<int64_t> offsets, std::optional<ArrayAttr> inBoundsAttr) {
-  llvm::SmallVector<bool> inBounds(offsets.size());
-  if (inBoundsAttr) {
-    llvm::copy(inBoundsAttr->getAsValueRange<BoolAttr>(),
-               inBounds.end() - inBoundsAttr->size());
-  }
-
-  auto memSlice = memory;
-  auto& memSliceView = memSlice.view();
-  auto& vectorView = vector.view();
-  for (int64_t i = 0; i < memSliceView.rank(); ++i) {
-    bool found = false;
-    for (int64_t j = 0; !found && j < vectorView.rank(); ++j) {
-      if (map.getResult(j).isFunctionOfDim(i)) {
-        int64_t size = memSliceView.sizes[i] - offsets[i];
-        bool isInBounds = size >= vectorView.sizes[j];
-        if (!isInBounds && inBounds[i]) {
-          state.addFailure("index out of bounds");
-          return std::nullopt;
-        }
-        (void)memSliceView.slice(
-            i, offsets[i],
-            std::max(int64_t{0}, std::min(vectorView.sizes[j], size)));
-        found = true;
-      }
-    }
-    if (!found) {
-      bool isInBounds = memSliceView.sizes[i] > offsets[i];
-      if (!isInBounds) {
-        state.addFailure("index out of bounds");
-        return std::nullopt;
-      }
-
-      (void)memSliceView.slice(i, offsets[i], isInBounds ? 1 : 0);
-    }
-  }
-  return memSlice;
-}
-
-InterpreterValue transferRead(InterpreterState& state,
-                              vector::TransferReadOp transfer,
-                              const InterpreterValue& src,
-                              ArrayRef<int64_t> offsets,
-                              const InterpreterValue& padding,
-                              std::optional<TensorOrMemref<bool>> mask) {
-  auto* maskChannel = state.getTopScope()->getSideChannel<MaskSideChannel>(
-      /*optional=*/true);
-  if (maskChannel) {
-    if (mask) {
-      state.addFailure(
-          "vector.mask and transfer_read with mask should not be used "
-          "simultaneously");
-      return {};
-    }
-    mask = maskChannel->getMask();
-  }
-
-  InterpreterValue dst = src.typedAlike(transfer.getVectorType().getShape());
-  if (maskChannel && maskChannel->getPassthrough()) {
-    dst.fill([&](auto indices) {
-      if (mask->at(indices)) {
-        return padding;
-      }
-      return maskChannel->getPassthrough()->extractElement(indices);
-    });
-  } else {
-    dst.fill([&](auto) { return padding; });
-  }
-  dst.view().isVector = true;
-
-  auto srcSlice = extractMemorySlice(state, transfer.getPermutationMap(), src,
-                                     dst, offsets, transfer.getInBounds());
-
-  if (!srcSlice) {
-    return {};
-  }
-  for (const auto& srcIndices : srcSlice->view().indices()) {
-    SmallVector<int64_t> dstIndices =
-        evalAffineMap(transfer.getPermutationMap(), srcIndices);
-
-    // Note: the handling of padding and passthrough values is somewhat
-    // arbitrary here. At the time of writing this, there seems to be little
-    // evidence of actual usage of this feature.
-    if (!mask || mask->at(dstIndices)) {
-      dst.insertElement(dstIndices, srcSlice->extractElement(srcIndices));
-    }
-  }
-
-  return dst;
-}
-
-llvm::SmallVector<InterpreterValue> transferWrite(
-    InterpreterState& state, vector::TransferWriteOp transfer,
-    InterpreterValue src, InterpreterValue dst, ArrayRef<int64_t> offsets,
-    std::optional<TensorOrMemref<bool>> mask) {
-  if (auto* maskChannel = state.getTopScope()->getSideChannel<MaskSideChannel>(
-          /*optional=*/true)) {
-    if (mask) {
-      state.addFailure(
-          "vector.mask and transfer_write with mask should not be used "
-          "simultaneously");
-      return {};
-    }
-    if (maskChannel->getPassthrough()) {
-      state.addFailure(
-          "vector.mask with passthrough should not be used with "
-          "transfer_write");
-      return {};
-    }
-    mask = maskChannel->getMask();
-  }
-
-  const auto& srcView = src.view();
-  int64_t srcRank = srcView.rank();
-  (void)srcRank;
-  assert(transfer.getPermutationMap().getNumResults() == srcRank &&
-         "expected matching number of results");
-
-  dst = transfer.getSource().getType().isa<TensorType>() ? dst.clone() : dst;
-  auto dstSlice = extractMemorySlice(state, transfer.getPermutationMap(), dst,
-                                     src, offsets, transfer.getInBounds());
-  if (!dstSlice) {
-    return {};
-  }
-
-  for (const auto& dstIndices : dstSlice->view().indices()) {
-    SmallVector<int64_t> srcIndices =
-        evalAffineMap(transfer.getPermutationMap(), dstIndices);
-    if (srcView.inBounds(srcIndices) && (!mask || mask->at(srcIndices))) {
-      dstSlice->insertElement(dstIndices, src.extractElement(srcIndices));
-    }
-  }
-
-  if (transfer->getNumResults() == 0) return {};
-  return {dst};
-}
-
-InterpreterValue transpose(InterpreterState&, vector::TransposeOp transpose,
-                           const InterpreterValue& vector) {
-  auto permutation = extractVector<int64_t>(transpose.getTransp());
-  return transposeImpl(vector, permutation);
-}
-
-InterpreterValue typeCast(InterpreterState&, vector::TypeCastOp,
-                          InterpreterValue vector) {
-  vector.view().numVectorDims = vector.view().rank();
-  return vector;
-}
-
-uint64_t vScale(InterpreterState&, vector::VectorScaleOp) { return 1; }
-
-REGISTER_MLIR_INTERPRETER_OP("vector.yield", noOpTerminator);
-REGISTER_MLIR_INTERPRETER_OP(bitcast);
-REGISTER_MLIR_INTERPRETER_OP(broadcast);
-REGISTER_MLIR_INTERPRETER_OP(compressStore);
-REGISTER_MLIR_INTERPRETER_OP(constantMask);
-REGISTER_MLIR_INTERPRETER_OP(contract);
-REGISTER_MLIR_INTERPRETER_OP(createMask);
-REGISTER_MLIR_INTERPRETER_OP(expandLoad);
-REGISTER_MLIR_INTERPRETER_OP(extract);
-REGISTER_MLIR_INTERPRETER_OP(extractElement);
-REGISTER_MLIR_INTERPRETER_OP(extractSlice);
-REGISTER_MLIR_INTERPRETER_OP(fusedMultiplyAdd);
-REGISTER_MLIR_INTERPRETER_OP(flatTranspose);
-REGISTER_MLIR_INTERPRETER_OP(gather);
-REGISTER_MLIR_INTERPRETER_OP(insert);
-REGISTER_MLIR_INTERPRETER_OP(insertElement);
-REGISTER_MLIR_INTERPRETER_OP(insertSlice);
-REGISTER_MLIR_INTERPRETER_OP(load);
-REGISTER_MLIR_INTERPRETER_OP(mask);
-REGISTER_MLIR_INTERPRETER_OP(maskedLoad);
-REGISTER_MLIR_INTERPRETER_OP(maskedStore);
-REGISTER_MLIR_INTERPRETER_OP(multiReduction);
-REGISTER_MLIR_INTERPRETER_OP(outerProduct);
-REGISTER_MLIR_INTERPRETER_OP(reduction);
-REGISTER_MLIR_INTERPRETER_OP(shapeCast);
-REGISTER_MLIR_INTERPRETER_OP(shuffle);
-REGISTER_MLIR_INTERPRETER_OP(splat);
-REGISTER_MLIR_INTERPRETER_OP(store);
-REGISTER_MLIR_INTERPRETER_OP(transferRead);
-REGISTER_MLIR_INTERPRETER_OP(transferWrite);
-REGISTER_MLIR_INTERPRETER_OP(transpose);
-REGISTER_MLIR_INTERPRETER_OP(typeCast);
-REGISTER_MLIR_INTERPRETER_OP(vScale);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/xla.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/xla.cc
deleted file mode 100644
index e89395edd33540..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/dialects/xla.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-llvm::SmallVector<InterpreterValue> bufferToMem(
-    MutableArrayRef<InterpreterValue> args, mlir::Operation*,
-    InterpreterState&) {
-  return {args[0]};
-}
-
-REGISTER_MLIR_INTERPRETER_OP("xla_cpu.memref_element_cast",
-                             "builtin.unrealized_conversion_cast");
-REGISTER_MLIR_INTERPRETER_OP("xla_framework.buffer_to_mem", bufferToMem);
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.cc
deleted file mode 100644
index 81ed545f83fe57..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tools/mlir_interpreter/framework/interpreter.h"
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <variant>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Support/LLVM.h"
-#include "tools/mlir_interpreter/framework/registration.h"
-
-namespace mlir {
-namespace interpreter {
-
-SmallVector<InterpreterValue> interpret(InterpreterState& state,
-                                        Operation& op) {
-  auto fn = detail::getFunction(op.getName().getStringRef());
-  if (!fn) {
-    llvm::errs() << "Unsupported op: " << op.getName().getStringRef() << "\n";
-    op.dump();
-    state.addFailure("unsupported op");
-    return {};
-  }
-  SmallVector<InterpreterValue> operands;
-  for (auto operand : op.getOperands()) {
-    operands.push_back(state.getTopScope()->Get(operand));
-  }
-  state.getOptions().listener->beforeOp(operands, &op);
-  auto results = fn(operands, &op, state);
-  for (auto* scope = state.getTopScope(); scope != nullptr;
-       scope = scope->getParentScope()) {
-    scope->verify();
-  }
-  if (state.hasFailure()) {
-    llvm::errs() << "Encountered failure while executing " << op << "\n";
-  }
-  state.getOptions().listener->afterOp(results);
-  state.step();
-  return results;
-}
-
-SmallVector<InterpreterValue> interpret(InterpreterState& state, Region& region,
-                                        ArrayRef<InterpreterValue> bbargs) {
-  if (state.hasFailure()) return {};
-  assert(region.hasOneBlock() && "expected region to have one block");
-  state.getOptions().listener->enterRegion(bbargs, region);
-  InterpreterScope scope(state);
-
-  auto& block = region.getBlocks().front();
-  for (auto [value, interpreter_value] :
-       llvm::zip(block.getArguments(), bbargs)) {
-    scope.Set(value, interpreter_value);
-  }
-
-  std::optional<SmallVector<InterpreterValue>> blockResults;
-  for (mlir::Operation& op : block) {
-    auto results = interpret(state, op);
-    if (state.hasFailure()) return {};
-    if (op.hasTrait<OpTrait::IsTerminator>()) {
-      assert(!blockResults.has_value() && "Expected at most one terminator");
-      blockResults = results;
-    } else {
-      if (results.size() != op.getNumResults()) {
-        llvm::errs() << "Unexpected number of results while interpreting "
-                     << op.getName().getStringRef() << ". Interpreter bug?\n";
-        llvm_unreachable("unexpected number of results");
-      }
-      for (auto [v, iv] : llvm::zip(op.getResults(), results)) {
-        scope.Set(v, iv);
-      }
-    }
-  }
-  if (!blockResults) {
-    blockResults = SmallVector<InterpreterValue>{};
-  }
-  state.getOptions().listener->leaveRegion(*blockResults);
-  return *std::move(blockResults);
-}
-
-InterpreterState::InterpreterState(const mlir::SymbolTable& symbols,
-                                   InterpreterOptions options)
-    : symbols(symbols), options(options) {
-  if (!options.listener) {
-    static auto& noOpListener = *new InterpreterListener();
-    this->options.listener = &noOpListener;
-  }
-  if (options.maxSteps) {
-    remainingSteps = *options.maxSteps;
-  }
-}
-
-void InterpreterState::addFailure(llvm::StringRef failure) {
-  failed = true;
-  options.errorHandler(failure);
-}
-
-void InterpreterScope::verify() const {
-  for (auto& [_, value] : values) {
-    if (value.isTensor() && value.buffer() &&
-        !value.buffer()->getFailure().empty()) {
-      state.addFailure(value.buffer()->getFailure());
-      break;
-    }
-  }
-}
-
-InterpreterScope::~InterpreterScope() {
-  verify();
-  state.topScope = parentScope;
-}
-
-mlir::FailureOr<SmallVector<InterpreterValue>> runInterpreter(
-    const mlir::SymbolTable& symbols, mlir::func::FuncOp function,
-    ArrayRef<InterpreterValue> args, InterpreterOptions options) {
-  InterpreterState state{symbols, std::move(options)};
-  auto results = interpret(state, function.getBody(), args);
-  if (state.hasFailure()) {
-    return failure();
-  }
-  return results;
-}
-
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h
deleted file mode 100644
index 5fe73bf1241b39..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
-#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
-
-#include <algorithm>
-#include <complex>
-#include <cstddef>
-#include <functional>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <optional>
-#include <ostream>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <variant>
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Support/LLVM.h"
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-
-namespace mlir {
-namespace interpreter {
-
-class InterpreterScope;
-
-class InterpreterListener {
- public:
-  virtual ~InterpreterListener() = default;
-  virtual void beforeOp(ArrayRef<InterpreterValue> args, mlir::Operation*) {}
-  virtual void afterOp(ArrayRef<InterpreterValue> results) {}
-  virtual void enterRegion(ArrayRef<InterpreterValue> args,
-                           mlir::Region& region) {}
-  virtual void leaveRegion(ArrayRef<InterpreterValue> terminatorArgs) {}
-};
-
-struct InterpreterStats {
-  // Memrefs only.
-  int64_t heapSize = 0;
-  int64_t peakHeapSize = 0;
-  int64_t numAllocations = 0;
-  int64_t numDeallocations = 0;
-};
-
-struct InterpreterOptions {
-  InterpreterListener* listener = nullptr;
-  std::optional<int64_t> maxSteps = std::nullopt;
-  // If set, ignore deallocations. Normally, accessing a deallocated memref will
-  // trigger an assertion. This flag disables all alloctions, which can be
-  // useful when debugging IR that includes a use-after-free bug.
-  bool disableDeallocations = false;
-  std::function<void(llvm::StringRef)> errorHandler =
-      [](llvm::StringRef failure) {
-        llvm::errs() << "Interpreter failure: " << failure << "\n";
-      };
-  InterpreterStats* stats = nullptr;
-};
-
-class InterpreterState {
- public:
-  InterpreterState(const mlir::SymbolTable& symbols,
-                   InterpreterOptions options);
-
-  void step() {
-    if (remainingSteps == 0) {
-      addFailure("maximum number of steps exceeded");
-      return;
-    }
-    --remainingSteps;
-  }
-  void addFailure(llvm::StringRef failure);
-  bool hasFailure() const { return failed; }
-  void checkSuccess(LogicalResult result, llvm::StringRef failure) {
-    if (!result.succeeded()) {
-      addFailure(failure);
-    }
-  }
-
-  InterpreterScope* getTopScope() { return topScope; }
-  const mlir::SymbolTable& getSymbols() const { return symbols; }
-  const InterpreterOptions& getOptions() { return options; }
-
- private:
-  const mlir::SymbolTable& symbols;
-  InterpreterScope* topScope = nullptr;
-  bool failed = false;
-  InterpreterOptions options;
-  int64_t remainingSteps = std::numeric_limits<int64_t>::max();
-
-  friend class InterpreterScope;
-  friend class InterpreterScopeStash;
-};
-
-// Used for passing arbitrary data to ops in sub-regions.
-class InterpreterSideChannel {
- public:
-  virtual ~InterpreterSideChannel() = default;
-};
-
-// Holds a mapping from SSA values to InterpreterValues and registered side
-// channels. There's typically one scope per region, but ops can add additional
-// scopes if needed (for example, to register a side channel).
-class InterpreterScope {
- public:
-  InterpreterScope(InterpreterScope&&) = delete;
-  explicit InterpreterScope(InterpreterState& state)
-      : state(state), parentScope(state.topScope) {
-    state.topScope = this;
-  }
-  ~InterpreterScope();
-
-  void Set(Value v, InterpreterValue iv) { values[v] = std::move(iv); }
-
-  const InterpreterValue& Get(Value v) {
-    auto ret = values.find(v);
-    if (ret == values.end()) {
-      if (!parentScope) {
-        v.dump();
-      }
-
-      assert(parentScope && "value not found");
-      return parentScope->Get(v);
-    }
-    return ret->second;
-  }
-
-  void verify() const;
-
-  // Retrieves the side channel of the given type in this scope or one of its
-  // ancestor scopes. If `optional` is set, returns nullptr if not found,
-  // otherwise asserts.
-  template <typename T>
-  T* getSideChannel(bool optional = false) {
-    for (auto& sideChannel : sideChannels) {
-      if (auto it = dynamic_cast<T*>(sideChannel.get())) {
-        return it;
-      }
-    }
-    if (!parentScope && optional) return nullptr;
-    assert(parentScope && "side channel not found");
-    return parentScope->getSideChannel<T>(optional);
-  }
-
-  // Registers the given side channel. Will shadow a side channel of the same
-  // type if registered in an outer scope.
-  // The behavior of registering two side channels of the same type in the same
-  // scope is undefined.
-  void setSideChannel(std::shared_ptr<InterpreterSideChannel> sideChannel) {
-    sideChannels.push_back(std::move(sideChannel));
-  }
-
-  InterpreterScope* getParentScope() const { return parentScope; }
-
- private:
-  DenseMap<Value, InterpreterValue> values;
-  SmallVector<std::shared_ptr<InterpreterSideChannel>> sideChannels;
-
-  InterpreterState& state;
-  InterpreterScope* parentScope;
-
-  friend class InterpreterScopeStash;
-};
-
-// Interprets the given region and returns the terminator's arguments. The
-// region must have a single block.
-SmallVector<InterpreterValue> interpret(InterpreterState& state, Region& region,
-                                        ArrayRef<InterpreterValue> bbargs);
-
-// Interprets the given function.
-mlir::FailureOr<SmallVector<InterpreterValue>> runInterpreter(
-    const mlir::SymbolTable& symbols, mlir::func::FuncOp function,
-    ArrayRef<InterpreterValue> args, InterpreterOptions options = {});
-
-}  // namespace interpreter
-}  // namespace mlir
-
-#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.cc
deleted file mode 100644
index ee996f9100ce44..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.cc
+++ /dev/null
@@ -1,377 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-
-#include <complex>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <variant>
-
-namespace mlir {
-namespace interpreter {
-
-namespace {
-struct TypeStr {
-  static std::string_view get(bool) { return "i1"; }
-  static std::string_view get(int64_t) { return "i64"; }
-  static std::string_view get(int32_t) { return "i32"; }
-  static std::string_view get(int16_t) { return "i16"; }
-  static std::string_view get(int8_t) { return "i8"; }
-  static std::string_view get(uint64_t) { return "ui64"; }
-  static std::string_view get(uint32_t) { return "ui32"; }
-  static std::string_view get(uint16_t) { return "ui16"; }
-  static std::string_view get(uint8_t) { return "ui8"; }
-  static std::string_view get(float) { return "f32"; }
-  static std::string_view get(double) { return "f64"; }
-  static std::string_view get(std::complex<float>) { return "complex<f32>"; }
-  static std::string_view get(std::complex<double>) { return "complex<f64>"; }
-};
-
-struct InterpreterValuePrinter {
-  llvm::raw_ostream& os;
-
-  template <typename T>
-  void operator()(const TensorOrMemref<T>& t) {
-    if (!t.buffer) {
-      os << "Memref: null";
-      return;
-    }
-
-    if (t.view.isVector) {
-      os << "vector<";
-    } else {
-      os << "TensorOrMemref<";
-    }
-    ArrayRef<int64_t> sizes = t.view.sizes;
-    for (int64_t size : sizes.drop_back(t.view.numVectorDims.value_or(0))) {
-      os << size << "x";
-    }
-    if (t.view.numVectorDims) {
-      os << "vector<";
-      for (int64_t size : sizes.take_back(*t.view.numVectorDims)) {
-        os << size << "x";
-      }
-      os << TypeStr::get(T{}) << ">>: ";
-    } else {
-      os << TypeStr::get(T{}) << ">: ";
-    }
-    SmallVector<int64_t> indices(t.view.rank() +
-                                 t.view.numVectorDims.value_or(0));
-    std::function<void(int64_t)> print;
-    print = [&](int64_t dim) {
-      if (dim == indices.size()) {
-        printScalar(t.at(indices));
-      } else {
-        os << "[";
-        for (int64_t i = 0; i < t.view.sizes[dim]; ++i) {
-          if (i > 0) os << ", ";
-          indices[dim] = i;
-          print(dim + 1);
-        }
-        os << "]";
-      }
-    };
-    if (t.buffer->deallocated()) {
-      os << "<<deallocated>>";
-    } else {
-      print(0);
-    }
-  }
-
-  void operator()(const Tuple& t) {
-    os << "(";
-    bool first = true;
-    for (const auto& v : t.values) {
-      if (!first) os << ", ";
-      first = false;
-      v->print(os);
-    }
-    os << ")";
-  }
-
-  template <typename T>
-  void operator()(const T& t) {
-    os << TypeStr::get(t) << ": ";
-    printScalar(t);
-  }
-
-  template <typename T>
-  void printScalar(const T& v) {
-    os << v;
-  }
-
-  template <typename T>
-  void printScalar(const std::complex<T>& v) {
-    os << v.real() << (v.imag() >= 0 ? "+" : "") << v.imag() << "i";
-  }
-
-  void printScalar(bool v) { os << (v ? "true" : "false"); }
-
-  void printScalar(int8_t v) { os << (int)v; }
-  void printScalar(uint8_t v) { os << (int)v; }
-};
-}  // namespace
-
-void InterpreterValue::print(llvm::raw_ostream& os) const {
-  std::visit(InterpreterValuePrinter{os}, storage);
-}
-
-std::string InterpreterValue::toString() const {
-  std::string buf;
-  llvm::raw_string_ostream os(buf);
-  print(os);
-  return buf;
-}
-
-InterpreterValue InterpreterValue::extractElement(
-    llvm::ArrayRef<int64_t> indices) const {
-  return std::visit(
-      [&](auto& it) -> InterpreterValue {
-        using T = std::decay_t<decltype(it)>;
-        if constexpr (is_tensor_or_memref_v<T>) {
-          if (it.view.numVectorDims) {
-            return {it.vectorAt(indices)};
-          } else {
-            return {it.at(indices)};
-          }
-        } else if constexpr (std::is_same_v<T, Tuple>) {
-          llvm_unreachable("extracting from tuples is unsupported");
-        } else {
-          return {it};
-        }
-      },
-      storage);
-}
-
-void InterpreterValue::insertElement(llvm::ArrayRef<int64_t> indices,
-                                     const InterpreterValue& value) {
-  std::visit(
-      [&](auto& it) {
-        using T = std::decay_t<decltype(it)>;
-        if constexpr (is_tensor_or_memref_v<T>) {
-          if (it.view.numVectorDims) {
-            auto subview = it.vectorAt(indices);
-            const auto& values = std::get<T>(value.storage);
-            assert(values.view.sizes == subview.view.sizes &&
-                   "mismatched sizes");
-            for (const auto& index : subview.view.indices()) {
-              subview.at(index) = values.at(index);
-            }
-          } else {
-            it.at(indices) = std::get<typename T::element_type>(value.storage);
-          }
-        } else if constexpr (std::is_same_v<T, Tuple>) {
-          llvm_unreachable("inserting into tuples is unsupported");
-        } else {
-          it = std::get<T>(value.storage);
-        }
-      },
-      storage);
-}
-
-void InterpreterValue::fill(
-    const std::function<InterpreterValue(llvm::ArrayRef<int64_t> indices)>& f) {
-  std::visit(
-      [&](auto& it) {
-        using T = std::decay_t<decltype(it)>;
-        if constexpr (is_tensor_or_memref_v<T>) {
-          for (const auto& indices : it.view.indices()) {
-            if (it.view.numVectorDims) {
-              auto subview = it.vectorAt(indices);
-              auto value = std::get<T>(f(indices).storage);
-              for (const auto& index : subview.view.indices()) {
-                subview.at(index) = value.at(index);
-              }
-            } else {
-              it.at(indices) =
-                  std::get<typename T::element_type>(f(indices).storage);
-            }
-          }
-        } else if constexpr (std::is_same_v<T, Tuple>) {
-          llvm_unreachable("filling tuples is unsupported");
-        } else {
-          it = std::get<T>(f({}).storage);
-        }
-      },
-      storage);
-}
-
-InterpreterValue InterpreterValue::clone(ArrayRef<int64_t> layout) const {
-  return std::visit(
-      [&](const auto& it) -> InterpreterValue {
-        using T = std::decay_t<decltype(it)>;
-        if constexpr (is_tensor_or_memref_v<T>) {
-          return {it.clone(layout)};
-        } else if constexpr (std::is_same_v<T, Tuple>) {
-          llvm_unreachable("cloning tuples is unsupported");
-        } else {
-          return {it};
-        }
-      },
-      storage);
-}
-
-InterpreterValue InterpreterValue::coerceLayout(
-    ArrayRef<int64_t> layout) const {
-  const auto& view = this->view();
-  if (view.strides == BufferView::getStridesForLayout(view.sizes, layout)) {
-    return *this;
-  }
-  return clone(layout);
-}
-
-InterpreterValue InterpreterValue::typedAlike(
-    llvm::ArrayRef<int64_t> shape) const {
-  return std::visit(
-      [&](const auto& it) -> InterpreterValue {
-        using T = std::decay_t<decltype(it)>;
-        if constexpr (is_tensor_or_memref_v<T>) {
-          return {T::empty(shape)};
-        } else if constexpr (std::is_same_v<T, Tuple>) {
-          llvm_unreachable("TypedAlike for tuples is unsupported");
-        } else {
-          return {TensorOrMemref<T>::empty(shape)};
-        }
-      },
-      storage);
-}
-
-InterpreterValue InterpreterValue::makeTensor(mlir::Type elementType,
-                                              SmallVector<int64_t> shape) {
-  auto vectorTy = llvm::dyn_cast<VectorType>(elementType);
-  if (vectorTy) {
-    llvm::copy(vectorTy.getShape(), std::back_inserter(shape));
-  }
-  return dispatchScalarType(elementType, [&](auto dummy) -> InterpreterValue {
-    auto tensor = TensorOrMemref<decltype(dummy)>::empty(shape);
-    if (vectorTy) {
-      tensor.view.numVectorDims = vectorTy.getRank();
-    }
-    return {tensor};
-  });
-}
-
-BufferView& InterpreterValue::view() {
-  return std::visit(
-      [](auto& it) -> BufferView& {
-        if constexpr (is_tensor_or_memref_v<decltype(it)>) {
-          return it.view;
-        }
-        llvm_unreachable("view is only supported for tensors");
-      },
-      storage);
-}
-
-const BufferView& InterpreterValue::view() const {
-  return std::visit(
-      [](const auto& it) -> const BufferView& {
-        if constexpr (is_tensor_or_memref_v<decltype(it)>) {
-          return it.view;
-        }
-        llvm_unreachable("view is only supported for tensors");
-      },
-      storage);
-}
-
-bool InterpreterValue::isTensor() const {
-  return std::visit(
-      [](const auto& it) { return is_tensor_or_memref_v<decltype(it)>; },
-      storage);
-}
-
-InterpreterValue InterpreterValue::asUnitTensor(bool isVector) const {
-  auto result = typedAlike({});
-  result.insertElement({}, *this);
-  result.view().isVector = isVector;
-  return result;
-}
-
-bool Tuple::operator==(const Tuple& other) const {
-  if (other.values.size() != values.size()) return false;
-  for (const auto& [lhs, rhs] : llvm::zip(values, other.values)) {
-    if (!(*lhs == *rhs)) return false;
-  }
-  return true;
-}
-
-std::shared_ptr<Buffer> InterpreterValue::buffer() const {
-  return std::visit(
-      [](const auto& it) -> std::shared_ptr<Buffer> {
-        if constexpr (is_tensor_or_memref_v<decltype(it)>) {
-          return it.buffer;
-        } else {
-          llvm_unreachable("buffer() is only supported for tensors");
-        }
-      },
-      storage);
-}
-
-int64_t InterpreterValue::asInt() const {
-  auto visit = [](auto value) -> int64_t {
-    if constexpr (std::is_integral_v<decltype(value)>) {
-      return static_cast<int64_t>(value);
-    } else {
-      llvm_unreachable("only integral types can be converted to ints");
-    }
-  };
-  return std::visit(visit, storage);
-}
-
-uint64_t InterpreterValue::asUInt() const {
-  auto visit = [](auto value) -> uint64_t {
-    if constexpr (std::is_integral_v<decltype(value)>) {
-      if constexpr (std::is_signed_v<decltype(value)>) {
-        return static_cast<uint64_t>(
-            static_cast<std::make_unsigned_t<decltype(value)>>(value));
-      } else {
-        return static_cast<uint64_t>(value);
-      }
-    } else {
-      llvm_unreachable("only integral types can be converted to ints");
-    }
-  };
-  return std::visit(visit, storage);
-}
-
-double InterpreterValue::asDouble() const {
-  auto visit = [](auto value) -> int64_t {
-    if constexpr (std::is_floating_point_v<decltype(value)>) {
-      return static_cast<double>(value);
-    } else {
-      llvm_unreachable("only float types can be converted to ints");
-    }
-  };
-  return std::visit(visit, storage);
-}
-
-int64_t InterpreterValue::getByteSizeOfElement() const {
-  return std::visit(
-      [](const auto& it) -> int64_t {
-        using T = std::decay_t<decltype(it)>;
-        if constexpr (is_tensor_or_memref_v<T>) {
-          return sizeof(typename T::element_type);
-        } else {
-          llvm_unreachable("scalars have no element sizes");
-        }
-      },
-      storage);
-}
-
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h
deleted file mode 100644
index 3af65a0a451715..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
-#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
-
-#include <complex>
-#include <cstddef>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <string>
-#include <variant>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/IR/Types.h"
-#include "mlir/Support/LLVM.h"
-#include "tools/mlir_interpreter/framework/tensor_or_memref.h"
-
-namespace mlir {
-namespace interpreter {
-
-struct InterpreterValue;
-
-struct Tuple {
-  bool operator==(const Tuple& other) const;
-
-  SmallVector<std::shared_ptr<InterpreterValue>> values;
-};
-
-// Holds a scalar, a tensor/memref or a tuple. Tensors/memrefs can also
-// represent vectors.
-struct InterpreterValue {
-  void print(llvm::raw_ostream& os) const;
-  std::string toString() const;
-
-  // Returns the element at the given indices. If the value is a scalar, returns
-  // itself.
-  InterpreterValue extractElement(llvm::ArrayRef<int64_t> indices) const;
-  // Sets the element at the given index. If the value is a scalar, sets its
-  // value.
-  void insertElement(llvm::ArrayRef<int64_t> indices,
-                     const InterpreterValue& value);
-  // Initializes all elements of the underlying tensor.
-  void fill(
-      const std::function<InterpreterValue(llvm::ArrayRef<int64_t> indices)>&
-          f);
-
-  // Converts a scalar to a unit tensor or vector.
-  InterpreterValue asUnitTensor(bool isVector = false) const;
-  // For integral interpreter values, casts them to int64.
-  int64_t asInt() const;
-  // For integral interpreter values, first casts them to the unsigned integer
-  // type of the same size, and then to uint64. For example, the result for
-  // int8_t{-1} is 255.
-  uint64_t asUInt() const;
-  // For floating point scalars, casts them to double.
-  double asDouble() const;
-  // Must be a tensor or memref.
-  int64_t getByteSizeOfElement() const;
-
-  // Creates a new tensor InterpreterValue (backed a new buffer) with the same
-  // elementtype as this, but a different shape. If this is not a tensor, it is
-  // used as the element type.
-  // If `layout` is empty, the clone uses the default layout.
-  InterpreterValue clone(ArrayRef<int64_t> layout = {}) const;
-  // Returns either this tensor InterpreterValue (if its layout matches the
-  // requested layout) or a clone.
-  InterpreterValue coerceLayout(llvm::ArrayRef<int64_t> layout) const;
-  // Returns a tensor interpreter value with a newly allocated buffer of the
-  // given shape, with a default layout and the same element type as this
-  // interpreter value.
-  InterpreterValue typedAlike(llvm::ArrayRef<int64_t> shape) const;
-
-  // Creates a tensor with the given element type and shape. `element_type` may
-  // be a vector type, in which case the shape only specifies the non-vector
-  // dimensions.
-  static InterpreterValue makeTensor(mlir::Type elementType,
-                                     SmallVector<int64_t> shape);
-
-  // Returns the underlying tensor's view. Must be a tensor.
-  BufferView& view();
-  const BufferView& view() const;
-  // Returns the underlying tensor's buffer. Must be a tensor.
-  std::shared_ptr<Buffer> buffer() const;
-
-  bool isTensor() const;
-
-  bool operator==(const InterpreterValue& other) const {
-    if (storage.index() != other.storage.index()) return false;
-    if (isTensor() || std::holds_alternative<Tuple>(storage))
-      return storage == other.storage;
-    // Tensors treat NaNs as equal, so just wrap the values.
-    return asUnitTensor() == other.asUnitTensor();
-  }
-
-  std::variant<
-      Tuple, bool, float, double, uint8_t, int8_t, uint16_t, int16_t, uint32_t,
-      int32_t, uint64_t, int64_t, std::complex<float>, std::complex<double>,
-      TensorOrMemref<bool>, TensorOrMemref<float>, TensorOrMemref<double>,
-      TensorOrMemref<uint8_t>, TensorOrMemref<int8_t>, TensorOrMemref<uint16_t>,
-      TensorOrMemref<int16_t>, TensorOrMemref<uint32_t>,
-      TensorOrMemref<int32_t>, TensorOrMemref<uint64_t>,
-      TensorOrMemref<int64_t>, TensorOrMemref<std::complex<float>>,
-      TensorOrMemref<std::complex<double>>>
-      storage;
-};
-
-template <typename T>
-constexpr static bool is_valid_interpreter_value_v =  // NOLINT
-    std::is_constructible_v<decltype(InterpreterValue::storage), T>;
-
-// Attempts to cast the given value to the requested type, returning nullopt if
-// no cast is possible. This allows casts to the concrete type of the value
-// (e.g. an `InterpreterValue` containing a `Tuple` can be cast to `Tuple`),
-// casts from a unit tensor to their contents, and casts of scalars to any
-// convertible type.
-// NOTE: When casting to an unsigned type, this behaves differently than
-// InterpreterValue::AsUint. That function preserves the content's bit width,
-// so InterpreterValueDynCast<uint64_t>({int8_t{-1}}) will return 2^64-1,
-// whereas asUInt will return 255.
-template <typename T>
-std::optional<T> interpreterValueDynCast(InterpreterValue v) {
-  if constexpr (std::is_same_v<T, InterpreterValue>) {
-    return v;
-  }
-  if constexpr (is_valid_interpreter_value_v<T>) {
-    if (std::holds_alternative<T>(v.storage)) {
-      return std::get<T>(v.storage);
-    }
-  }
-  if (v.isTensor() && !is_tensor_or_memref_v<T>) {
-    if (v.view().getNumElements() != 1) {
-      return std::nullopt;
-    }
-    return interpreterValueDynCast<T>(v.extractElement({}));
-  }
-  return std::visit(
-      [](auto v) -> std::optional<T> {
-        if constexpr (std::is_convertible_v<decltype(v), T>) {
-          return v;
-        } else {
-          return std::nullopt;
-        }
-      },
-      v.storage);
-}
-
-template <typename T>
-T interpreterValueCast(InterpreterValue v) {
-  auto ret = interpreterValueDynCast<T>(v);
-  assert(ret && "cast failed");
-  return *std::move(ret);
-}
-
-// Calls functor with a value of the C++ type corresponding to the given `Type`,
-// (or its element type).
-template <class Fn>
-auto dispatchScalarType(mlir::Type ty, Fn&& functor) {
-  ty = getElementTypeOrSelf(ty);
-  if (ty.isF32()) {
-    return functor(float{});
-  }
-  if (ty.isF64()) {
-    return functor(double{});
-  }
-  if (ty.isUnsignedInteger(64)) {
-    return functor(uint64_t{});
-  }
-  if (ty.isInteger(64) || ty.isIndex()) {
-    return functor(int64_t{});
-  }
-  if (ty.isUnsignedInteger(32)) {
-    return functor(uint32_t{});
-  }
-  if (ty.isInteger(32)) {
-    return functor(int32_t{});
-  }
-  if (ty.isUnsignedInteger(16)) {
-    return functor(uint16_t{});
-  }
-  if (ty.isInteger(16)) {
-    return functor(int16_t{});
-  }
-  if (ty.isUnsignedInteger(8)) {
-    return functor(uint8_t{});
-  }
-  if (ty.isInteger(8)) {
-    return functor(int8_t{});
-  }
-  if (ty.isInteger(1)) {
-    return functor(bool{});
-  }
-  if (auto complex = ty.dyn_cast<ComplexType>()) {
-    if (complex.getElementType().isF32()) {
-      return functor(std::complex<float>{});
-    }
-    if (complex.getElementType().isF64()) {
-      return functor(std::complex<double>{});
-    }
-  }
-
-  llvm::errs() << "DispatchScalarType unimplemented for " << ty << "\n";
-  llvm_unreachable("unimplemented");
-}
-
-}  // namespace interpreter
-}  // namespace mlir
-
-#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value_util.h b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value_util.h
deleted file mode 100644
index c2f02352e82872..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value_util.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
-#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
-
-#include <complex>
-#include <type_traits>
-#include <utility>
-
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-
-namespace mlir {
-namespace interpreter {
-namespace detail {
-
-template <typename T>
-struct is_complex : std::false_type {};  // NOLINT
-
-template <typename T>
-struct is_complex<std::complex<T>> : std::true_type {};  // NOLINT
-
-template <typename Fn>
-struct InterpreterValueMapVisitor {
-  template <typename T>
-  InterpreterValue operator()(const TensorOrMemref<T>& t) {
-    if constexpr (Fn::template supportedType<T>()) {
-      using out_elem_t = decltype(Fn::apply(T()));
-      auto out = TensorOrMemref<out_elem_t>::emptyLike(t.view);
-      for (const auto& index : out.view.indices(true)) {
-        out.at(index) = Fn::apply(t.at(index));
-      }
-      return {out};
-    } else {
-      llvm::errs() << llvm::getTypeName<Fn>()
-                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
-      llvm_unreachable("unsupported type");
-    }
-  }
-
-  InterpreterValue operator()(const Tuple& t) {
-    Tuple out;
-    for (const auto& value : t.values) {
-      out.values.push_back(std::make_unique<InterpreterValue>(
-          std::move(std::visit(*this, value->storage))));
-    }
-    return {out};
-  }
-
-  template <typename T>
-  InterpreterValue operator()(const T& t) {
-    if constexpr (Fn::template supportedType<T>()) {
-      return {Fn::apply(t)};
-    } else {
-      llvm::errs() << llvm::getTypeName<Fn>()
-                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
-      llvm_unreachable("unsupported type");
-    }
-  }
-};
-
-template <typename Fn>
-struct InterpreterValueBiMapVisitor {
-  const InterpreterValue& rhs;
-
-  template <typename T>
-  InterpreterValue operator()(const TensorOrMemref<T>& lhsT) {
-    if constexpr (Fn::template supportedType<T>()) {
-      using OutElemT = decltype(Fn::apply(T(), T()));
-      auto out = TensorOrMemref<OutElemT>::emptyLike(lhsT.view);
-      const auto& rhsT = std::get<TensorOrMemref<T>>(rhs.storage);
-      for (const auto& index : out.view.indices(true)) {
-        out.at(index) = Fn::apply(lhsT.at(index), rhsT.at(index));
-      }
-      return {out};
-    } else {
-      llvm::errs() << llvm::getTypeName<Fn>()
-                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
-      llvm_unreachable("unsupported type");
-    }
-  }
-
-  InterpreterValue operator()(const Tuple& lhsT) {
-    const auto& rhsT = std::get<Tuple>(rhs.storage);
-    Tuple out;
-    for (const auto& [lhs_v, rhs_v] : llvm::zip(lhsT.values, rhsT.values)) {
-      out.values.push_back(std::make_unique<InterpreterValue>(std::move(
-          std::visit(InterpreterValueBiMapVisitor{*rhs_v}, lhs_v->storage))));
-    }
-    return {std::move(out)};
-  }
-
-  template <typename T>
-  InterpreterValue operator()(const T& t) {
-    if constexpr (Fn::template supportedType<T>()) {
-      return {Fn::apply(t, std::get<T>(rhs.storage))};
-    } else {
-      llvm::errs() << llvm::getTypeName<Fn>()
-                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
-      llvm_unreachable("unsupported type");
-    }
-  }
-};
-
-}  // namespace detail
-
-template <typename T>
-inline constexpr bool is_complex_v = detail::is_complex<T>::value;  // NOLINT
-
-template <bool allow_bools, bool allow_ints, bool allow_floats,
-          bool allow_complex, bool allow_unsigned = true>
-struct FilterMapTraits {
-  template <typename T>
-  static constexpr bool supportedType() {
-    constexpr bool isBool = std::is_same_v<T, bool>;
-    constexpr bool isInt = std::is_integral_v<T> && !isBool;
-    constexpr bool isUnsigned = std::is_unsigned_v<T>;
-    return (allow_bools && isBool) ||
-           (allow_ints && isInt && (allow_unsigned || !isUnsigned)) ||
-           (allow_floats && std::is_floating_point_v<T>) ||
-           (allow_complex && is_complex_v<T>);
-  }
-};
-
-using CwiseAll = FilterMapTraits<true, true, true, true>;
-using CwiseArith = FilterMapTraits<false, true, true, true>;
-using CwiseComplex = FilterMapTraits<false, false, false, true>;
-using CwiseFloat = FilterMapTraits<false, false, true, false>;
-using CwiseInt = FilterMapTraits<false, true, false, false>;
-using CwiseIntegral = FilterMapTraits<true, true, false, false>;
-using CwiseNonIntegral = FilterMapTraits<false, false, true, true>;
-using CwiseReal = FilterMapTraits<false, true, true, false>;
-using CwiseSignedOrComplex = FilterMapTraits<false, true, true, true, false>;
-using CwiseSigned = FilterMapTraits<false, true, true, false, false>;
-
-template <typename Fn>
-InterpreterValue applyCwiseMap(const InterpreterValue& value) {
-  return std::visit(detail::InterpreterValueMapVisitor<Fn>{}, value.storage);
-}
-
-template <typename Fn>
-InterpreterValue applyCwiseBinaryMap(const InterpreterValue& lhs,
-                                     const InterpreterValue& rhs) {
-  assert(lhs.storage.index() == rhs.storage.index());
-  return std::visit(detail::InterpreterValueBiMapVisitor<Fn>{rhs}, lhs.storage);
-}
-
-// Unboxes (and casts if necessary) the given interpreter values. Asserts if the
-// types are incompatible.
-template <typename T>
-SmallVector<T> unpackInterpreterValues(ArrayRef<InterpreterValue> values) {
-  SmallVector<T> result;
-  for (const auto& value : values) {
-    result.push_back(interpreterValueCast<T>(value));
-  }
-  return result;
-}
-
-// Boxes the given values in InterpreterValues.
-template <typename T>
-SmallVector<InterpreterValue> packInterpreterValues(ArrayRef<T> values) {
-  SmallVector<InterpreterValue> result;
-  for (const auto& value : values) {
-    result.push_back({value});
-  }
-  return result;
-}
-
-}  // namespace interpreter
-}  // namespace mlir
-
-#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.cc
deleted file mode 100644
index 8481f8a7bbda45..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tools/mlir_interpreter/framework/registration.h"
-
-#include <functional>
-#include <utility>
-
-namespace mlir {
-namespace interpreter {
-namespace detail {
-namespace {
-
-// Aliases and function names are wrapped in functions because function
-// registrations are called from static initializers, whose execution order is
-// undefined.
-DenseMap<llvm::StringRef, llvm::StringRef>& getOpAliases() {
-  static DenseMap<llvm::StringRef, llvm::StringRef>* aliases = nullptr;
-  if (!aliases) {
-    aliases = new DenseMap<llvm::StringRef, llvm::StringRef>();
-  }
-  return *aliases;
-}
-
-DenseMap<llvm::StringRef, InterpreterFunction>& getFunctions() {
-  static DenseMap<llvm::StringRef, InterpreterFunction>* functions = nullptr;
-  if (!functions) {
-    functions = new DenseMap<llvm::StringRef, InterpreterFunction>();
-  }
-  return *functions;
-}
-
-}  // namespace
-
-InterpreterFunction getFunction(llvm::StringRef name) {
-  const auto& fns = getFunctions();
-  auto fn = fns.find(name);
-  if (fn != fns.end()) {
-    return fn->second;
-  }
-  const auto& aliases = getOpAliases();
-  auto alias = aliases.find(name);
-  if (alias != aliases.end()) {
-    return fns.find(alias->second)->second;
-  }
-  return nullptr;
-}
-
-void registerInterpreterOp(llvm::StringRef name,
-                           InterpreterValue (*fn)(const InterpreterValue&)) {
-  registerInterpreterOp(
-      name,
-      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
-           InterpreterState&) -> SmallVector<InterpreterValue> {
-        assert(operands.size() == 1 && "unexpected number of operands");
-        return {fn(operands[0])};
-      });
-}
-
-void registerInterpreterOp(llvm::StringRef name,
-                           InterpreterValue (*fn)(const InterpreterValue&,
-                                                  const InterpreterValue&)) {
-  registerInterpreterOp(
-      name,
-      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
-           InterpreterState&) -> SmallVector<InterpreterValue> {
-        assert(operands.size() == 2 && "unexpected number of operands");
-        return {fn(operands[0], operands[1])};
-      });
-}
-
-void registerInterpreterOp(
-    llvm::StringRef name,
-    InterpreterValue (*fn)(MutableArrayRef<InterpreterValue>)) {
-  registerInterpreterOp(
-      name,
-      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
-           InterpreterState&) -> SmallVector<InterpreterValue> {
-        return {fn(operands)};
-      });
-}
-
-void registerInterpreterOp(llvm::StringRef name,
-                           void (*fn)(MutableArrayRef<InterpreterValue>)) {
-  registerInterpreterOp(
-      name,
-      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
-           InterpreterState&) -> SmallVector<InterpreterValue> {
-        fn(operands);
-        return {};
-      });
-}
-
-void registerInterpreterOp(
-    llvm::StringRef name,
-    std::function<llvm::SmallVector<InterpreterValue>(
-        MutableArrayRef<InterpreterValue>, mlir::Operation*, InterpreterState&)>
-        fn) {
-  getFunctions()[name] = std::move(fn);
-}
-
-void registerInterpreterOp(llvm::StringRef name, llvm::StringRef original) {
-  getOpAliases()[name] = original;
-}
-
-}  // namespace detail
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.h b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.h
deleted file mode 100644
index abf6435cd6dec3..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
-#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
-
-#include <functional>
-#include <optional>
-#include <type_traits>
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/TypeName.h"
-#include "mlir/Support/LLVM.h"
-#include "absl/strings/str_cat.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
-
-#define MLIR_INTERPRETER_CONCAT_IMPL(x, y) x##y
-#define MLIR_INTERPRETER_CONCAT(x, y) MLIR_INTERPRETER_CONCAT_IMPL(x, y)
-#define REGISTER_MLIR_INTERPRETER_OP(args...)                     \
-  static int MLIR_INTERPRETER_CONCAT(init_, __COUNTER__) = []() { \
-    ::mlir::interpreter::detail::registerInterpreterOp(args);     \
-    return 1;                                                     \
-  }();
-
-namespace mlir {
-namespace interpreter {
-namespace detail {
-
-// The generic signature for interpreter functions. Typically the type-checked
-// form should be used instead.
-using InterpreterFunction = std::function<SmallVector<InterpreterValue>(
-    MutableArrayRef<InterpreterValue>, mlir::Operation*, InterpreterState&)>;
-
-// Returns the given registered function, or nullptr if not found.
-InterpreterFunction getFunction(llvm::StringRef name);
-
-// Simple unary ops.
-void registerInterpreterOp(llvm::StringRef name,
-                           InterpreterValue (*fn)(const InterpreterValue&));
-
-// Simple binary ops.
-void registerInterpreterOp(llvm::StringRef name,
-                           InterpreterValue (*fn)(const InterpreterValue&,
-                                                  const InterpreterValue&));
-
-template <typename T>
-struct is_optional : std::false_type {};  // NOLINT
-
-template <typename T>
-struct is_optional<std::optional<T>> : std::true_type {};  // NOLINT
-
-// Converts the given arguments to the requested type. Supported target types:
-// - InterpreterValue storage types (e.g. uint8_t, TensorOrMemref<float>).
-//   Scalars will be cast if necessary.
-// - std::optional of InterpreterValue storage types
-// - ArrayRef of InterpreterValue storage types. Will cast if necessary (e.g.
-//   int32_t -> int64_t).
-// - no-op conversions: InterpreterValue, ArrayRef<InterpreterValue>
-template <typename ArgT>
-auto typedInterpreterOpConvertArg(MutableArrayRef<InterpreterValue> args,
-                                  InterpreterState& state) {
-  constexpr bool optional = is_optional<ArgT>::value;
-  constexpr bool single = std::is_same_v<ArgT, InterpreterValue> ||
-                          is_valid_interpreter_value_v<ArgT>;
-  if constexpr (optional) {
-    if (args.empty()) {
-      return ArgT{};
-    }
-  }
-
-  if constexpr (single || optional) {
-    if (args.size() != 1) {
-      state.addFailure("Expected a single argument for the operand");
-      return ArgT{};
-    }
-  }
-
-  auto fail = [&]() {
-    state.addFailure(absl::StrCat("Unable to convert argument (variant index ",
-                                  args[0].storage.index(), ") to ",
-                                  llvm::getTypeName<ArgT>().str()));
-    return ArgT{};
-  };
-
-  if constexpr (single) {
-    if (auto arg = interpreterValueDynCast<ArgT>(args[0])) {
-      return ArgT{*arg};
-    }
-    return fail();
-  } else if constexpr (optional) {
-    using T = std::decay_t<decltype(*std::declval<ArgT>())>;
-    if (auto arg = interpreterValueDynCast<T>(args[0])) {
-      return arg;
-    }
-    return fail();
-  } else {
-    using E = std::decay_t<decltype(*std::declval<ArgT>().begin())>;
-    // Container argument types (e.g. MutableArrayRef<InterpreterValue>,
-    // ArrayRef<int64_t>).
-    if constexpr (std::is_same_v<E, InterpreterValue>) {
-      return ArgT{args};
-    } else {
-      // Note: we don't cast to ArgT here, because that's typically an ArrayRef,
-      // which would lead to returning a reference to a temporary.
-      return unpackInterpreterValues<E>(args);
-    }
-  }
-}
-
-// Converts the given return value. Supported target types:
-// - InterpreterValue (no-op conversion)
-// - InterpreterValue storage types (the value will be boxed)
-// - SmallVector<InterpreterValue>
-template <typename RetT>
-SmallVector<InterpreterValue> typedInterpreterOpConvertRet(RetT ret) {
-  if constexpr (std::is_same_v<RetT, InterpreterValue>) {
-    return {ret};
-  } else if constexpr (is_valid_interpreter_value_v<RetT>) {
-    return {InterpreterValue{ret}};
-  } else if constexpr (std::is_same_v<RetT, SmallVector<InterpreterValue>>) {
-    return ret;
-  } else {
-    using E = std::decay_t<decltype(*std::declval<RetT>().begin())>;
-    return packInterpreterValues(ArrayRef<E>(ret));
-  }
-}
-
-// Adapts the given function to the generic handler signature
-// (SmallVector<InterpreterValue>(MutableArrayRef<InterpreterValue>, Operation*,
-// InterpreterState&)).
-// See the function below for usage.
-template <typename Op, typename Ret, typename... T, size_t... Indices>
-void registerTypedInterpreterOpImpl(Ret (*fn)(InterpreterState&, Op, T... args),
-                                    std::index_sequence<Indices...>) {
-  registerInterpreterOp(
-      Op::getOperationName(),
-      [fn](MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
-           InterpreterState& state) -> SmallVector<InterpreterValue> {
-        auto cast = llvm::dyn_cast<Op>(op);
-        if (!cast) {
-          state.addFailure(absl::StrCat(
-              "failed to cast op '", op->getName().getStringRef().str(),
-              "' to expected type (", llvm::getTypeName<Op>().str(), ")"));
-          return {};
-        }
-        int64_t usedArgs = 0;
-        for (auto i : llvm::seq(0ul, sizeof...(T))) {
-          usedArgs += cast.getODSOperandIndexAndLength(i).second;
-        }
-        if (args.size() != usedArgs) {
-          state.addFailure("Op handler did not use all arguments");
-          return {};
-        }
-
-        auto extractArg = [&](auto index, auto* dummy) {
-          auto [pos, length] = cast.getODSOperandIndexAndLength(index);
-          using ArgT = std::decay_t<decltype(*dummy)>;
-          return typedInterpreterOpConvertArg<ArgT>(args.slice(pos, length),
-                                                    state);
-        };
-
-        if constexpr (std::is_same_v<Ret, void>) {
-          fn(state, cast, extractArg(Indices, (std::decay_t<T>*){nullptr})...);
-          return {};
-        } else {
-          Ret ret = fn(state, cast,
-                       extractArg(Indices, (std::decay_t<T>*){nullptr})...);
-          return typedInterpreterOpConvertRet(ret);
-        }
-      });
-}
-
-// registers the given function. The function should take one argument per
-// Op operand, in the same order as the Op.
-// The argument types should match the operation's operand types:
-// - Variadic<...> becomes ArrayRef<...>
-// - Optional<...> becomes std::optional<...>
-// - Unboxing is optionally supported, e.g. an Optional<Index> operand can be
-//   passed to either a std::optional<int64_t> or a
-//   std::optional<InterpreterValue>.
-// Valid return types are InterpreterValue, SmallVector<InterpreterValue>, void,
-// and any type boxable in an InterpreterValue.
-template <typename Op, typename Ret, typename... T>
-void registerInterpreterOp(Ret (*fn)(InterpreterState&, Op, T...)) {
-  registerTypedInterpreterOpImpl(fn, std::index_sequence_for<T...>{});
-}
-
-// Simple variadic ops (single output).
-void registerInterpreterOp(
-    llvm::StringRef name,
-    InterpreterValue (*fn)(MutableArrayRef<InterpreterValue>));
-
-// Simple variadic ops(no output).
-void registerInterpreterOp(llvm::StringRef name,
-                           void (*fn)(MutableArrayRef<InterpreterValue>));
-
-// Generic ops.
-void registerInterpreterOp(
-    llvm::StringRef name,
-    std::function<llvm::SmallVector<InterpreterValue>(
-        MutableArrayRef<InterpreterValue>, mlir::Operation*, InterpreterState&)>
-        fn);
-
-void registerInterpreterOp(llvm::StringRef name, llvm::StringRef original);
-
-}  // namespace detail
-}  // namespace interpreter
-}  // namespace mlir
-
-#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.cc
deleted file mode 100644
index 0b364a5beeccae..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tools/mlir_interpreter/framework/tensor_or_memref.h"
-
-#include <algorithm>
-#include <iostream>
-#include <numeric>
-#include <optional>
-#include <utility>
-
-#include "mlir/Dialect/Utils/IndexingUtils.h"
-
-namespace mlir {
-namespace interpreter {
-
-std::optional<int64_t> BufferView::getPhysicalIndex(
-    llvm::ArrayRef<int64_t> viewIndices) const {
-  int64_t result = offset;
-  if (!inBounds(viewIndices)) {
-    return std::nullopt;
-  }
-  for (int64_t i = 0; i < viewIndices.size(); ++i) {
-    result += viewIndices[i] * strides[i];
-  }
-  return result;
-}
-
-bool BufferView::inBounds(llvm::ArrayRef<int64_t> viewIndices) const {
-  if (viewIndices.size() > sizes.size()) return false;
-  for (auto [index, size] : llvm::zip(viewIndices, sizes)) {
-    if (index < 0 || index >= size) return false;
-  }
-  return true;
-}
-
-SmallVector<int64_t> BufferView::getDefaultStrides(ArrayRef<int64_t> sizes) {
-  SmallVector<int64_t> result(sizes.size());
-  int64_t stride = 1;
-  for (int64_t i = result.size() - 1; i >= 0; --i) {
-    result[i] = stride;
-    stride *= sizes[i];
-  }
-  return result;
-}
-
-SmallVector<int64_t> BufferView::getStridesForLayout(ArrayRef<int64_t> sizes,
-                                                     ArrayRef<int64_t> layout) {
-  if (layout.empty()) return getDefaultStrides(sizes);
-  auto inverseLayout = invertPermutationVector(layout);
-  SmallVector<int64_t> result(sizes.size());
-  int64_t stride = 1;
-  for (int64_t i = 0; i < layout.size(); ++i) {
-    result[inverseLayout[i]] = stride;
-    stride *= sizes[inverseLayout[i]];
-  }
-  return result;
-}
-
-LogicalResult BufferView::slice(int64_t dimIndex, int64_t dimOffset) {
-  llvm::SmallVector<int64_t> offsets(rank(), 0);
-  offsets[dimIndex] = dimOffset;
-  if (auto newOffset = getPhysicalIndex(offsets)) {
-    offset = *newOffset;
-  } else {
-    return failure();
-  }
-  if (dimIndex >= rank()) --*numVectorDims;
-  strides.erase(strides.begin() + dimIndex);
-  sizes.erase(sizes.begin() + dimIndex);
-  return success();
-}
-
-LogicalResult BufferView::slice(int64_t dimIndex, int64_t dimOffset,
-                                int64_t dimSize, int64_t dimStride) {
-  llvm::SmallVector<int64_t> offsets(rank(), 0);
-  offsets[dimIndex] = dimOffset;
-  if (dimSize == 0) {
-    offset = 0;
-  } else if (auto newOffset = getPhysicalIndex(offsets)) {
-    offset = *newOffset;
-  } else {
-    return failure();
-  }
-  sizes[dimIndex] = dimSize;
-  strides[dimIndex] *= dimStride;
-  return success();
-}
-
-LogicalResult BufferView::subview(ArrayRef<int64_t> subviewOffsets,
-                                  ArrayRef<int64_t> subviewSizes,
-                                  ArrayRef<int64_t> subviewStrides) {
-  if (auto newOffset = getPhysicalIndex(subviewOffsets)) {
-    offset = *newOffset;
-  } else {
-    return failure();
-  }
-
-  for (auto [inSize, subview_offset, subview_size, subview_stride] :
-       llvm::zip(sizes, subviewOffsets, subviewSizes, subviewStrides)) {
-    int64_t limitIndex = subview_offset + (subview_size - 1) * subview_stride;
-    if (subview_offset < 0 || subview_offset >= inSize || limitIndex < 0 ||
-        limitIndex >= inSize) {
-      return failure();
-    }
-  }
-
-  for (auto [in_stride, subview_stride] : llvm::zip(strides, subviewStrides)) {
-    in_stride *= subview_stride;
-  }
-  sizes = llvm::to_vector(subviewSizes);
-  return success();
-}
-
-int64_t BufferView::getNumElements(bool includeVectorDims) const {
-  size_t n = 1;
-  for (auto size : ArrayRef<int64_t>(sizes).drop_back(
-           includeVectorDims ? 0 : numVectorDims.value_or(0)))
-    n *= size;
-  return n;
-}
-
-std::optional<int64_t> BufferView::getCollapsedStride(
-    llvm::ArrayRef<int64_t> dims) const {
-  using StrideAndDim = std::pair<int64_t, int64_t>;
-  llvm::SmallVector<StrideAndDim> stridesAndDims;
-  for (auto dim : dims) {
-    if (sizes[dim] != 1) {
-      stridesAndDims.emplace_back(strides[dim], dim);
-    }
-  }
-
-  if (stridesAndDims.empty()) return 0;
-
-  llvm::sort(stridesAndDims);
-  int64_t nextStride = stridesAndDims.front().first;
-  for (auto [stride, dim] : stridesAndDims) {
-    if (stride != nextStride) return std::nullopt;
-    nextStride *= sizes[dim];
-  }
-  return stridesAndDims.front().first;
-}
-
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h
deleted file mode 100644
index 02fcb4c41cffd8..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h
+++ /dev/null
@@ -1,363 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
-#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
-
-#include <math.h>
-
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-
-namespace mlir {
-namespace interpreter {
-
-template <typename T>
-bool isEqual(T a, T b) {
-  return a == b;
-}
-
-// TODO(jreiffers): Replace ifndef with a command line flag.
-#ifndef MLIR_INTERPRETER_COMPARE_DOUBLES_EXACT
-// Compare double precision float with a small tolerance, because complex
-// computations in the interpreted don't always produce the exact same result.
-template <>
-inline bool isEqual(double a, double b) {
-  if (isinf(a) || isinf(b)) {
-    return a == b;
-  }
-
-  return fabs(a - b) < 1e-14;
-}
-
-template <>
-inline bool isEqual(std::complex<double> a, std::complex<double> b) {
-  return isEqual(a.real(), b.real()) && isEqual(a.imag(), b.imag());
-}
-#endif
-
-// Represents a view into a physical buffer.
-struct BufferView {
-  int64_t offset;
-  llvm::SmallVector<int64_t> sizes;    // [10, 11, 12]
-  llvm::SmallVector<int64_t> strides;  // [132, 12, 1]
-  // Number of vector element dimensions in the tensor. nullopt if this is a
-  // vector itself (isVector is set). {0} if this is a tensor of a unit vector.
-  std::optional<int64_t> numVectorDims = std::nullopt;
-  bool isVector = false;
-
-  int64_t rank() const { return sizes.size() - numVectorDims.value_or(0); }
-
-  // Removes the dimension from the view. If you need to keep it, use the
-  // overload below with dimSize = 1.
-  LogicalResult slice(int64_t dimIndex, int64_t dimOffset);
-  LogicalResult slice(int64_t dimIndex, int64_t dimOffset, int64_t dimSize,
-                      int64_t dimStride = 1);
-  LogicalResult subview(ArrayRef<int64_t> subviewoffsets,
-                        ArrayRef<int64_t> subviewsizes,
-                        ArrayRef<int64_t> subviewstrides);
-  int64_t getNumElements(bool includeVectorDims = false) const;
-
-  class LogicalIndexView {
-   public:
-    class Iterator {
-     public:
-      using iterator_category = std::forward_iterator_tag;
-      using value_type = llvm::SmallVector<int64_t>;
-      using difference_type = std::ptrdiff_t;
-      using pointer = llvm::SmallVector<int64_t>*;
-      using reference = llvm::SmallVector<int64_t>&;
-
-      const llvm::SmallVector<int64_t>& operator*() const {
-        return viewIndices;
-      }
-      const llvm::SmallVector<int64_t>* operator->() const {
-        return &viewIndices;
-      }
-
-      Iterator& operator++() {
-        auto indexIt = viewIndices.rbegin();
-        auto sizeIt = view->sizes.rbegin();
-        if (!includeVectorDims) {
-          std::advance(sizeIt, view->numVectorDims.value_or(0));
-        }
-
-        for (auto e = viewIndices.rend(); indexIt != e; ++indexIt, ++sizeIt) {
-          ++*indexIt;
-          if (*indexIt < *sizeIt) {
-            return *this;
-          }
-          *indexIt = 0;
-        }
-
-        viewIndices.clear();
-        viewIndices.push_back(-1);
-        return *this;
-      }
-
-      Iterator operator++(int) {
-        auto tmp = *this;
-        ++(*this);
-        return tmp;
-      }
-
-      bool operator==(const Iterator& other) const {
-        return viewIndices == other.viewIndices;
-      }
-
-      bool operator!=(const Iterator& other) const { return !(*this == other); }
-
-     private:
-      friend class LogicalIndexView;
-
-      Iterator(const BufferView* view, llvm::SmallVector<int64_t> indices,
-               bool includeVectorDims)
-          : view(view),
-            viewIndices(std::move(indices)),
-            includeVectorDims(includeVectorDims) {}
-
-      const BufferView* view;
-      llvm::SmallVector<int64_t> viewIndices;
-      bool includeVectorDims;
-    };
-
-    Iterator begin() const {
-      if (view->getNumElements() == 0) return end();
-      return {view,
-              llvm::SmallVector<int64_t>(
-                  view->rank() +
-                  (includeVectorDims ? view->numVectorDims.value_or(0) : 0)),
-              includeVectorDims};
-    }
-    Iterator end() const { return {view, {-1}, false}; }
-
-   private:
-    friend class BufferView;
-
-    LogicalIndexView(const BufferView* view, bool includeVectorDims)
-        : view(view), includeVectorDims(includeVectorDims) {}
-
-    const BufferView* view;
-    bool includeVectorDims;
-  };
-
-  // Returns nullopt if the index is out of bounds.
-  std::optional<int64_t> getPhysicalIndex(
-      llvm::ArrayRef<int64_t> viewIndices) const;
-  LogicalIndexView indices(bool includeVectorDims = false) const {
-    return LogicalIndexView{this, includeVectorDims};
-  }
-  // Returns the stride resulting from collapsing the given dimensions, if
-  // possible.
-  std::optional<int64_t> getCollapsedStride(llvm::ArrayRef<int64_t> dims) const;
-
-  bool inBounds(llvm::ArrayRef<int64_t> viewIndices) const;
-  static SmallVector<int64_t> getDefaultStrides(ArrayRef<int64_t> sizes);
-  static SmallVector<int64_t> getStridesForLayout(ArrayRef<int64_t> sizes,
-                                                  ArrayRef<int64_t> layout);
-};
-
-// Backing for a TensorOrMemref.
-class Buffer {
- private:
-  struct Dummy {};
-
- public:
-  template <typename T>
-  static std::shared_ptr<Buffer> allocate(size_t size) {
-    return std::make_shared<Buffer>(Dummy{}, size, sizeof(T));
-  }
-
-  char* at(std::optional<int64_t> idx, int64_t elementSize) {
-    auto byteOffset = getByteOffset(idx, elementSize);
-    if (!byteOffset) {
-      return &storage.data()[0];
-    }
-    return &storage.data()[*byteOffset];
-  }
-
-  const char* at(std::optional<int64_t> idx, int64_t elementSize) const {
-    auto byteOffset = getByteOffset(idx, elementSize);
-    if (!byteOffset) {
-      return &storage.data()[0];
-    }
-    return &storage.data()[*byteOffset];
-  }
-
-  Buffer(Dummy, size_t numElements, size_t elementSize)
-      : storage(numElements * elementSize) {}
-
-  int64_t getByteSize() const { return storage.size(); }
-
-  void deallocate(mlir::Operation* op) {
-    if (isAlloca) {
-      setFailure("deallocated stack buffer");
-    } else if (freedBy != nullptr) {
-      std::string failure;
-      llvm::raw_string_ostream os(failure);
-      os << "double-free\n";
-      os << "  Note: allocated by " << *allocatedBy << "\n";
-      os << "  Note: previously freed by " << *freedBy << "\n";
-      setFailure(failure);
-    } else {
-      freedBy = op;
-    }
-  }
-
-  bool deallocated() const { return freedBy != nullptr; }
-  mlir::Operation* freedByOp() const { return freedBy; }
-  void setAllocatedBy(mlir::Operation* allocatedBy) {
-    this->allocatedBy = allocatedBy;
-  }
-
-  void setFailure(llvm::StringRef failure) const {
-    this->failure = failure.str();
-  }
-  llvm::StringRef getFailure() const { return failure; }
-
-  void setIsAlloca() { isAlloca = true; }
-
- private:
-  std::optional<size_t> getByteOffset(std::optional<int64_t> idx,
-                                      int64_t elementSize) const {
-    if (!idx) {
-      setFailure("out of bounds access");
-      return std::nullopt;
-    }
-
-    if (freedBy != nullptr) {
-      std::string failure;
-      llvm::raw_string_ostream os(failure);
-      os << "use-after-free\n";
-      os << "  Note: allocated by " << *allocatedBy << "\n";
-      os << "  Note: previously freed by " << *freedBy << "\n";
-      setFailure(failure);
-      return std::nullopt;
-    }
-
-    return *idx * elementSize;
-  }
-
-  llvm::SmallVector<char> storage;
-  mlir::Operation* freedBy = nullptr;
-  mlir::Operation* allocatedBy = nullptr;
-  bool isAlloca = false;
-  mutable std::string failure;
-};
-
-template <typename T>
-struct TensorOrMemref {
-  using element_type = T;
-
-  static TensorOrMemref<T> empty(ArrayRef<int64_t> sizes,
-                                 ArrayRef<int64_t> layout = {}) {
-    BufferView dummy{0, SmallVector<int64_t>(sizes), {}};
-    return emptyLike(dummy, layout);
-  }
-
-  static TensorOrMemref<T> emptyLike(const BufferView& view,
-                                     ArrayRef<int64_t> layout = {}) {
-    BufferView newView = view;
-    newView.offset = 0;
-    newView.strides = BufferView::getStridesForLayout(view.sizes, layout);
-    return {Buffer::allocate<T>(view.getNumElements(true)), newView};
-  }
-
-  TensorOrMemref<T> clone(ArrayRef<int64_t> layout = {}) const {
-    auto out = emptyLike(view, layout);
-    for (auto [src_index, dst_index] :
-         llvm::zip(view.indices(true), out.view.indices(true))) {
-      out.at(dst_index) = at(src_index);
-    }
-    return out;
-  }
-
-  const T& at(ArrayRef<int64_t> indices) const {
-    return *reinterpret_cast<const T*>(
-        buffer->at(view.getPhysicalIndex(indices), sizeof(T)));
-  }
-
-  T& at(ArrayRef<int64_t> indices) {
-    return *reinterpret_cast<T*>(
-        buffer->at(view.getPhysicalIndex(indices), sizeof(T)));
-  }
-
-  TensorOrMemref vectorAt(ArrayRef<int64_t> indices) const {
-    auto offset = view.getPhysicalIndex(indices);
-    BufferView subview;
-    subview.strides = {view.strides.begin() + view.rank(), view.strides.end()};
-    subview.sizes = {view.sizes.begin() + view.rank(), view.sizes.end()};
-    if (offset) {
-      subview.offset = *offset;
-    } else {
-      buffer->setFailure("out of bounds access");
-    }
-    subview.isVector = true;
-    subview.numVectorDims = std::nullopt;
-    return {buffer, subview};
-  }
-
-  bool operator==(const TensorOrMemref& other) const {
-    if (buffer->deallocated() || other.buffer->deallocated()) return false;
-    if (other.view.sizes != view.sizes) return false;
-    if (other.view.numVectorDims != view.numVectorDims) return false;
-    for (const auto& indices : view.indices(true)) {
-      // Treat NaNs as equal.
-      if constexpr (std::is_floating_point_v<T>) {
-        bool thisnan = isnan(at(indices));
-        bool othernan = isnan(other.at(indices));
-        if (thisnan || othernan) {
-          if (thisnan && othernan) continue;
-          return false;
-        }
-      }
-      if (!isEqual(at(indices), other.at(indices))) return false;
-    }
-    return true;
-  }
-
-  std::shared_ptr<Buffer> buffer;
-  BufferView view;
-};
-
-template <typename T>
-struct is_tensor_or_memref : std::false_type {};  // NOLINT
-
-template <typename T>
-struct is_tensor_or_memref<TensorOrMemref<T>> : std::true_type {};  // NOLINT
-
-template <typename T>
-inline constexpr bool is_tensor_or_memref_v =  // NOLINT
-    is_tensor_or_memref<std::decay_t<T>>::value;
-
-}  // namespace interpreter
-}  // namespace mlir
-
-#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/BUILD b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/BUILD
deleted file mode 100644
index 81b0fa9d9faf0b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_test")
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
-
-cc_test(
-    name = "tensor_or_memref_test",
-    srcs = ["tensor_or_memref_test.cc"],
-    deps = [
-        "//xla/mlir_hlo:mlir_interpreter_framework",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_test(
-    name = "interpreter_value_test",
-    srcs = ["interpreter_value_test.cc"],
-    deps = [
-        "//xla/mlir_hlo:mlir_interpreter_framework",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-    ],
-)
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc
deleted file mode 100644
index 349a7ca4799652..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tools/mlir_interpreter/framework/interpreter_value.h"
-
-#include <variant>
-
-#include "llvm/ADT/ArrayRef.h"
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-using ::testing::ElementsAre;
-using ::testing::IsEmpty;
-
-TEST(InterpreterValueTest, FillUnitTensor) {
-  auto t = TensorOrMemref<int64_t>::empty({});
-  t.at({}) = 42;
-  InterpreterValue v{t};
-  v.fill([](llvm::ArrayRef<int64_t>) { return InterpreterValue{int64_t{43}}; });
-  ASSERT_EQ(t.at({}), 43);
-}
-
-TEST(InterpreterValueTest, Fill1DTensor) {
-  auto t = TensorOrMemref<int64_t>::empty({3});
-  InterpreterValue v{t};
-  v.fill([](llvm::ArrayRef<int64_t> indices) {
-    return InterpreterValue{indices[0]};
-  });
-  ASSERT_EQ(t.at(0), 0);
-  ASSERT_EQ(t.at(1), 1);
-  ASSERT_EQ(t.at(2), 2);
-}
-
-TEST(InterpreterValueTest, FillTensorOfVector) {
-  auto t = TensorOrMemref<int64_t>::empty({4, 2});
-  t.view.numVectorDims = 1;
-
-  InterpreterValue v{t};
-  v.fill([](llvm::ArrayRef<int64_t> indices) -> InterpreterValue {
-    EXPECT_EQ(indices.size(), 1);
-    auto r = TensorOrMemref<int64_t>::empty({2});
-    r.view.isVector = true;
-    r.at(0) = indices[0];
-    r.at(1) = indices[0] * 10;
-    return {r};
-  });
-  ASSERT_EQ(
-      v.toString(),
-      "TensorOrMemref<4xvector<2xi64>>: [[0, 0], [1, 10], [2, 20], [3, 30]]");
-}
-
-TEST(InterpreterValueTest, FillZeroSizedTensor) {
-  auto t = TensorOrMemref<int64_t>::empty({0, 1});
-  InterpreterValue v{t};
-  bool wasCalled = false;
-  v.fill([&](llvm::ArrayRef<int64_t> indices) {
-    wasCalled = true;
-    return InterpreterValue{indices[0]};
-  });
-  EXPECT_FALSE(wasCalled);
-}
-
-TEST(InterpreterValueTest, TypedAlike) {
-  InterpreterValue v{TensorOrMemref<int32_t>::empty({})};
-  auto typedAlike = v.typedAlike({1, 2, 3});
-  ASSERT_TRUE(
-      std::holds_alternative<TensorOrMemref<int32_t>>(typedAlike.storage));
-  ASSERT_THAT(typedAlike.view().sizes, ElementsAre(1, 2, 3));
-}
-
-TEST(InterpreterValueTest, AsUnitTensor) {
-  InterpreterValue v{42};
-  InterpreterValue wrapped = v.asUnitTensor();
-  ASSERT_THAT(wrapped.view().sizes, IsEmpty());
-  ASSERT_EQ(std::get<TensorOrMemref<int32_t>>(wrapped.storage).at({}), 42);
-}
-
-TEST(InterpreterValueTest, IsTensor) {
-  ASSERT_FALSE(InterpreterValue{42}.isTensor());
-  ASSERT_TRUE(InterpreterValue{TensorOrMemref<int32_t>::empty({})}.isTensor());
-}
-
-TEST(InterpreterValueTest, AsInt) {
-  ASSERT_EQ(InterpreterValue{int64_t{42}}.asInt(), 42);
-  ASSERT_EQ(InterpreterValue{int32_t{42}}.asInt(), 42);
-  ASSERT_EQ(InterpreterValue{int16_t{42}}.asInt(), 42);
-  ASSERT_EQ(InterpreterValue{int8_t{42}}.asInt(), 42);
-  ASSERT_EQ(InterpreterValue{int8_t{-1}}.asInt(), -1);
-}
-
-TEST(InterpreterValueTest, AsUInt) {
-  ASSERT_EQ(InterpreterValue{int16_t{-1}}.asUInt(), 65535);
-  ASSERT_EQ(InterpreterValue{int8_t{-1}}.asUInt(), 255);
-}
-
-TEST(InterpreterValueTest, CloneTensor) {
-  auto tensor = TensorOrMemref<int64_t>::empty({3});
-  tensor.at(0) = 1;
-  tensor.at(1) = 2;
-  tensor.at(2) = 3;
-
-  InterpreterValue wrapped{tensor};
-  auto clone = wrapped.clone();
-  tensor.at(0) = 4;
-
-  auto& clonedTensor = std::get<TensorOrMemref<int64_t>>(clone.storage);
-  ASSERT_EQ(clonedTensor.at(0), 1);
-  ASSERT_EQ(clonedTensor.at(1), 2);
-  ASSERT_EQ(clonedTensor.at(2), 3);
-}
-
-TEST(InterpreterValueTest, CloneWithLayouts) {
-  auto tensor = TensorOrMemref<int64_t>::empty({3, 5}, {0, 1});
-  tensor.at({2, 4}) = 42;
-
-  InterpreterValue wrapped{tensor};
-  auto clone = wrapped.clone();
-  ASSERT_EQ(clone.view().strides,
-            BufferView::getStridesForLayout({3, 5}, {1, 0}));
-  ASSERT_EQ(clone.extractElement({2, 4}).asInt(), 42);
-}
-
-TEST(InterpreterValueTest, CoerceLayoutNoop) {
-  auto tensor = TensorOrMemref<int64_t>::empty({3, 5}, {0, 1});
-  tensor.at({2, 4}) = 42;
-
-  InterpreterValue wrapped{tensor};
-  auto coerced = wrapped.coerceLayout({0, 1});
-  ASSERT_EQ(tensor.buffer,
-            std::get<TensorOrMemref<int64_t>>(coerced.storage).buffer);
-}
-
-TEST(InterpreterValueTest, CoerceLayout) {
-  auto tensor = TensorOrMemref<int64_t>::empty({3, 5});
-  tensor.at({2, 4}) = 42;
-
-  InterpreterValue wrapped{tensor};
-  auto clone = wrapped.coerceLayout({0, 1});
-  ASSERT_EQ(clone.view().strides,
-            BufferView::getStridesForLayout({3, 5}, {0, 1}));
-  ASSERT_EQ(clone.extractElement({2, 4}).asInt(), 42);
-}
-
-TEST(InterpreterValueTest, CoerceLayoutSquare) {
-  auto tensor = TensorOrMemref<float>::empty({2, 2});
-  tensor.at({0, 0}) = 1;
-  tensor.at({0, 1}) = 2;
-  tensor.at({1, 0}) = 3;
-  tensor.at({1, 1}) = 4;
-
-  InterpreterValue wrapped{tensor};
-  auto clone = wrapped.coerceLayout({0, 1});
-  auto& clonedTensor = std::get<TensorOrMemref<float>>(clone.storage);
-
-  EXPECT_EQ(
-      *reinterpret_cast<float*>(clonedTensor.buffer->at(0, sizeof(float))), 1);
-  EXPECT_EQ(
-      *reinterpret_cast<float*>(clonedTensor.buffer->at(1, sizeof(float))), 3);
-  EXPECT_EQ(
-      *reinterpret_cast<float*>(clonedTensor.buffer->at(2, sizeof(float))), 2);
-  EXPECT_EQ(
-      *reinterpret_cast<float*>(clonedTensor.buffer->at(3, sizeof(float))), 4);
-}
-
-TEST(InterpreterValueTest, CloneScalar) {
-  InterpreterValue value{42};
-  auto clone = value.clone();
-  ASSERT_THAT(std::get<int32_t>(clone.storage), 42);
-}
-
-TEST(InterpreterValueTest, ToString) {
-  InterpreterValue value{TensorOrMemref<int64_t>::empty({3})};
-  ASSERT_EQ(value.toString(), "TensorOrMemref<3xi64>: [0, 0, 0]");
-}
-
-TEST(InterpreterValueTest, ToString2d) {
-  InterpreterValue value{TensorOrMemref<int64_t>::empty({3, 2})};
-  ASSERT_EQ(value.toString(),
-            "TensorOrMemref<3x2xi64>: [[0, 0], [0, 0], [0, 0]]");
-}
-
-TEST(InterpreterValueTest, ToString0d) {
-  InterpreterValue value{TensorOrMemref<int64_t>::empty({})};
-  ASSERT_EQ(value.toString(), "TensorOrMemref<i64>: 0");
-}
-
-TEST(InterpreterValueTest, ToStringComplex) {
-  InterpreterValue value{std::complex<float>{}};
-  ASSERT_EQ(value.toString(), "complex<f32>: 0.000000e+00+0.000000e+00i");
-}
-
-TEST(CastTest, UnpackTensor) {
-  InterpreterValue value{TensorOrMemref<int8_t>::empty({1, 1})};
-  value.insertElement({0, 0}, {int8_t{1}});
-  ASSERT_EQ(interpreterValueCast<int64_t>(value), 1);
-  ASSERT_EQ(interpreterValueCast<uint8_t>(value), 1);
-  ASSERT_EQ(interpreterValueCast<float>(value), 1.0f);
-  ASSERT_EQ(interpreterValueCast<double>(value), 1.0);
-
-  InterpreterValue nonUnit{TensorOrMemref<int8_t>::empty({2, 2})};
-  ASSERT_EQ(interpreterValueDynCast<int64_t>(nonUnit), std::nullopt);
-}
-
-TEST(CastTest, IdentityCast) {
-  InterpreterValue value{TensorOrMemref<float>::empty({1, 1})};
-  ASSERT_EQ(interpreterValueCast<InterpreterValue>(value), value);
-}
-
-TEST(CastTest, CastToUnsigned) {
-  // Note: This is different from `AsUint`, which preserves the size of the
-  // original type (i.e. int8_t{-1} results in 255).
-  InterpreterValue value{int8_t{-1}};
-  ASSERT_EQ(interpreterValueCast<uint8_t>(value), 255);
-  ASSERT_EQ(interpreterValueCast<uint16_t>(value), 65535);
-}
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
deleted file mode 100644
index a7187244a14c3a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tools/mlir_interpreter/framework/tensor_or_memref.h"
-
-#include <algorithm>
-#include <limits>
-#include <optional>
-#include <utility>
-
-#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/str_join.h"
-
-namespace mlir {
-namespace interpreter {
-namespace {
-
-using ::testing::ElementsAre;
-
-TEST(TensorOrMemrefTest, DefaultStrides) {
-  EXPECT_THAT(BufferView::getDefaultStrides({1, 2, 3}), ElementsAre(6, 3, 1));
-}
-
-TEST(TensorOrMemrefTest, StridesForLayout) {
-  EXPECT_THAT(BufferView::getStridesForLayout({1, 2, 3}, {2, 1, 0}),
-              ElementsAre(6, 3, 1));
-  EXPECT_THAT(BufferView::getStridesForLayout({1, 2, 3}, {0, 1, 2}),
-              ElementsAre(1, 1, 2));
-  EXPECT_THAT(BufferView::getStridesForLayout({3, 3, 3, 3}, {3, 0, 1, 2}),
-              ElementsAre(27, 1, 3, 9));
-}
-
-std::optional<int64_t> getCollapsedStrideNaive(llvm::ArrayRef<int64_t> dims,
-                                               const BufferView& view) {
-  BufferView f;
-  for (int64_t dim : dims) {
-    f.sizes.push_back(view.sizes[dim]);
-  }
-
-  // Find all physical indices for the dimensions.
-  llvm::SmallBitVector v(view.getNumElements());
-  for (const auto& indices : f.indices()) {
-    SmallVector<int64_t> viewIndices(view.rank());
-    for (auto [dim, index] : llvm::zip(dims, indices)) {
-      viewIndices[dim] = index;
-    }
-    v[*view.getPhysicalIndex(viewIndices)] = true;
-  }
-
-  if (v.count() != f.getNumElements()) return std::nullopt;
-  if (f.getNumElements() <= 1) return 0;
-
-  // Check that they have a common stride.
-  int64_t min = v.find_first();
-  int64_t expectedStride = (v.find_last() - min) / (f.getNumElements() - 1);
-  for (int64_t i = 0; i < f.getNumElements(); ++i) {
-    if (!v[i * expectedStride + min]) return std::nullopt;
-  }
-
-  return expectedStride;
-}
-
-TEST(TensorOrMemrefTest, CollapsedStride) {
-  BufferView view{.sizes = {1, 2, 3, 1, 5},
-                  .strides = BufferView::getDefaultStrides({1, 2, 3, 1, 5})};
-
-  auto checkAll = [&]() {
-    for (int64_t i = 0; i < (1 << view.rank()); ++i) {
-      SmallVector<int64_t> dims;
-      for (int64_t dim = 0; dim < view.rank(); ++dim) {
-        if (i & (1 << dim)) dims.push_back(dim);
-      }
-
-      do {
-        auto v = view.getCollapsedStride(dims);
-        auto n = getCollapsedStrideNaive(dims, view);
-        EXPECT_EQ(n, v) << "checking " << absl::StrJoin(dims, ", ");
-      } while (std::next_permutation(dims.begin(), dims.end()));
-    }
-  };
-
-  checkAll();
-  ASSERT_TRUE(view.slice(3, 0).succeeded());
-  checkAll();
-}
-
-}  // namespace
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/mlir-interpreter-runner.cc b/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/mlir-interpreter-runner.cc
deleted file mode 100644
index 7c8475d4bb644b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tools/mlir_interpreter/mlir-interpreter-runner.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "deallocation/IR/deallocation_ops.h"
-#include "gml_st/IR/gml_st_ops.h"
-#include "lhlo/IR/lhlo_ops.h"
-#include "lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/SourceMgr.h"
-#include "mhlo/IR/register.h"
-#include "mhlo/transforms/passes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/DialectRegistry.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/OwningOpRef.h"
-#include "mlir/InitAllDialects.h"
-#include "mlir/InitAllPasses.h"
-#include "mlir/Support/FileUtilities.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Tools/ParseUtilities.h"
-#include "thlo/IR/thlo_ops.h"
-#include "tools/mlir_interpreter/framework/interpreter.h"
-
-struct Options {
-  llvm::cl::opt<std::string> inputFilename{llvm::cl::Positional,
-                                           llvm::cl::desc("<input file>"),
-                                           llvm::cl::init("-")};
-  llvm::cl::opt<bool> runAllFunctions{
-      "run-all", llvm::cl::desc("Run all functions in the module"),
-      llvm::cl::init(false)};
-};
-
-static mlir::OwningOpRef<mlir::Operation *> parseMlirInput(
-    llvm::StringRef inputFilename, mlir::MLIRContext *context) {
-  std::string errorMessage;
-  auto file = mlir::openInputFile(inputFilename, &errorMessage);
-  if (!file) {
-    llvm::errs() << errorMessage << "\n";
-    return {};
-  }
-
-  auto sourceMgr = std::make_shared<llvm::SourceMgr>();
-  sourceMgr->AddNewSourceBuffer(std::move(file), mlir::SMLoc());
-  return mlir::parseSourceFileForTool(sourceMgr, context,
-                                      /*insertImplicitModule=*/true);
-}
-
-static mlir::LogicalResult run(mlir::ModuleOp module,
-                               mlir::func::FuncOp function) {
-  llvm::outs() << "@" << function.getName().str() << "()\n";
-  if (function.getBody().getBlocks().front().getNumArguments() > 0) {
-    llvm::errs() << "Function arguments are not supported.";
-    return mlir::failure();
-  }
-
-  mlir::SymbolTable symbolTable{module};
-  auto results =
-      mlir::interpreter::runInterpreter(symbolTable, function, {}, {});
-  if (!mlir::succeeded(results)) {
-    llvm::errs() << "Interpreter failed\n";
-    return mlir::failure();
-  }
-
-  if (!results->empty()) {
-    llvm::outs() << "Results:\n";
-    for (const auto &result : *results) {
-      llvm::outs() << result.toString() << "\n";
-    }
-  }
-
-  return mlir::success();
-}
-
-int main(int argc, char *argv[]) {
-  // Flush llvm::outs before writing errors.
-  llvm::errs().tie(&llvm::outs());
-
-  Options options;
-  llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR CPU execution driver\n");
-
-  mlir::DialectRegistry registry;
-  mlir::registerAllDialects(registry);
-  mlir::mhlo::registerAllMhloDialects(registry);
-  mlir::mhlo::registerAllMhloPasses();
-  mlir::registerAllPasses();
-  registry.insert<mlir::deallocation::DeallocationDialect,
-                  mlir::lmhlo::LmhloDialect, mlir::lmhlo_gpu::LmhloGpuDialect,
-                  mlir::gml_st::GmlStDialect, mlir::thlo::THLODialect>();
-
-  mlir::MLIRContext context(registry);
-  context.allowUnregisteredDialects();
-  auto parsedInput = parseMlirInput(options.inputFilename, &context);
-  if (!parsedInput) {
-    llvm::errs() << "Failed to parse module.\n";
-    return 1;
-  }
-  auto module = llvm::dyn_cast<mlir::ModuleOp>(**parsedInput);
-  if (!module) {
-    llvm::errs() << "Parsing returned something that's not a module.\n";
-    return 1;
-  }
-
-  if (options.runAllFunctions) {
-    bool allSucceeded = true;
-    module.walk([&](mlir::func::FuncOp function) {
-      if (!function.isPrivate()) {
-        allSucceeded &= run(module, function).succeeded();
-      }
-    });
-    if (!allSucceeded) {
-      return 1;
-    }
-  } else {
-    auto *main = module.lookupSymbol("main");
-    if (!main) {
-      llvm::errs() << "no main function found.\n";
-      return 1;
-    }
-    if (!run(module, llvm::cast<mlir::func::FuncOp>(main)).succeeded()) {
-      return 1;
-    }
-  }
-  return 0;
-}

From 9a704c508833ca68caf272d4f44cbe944d10f8c7 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Tue, 26 Sep 2023 05:16:33 -0700
Subject: [PATCH 264/567] Rename FindFusionParameters -> FindFusionArguments.

Parameters are inside fusions, but this finds instructions outside the fusion, so this name is more accurate and less confusing.

PiperOrigin-RevId: 568505665
---
 .../xla/service/gpu/hlo_fusion_analysis.cc    | 30 +++++++++----------
 .../xla/xla/service/gpu/hlo_fusion_analysis.h |  6 ++--
 .../xla/xla/service/gpu/hlo_traversal.cc      |  2 +-
 .../xla/xla/service/gpu/hlo_traversal.h       |  3 +-
 .../xla/xla/service/gpu/hlo_traversal_test.cc | 18 +++++------
 5 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index 0b1f6389bd8129..830961ad9894af 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -266,19 +266,19 @@ StatusOr<HloFusionAnalysis> HloFusionAnalysis::Create(
     heroes.push_back(&FindNonTrivialHero(*root));
   }
 
-  std::vector<const HloInstruction*> fusion_parameter_inputs;
-  FindFusionParameters(hlo_roots, boundary_fn,
-                       [&](const HloInstruction& parameter) {
-                         fusion_parameter_inputs.push_back(&parameter);
-                       });
+  std::vector<const HloInstruction*> fusion_arguments;
+  FindFusionArguments(hlo_roots, boundary_fn,
+                      [&](const HloInstruction& argument) {
+                        fusion_arguments.push_back(&argument);
+                      });
 
   std::optional<TransposeDescription> tiled_transpose_hero =
       FindConsistentTransposeHero(hlo_roots, heroes);
 
-  return HloFusionAnalysis(
-      std::move(backend_config), std::move(hlo_roots), std::move(boundary_fn),
-      std::move(fusion_parameter_inputs), std::move(heroes), device_info,
-      tiled_transpose_hero);
+  return HloFusionAnalysis(std::move(backend_config), std::move(hlo_roots),
+                           std::move(boundary_fn), std::move(fusion_arguments),
+                           std::move(heroes), device_info,
+                           tiled_transpose_hero);
 }
 
 // static
@@ -514,7 +514,7 @@ const Shape& HloFusionAnalysis::GetElementShape() const {
 
 int HloFusionAnalysis::SmallestInputDtypeBits() const {
   int bits = std::numeric_limits<int>::max();
-  for (const HloInstruction* operand : fusion_parameter_inputs_) {
+  for (const HloInstruction* operand : fusion_arguments_) {
     bits = std::min(bits,
                     primitive_util::BitWidth(operand->shape().element_type()));
   }
@@ -698,9 +698,9 @@ bool HloFusionAnalysis::IsUnrollingColumnReductionBeneficial(
         return TraversalResult::kVisitOperands;
       });
 
-  for (auto* param : fusion_parameter_inputs_) {
-    if (!reachable_through_non_elementwise.contains(param) &&
-        ShapeUtil::SameDimensions(input_shape, param->shape())) {
+  for (auto* argument : fusion_arguments_) {
+    if (!reachable_through_non_elementwise.contains(argument) &&
+        ShapeUtil::SameDimensions(input_shape, argument->shape())) {
       ++can_be_vectorized;
     }
   }
@@ -711,8 +711,8 @@ bool HloFusionAnalysis::IsUnrollingColumnReductionBeneficial(
   // unrolled even with such an assumption,  and the accesses to those inputs
   // turn out to be vectorizable, the compiler will still vectorize them.
   int64_t num_elements = ShapeUtil::ElementsIn(input_shape);
-  cannot_be_vectorized += absl::c_count_if(
-      fusion_parameter_inputs_, [&](const HloInstruction* parameter) {
+  cannot_be_vectorized +=
+      absl::c_count_if(fusion_arguments_, [&](const HloInstruction* parameter) {
         return ShapeUtil::ElementsIn(parameter->shape()) > num_elements;
       });
   if (can_be_vectorized < cannot_be_vectorized) {
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
index ecaaf35bd225d0..600eae6c801e6b 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
@@ -88,14 +88,14 @@ class HloFusionAnalysis {
   HloFusionAnalysis(FusionBackendConfig fusion_backend_config,
                     std::vector<const HloInstruction*> fusion_roots,
                     FusionBoundaryFn fusion_boundary_fn,
-                    std::vector<const HloInstruction*> fusion_parameters,
+                    std::vector<const HloInstruction*> fusion_arguments,
                     std::vector<const HloInstruction*> fusion_heroes,
                     const GpuDeviceInfo* device_info,
                     std::optional<TransposeDescription> tiled_transpose)
       : fusion_backend_config_(std::move(fusion_backend_config)),
         fusion_roots_(std::move(fusion_roots)),
         fusion_boundary_fn_(std::move(fusion_boundary_fn)),
-        fusion_parameter_inputs_(std::move(fusion_parameters)),
+        fusion_arguments_(std::move(fusion_arguments)),
         fusion_heroes_(std::move(fusion_heroes)),
         device_info_(device_info),
         tiled_transpose_(tiled_transpose) {}
@@ -123,7 +123,7 @@ class HloFusionAnalysis {
   FusionBoundaryFn fusion_boundary_fn_;
   // The HLO instructions that are inputs into the fusion. These instructions
   // are /outside/ the fusion.
-  std::vector<const HloInstruction*> fusion_parameter_inputs_;
+  std::vector<const HloInstruction*> fusion_arguments_;
   std::vector<const HloInstruction*> fusion_heroes_;
   const GpuDeviceInfo* device_info_;
   std::optional<TransposeDescription> tiled_transpose_;
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.cc b/third_party/xla/xla/service/gpu/hlo_traversal.cc
index b7b0ca257b639b..5baf613d8c5c8b 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.cc
@@ -119,7 +119,7 @@ void HloBfsConsumersFirstTraversal(
   }
 }
 
-void FindFusionParameters(
+void FindFusionArguments(
     absl::Span<const HloInstruction* const> roots,
     const FusionBoundaryFn& boundary,
     const std::function<void(const HloInstruction& param)>& visit) {
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.h b/third_party/xla/xla/service/gpu/hlo_traversal.h
index 9f3e1d12fe5e54..799a427de94fcc 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.h
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.h
@@ -70,8 +70,7 @@ const HloInstruction* HloFindIf(
     const std::function<bool(const HloInstruction& node)>& visit);
 
 // Visit the producers of all parameters that are needed by the fusion.
-// TODO(jreiffers): Rename this to FindFusionArguments.
-void FindFusionParameters(
+void FindFusionArguments(
     absl::Span<const HloInstruction* const> roots,
     const FusionBoundaryFn& boundary,
     const std::function<void(const HloInstruction& producer)>& visit);
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
index 6185579fedc886..bcbe89e451fd2e 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
@@ -122,10 +122,10 @@ TEST_F(HloTraversalTest, TraversePartialFusion) {
               ElementsAre("reduce.1", "mul", "p0.1", "p1.1", "negate"));
 }
 
-TEST_F(HloTraversalTest, FindParameters) {
+TEST_F(HloTraversalTest, FindArguments) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
   std::vector<std::string> producers;
-  FindFusionParameters(
+  FindFusionArguments(
       {module->GetComputationWithName("fused_computation")->root_instruction()},
       DefaultFusionBoundaryFn, [&](const HloInstruction& producer) {
         producers.emplace_back(producer.name());
@@ -133,12 +133,12 @@ TEST_F(HloTraversalTest, FindParameters) {
   EXPECT_THAT(producers, ElementsAre("p0", "negate"));
 }
 
-TEST_F(HloTraversalTest, FindParametersAfterFusion) {
-  // Verifies that we correctly find the parameters after fusing the negation.
+TEST_F(HloTraversalTest, FindArgumentsAfterFusion) {
+  // Verifies that we correctly find the arguments after fusing the negation.
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
   std::vector<std::string> producers;
   auto* fused_computation = module->GetComputationWithName("fused_computation");
-  FindFusionParameters(
+  FindFusionArguments(
       {fused_computation->root_instruction()},
       [&](const HloInstruction& producer, const HloInstruction& consumer) {
         return &consumer == fused_computation->parameter_instruction(0) ||
@@ -154,7 +154,7 @@ TEST_F(HloTraversalTest, FuseEverything) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
   std::vector<std::string> producers;
   auto* fused_computation = module->GetComputationWithName("fused_computation");
-  FindFusionParameters(
+  FindFusionArguments(
       {fused_computation->root_instruction()},
       [&](const HloInstruction& producer, const HloInstruction& consumer) {
         return producer.opcode() == HloOpcode::kParameter &&
@@ -254,7 +254,7 @@ TEST_F(HloTraversalTest, FuseFusionConsumer) {
                                   return TraversalResult::kVisitOperands;
                                 });
   std::vector<std::string> params;
-  FindFusionParameters(roots, boundary, [&](const HloInstruction& param) {
+  FindFusionArguments(roots, boundary, [&](const HloInstruction& param) {
     params.emplace_back(param.name());
   });
 
@@ -277,7 +277,7 @@ TEST_F(HloTraversalTest, FuseFusionProducer) {
                                   return TraversalResult::kVisitOperands;
                                 });
   std::vector<std::string> params;
-  FindFusionParameters({consumer}, boundary, [&](const HloInstruction& param) {
+  FindFusionArguments({consumer}, boundary, [&](const HloInstruction& param) {
     params.emplace_back(param.name());
   });
 
@@ -302,7 +302,7 @@ TEST_F(HloTraversalTest, FuseFusionConsumerAndProducer) {
                                   return TraversalResult::kVisitOperands;
                                 });
   std::vector<std::string> params;
-  FindFusionParameters(roots, boundary, [&](const HloInstruction& param) {
+  FindFusionArguments(roots, boundary, [&](const HloInstruction& param) {
     params.emplace_back(param.name());
   });
 

From ff8e8010f77dbe98d5e811693f04e14e7cfe85d3 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Tue, 26 Sep 2023 06:02:35 -0700
Subject: [PATCH 265/567] Remove tracing instrumentation.

It's dead.

PiperOrigin-RevId: 568514448
---
 .../xla/mlir/tools/mlir_replay/public/BUILD   | 34 -----------
 .../mlir/tools/mlir_replay/public/README.md   |  3 -
 .../mlir_replay/public/compiler_trace.proto   | 31 ----------
 .../public/compiler_trace_instrumentation.cc  | 60 -------------------
 .../public/compiler_trace_instrumentation.h   | 48 ---------------
 third_party/xla/xla/service/cpu/BUILD         |  1 -
 .../xla/xla/service/cpu/cpu_compiler.cc       | 16 -----
 7 files changed, 193 deletions(-)
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/public/README.md
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace.proto
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.cc
 delete mode 100644 third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h

diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD b/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD
deleted file mode 100644
index efaa3a84e62d1c..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD
+++ /dev/null
@@ -1,34 +0,0 @@
-load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "compiler_trace_instrumentation",
-    srcs = ["compiler_trace_instrumentation.cc"],
-    hdrs = ["compiler_trace_instrumentation.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":compiler_trace_proto_cc",
-        ":compiler_trace_proto_cc_impl",
-        "//xla/service/llvm_ir:llvm_util",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:protobuf",
-    ],
-)
-
-tf_proto_library(
-    name = "compiler_trace_proto",
-    srcs = ["compiler_trace.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/README.md b/third_party/xla/xla/mlir/tools/mlir_replay/public/README.md
deleted file mode 100644
index 553accf2d0543e..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# DEPRECATED: Public API of mlir_replay
-
-Do not use. This is in the process of being removed.
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace.proto b/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace.proto
deleted file mode 100644
index bf14471620831d..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace.proto
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-syntax = "proto2";
-
-package mlir.interpreter;
-
-message MlirCompilationTraceEntry {
-  // The name of the pass that was previously executed.
-  optional string after_pass = 1;
-
-  // MLIR module IR of the state after the pass.
-  optional string mlir_module = 2;
-}
-
-message MlirCompilationTrace {
-  // MLIR modules corresponding to each stage of the compilation pipeline.
-  repeated MlirCompilationTraceEntry passes = 1;
-}
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.cc b/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.cc
deleted file mode 100644
index 982a62c2f81374..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/path.h"
-
-namespace mlir {
-namespace interpreter {
-
-void MlirCompilerTraceInstrumentation::runAfterPass(Pass* pass, Operation* op) {
-  ModuleOp module = llvm::dyn_cast<ModuleOp>(op);
-  if (!module) {
-    module = op->getParentOfType<mlir::ModuleOp>();
-  }
-  if (!module) {
-    LOG(ERROR) << "Failed to find a ModuleOp: " << pass->getName().str() << ".";
-    return;
-  }
-
-  auto* item = trace_.mutable_passes()->Add();
-  item->set_after_pass(pass->getName().str());
-  *item->mutable_mlir_module() = xla::llvm_ir::DumpToString(module);
-}
-
-MlirCompilerTraceInstrumentation::~MlirCompilerTraceInstrumentation() {
-  if (!trace_.passes().empty()) {
-    std::string filename;
-    absl::StrAppendFormat(&filename, "module_%04d", unique_id_);
-    if (!module_name_.empty()) {
-      absl::StrAppend(&filename, ".", module_name_);
-    }
-    absl::StrAppend(&filename, ".mlir-trace.pb");
-    filename = tsl::io::JoinPath(dirname_, filename);
-    TF_CHECK_OK(tsl::Env::Default()->RecursivelyCreateDir(dirname_));
-    TF_CHECK_OK(tsl::WriteBinaryProto(tsl::Env::Default(), filename, trace_));
-  }
-}
-
-}  // namespace interpreter
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h b/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h
deleted file mode 100644
index a02d7a525392f6..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
-#define XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
-
-#include <string>
-
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassInstrumentation.h"  // from @llvm-project
-#include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
-
-namespace mlir {
-namespace interpreter {
-
-// Instrumentation that logs the state of the IR after each pass.
-class MlirCompilerTraceInstrumentation : public PassInstrumentation {
- public:
-  explicit MlirCompilerTraceInstrumentation(const std::string& dirname,
-                                            int unique_id,
-                                            const std::string& module_name)
-      : dirname_(dirname), unique_id_(unique_id), module_name_(module_name) {}
-  ~MlirCompilerTraceInstrumentation() override;
-  void runAfterPass(Pass* pass, Operation* op) override;
-
- private:
-  MlirCompilationTrace trace_;
-  std::string dirname_;
-  int unique_id_;
-  std::string module_name_;
-};
-
-}  // namespace interpreter
-}  // namespace mlir
-
-#endif  // XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index f4c9fde056b1c2..95269b75edd022 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -243,7 +243,6 @@ cc_library(
         "//xla/mlir/runtime/transforms:compilation_pipeline_cpu",
         "//xla/mlir/runtime/transforms:compiler",
         "//xla/mlir/runtime/transforms:jit_compiler",
-        "//xla/mlir/tools/mlir_replay/public:compiler_trace_instrumentation",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:all_passes",
         "//xla/mlir_hlo:gml_st_passes",
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index c6c78592825e58..9da505e9d10685 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -106,7 +106,6 @@ limitations under the License.
 #include "xla/mlir/runtime/transforms/compilation_pipeline_cpu.h"
 #include "xla/mlir/runtime/transforms/compiler.h"
 #include "xla/mlir/runtime/transforms/jit_compiler.h"
-#include "xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
@@ -427,14 +426,6 @@ runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions(
                  << status.message();
     }
     runtime::CreateDefaultXlaCpuRuntimeCompilationPipeline(passes, copts);
-
-    if (DumpingEnabledForHloModule(module) &&
-        module.config().debug_options().xla_dump_hlo_snapshots()) {
-      passes->addInstrumentation(
-          std::make_unique<mlir::interpreter::MlirCompilerTraceInstrumentation>(
-              module.config().debug_options().xla_dump_to(), module.unique_id(),
-              module.name()));
-    }
   };
   opts.compiler.calling_convention = runtime::ResultsToOutsCallingConvention(
       FlattenTuplesAndBufferizeTypeConverter());
@@ -1150,13 +1141,6 @@ Status LowerMLIRModule(HloModule* module, mlir::ModuleOp mlir_module,
         /*printAfterOnlyOnFailure=*/false, llvm::errs(), printing_flags);
   }
 
-  if (DumpingEnabledForHloModule(*module)) {
-    pm.addInstrumentation(
-        std::make_unique<mlir::interpreter::MlirCompilerTraceInstrumentation>(
-            module->config().debug_options().xla_dump_to(), module->unique_id(),
-            module->name()));
-  }
-
   xla::runtime::PassManager xla_pm(&pm);
   HloXlaRuntimePipelineOptions options = GetHloXlaRuntimePipelineOptions(
       target.getTargetTriple(), target.getTargetCPU());

From c45c4c9f39b35d8b64ed0545da66a3042cb67a52 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Tue, 26 Sep 2023 07:34:40 -0700
Subject: [PATCH 266/567] Split initial Triton IR generation from TritonWrapper
 and add a Matmul filecheck test.

I'll add more test cases later.

PiperOrigin-RevId: 568534317
---
 third_party/xla/xla/service/gpu/BUILD         |   6 +-
 .../xla/xla/service/gpu/ir_emitter_triton.cc  | 122 +++++++++-------
 .../xla/xla/service/gpu/ir_emitter_triton.h   |  13 ++
 .../xla/service/gpu/ir_emitter_triton_test.cc | 134 ++++++++++++++++++
 4 files changed, 222 insertions(+), 53 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 1c76f62f78b7b1..c8ef34a5592760 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -552,11 +552,11 @@ xla_test(
     tags = ["nomac"],
     deps = [
         ":backend_configs_cc",
+        ":gemm_rewriter_triton",
         ":gpu_device_info",
         ":gpu_device_info_for_tests",
         ":ir_emission_utils",
         ":ir_emitter_triton",
-        ":launch_dimensions",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
         "//xla:xla_proto_cc",
@@ -566,14 +566,18 @@ xla_test(
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cublas_plugin",
+        "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index bb257683e203e2..eeba9f284dd45e 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -1604,6 +1604,68 @@ StatusOr<std::unique_ptr<llvm::Module>> TranslateLLVMToLLVMIR(
   return llvmModule;
 }
 
+namespace {
+
+std::string GetLibdevicePath(const HloComputation* hlo_computation) {
+  return nvptx::LibDevicePath(hlo_computation->parent()
+                                  ->config()
+                                  .debug_options()
+                                  .xla_gpu_cuda_data_dir());
+}
+
+}  // namespace
+
+StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
+    const TritonFusionAnalysis& analysis, absl::string_view fn_name,
+    const HloComputation* hlo_computation, const GpuDeviceInfo& device_info,
+    const AutotuneResult::TritonGemmKey& config, TritonIrEmitter ir_emitter,
+    mlir::MLIRContext& mlir_context) {
+  mlir_context.loadDialect<mt::TritonDialect>();
+  mlir::OpBuilder b(&mlir_context);
+  auto loc = mlir::NameLoc::get(b.getStringAttr(hlo_computation->name()));
+  mlir::OwningOpRef<mlir::ModuleOp> triton_module =
+      llvm_ir::CreateMlirModuleOp(loc);
+  b.setInsertionPointToEnd(triton_module->getBody());
+
+  // Build Triton kernel.
+  SmallVector<Type> fn_arg_types;
+  for (HloInstruction* p : hlo_computation->parameter_instructions()) {
+    fn_arg_types.push_back(mt::PointerType::get(
+        StorageType(b, TritonType(b, p->shape().element_type())),
+        mn::kGlobalMemorySpace));
+  }
+
+  for (const ShapeUtil::IndexedShape& s :
+       ShapeUtil::GetLeafShapes(hlo_computation->root_instruction()->shape())) {
+    fn_arg_types.push_back(mt::PointerType::get(
+        StorageType(b, TritonType(b, s.shape.element_type())),
+        mn::kGlobalMemorySpace));
+  }
+
+  auto fn = b.create<mt::FuncOp>(loc, fn_name,
+                                 b.getFunctionType(fn_arg_types, std::nullopt));
+  for (int i = 0; i < fn.getNumArguments(); ++i) {
+    fn.setArgAttr(i, "tt.divisibility", b.getIntegerAttr(b.getI32Type(), 16));
+  }
+  fn.addEntryBlock();
+  b.setInsertionPointToStart(&fn.front());
+
+  TF_RETURN_IF_ERROR(ir_emitter(b, GetLibdevicePath(hlo_computation), analysis,
+                                hlo_computation, fn, config,
+                                device_info.shared_memory_per_block_optin));
+
+  b.create<mt::ReturnOp>(loc);
+
+  VLOG(6) << llvm_ir::DumpToString(*triton_module);
+  if (DumpingEnabledForHloModule(*hlo_computation->parent())) {
+    DumpToFileInDirOrStdout(*hlo_computation->parent(), "triton_ir", "ttir",
+                            llvm_ir::DumpToString(*triton_module));
+  }
+
+  CHECK(mlir::succeeded(mlir::verify(*triton_module)));
+  return std::move(triton_module);
+}
+
 StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation, absl::string_view fusion_kind,
@@ -1647,59 +1709,14 @@ StatusOr<TritonWrapperResult> TritonWrapper(
     }
   }
 
-  mlir_context.loadDialect<mt::TritonDialect>();
-  mlir::OpBuilder b(&mlir_context);
-  auto loc = mlir::NameLoc::get(b.getStringAttr(hlo_computation->name()));
-  mlir::OwningOpRef<mlir::ModuleOp> triton_module =
-      llvm_ir::CreateMlirModuleOp(loc);
-  b.setInsertionPointToEnd(triton_module->getBody());
+  TF_ASSIGN_OR_RETURN(
+      auto triton_module,
+      CreateTritonModule(analysis, fn_name, hlo_computation, device_info,
+                         config, ir_emitter, mlir_context));
 
   VLOG(3) << hlo_computation->ToString(HloPrintOptions::ShortParsable());
   VLOG(2) << config.ShortDebugString();
 
-  // Build Triton kernel.
-  SmallVector<Type> fn_arg_types;
-  for (HloInstruction* p : hlo_computation->parameter_instructions()) {
-    fn_arg_types.push_back(mt::PointerType::get(
-        StorageType(b, TritonType(b, p->shape().element_type())),
-        mn::kGlobalMemorySpace));
-  }
-
-  for (const ShapeUtil::IndexedShape& s :
-       ShapeUtil::GetLeafShapes(hlo_computation->root_instruction()->shape())) {
-    fn_arg_types.push_back(mt::PointerType::get(
-        StorageType(b, TritonType(b, s.shape.element_type())),
-        mn::kGlobalMemorySpace));
-  }
-
-  auto fn = b.create<mt::FuncOp>(loc, fn_name,
-                                 b.getFunctionType(fn_arg_types, std::nullopt));
-  for (int i = 0; i < fn.getNumArguments(); ++i) {
-    fn.setArgAttr(i, "tt.divisibility", b.getIntegerAttr(b.getI32Type(), 16));
-  }
-  fn.addEntryBlock();
-  b.setInsertionPointToStart(&fn.front());
-
-  const std::string libdevice_path =
-      nvptx::LibDevicePath(hlo_computation->parent()
-                               ->config()
-                               .debug_options()
-                               .xla_gpu_cuda_data_dir());
-
-  TF_RETURN_IF_ERROR(ir_emitter(b, libdevice_path, analysis, hlo_computation,
-                                fn, config,
-                                device_info.shared_memory_per_block_optin));
-
-  b.create<mt::ReturnOp>(loc);
-
-  VLOG(6) << llvm_ir::DumpToString(*triton_module);
-  if (DumpingEnabledForHloModule(*hlo_computation->parent())) {
-    DumpToFileInDirOrStdout(*hlo_computation->parent(), "triton_ir", "ttir",
-                            llvm_ir::DumpToString(*triton_module));
-  }
-
-  CHECK(mlir::succeeded(mlir::verify(*triton_module)));
-
   // Compile Triton kernel to LLVM.
   mlir::PassManager pm(&mlir_context);
 
@@ -1765,9 +1782,10 @@ StatusOr<TritonWrapperResult> TritonWrapper(
     return ResourceExhausted("Shared memory size limit exceeded.");
   }
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::Module> ll_triton_module,
-                      TranslateLLVMToLLVMIR(&llvm_module->getContext(),
-                                            *triton_module, libdevice_path));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<llvm::Module> ll_triton_module,
+      TranslateLLVMToLLVMIR(&llvm_module->getContext(), *triton_module,
+                            GetLibdevicePath(hlo_computation)));
   LogAndVerify(ll_triton_module.get());
 
   // Integrate LLVM matmul kernel into XLA's LLVM module.
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index 8fe834b38e9030..06970d350b24f2 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -18,8 +18,12 @@ limitations under the License.
 
 #include <functional>
 
+#include "absl/strings/string_view.h"
 #include "llvm/IR/Module.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
@@ -27,6 +31,7 @@ limitations under the License.
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace xla {
@@ -77,6 +82,14 @@ StatusOr<TritonWrapperResult> TritonWrapper(
     const AutotuneResult::TritonGemmKey& config, llvm::Module* llvm_module,
     TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context);
 
+// Creates the initial Triton module for the given fusion. Visible for testing,
+// use TritonWrapper instead.
+StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
+    const TritonFusionAnalysis& analysis, absl::string_view fn_name,
+    const HloComputation* hlo_computation, const GpuDeviceInfo& device_info,
+    const AutotuneResult::TritonGemmKey& config, TritonIrEmitter ir_emitter,
+    mlir::MLIRContext& mlir_context);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 0e20fc806662dc..83f984fa7234bb 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -25,12 +25,16 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "xla/autotuning.pb.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -38,6 +42,7 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
 #include "xla/xla.pb.h"
@@ -86,6 +91,135 @@ class TritonGemmNoTF32Test : public TritonGemmTest {
   bool tf32_state_;
 };
 
+class TritonFilecheckTest : public TritonGemmTest {
+ public:
+  StatusOr<bool> CreateTritonIrAndFileCheck(
+      absl::string_view hlo_text, const AutotuneResult::TritonGemmKey& config,
+      TritonIrEmitter emitter, absl::string_view triton_fusion_name,
+      absl::string_view filecheck_pattern);
+};
+
+StatusOr<bool> TritonFilecheckTest::CreateTritonIrAndFileCheck(
+    absl::string_view hlo_text, const AutotuneResult::TritonGemmKey& config,
+    TritonIrEmitter emitter, absl::string_view triton_fusion_name,
+    absl::string_view filecheck_pattern) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<VerifiedHloModule> verified_module,
+                      ParseAndReturnVerifiedModule(hlo_text));
+
+  auto* computation =
+      verified_module->GetComputationWithName(triton_fusion_name);
+  TF_ASSIGN_OR_RETURN(auto analysis,
+                      TritonFusionAnalysis::Execute(*computation));
+
+  mlir::MLIRContext context;
+  TF_ASSIGN_OR_RETURN(
+      auto module, CreateTritonModule(analysis, "triton_fn", computation,
+                                      TestGpuDeviceInfo::RTXA6000DeviceInfo(),
+                                      config, emitter, context));
+  mlir::PassManager pm(&context);
+  pm.addPass(mlir::createCanonicalizerPass());
+  TF_RET_CHECK(pm.run(module.get()).succeeded());
+
+  std::string out;
+  llvm::raw_string_ostream os(out);
+  module->print(os);
+  return RunFileCheck(out, filecheck_pattern);
+}
+
+TEST_F(TritonFilecheckTest, TestGemm) {
+  const std::string kHloText = R"(
+HloModule t, is_scheduled=true
+
+triton_gemm_r {
+  parameter_0 = s8[80,115]{1,0} parameter(0)
+  convert.3 = f32[80,115]{1,0} convert(parameter_0)
+  parameter_1 = f32[137,115]{1,0} parameter(1)
+  ROOT r.1 = f32[80,137]{1,0} dot(convert.3, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p1 = f32[137,115]{1,0} parameter(1)
+  p0 = s8[80,115]{1,0} parameter(0)
+  ROOT triton_gemm_r = f32[80,137]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_gemm_r,
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,"split_k":1,"num_stages":1,"num_warps":2}}
+})";
+  AutotuneResult::TritonGemmKey config;
+  config.set_split_k(1);
+  config.set_block_m(16);
+  config.set_block_k(32);
+  config.set_block_n(64);
+
+  ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
+                                         "triton_gemm_r", R"(
+CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
+CHECK-DAG:  %[[ZERO_KN:.*]] = arith.constant dense<0.000000e+00> : tensor<32x64xf32>
+CHECK-DAG:  %[[ZERO_MK:.*]] = arith.constant dense<0.000000e+00> : tensor<16x32xf32>
+CHECK-DAG:  %[[ZERO_MN:.*]] = arith.constant dense<0.000000e+00> : tensor<16x64xf32>
+CHECK-DAG:  %[[SIZE_K:.*]] = arith.constant 115 : i32
+CHECK-DAG:  %[[SIZE_M:.*]] = arith.constant 137 : i64
+CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : i64
+CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : i32
+CHECK-DAG:  %[[C80:.*]] = arith.constant 80 : i64
+CHECK-DAG:  %[[TILE_SIZE_K:.*]] = arith.constant 32 : i32
+CHECK-DAG:  %[[TILE_SIZE_N:.*]] = arith.constant 64 : i32
+CHECK-DAG:  %[[TILE_SIZE_M:.*]] = arith.constant 16 : i32
+CHECK-DAG:  %[[NUM_TILES_M:.*]] = arith.constant 5 : i32
+CHECK-DAG:  %[[GROUP_M:.*]] = arith.constant 8 : i32
+CHECK-DAG:  %[[WIDTH:.*]] = arith.constant 24 : i32
+CHECK:      %[[PID_NC:.*]] = tt.get_program_id x
+CHECK:      %[[PID_K:.*]] = tt.get_program_id z
+CHECK:      %[[GROUP_ID:.*]] = arith.divsi %[[PID_NC]], %[[WIDTH]]
+CHECK:      %[[FIRST_PID_M:.*]] = arith.muli %[[GROUP_ID]], %[[GROUP_M]]
+CHECK:      %[[MAX_M:.*]] = arith.subi %[[NUM_TILES_M]], %[[FIRST_PID_M]]
+CHECK:      %[[CMP:.*]] = arith.cmpi slt, %[[MAX_M]], %[[GROUP_M]]
+CHECK:      %[[GROUP_SIZE:.*]] = arith.select %[[CMP]], %[[MAX_M]], %[[GROUP_M]]
+CHECK:      %[[PID_M:.*]] = arith.remsi %[[PID_NC]], %[[GROUP_SIZE]]
+CHECK:      %[[TILE_INDEX_M:.*]] = arith.addi %[[FIRST_PID_M]], %[[PID_M]] : i32
+CHECK:      %[[TILE_OFFSET_M:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
+CHECK:      %[[TMP:.*]] = arith.remsi %[[PID_NC]], %[[WIDTH]] : i32
+CHECK:      %[[TILE_INDEX_N:.*]] = arith.divsi %[[TMP]], %[[GROUP_SIZE]] : i32
+CHECK:      %[[TILE_OFFSET_N:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_N]]
+CHECK:      %[[TILE_OFFSET_K:.*]] = arith.muli %[[PID_K]], %[[TILE_SIZE_K]]
+CHECK:      %[[LHS_PTR:.*]] = tt.make_tensor_ptr %[[LHS]]
+CHECK:      %[[LHS_TILE_PTR:.*]] = tt.advance %[[LHS_PTR]], [%[[TILE_OFFSET_M]], %[[TILE_OFFSET_K]]]
+CHECK:      %[[RHS_PTR:.*]] = tt.make_tensor_ptr %[[RHS]]
+CHECK:      %[[RHS_TILE_PTR:.*]] = tt.advance %[[RHS_PTR]], [%[[TILE_OFFSET_K]], %[[TILE_OFFSET_N]]]
+CHECK:        %[[FOR:.*]]:3 = scf.for %[[BLOCK_K:.*]] = %[[C0]] to %[[SIZE_K]] step %[[TILE_SIZE_K]]
+CHECK-SAME:       iter_args(%[[LHS_ITER_PTR:.*]] = %[[LHS_TILE_PTR]], %[[RHS_ITER_PTR:.*]] = %[[RHS_TILE_PTR]], %[[ACC:.*]] = %[[ZERO_MN]])
+CHECK:        %[[LHS_TILE:.*]] = tt.load %[[LHS_ITER_PTR]] {boundaryCheck = array<i32: 1>
+CHECK:        %[[LHS_ITER_PTR_NEXT:.*]] = tt.advance %[[LHS_ITER_PTR]], [%[[C0]], %[[TILE_SIZE_K]]]
+CHECK:        %[[RHS_TILE:.*]] = tt.load %[[RHS_ITER_PTR]] {boundaryCheck = array<i32: 0, 1>
+CHECK:        %[[RHS_ITER_PTR_NEXT:.*]] = tt.advance %[[RHS_ITER_PTR]], [%[[TILE_SIZE_K]], %[[C0]]]
+CHECK:        %[[CONVERTED:.*]] = arith.sitofp %[[LHS_TILE]] : tensor<16x32xi8> to tensor<16x32xf32>
+CHECK:        %[[TILE_K_LIMIT:.*]] = arith.subi %[[SIZE_K]], %[[BLOCK_K]] : i32
+CHECK:        %[[K_OFFSET:.*]] = arith.muli %[[PID_K]], %[[TILE_SIZE_K]] : i32
+CHECK:        %[[K_OFFSET_SPLAT_K:.*]] = tt.splat %[[K_OFFSET]] : (i32) -> tensor<32xi32>
+CHECK:        %[[K_TILE_IOTA:.*]] = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
+CHECK:        %[[K_OFFSETS:.*]] = arith.addi %[[K_OFFSET_SPLAT_K]], %[[K_TILE_IOTA]] : tensor<32xi32>
+CHECK:        %[[K_OFFSETS_1K:.*]] = tt.expand_dims %[[K_OFFSETS]] {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32>
+CHECK:        %[[TILE_K_LIMIT_1K:.*]] = tt.splat %[[TILE_K_LIMIT]] : (i32) -> tensor<1x32xi32>
+CHECK:        %[[LHS_INBOUNDS_1K:.*]] = arith.cmpi slt, %[[K_OFFSETS_1K]], %[[TILE_K_LIMIT_1K]] : tensor<1x32xi32>
+CHECK:        %[[LHS_INBOUNDS_MK:.*]] = tt.broadcast %[[LHS_INBOUNDS_1K]] : (tensor<1x32xi1>) -> tensor<16x32xi1>
+CHECK:        %[[LHS_MASKED:.*]] = arith.select %[[LHS_INBOUNDS_MK]], %[[CONVERTED]], %[[ZERO_MK]]
+CHECK:        %[[K_OFFSETS_K1:.*]] = tt.expand_dims %[[K_OFFSETS]] {axis = 1 : i32} : (tensor<32xi32>) -> tensor<32x1xi32>
+CHECK:        %[[TILE_K_LIMIT_K1:.*]] = tt.splat %[[TILE_K_LIMIT]] : (i32) -> tensor<32x1xi32>
+CHECK:        %[[RHS_INBOUNDS_K1:.*]] = arith.cmpi slt, %[[K_OFFSETS_K1]], %[[TILE_K_LIMIT_K1]] : tensor<32x1xi32>
+CHECK:        %[[RHS_INBOUNDS_KN:.*]] = tt.broadcast %[[RHS_INBOUNDS_K1]] : (tensor<32x1xi1>) -> tensor<32x64xi1>
+CHECK:        %[[RHS_MASKED:.*]] = arith.select %[[RHS_INBOUNDS_KN]], %[[RHS_TILE]], %[[ZERO_KN]] : tensor<32x64xi1>, tensor<32x64xf32>
+CHECK:        %[[ACC_NEXT:.*]] = tt.dot %[[LHS_MASKED]], %[[RHS_MASKED]], %[[ACC]]
+CHECK:        scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xi8>>, !tt.ptr<tensor<32x64xf32>>, tensor<16x64xf32>
+CHECK:      }
+CHECK:      %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[C80]], %[[SIZE_M]]], [%[[SIZE_M]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x64xf32>>
+CHECK:      %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M]], %[[TILE_OFFSET_N]]] : <tensor<16x64xf32>>
+CHECK:      tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<16x64xf32>>, tensor<16x64xf32>
+CHECK:      tt.return
+CHECK:    }
+)"),
+              tsl::testing::IsOkAndHolds(true));
+}
+
 TEST_F(TritonGemmNoTF32Test, DoNotUseTensorCoresForF32) {
   const std::string kHloText = R"(
 HloModule t, is_scheduled=true

From c5e2eff71408e935d82a2c18ebb9c22b5f9db390 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 07:55:09 -0700
Subject: [PATCH 267/567] Update docker image hash in Dockerfile.

PiperOrigin-RevId: 568538855
---
 ....rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
index 460e54add5d558..8fcf2211bcd848 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
@@ -8,7 +8,7 @@
 #  --tag "gcr.io/tensorflow-testing/nosla-cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython" .
 # $ docker push gcr.io/tensorflow-testing/nosla-cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
 
-FROM gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282
+FROM gcr.io/tensorflow-sigs/build@sha256:edf0324686ffbf488ed0a6b7d3a29ecc6f7b7ad9c04b2e1a527a6a47c0dc1b4b
 
 # Copy and run the install scripts.
 ARG DEBIAN_FRONTEND=noninteractive

From aad923ba91272ffb67f3541a84da2fe21d8abefb Mon Sep 17 00:00:00 2001
From: Sam McCall <sammccall@google.com>
Date: Tue, 26 Sep 2023 08:18:06 -0700
Subject: [PATCH 268/567] Integrate LLVM at llvm/llvm-project@69ba4aed004d

Updates LLVM usage to match
[69ba4aed004d](https://github.com/llvm/llvm-project/commit/69ba4aed004d)

PiperOrigin-RevId: 568544781
---
 .../tests/tf_framework_legalize_to_llvm.mlir        | 10 +++++-----
 .../transforms/tf_framework_legalize_to_llvm.cc     | 10 +++++-----
 third_party/llvm/workspace.bzl                      |  4 ++--
 third_party/triton/cl568240805.patch                | 13 +++++++++++++
 third_party/triton/workspace.bzl                    |  1 +
 .../xla/third_party/triton/cl568240805.patch        | 13 +++++++++++++
 third_party/xla/third_party/triton/workspace.bzl    |  1 +
 .../mlir/runtime/transforms/custom_call_encoding.cc |  4 ++--
 .../mlir/runtime/transforms/tests/rt_to_llvm.mlir   |  2 +-
 .../transforms/convert_deallocation_ops_to_llvm.cc  |  2 +-
 .../convert_deallocation_ops_to_llvm.mlir           |  2 +-
 11 files changed, 45 insertions(+), 17 deletions(-)
 create mode 100644 third_party/triton/cl568240805.patch
 create mode 100644 third_party/xla/third_party/triton/cl568240805.patch

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
index 1bc9a38515a7e0..ed65470eb72728 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
@@ -19,7 +19,7 @@ func.func @alloc(%ctx: !tf_framework.op_kernel_context,
 // CHECK: [[NUM_ELEMS:%.*]] = llvm.mul [[NUM_ELEM_0]], [[SIZE_2]] : i64
 
 // Compute the size of an individual element.
-// CHECK: [[NULL:%.*]] = llvm.mlir.null : !llvm.ptr
+// CHECK: [[NULL:%.*]] = llvm.mlir.zero : !llvm.ptr
 // CHECK: [[GEP:%.*]] = llvm.getelementptr [[NULL]]{{\[}}1]
 // CHECK-SAME:            (!llvm.ptr) -> !llvm.ptr, f32
 // CHECK: [[SIZE_OF_FLOAT:%.*]] = llvm.ptrtoint [[GEP]]
@@ -28,7 +28,7 @@ func.func @alloc(%ctx: !tf_framework.op_kernel_context,
 // Compute output index (-1) and candidate indices (0, NULL).
 // CHECK: [[OUTPUT_INDEX:%.*]] = llvm.mlir.constant(-1 : i32) : i32
 // CHECK-NEXT: [[NUM_CANDIDATES:%.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK-NEXT: [[CANDIDATES_PTR:%.*]] = llvm.mlir.null : !llvm.ptr
+// CHECK-NEXT: [[CANDIDATES_PTR:%.*]] = llvm.mlir.zero : !llvm.ptr
 
 // Allocate memory.
 // CHECK: [[BYTES_PTR:%.*]] = llvm.call @{{.*}}([[TF_CTX]], [[NUM_ELEMS]],
@@ -115,8 +115,8 @@ func.func @ranked_null_memref() {
 // CHECK-NEXT: %[[C2:.*]] = llvm.mlir.constant(2 : index) : i64
 // CHECK-NEXT: %[[C1_:.*]] = llvm.mlir.constant(1 : index) : i64
 
-// CHECK: llvm.mlir.null
-// CHECK: %[[NULL:.*]] = llvm.mlir.null : !llvm.ptr
+// CHECK: llvm.mlir.zero
+// CHECK: %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
 // CHECK-NEXT: %[[DESC_0:.*]] = llvm.mlir.undef :
 // CHECK-SAME:   !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
 // CHECK-NEXT: %[[DESC_1:.*]] = llvm.insertvalue %[[NULL]], %[[DESC_0]][0]
@@ -143,7 +143,7 @@ func.func @is_valid_memref(%buf: memref<?xf32>) -> i1 {
 // CHECK-NEXT: %[[IS_EMPTY_:.*]] =  llvm.or %[[IS_EMPTY]], %[[IS_ZERO]] : i1
 
 // CHECK-NEXT: %[[PTR_F32:.*]] = llvm.extractvalue %[[MEMREF]][0]
-// CHECK-NEXT: %[[NULL_PTR:.*]] = llvm.mlir.null : !llvm.ptr
+// CHECK-NEXT: %[[NULL_PTR:.*]] = llvm.mlir.zero : !llvm.ptr
 // CHECK-NEXT: %[[NOT_NULL:.*]] = llvm.icmp "ne" %[[PTR_F32]], %[[NULL_PTR]]
 
 // CHECK-NEXT: %[[PRED:.*]] = llvm.or %[[NOT_NULL]], %[[IS_EMPTY_]]  : i1
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
index 8653521aa09424..fe649e1edeb723 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -71,7 +71,7 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
     if (!attr.has_value() || attr.value().empty()) {
       Value zero = rewriter->create<LLVM::ConstantOp>(
           loc, size_ty, rewriter->getIntegerAttr(size_ty, 0));
-      Value null_ptr = rewriter->create<LLVM::NullOp>(loc, ptr_ty);
+      Value null_ptr = rewriter->create<LLVM::ZeroOp>(loc, ptr_ty);
       return std::make_pair(zero, null_ptr);
     }
 
@@ -451,7 +451,7 @@ class NullContextOpConverter : public ConvertOpToLLVMPattern<NullContextOp> {
   LogicalResult matchAndRewrite(
       NullContextOp op, OpAdaptor /*adaptor*/,
       ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<LLVM::NullOp>(op, getVoidPtrType());
+    rewriter.replaceOpWithNewOp<LLVM::ZeroOp>(op, getVoidPtrType());
     return success();
   }
 };
@@ -489,7 +489,7 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
 
       // Prepare packed args [allocatedPtr, alignedPtr, offset, sizes, strides]
       // to create a memref descriptor.
-      Value null = rewriter.create<LLVM::NullOp>(loc, llvm_ptr_type);
+      Value null = rewriter.create<LLVM::ZeroOp>(loc, llvm_ptr_type);
       SmallVector<Value, 12> packed_values{null, null, zero};
       packed_values.append(sizes);
       packed_values.append(strides);
@@ -524,7 +524,7 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
         sizes.front());
 
     // Populate underlying ranked descriptor.
-    Value null = rewriter.create<LLVM::NullOp>(loc, llvm_ptr_type);
+    Value null = rewriter.create<LLVM::ZeroOp>(loc, llvm_ptr_type);
     UnrankedMemRefDescriptor::setAllocatedPtr(
         rewriter, loc, underlying_desc_ptr, llvm_ptr_type, null);
     UnrankedMemRefDescriptor::setAlignedPtr(rewriter, loc, *getTypeConverter(),
@@ -565,7 +565,7 @@ class IsValidMemRefOpConverter
     }
 
     Value ptr = desc.allocatedPtr(rewriter, loc);
-    Value null = rewriter.create<LLVM::NullOp>(loc, getVoidPtrType());
+    Value null = rewriter.create<LLVM::ZeroOp>(loc, getVoidPtrType());
     Value is_not_nullptr = rewriter.create<LLVM::ICmpOp>(
         loc, rewriter.getI1Type(), LLVM::ICmpPredicate::ne, ptr, null);
 
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 59331ea1df5291..7879bbfdad7a0e 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "f7bf99fb529f9993548bf6fac0b5523c791f2416"
-    LLVM_SHA256 = "567656a6da8e3608a81859dcc57e4279ae468936769177d66a775d6800003434"
+    LLVM_COMMIT = "69ba4aed004dd96dcf254e1ee19bd5d3875c0d09"
+    LLVM_SHA256 = "bcf5d0f7b9a8d49cbb9f296a02d87be47cc64bbbd3aa45b6aa356c5adcd85e01"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/triton/cl568240805.patch b/third_party/triton/cl568240805.patch
new file mode 100644
index 00000000000000..1ccf442840e111
--- /dev/null
+++ b/third_party/triton/cl568240805.patch
@@ -0,0 +1,13 @@
+==== triton/lib/Conversion/TritonGPUToLLVM/Utility.h#5 - /google/src/cloud/aliia/mlir_85175edd4efee6995b39b8a06f3f946003005211_1695659054/triton/lib/Conversion/TritonGPUToLLVM/Utility.h ====
+# action=edit type=text
+--- triton/lib/Conversion/TritonGPUToLLVM/Utility.h	2023-07-07 10:10:42.000000000 -0700
++++ triton/lib/Conversion/TritonGPUToLLVM/Utility.h	2023-09-25 09:28:13.000000000 -0700
+@@ -76,7 +76,7 @@
+ #define address_of(...) rewriter.create<LLVM::AddressOfOp>(loc, __VA_ARGS__)
+ #define barrier() rewriter.create<mlir::gpu::BarrierOp>(loc)
+ #define undef(...) rewriter.create<LLVM::UndefOp>(loc, __VA_ARGS__)
+-#define null(...) rewriter.create<LLVM::NullOp>(loc, __VA_ARGS__)
++#define null(...) rewriter.create<LLVM::ZeroOp>(loc, __VA_ARGS__)
+ #define call(...) rewriter.create<LLVM::CallOp>(loc, __VA_ARGS__)
+ 
+ // Types
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 29ac349131cde9..c12e2ae5b8f6d9 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -25,5 +25,6 @@ def repo():
             "//third_party/triton:cl565616678.patch",
             "//third_party/triton:cl565664892.patch",
             "//third_party/triton:cl566223642.patch",
+            "//third_party/triton:cl568240805.patch",
         ],
     )
diff --git a/third_party/xla/third_party/triton/cl568240805.patch b/third_party/xla/third_party/triton/cl568240805.patch
new file mode 100644
index 00000000000000..1ccf442840e111
--- /dev/null
+++ b/third_party/xla/third_party/triton/cl568240805.patch
@@ -0,0 +1,13 @@
+==== triton/lib/Conversion/TritonGPUToLLVM/Utility.h#5 - /google/src/cloud/aliia/mlir_85175edd4efee6995b39b8a06f3f946003005211_1695659054/triton/lib/Conversion/TritonGPUToLLVM/Utility.h ====
+# action=edit type=text
+--- triton/lib/Conversion/TritonGPUToLLVM/Utility.h	2023-07-07 10:10:42.000000000 -0700
++++ triton/lib/Conversion/TritonGPUToLLVM/Utility.h	2023-09-25 09:28:13.000000000 -0700
+@@ -76,7 +76,7 @@
+ #define address_of(...) rewriter.create<LLVM::AddressOfOp>(loc, __VA_ARGS__)
+ #define barrier() rewriter.create<mlir::gpu::BarrierOp>(loc)
+ #define undef(...) rewriter.create<LLVM::UndefOp>(loc, __VA_ARGS__)
+-#define null(...) rewriter.create<LLVM::NullOp>(loc, __VA_ARGS__)
++#define null(...) rewriter.create<LLVM::ZeroOp>(loc, __VA_ARGS__)
+ #define call(...) rewriter.create<LLVM::CallOp>(loc, __VA_ARGS__)
+ 
+ // Types
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 29ac349131cde9..c12e2ae5b8f6d9 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -25,5 +25,6 @@ def repo():
             "//third_party/triton:cl565616678.patch",
             "//third_party/triton:cl565664892.patch",
             "//third_party/triton:cl566223642.patch",
+            "//third_party/triton:cl568240805.patch",
         ],
     )
diff --git a/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.cc b/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.cc
index e8b229829aaa35..0a6ae2ee70588a 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.cc
@@ -391,7 +391,7 @@ static LLVM::GlobalOp EncodeEmptyArrayAttribute(Globals &g,
   auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
     // Array size and the pointer to data.
     Value num_elements = ib.create<ConstantOp>(b.getI64IntegerAttr(0));
-    Value data = ib.create<LLVM::NullOp>(ptr);
+    Value data = ib.create<LLVM::ZeroOp>(ptr);
 
     // Store size and values into the struct.
     Value encoded = ib.create<LLVM::UndefOp>(type);
@@ -999,7 +999,7 @@ FailureOr<LLVM::GlobalOp> EncodeAttributes(
       if (encoded.value) {
         insert_value(Globals::AddrOf(b, encoded.value), offset + 2);
       } else {
-        insert_value(b.create<LLVM::NullOp>(ptr), offset + 2);
+        insert_value(b.create<LLVM::ZeroOp>(ptr), offset + 2);
       }
     }
 
diff --git a/third_party/xla/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir b/third_party/xla/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir
index e6b45a41450ea5..c8202e8378a6de 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir
+++ b/third_party/xla/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir
@@ -133,7 +133,7 @@ func.func @custom_call(%arg0: !rt.execution_context) {
 // CHECK: global internal constant @__rt_attr_value()
 // CHECK-SAME: !llvm.struct<(i64, ptr)> {
 // CHECK:    arith.constant 0 : i64
-// CHECK:    llvm.mlir.null : !llvm.ptr
+// CHECK:    llvm.mlir.zero : !llvm.ptr
 // CHECK:    llvm.mlir.undef : !llvm.struct<(i64, ptr)>
 // CHECK:    llvm.insertvalue
 // CHECK:    llvm.insertvalue
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/convert_deallocation_ops_to_llvm.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/convert_deallocation_ops_to_llvm.cc
index cda6d2c1d2485e..ca854374e16949 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/convert_deallocation_ops_to_llvm.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/transforms/convert_deallocation_ops_to_llvm.cc
@@ -41,7 +41,7 @@ struct NullOpLowering : public ConvertOpToLLVMPattern<NullOp> {
   LogicalResult matchAndRewrite(
       NullOp nullOp, OpAdaptor,
       ConversionPatternRewriter& rewriter) const override {
-    rewriter.replaceOpWithNewOp<LLVM::NullOp>(
+    rewriter.replaceOpWithNewOp<LLVM::ZeroOp>(
         nullOp, LLVM::LLVMPointerType::get(rewriter.getContext(), 0));
     return success();
   }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/convert_deallocation_ops_to_llvm.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/convert_deallocation_ops_to_llvm.mlir
index 868ee0319ad650..121b9f346cd67f 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/convert_deallocation_ops_to_llvm.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/convert_deallocation_ops_to_llvm.mlir
@@ -6,7 +6,7 @@ func.func @null() -> !deallocation.ownership {
   %null = deallocation.null
   func.return %null : !deallocation.ownership
 }
-// CHECK: %[[NULL:.*]] = llvm.mlir.null : !llvm.ptr
+// CHECK: %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
 // CHECK: %[[RET:.*]] = builtin.unrealized_conversion_cast %[[NULL]]
 // CHECK: return %[[RET]]
 

From 7cf4309dea6c3a6dc6cd7b416dc9e5c20b6a8fe7 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Tue, 26 Sep 2023 09:12:27 -0700
Subject: [PATCH 269/567] Add bisect & any scripts, allow Docker rebuilds,
 integrate bazelrcs

Documentation for these will come soon.

PiperOrigin-RevId: 568558257
---
 .bazelrc                                      |  63 +++++++-
 ci/official/README.md                         |   8 +-
 ci/official/any.sh                            |  59 +++-----
 ci/official/bazelrcs/cpu.bazelrc              | 105 --------------
 ci/official/bazelrcs/cpu_gcc.bazelrc          |  91 ------------
 ci/official/bazelrcs/cuda.bazelrc             | 137 ------------------
 ci/official/bisect.sh                         |  40 +++++
 ci/official/envs/ci_default                   |   4 +
 ci/official/envs/ci_nightly_uploads           |  17 +--
 .../envs/continuous_linux_x86_cpu_py310       |   3 +-
 .../envs/continuous_linux_x86_cpu_py311       |   3 +-
 .../envs/continuous_linux_x86_cpu_py39        |   3 +-
 .../envs/continuous_linux_x86_cuda_py310      |   3 +-
 .../envs/continuous_linux_x86_cuda_py311      |   3 +-
 .../envs/continuous_linux_x86_cuda_py39       |   3 +-
 ci/official/envs/disable_all_uploads          |   7 +
 .../envs/nightly_libtensorflow_linux_x86_cpu  |   4 +-
 .../envs/nightly_libtensorflow_linux_x86_cuda |   4 +-
 ci/official/envs/nightly_linux_x86_cpu_py310  |   5 +-
 ci/official/envs/nightly_linux_x86_cpu_py311  |   5 +-
 ci/official/envs/nightly_linux_x86_cpu_py39   |   5 +-
 ci/official/envs/nightly_linux_x86_cuda_py310 |   5 +-
 ci/official/envs/nightly_linux_x86_cuda_py311 |   5 +-
 ci/official/envs/nightly_linux_x86_cuda_py39  |   5 +-
 ci/official/envs/nightly_linux_x86_tpu_py310  |   5 +-
 ci/official/envs/nightly_linux_x86_tpu_py311  |   3 +-
 ci/official/envs/nightly_linux_x86_tpu_py39   |   3 +-
 ci/official/envs/sample                       |   9 +-
 ci/official/pycpp.sh                          |   2 +-
 ci/official/utilities/docker.sh               |   7 +
 ci/official/wheel.sh                          |   2 +-
 third_party/xla/.bazelrc                      |  63 +++++++-
 third_party/xla/third_party/tsl/.bazelrc      |  63 +++++++-
 33 files changed, 303 insertions(+), 441 deletions(-)
 delete mode 100644 ci/official/bazelrcs/cpu.bazelrc
 delete mode 100644 ci/official/bazelrcs/cpu_gcc.bazelrc
 delete mode 100644 ci/official/bazelrcs/cuda.bazelrc
 create mode 100755 ci/official/bisect.sh
 create mode 100644 ci/official/envs/disable_all_uploads

diff --git a/.bazelrc b/.bazelrc
index 98eff1ea726464..50b3372e43a32f 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -425,15 +425,19 @@ build:ios     --define=with_xla_support=false
 # Options when using remote execution
 # WARNING: THESE OPTIONS WONT WORK IF YOU DO NOT HAVE PROPER AUTHENTICATION AND PERMISSIONS
 
+# Allow creation of resultstore URLs for any bazel invocation
+build:resultstore --google_default_credentials
+build:resultstore --bes_backend=buildeventservice.googleapis.com
+build:resultstore --bes_instance_name="tensorflow-testing"
+build:resultstore --bes_results_url="https://source.cloud.google.com/results/invocations"
+build:resultstore --bes_timeout=600s
+
 # Flag to enable remote config
 common --experimental_repo_remote_exec
 
 # Make Bazel not try to probe the host system for a C++ toolchain.
+build:rbe_base --config=resultstore
 build:rbe_base --repo_env=BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1
-build:rbe_base --google_default_credentials
-build:rbe_base --bes_backend=buildeventservice.googleapis.com
-build:rbe_base --bes_results_url="https://source.cloud.google.com/results/invocations"
-build:rbe_base --bes_timeout=600s
 build:rbe_base --define=EXECUTOR=remote
 build:rbe_base --jobs=800
 build:rbe_base --remote_executor=grpcs://remotebuildexecution.googleapis.com
@@ -475,7 +479,6 @@ build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_conf
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-build:rbe_linux_cpu --bes_instance_name="tensorflow-testing"
 
 # TODO(kanglan): Remove it after toolchain update is complete.
 build:rbe_linux_cpu_old --config=rbe_linux
@@ -488,7 +491,6 @@ build:rbe_linux_cpu_old --platforms="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cu
 build:rbe_linux_cpu_old --python_path="/usr/local/bin/python3.9"
 build:rbe_linux_cpu_old --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_python3.9"
 common:rbe_linux_cpu_old --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-build:rbe_linux_cpu_old --bes_instance_name="tensorflow-testing"
 
 build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
@@ -531,7 +533,6 @@ build:rbe_win_py39 --python_path=C:\\Python39\\python.exe
 
 # TODO(kanglan): Merge tensorflow_testing_rbe_win into rbe_win
 common:tensorflow_testing_rbe_win --remote_instance_name=projects/tensorflow-testing/instances/windows
-build:tensorflow_testing_rbe_win --bes_instance_name="tensorflow-testing"
 # END TF REMOTE BUILD EXECUTION OPTIONS
 
 # TFLite build configs for generic embedded Linux
@@ -641,3 +642,51 @@ build:android --config=no_tfrt
 build:macos   --config=no_tfrt
 build:windows --config=no_tfrt
 build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/ir,tensorflow/compiler/mlir/tfrt/ir/mlrt,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/mlrt,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/compiler/mlir/tfrt/transforms/mlrt,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/runtime_fallback/test,tensorflow/core/runtime_fallback/test/gpu,tensorflow/core/runtime_fallback/test/saved_model,tensorflow/core/runtime_fallback/test/testdata,tensorflow/core/tfrt/stubs,tensorflow/core/tfrt/tfrt_session,tensorflow/core/tfrt/mlrt,tensorflow/core/tfrt/mlrt/attribute,tensorflow/core/tfrt/mlrt/kernel,tensorflow/core/tfrt/mlrt/bytecode,tensorflow/core/tfrt/mlrt/interpreter,tensorflow/compiler/mlir/tfrt/translate/mlrt,tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug,tensorflow/core/tfrt/saved_model/python,tensorflow/core/tfrt/graph_executor/python,tensorflow/core/tfrt/saved_model/utils
+
+# BEGIN TF CACHE HELPER OPTIONS
+# Options when using remote execution
+# WARNING: THESE OPTIONS WONT WORK IF YOU DO NOT HAVE PROPER AUTHENTICATION AND PERMISSIONS
+
+# Use --config=tf_public_cache to try and use the TensorFlow public build cache
+# to build TensorFlow. Look at ci/official/envs to find which types of jobs
+# push to the cache.
+build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
+# Cache pushes are limited to TF's CI system.
+build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_results=true --google_default_credentials
+
+# END TF CACHE HELPER OPTIONS
+# BEGIN TF TEST SUITE OPTIONS
+# These are convenience config options that effectively declare TF's CI test suites. Look
+# at the scripts of ci/official/ to see how TF's CI uses them.
+
+# LIBTENSORFLOW TESTS are for building Libtensorflow archives. These are CUDA/CPU-agnostic.
+test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
+build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
+
+# PYTHON TESTS run a suite of Python tests intended for verifying that the Python wheel
+# will work properly. These are usually run Nightly or upon Release.
+# CPU WHEEL
+test:linux_cpu_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cpu_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cpu_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+# CUDA WHEEL
+test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:linux_cuda_wheel_test --config=nonpip_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+
+# PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
+# the whole TF code base. These are usually run continuously or upon presubmit.
+# CPU PYCPP:
+test:linux_cpu_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:linux_cpu_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:linux_cpu_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
+test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+# CUDA PYCPP:
+test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
+test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
+test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
+test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+
+# END TF TEST SUITE OPTIONS
diff --git a/ci/official/README.md b/ci/official/README.md
index b8ac86c6612ba1..0bdff0cf5e4e0a 100644
--- a/ci/official/README.md
+++ b/ci/official/README.md
@@ -201,7 +201,7 @@ with TensorFlow's remote cache. TensorFlow's official nightly builds push to a
 publicly accessible cache, which you can combine with a local bazel cache:
 
 ```
-TFCI_BAZEL_COMMON_ARGS=( --disk_cache=$TFCI_OUT_DIR/cache --config=sigbuild_remote_cache )
+TFCI_BAZEL_COMMON_ARGS=( --disk_cache=$TFCI_OUT_DIR/cache --config=tf_public_cache )
 ```
 
 This will place a bazel cache in `$TFCI_OUT_DIR/cache`, which by default
@@ -223,14 +223,14 @@ TFCI_BAZEL_COMMON_ARGS=( --disk_cache=/root/bazelcache )
 
 Keep these additional details in mind when using the cache:
 
--   The official nightly builds use `--config=sigbuild_remote_cache_push` to
+-   The official nightly builds use `--config=tf_public_cache_push` to
     push the results of `wheel.sh` to a remote cache. Our CI must use Bazel's
     `--google_default_credentials` flag to pull upload credentials from the
     virtual machine, but the flag raises an error if no credentials are
     available.
--   It's safe to use `--config=sigbuild_remote_cache_push` (the default config)
+-   It's safe to use `--config=tf_public_cache_push` (the default config)
     as a normal developer because you don't have upload permission, but you
-    should switch it to `sigbuild_remote_cache` if you're a Google developer.
+    should switch it to `tf_public_cache` if you're a Google developer.
     There is no cache for `pycpp.sh`, because the official jobs use Remote Build
     Execution instead.
 
diff --git a/ci/official/any.sh b/ci/official/any.sh
index 3a5913713d7ce2..4176541c33d7ce 100755
--- a/ci/official/any.sh
+++ b/ci/official/any.sh
@@ -13,39 +13,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-source "${BASH_SOURCE%/*}/utilities/setup.sh"
-
-# Parse options and build targets into arrays, so that shelllint doesn't yell
-# about readability. We can't pipe into 'read -ra' to create an array because
-# piped commands run in subshells, which can't store variables outside of the
-# subshell environment.
-# Ignore grep failures since we're using it for basic filtering
-set +e
-filtered_build_targets=( $(echo "$BUILD_TARGETS" | tr ' ' '\n' | grep .) )
-nonpip_targets=( $(echo "$TEST_TARGETS" | tr ' ' '\n' | grep -E "^//tensorflow/" ) )
-config=( $(echo "$CONFIG_OPTIONS" ) )
-test_flags=( $(echo "$TEST_FLAGS" ) )
-set -e
-
-if [[ "$TFCI_NVIDIA_SMI_ENABLE" == 1 ]]; then
-  tfrun nvidia-smi
-fi
-
-if [[ "${#filtered_build_targets[@]}" -ne 0 ]]; then
-  tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" "${config[@]}" "${filtered_build_targets[@]}"
-fi
-
-if [[ "${PIP_WHEEL}" -eq "1" ]]; then
-  # Update the version numbers to build a "nightly" package
-  if [[ "$TFCI_NIGHTLY_UPDATE_VERSION_ENABLE" == 1 ]]; then
-    tfrun python3 tensorflow/tools/ci_build/update_version.py --nightly
-  fi
-
-  tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" build "${TFCI_BAZEL_COMMON_ARGS[@]}" tensorflow/tools/pip_package:build_pip_package
-  tfrun ./bazel-bin/tensorflow/tools/pip_package/build_pip_package build "${TFCI_BUILD_PIP_PACKAGE_ARGS[@]}"
-  tfrun ./ci/official/utilities/rename_and_verify_wheels.sh "$TFCI_GIT_DIR"
-fi
-
-if [[ "${#nonpip_targets[@]}" -ne 0 ]]; then
-  tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" test "${config[@]}" "${test_flags[@]}" "${nonpip_targets[@]}"
+# any.sh has two run modes.
+#
+# 1. RUN, TEST, OR BUILD BAZEL TARGET(S) WITHIN A TFCI ENVIRONMENT
+#    To use:
+#       export TFCI=$(realpath ...)
+#       export TF_ANY_TARGETS="quoted list of targets, like on the command line"
+#       export TF_ANY_MODE="test" or "build" or "run" (default: "test")
+#       ./any.sh
+#
+# 2. RUN ANY OTHER SCRIPT AND ENV WITH NO SIDE EFFECTS (NO UPLOADS)
+#    To use:
+#       export TFCI=$(realpath ...)
+#       export TF_ANY_SCRIPT=$(realpath wheel.sh)
+#       ./any.sh
+set -euxo pipefail
+if [[ -n "${TF_ANY_SCRIPT:-}" ]]; then
+  cp "$TFCI" any
+  echo "source ci/official/envs/disable_all_uploads" >> any
+  export TFCI=$(realpath any)
+  "$TF_ANY_SCRIPT"
+else
+  source "${BASH_SOURCE%/*}/utilities/setup.sh"
+  read -ra TARGETS_AS_ARRAY <<<"$TF_ANY_TARGETS"
+  tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" "${TF_ANY_MODE:-test}" "${TFCI_BAZEL_COMMON_ARGS[@]}" "${TARGETS_AS_ARRAY[@]}"
 fi
diff --git a/ci/official/bazelrcs/cpu.bazelrc b/ci/official/bazelrcs/cpu.bazelrc
deleted file mode 100644
index 43c951e3532466..00000000000000
--- a/ci/official/bazelrcs/cpu.bazelrc
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# This bazelrc can build a CPU-supporting TF package.
-
-# Convenient cache configurations
-# Use a cache directory mounted to /tf/cache. Very useful!
-build:sigbuild_local_cache --disk_cache=/tf/cache
-# Use the public-access TF DevInfra cache (read only)
-build:sigbuild_remote_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
-# Write to the TF DevInfra cache (only works for internal TF CI)
-build:sigbuild_remote_cache_push --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --google_default_credentials
-# Change the value of CACHEBUSTER when upgrading the toolchain, or when testing
-# different compilation methods. E.g. for a PR to test a new CUDA version, set
-# the CACHEBUSTER to the PR number.
-build --action_env=CACHEBUSTER=501872366
-
-# Use Python 3.X as installed in container image
-build --action_env PYTHON_BIN_PATH="/usr/bin/python3"
-build --action_env PYTHON_LIB_PATH="/usr/lib/tf_python"
-build --python_path="/usr/bin/python3"
-
-# Build TensorFlow v2
-build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
-
-# Target the AVX instruction set
-build --copt=-mavx --host_copt=-mavx
-
-# Use lld as the linker
-build --linkopt="-fuse-ld=lld"
-build --linkopt="-lm"
-
-# Disable clang extention that rejects type definitions within offsetof. 
-# This was added in clang-16 by https://reviews.llvm.org/D133574.
-# Can be removed once upb is updated, since a type definition is used within
-# offset of in the current version of ubp.
-# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
-build --copt=-Wno-gnu-offsetof-extensions
-
-# Use the NVCC toolchain to compile for manylinux2014
-build --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-
-# Test-related settings below this point.
-test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
-test --local_test_jobs=HOST_CPUS
-test --test_env=LD_LIBRARY_PATH
-# Give only the list of failed tests at the end of the log
-test --test_summary=short
-
-# "nonpip" tests are regular py_test tests.
-# Pass --config=nonpip to run the same suite of tests. If you want to run just
-# one test for investigation, you don't need --config=nonpip; just run the
-# bazel test invocation as normal.
-test:nonpip_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:nonpip_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:nonpip_filters --test_lang_filters=py --test_size_filters=small,medium
-test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
-
-# For building libtensorflow archives
-test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
-build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
-
-# Allow creation of resultstore URLs for any bazel invocation
-build:resultstore --google_default_credentials
-build:resultstore --bes_backend=buildeventservice.googleapis.com
-build:resultstore --bes_results_url="https://source.cloud.google.com/results/invocations"
-build:resultstore --bes_timeout=600s
-build:resultstore --bes_instance_name="tensorflow-testing"
-
-# For Remote Build Execution.
-build:rbe --config=resultstore
-build:rbe --define=EXECUTOR=remote
-build:rbe --jobs=800
-build:rbe --remote_executor=grpcs://remotebuildexecution.googleapis.com
-build:rbe --remote_timeout=3600
-build:rbe --spawn_strategy=remote,worker,standalone,local
-build:rbe --remote_download_toplevel
-build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
-build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
-# Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
-build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-
-# For continuous builds
-test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:pycpp_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:pycpp_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/ci/official/bazelrcs/cpu_gcc.bazelrc b/ci/official/bazelrcs/cpu_gcc.bazelrc
deleted file mode 100644
index 8e8d9b42ff1a9e..00000000000000
--- a/ci/official/bazelrcs/cpu_gcc.bazelrc
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# This bazelrc can build a CPU-supporting TF package.
-
-# Convenient cache configurations
-# Use a cache directory mounted to /tf/cache. Very useful!
-build:sigbuild_local_cache --disk_cache=/tf/cache
-# Use the public-access TF DevInfra cache (read only)
-build:sigbuild_remote_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
-# Write to the TF DevInfra cache (only works for internal TF CI)
-build:sigbuild_remote_cache_push --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --google_default_credentials
-# Change the value of CACHEBUSTER when upgrading the toolchain, or when testing
-# different compilation methods. E.g. for a PR to test a new CUDA version, set
-# the CACHEBUSTER to the PR number.
-build --action_env=CACHEBUSTER=501872366
-
-# Use Python 3.X as installed in container image
-build --action_env PYTHON_BIN_PATH="/usr/bin/python3"
-build --action_env PYTHON_LIB_PATH="/usr/lib/tf_python"
-build --python_path="/usr/bin/python3"
-
-# Build TensorFlow v2
-build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
-
-# Target the AVX instruction set
-build --copt=-mavx --host_copt=-mavx
-
-# Use the NVCC toolchain to compile for manylinux2014
-build --crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
-
-# Test-related settings below this point.
-test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
-test --local_test_jobs=HOST_CPUS
-test --test_env=LD_LIBRARY_PATH
-# Give only the list of failed tests at the end of the log
-test --test_summary=short
-
-# "nonpip" tests are regular py_test tests.
-# Pass --config=nonpip to run the same suite of tests. If you want to run just
-# one test for investigation, you don't need --config=nonpip; just run the
-# bazel test invocation as normal.
-test:nonpip_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:nonpip_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:nonpip_filters --test_lang_filters=py --test_size_filters=small,medium
-test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
-
-# For building libtensorflow archives
-test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
-build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
-
-# For Remote Build Execution.
-build:rbe --google_default_credentials
-build:rbe --bes_backend=buildeventservice.googleapis.com
-build:rbe --bes_results_url="https://source.cloud.google.com/results/invocations"
-build:rbe --bes_timeout=600s
-build:rbe --define=EXECUTOR=remote
-build:rbe --jobs=800
-build:rbe --remote_executor=grpcs://remotebuildexecution.googleapis.com
-build:rbe --remote_timeout=3600
-build:rbe --spawn_strategy=remote,worker,standalone,local
-build:rbe --remote_download_toplevel
-build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
-build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.14_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.14_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.14_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.14_config_platform//:platform"
-# Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14_config_python"
-build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-build:rbe --project_id="tensorflow-testing"
-
-# For continuous builds
-test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:pycpp_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:pycpp_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/ci/official/bazelrcs/cuda.bazelrc b/ci/official/bazelrcs/cuda.bazelrc
deleted file mode 100644
index 1bb260687d35fa..00000000000000
--- a/ci/official/bazelrcs/cuda.bazelrc
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# This bazelrc can build a GPU-supporting TF package.
-
-# Convenient cache configurations
-# Use a cache directory mounted to /tf/cache. Very useful!
-build:sigbuild_local_cache --disk_cache=/tf/cache
-# Use the public-access TF DevInfra cache (read only)
-build:sigbuild_remote_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
-# Write to the TF DevInfra cache (only works for internal TF CI)
-build:sigbuild_remote_cache_push --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --google_default_credentials
-# Change the value of CACHEBUSTER when upgrading the toolchain, or when testing
-# different compilation methods. E.g. for a PR to test a new CUDA version, set
-# the CACHEBUSTER to the PR number.
-build --action_env=CACHEBUSTER=501872366
-
-# Use Python 3.X as installed in container image
-build --action_env PYTHON_BIN_PATH="/usr/bin/python3"
-build --action_env PYTHON_LIB_PATH="/usr/lib/tf_python"
-build --python_path="/usr/bin/python3"
-
-# Build TensorFlow v2
-build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
-
-# Target the AVX instruction set
-build --copt=-mavx --host_copt=-mavx
-
-# Disable clang extention that rejects type definitions within offsetof. 
-# This was added in clang-16 by https://reviews.llvm.org/D133574.
-# Can be removed once upb is updated, since a type definition is used within
-# offset of in the current version of ubp.
-# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
-build --copt=-Wno-gnu-offsetof-extensions
-
-# Use lld as the linker
-build --linkopt="-fuse-ld=lld"
-build --linkopt="-lm"
-
-# CUDA: Set up compilation CUDA version and paths
-build --@local_config_cuda//:enable_cuda
-build --@local_config_cuda//:cuda_compiler=clang
-build --repo_env TF_NEED_CUDA=1
-build --config cuda_clang
-build --action_env=TF_CUDA_VERSION="12"
-build --action_env=TF_CUDNN_VERSION="2"
-build --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
-build --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
-build --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
-build --action_env=TF_CUDA_CLANG="1"
-build --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
-build --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-
-# CUDA: Enable TensorRT optimizations
-# https://developer.nvidia.com/tensorrt
-build --repo_env TF_NEED_TENSORRT=1
-
-# CUDA: Select supported compute capabilities (supported graphics cards).
-# This is the same as the official TensorFlow builds.
-# See https://developer.nvidia.com/cuda-gpus#compute
-# TODO(angerson, perfinion): What does sm_ vs compute_ mean?
-# TODO(angerson, perfinion): How can users select a good value for this?
-build --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
-
-# Test-related settings below this point.
-test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
-test --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-# Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
-test --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
-# Give only the list of failed tests at the end of the log
-test --test_summary=short
-
-# "nonpip" tests are regular py_test tests.
-# Pass --config=nonpip to run the same suite of tests. If you want to run just
-# one test for investigation, you don't need --config=nonpip; just run the
-# bazel test invocation as normal.
-test:nonpip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:nonpip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:nonpip_filters --test_lang_filters=py --test_size_filters=small,medium
-test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
-
-# For building libtensorflow archives
-test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
-build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
-
-# Allow creation of resultstore URLs for any bazel invocation
-build:resultstore --google_default_credentials
-build:resultstore --bes_backend=buildeventservice.googleapis.com
-build:resultstore --bes_results_url="https://source.cloud.google.com/results/invocations"
-build:resultstore --bes_timeout=600s
-build:resultstore --bes_instance_name="tensorflow-testing"
-
-# For Remote Build Execution.
-build:rbe --config=resultstore
-build:rbe --define=EXECUTOR=remote
-build:rbe --jobs=800
-build:rbe --remote_executor=grpcs://remotebuildexecution.googleapis.com
-build:rbe --remote_timeout=3600
-build:rbe --spawn_strategy=remote,worker,standalone,local
-build:rbe --remote_download_toplevel
-build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
-build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
-# Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
-build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-build:rbe --project_id="tensorflow-testing"
-
-# For Remote build execution -- GPU configuration
-build:rbe --repo_env=REMOTE_GPU_TESTING=1
-test:rbe --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang_config_cuda"
-build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang_config_tensorrt"
-build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang_config_nccl"
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
-
-# For continuous builds
-test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
-test:pycpp_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
-test:pycpp_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/ci/official/bisect.sh b/ci/official/bisect.sh
new file mode 100755
index 00000000000000..44856ad47f5d0b
--- /dev/null
+++ b/ci/official/bisect.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Run any CI script and env configuration to bisect a failing target in some
+# build configuration. You must set the following variables to control this
+# script:
+#
+#   TF_BISECT_GOOD: Last known good commit (e.g. commit from the last passing job)
+#   TF_BISECT_BAD: First bad commit (e.g. commit from the first failing job)
+#   TF_BISECT_SCRIPT: The build script path, e.g. ci/official/wheel.sh if you
+#     are in the Git root directory, or github/tensorflow/ci/official/wheel.sh if
+#     you are running one of TensorFlow's Kokoro GitHub CI jobs.
+#   TFCI: The env config path, absolute.
+#
+# Note that you can combine bisect.sh with any.sh to bisect a single test:
+#
+#   export TFCI=...
+#   export TF_BISECT_SCRIPT=$(realpath any.sh)
+#   export TF_BISECT_GOOD=a_good_commit_sha
+#   export TF_BISECT_BAD=a_failing_commit_sha
+#   export TF_ANY_TARGETS="quoted list of targets, like on the command line"
+#   export TF_ANY_MODE=test
+set -euxo pipefail
+cp "$TFCI" bisect
+echo "source ci/official/envs/disable_all_uploads" >> bisect
+export TFCI=$(realpath bisect)
+git bisect start "$TF_BISECT_BAD" "$TF_BISECT_GOOD"
+git bisect run "$TF_BISECT_SCRIPT"
diff --git a/ci/official/envs/ci_default b/ci/official/envs/ci_default
index 86896a45169c09..3747b8b5a7b7ed 100644
--- a/ci/official/envs/ci_default
+++ b/ci/official/envs/ci_default
@@ -1,10 +1,14 @@
 TFCI_BAZEL_BAZELRC_ARGS=()
+TFCI_BAZEL_CONFIG_PREFIX=
 TFCI_BAZEL_COMMON_ARGS=()
 TFCI_BUILD_PIP_PACKAGE_ARGS=()
 TFCI_DOCKER_ARGS=()
 TFCI_DOCKER_ENABLE=1
 TFCI_DOCKER_IMAGE=
 TFCI_DOCKER_PULL_ENABLE=1
+TFCI_DOCKER_REBUILD_ARGS=
+TFCI_DOCKER_REBUILD_ENABLE=0
+TFCI_DOCKER_REBUILD_UPLOAD_ENABLE=0
 TFCI_INDEX_HTML_ENABLE=1
 TFCI_LIB_SUFFIX=
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=
diff --git a/ci/official/envs/ci_nightly_uploads b/ci/official/envs/ci_nightly_uploads
index b54d139d3cb755..e35a712d034966 100644
--- a/ci/official/envs/ci_nightly_uploads
+++ b/ci/official/envs/ci_nightly_uploads
@@ -1,17 +1,6 @@
-# Disabled until newest scripts are ready
 TFCI_UPLOAD_LIB_ENABLE=0
-TFCI_UPLOAD_LIB_LATEST_ENABLE=0
-TFCI_UPLOAD_LIB_LATEST_URI="gs://libtensorflow-nightly/latest"
-TFCI_UPLOAD_LIB_URI="gs://libtensorflow-nightly/$(date -I)"
-
-# Disabled until newest scripts are ready
+TFCI_UPLOAD_LIB_LATEST_ENABLE=
 TFCI_UPLOAD_WHL_GCS_ENABLE=
 TFCI_UPLOAD_WHL_GCS_URI=
-
-# Disabled until newest scripts are ready
-TFCI_UPLOAD_WHL_PYPI_ENABLE=0
-TFCI_UPLOAD_WHL_PYPI_ARGS=( --config-file "$KOKORO_KEYSTORE_DIR/73361_tensorflow_pypirc_using_global_api_token" --repository testpypi )
-
-# For standard releases, later:
-# TFCI_UPLOAD_LIB_URI="gs://tensorflow-release-packages/$RELEASE_VERSION/$KOKORO_GIT_COMMIT_tensorflow"
-# TFCI_UPLOAD_WHL_GCS_URI="gs://tensorflow-release-packages/$RELEASE_VERSION/$KOKORO_GIT_COMMIT_tensorflow"
+TFCI_UPLOAD_WHL_PYPI_ARGS=
+TFCI_UPLOAD_WHL_PYPI_ENABLE=
diff --git a/ci/official/envs/continuous_linux_x86_cpu_py310 b/ci/official/envs/continuous_linux_x86_cpu_py310
index f6fc02a70065de..c891af629fdfa7 100644
--- a/ci/official/envs/continuous_linux_x86_cpu_py310
+++ b/ci/official/envs/continuous_linux_x86_cpu_py310
@@ -1,4 +1,5 @@
 source ci/official/envs/ci_default
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
 TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/continuous_linux_x86_cpu_py311 b/ci/official/envs/continuous_linux_x86_cpu_py311
index 85e18ee57a0a09..0d82dee52191d3 100644
--- a/ci/official/envs/continuous_linux_x86_cpu_py311
+++ b/ci/official/envs/continuous_linux_x86_cpu_py311
@@ -1,4 +1,5 @@
 source ci/official/envs/ci_default
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
 TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.11)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.11
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.11 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/continuous_linux_x86_cpu_py39 b/ci/official/envs/continuous_linux_x86_cpu_py39
index ee6175054f5b10..ce3ddebfaf58c9 100644
--- a/ci/official/envs/continuous_linux_x86_cpu_py39
+++ b/ci/official/envs/continuous_linux_x86_cpu_py39
@@ -1,4 +1,5 @@
 source ci/official/envs/ci_default
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
 TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.9)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.9
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.11 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/continuous_linux_x86_cuda_py310 b/ci/official/envs/continuous_linux_x86_cuda_py310
index 0754417a12c67d..adc440db2cface 100644
--- a/ci/official/envs/continuous_linux_x86_cuda_py310
+++ b/ci/official/envs/continuous_linux_x86_cuda_py310
@@ -1,6 +1,7 @@
 source ci/official/envs/ci_default
 TFCI_NVIDIA_SMI_ENABLE=1
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cuda.bazelrc)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
 TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/continuous_linux_x86_cuda_py311 b/ci/official/envs/continuous_linux_x86_cuda_py311
index 55c0527da53384..e0f57d1e4ea3d4 100644
--- a/ci/official/envs/continuous_linux_x86_cuda_py311
+++ b/ci/official/envs/continuous_linux_x86_cuda_py311
@@ -1,6 +1,7 @@
 source ci/official/envs/ci_default
 TFCI_NVIDIA_SMI_ENABLE=1
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cuda.bazelrc)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
 TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.11)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.11
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.11 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/continuous_linux_x86_cuda_py39 b/ci/official/envs/continuous_linux_x86_cuda_py39
index 1d6111e2311267..077c89aac47b41 100644
--- a/ci/official/envs/continuous_linux_x86_cuda_py39
+++ b/ci/official/envs/continuous_linux_x86_cuda_py39
@@ -1,6 +1,7 @@
 source ci/official/envs/ci_default
 TFCI_NVIDIA_SMI_ENABLE=1
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cuda.bazelrc)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
 TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.9)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.9
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.9 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/disable_all_uploads b/ci/official/envs/disable_all_uploads
new file mode 100644
index 00000000000000..b09169d677fa8f
--- /dev/null
+++ b/ci/official/envs/disable_all_uploads
@@ -0,0 +1,7 @@
+TFCI_UPLOAD_LIB_ENABLE=0
+TFCI_UPLOAD_LIB_LATEST_ENABLE=
+TFCI_DOCKER_REBUILD_UPLOAD_ENABLE=
+TFCI_UPLOAD_WHL_GCS_ENABLE=
+TFCI_UPLOAD_WHL_GCS_URI=
+TFCI_UPLOAD_WHL_PYPI_ENABLE=
+TFCI_UPLOAD_WHL_PYPI_ARGS=
diff --git a/ci/official/envs/nightly_libtensorflow_linux_x86_cpu b/ci/official/envs/nightly_libtensorflow_linux_x86_cpu
index 6747d101d31275..9da67eefa3b449 100644
--- a/ci/official/envs/nightly_libtensorflow_linux_x86_cpu
+++ b/ci/official/envs/nightly_libtensorflow_linux_x86_cpu
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
+TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_LIB_SUFFIX="-cpu-linux-x86_64"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_libtensorflow_linux_x86_cuda b/ci/official/envs/nightly_libtensorflow_linux_x86_cuda
index 728d4c961bc814..b011304fc3a14d 100644
--- a/ci/official/envs/nightly_libtensorflow_linux_x86_cuda
+++ b/ci/official/envs/nightly_libtensorflow_linux_x86_cuda
@@ -1,9 +1,9 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cuda.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
+TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_LIB_SUFFIX="-gpu-linux-x86_64"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
 TFCI_NVIDIA_SMI_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_cpu_py310 b/ci/official/envs/nightly_linux_x86_cpu_py310
index ad090d602064c5..9c77911a190c50 100644
--- a/ci/official/envs/nightly_linux_x86_cpu_py310
+++ b/ci/official/envs/nightly_linux_x86_cpu_py310
@@ -1,7 +1,8 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
+TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_cpu_py311 b/ci/official/envs/nightly_linux_x86_cpu_py311
index ecf46798583d43..0a5e2db2acb519 100644
--- a/ci/official/envs/nightly_linux_x86_cpu_py311
+++ b/ci/official/envs/nightly_linux_x86_cpu_py311
@@ -1,7 +1,8 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.11)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
+TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.11)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.11
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.11 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_cpu_py39 b/ci/official/envs/nightly_linux_x86_cpu_py39
index f439e7cc87e0c3..7679944d3f0dd1 100644
--- a/ci/official/envs/nightly_linux_x86_cpu_py39
+++ b/ci/official/envs/nightly_linux_x86_cpu_py39
@@ -1,7 +1,8 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.9)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
+TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.9)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.9
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.9 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_cuda_py310 b/ci/official/envs/nightly_linux_x86_cuda_py310
index 6a3847d45fefc6..836d187c365611 100644
--- a/ci/official/envs/nightly_linux_x86_cuda_py310
+++ b/ci/official/envs/nightly_linux_x86_cuda_py310
@@ -1,8 +1,9 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cuda.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
+TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--nightly_flag)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_cuda_py311 b/ci/official/envs/nightly_linux_x86_cuda_py311
index 46fe5077699e90..36abb67cb73acc 100644
--- a/ci/official/envs/nightly_linux_x86_cuda_py311
+++ b/ci/official/envs/nightly_linux_x86_cuda_py311
@@ -1,8 +1,9 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cuda.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.11)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
+TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.11)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--nightly_flag)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.11
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.11 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_cuda_py39 b/ci/official/envs/nightly_linux_x86_cuda_py39
index 32d8f3ad12546f..4796e3374fb68a 100644
--- a/ci/official/envs/nightly_linux_x86_cuda_py39
+++ b/ci/official/envs/nightly_linux_x86_cuda_py39
@@ -1,8 +1,9 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cuda.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.9)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
+TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.9)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--nightly_flag)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.9
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.9 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py310 b/ci/official/envs/nightly_linux_x86_tpu_py310
index d802791f6c9f8d..95e66d9037f54f 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py310
+++ b/ci/official/envs/nightly_linux_x86_tpu_py310
@@ -1,7 +1,8 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10 --config=tpu)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
+TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10 --config=tpu)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py311 b/ci/official/envs/nightly_linux_x86_tpu_py311
index c797ad33cac6a7..57c479933e61c7 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py311
+++ b/ci/official/envs/nightly_linux_x86_tpu_py311
@@ -1,7 +1,8 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
 TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.11 --config=tpu)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.11
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.11 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py39 b/ci/official/envs/nightly_linux_x86_tpu_py39
index f738c2cb75c216..8e4c1b25d0ef87 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py39
+++ b/ci/official/envs/nightly_linux_x86_tpu_py39
@@ -1,7 +1,8 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_BAZELRC_ARGS=(--bazelrc ./ci/official/bazelrcs/cpu.bazelrc)
 TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.9 --config=tpu)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.9
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.9 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/sample b/ci/official/envs/sample
index fcfc2c7fc1e76a..a64de50dd2f698 100644
--- a/ci/official/envs/sample
+++ b/ci/official/envs/sample
@@ -8,7 +8,7 @@
 # TFCI_GIT_DIR is set automatically by tensorflow/ci/official/utilities/setup.sh
 # "set +u ... set -u" ignores unknown CI variables, like $KOKORO_XYZ, since
 # they'll be overwritten anyway.
-set +u; source "$TFCI_GIT_DIR/ci/official/envs/your_choice_here"; set -u
+set +u; source ci/official/envs/your_choice_here; set -u
 
 # Reset bazel common options. This combines a local disk cache and
 # TensorFlow's remote cache to speed up your builds. The "nightly" branch has
@@ -16,15 +16,12 @@ set +u; source "$TFCI_GIT_DIR/ci/official/envs/your_choice_here"; set -u
 # different Python versions. You can add e.g. "--repo_env=TF_PYTHON_VERSION=3.9"
 # to change the Python version to anything available (including the default) in
 # tensorflow/tools/toolchains/python/python_repo.bzl.
-TFCI_BAZEL_COMMON_ARGS=(--config sigbuild_remote_cache --disk_cache=build_output/cache)
+TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache --disk_cache=build_output/cache)
 
 # Disable all CI-specific behavior. You never need any of these if you are
 # running a script locally.
 TFCI_INDEX_HTML_ENABLE=
-TFCI_UPLOAD_LIB_ENABLE=
-TFCI_UPLOAD_LIB_LATEST_ENABLE=
-TFCI_UPLOAD_WHL_GCS_ENABLE=
-TFCI_UPLOAD_WHL_PYPI_ENABLE=
+source ci/official/envs/disable_all_uploads
 
 # You can add other custom settings below.
 # 
diff --git a/ci/official/pycpp.sh b/ci/official/pycpp.sh
index 2ccf4f861950dc..6a4bd8821bbefb 100755
--- a/ci/official/pycpp.sh
+++ b/ci/official/pycpp.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 source "${BASH_SOURCE%/*}/utilities/setup.sh"
 
-tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" test "${TFCI_BAZEL_COMMON_ARGS[@]}" --profile "$TFCI_OUTPUT_DIR/profile.json.gz" --config=pycpp
+tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" test "${TFCI_BAZEL_COMMON_ARGS[@]}" --profile "$TFCI_OUTPUT_DIR/profile.json.gz" --config="${TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX}_pycpp_test"
 
 # Note: the profile can be viewed by visiting chrome://tracing in a Chrome browser.
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
diff --git a/ci/official/utilities/docker.sh b/ci/official/utilities/docker.sh
index 5ca42c9fff6035..4d726a98097d71 100755
--- a/ci/official/utilities/docker.sh
+++ b/ci/official/utilities/docker.sh
@@ -17,6 +17,13 @@ if [[ "$TFCI_DOCKER_PULL_ENABLE" == 1 ]]; then
   docker pull "$TFCI_DOCKER_IMAGE"
 fi
 
+if [[ "$TFCI_DOCKER_REBUILD_ENABLE" == 1 ]]; then
+  DOCKER_BUILDKIT=1 docker build --cache-from "$TFCI_DOCKER_IMAGE" -t "$TFCI_DOCKER_IMAGE" "${TFCI_DOCKER_REBUILD_ARGS[@]}"
+  if [[ "$TFCI_DOCKER_REBUILD_UPLOAD_ENABLE" == 1 ]]; then
+    docker push "$TFCI_DOCKER_IMAGE"
+  fi
+fi
+
 # Keep the existing "tf" container if it's already present.
 # The container is not cleaned up automatically! Remove it with:
 # docker rm tf
diff --git a/ci/official/wheel.sh b/ci/official/wheel.sh
index f2f79d0ae09127..a92e8f2c397a9a 100755
--- a/ci/official/wheel.sh
+++ b/ci/official/wheel.sh
@@ -37,5 +37,5 @@ if [[ "$TFCI_UPLOAD_WHL_GCS_ENABLE" == 1 ]]; then
 fi
 
 if [[ "$TFCI_WHL_BAZEL_TEST_ENABLE" == 1 ]]; then
-  tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" test "${TFCI_BAZEL_COMMON_ARGS[@]}" --config=nonpip
+  tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" test "${TFCI_BAZEL_COMMON_ARGS[@]}" --config="${TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX}_wheel_test"
 fi
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 98eff1ea726464..50b3372e43a32f 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -425,15 +425,19 @@ build:ios     --define=with_xla_support=false
 # Options when using remote execution
 # WARNING: THESE OPTIONS WONT WORK IF YOU DO NOT HAVE PROPER AUTHENTICATION AND PERMISSIONS
 
+# Allow creation of resultstore URLs for any bazel invocation
+build:resultstore --google_default_credentials
+build:resultstore --bes_backend=buildeventservice.googleapis.com
+build:resultstore --bes_instance_name="tensorflow-testing"
+build:resultstore --bes_results_url="https://source.cloud.google.com/results/invocations"
+build:resultstore --bes_timeout=600s
+
 # Flag to enable remote config
 common --experimental_repo_remote_exec
 
 # Make Bazel not try to probe the host system for a C++ toolchain.
+build:rbe_base --config=resultstore
 build:rbe_base --repo_env=BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1
-build:rbe_base --google_default_credentials
-build:rbe_base --bes_backend=buildeventservice.googleapis.com
-build:rbe_base --bes_results_url="https://source.cloud.google.com/results/invocations"
-build:rbe_base --bes_timeout=600s
 build:rbe_base --define=EXECUTOR=remote
 build:rbe_base --jobs=800
 build:rbe_base --remote_executor=grpcs://remotebuildexecution.googleapis.com
@@ -475,7 +479,6 @@ build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_conf
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-build:rbe_linux_cpu --bes_instance_name="tensorflow-testing"
 
 # TODO(kanglan): Remove it after toolchain update is complete.
 build:rbe_linux_cpu_old --config=rbe_linux
@@ -488,7 +491,6 @@ build:rbe_linux_cpu_old --platforms="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cu
 build:rbe_linux_cpu_old --python_path="/usr/local/bin/python3.9"
 build:rbe_linux_cpu_old --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_python3.9"
 common:rbe_linux_cpu_old --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-build:rbe_linux_cpu_old --bes_instance_name="tensorflow-testing"
 
 build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
@@ -531,7 +533,6 @@ build:rbe_win_py39 --python_path=C:\\Python39\\python.exe
 
 # TODO(kanglan): Merge tensorflow_testing_rbe_win into rbe_win
 common:tensorflow_testing_rbe_win --remote_instance_name=projects/tensorflow-testing/instances/windows
-build:tensorflow_testing_rbe_win --bes_instance_name="tensorflow-testing"
 # END TF REMOTE BUILD EXECUTION OPTIONS
 
 # TFLite build configs for generic embedded Linux
@@ -641,3 +642,51 @@ build:android --config=no_tfrt
 build:macos   --config=no_tfrt
 build:windows --config=no_tfrt
 build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/ir,tensorflow/compiler/mlir/tfrt/ir/mlrt,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/mlrt,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/compiler/mlir/tfrt/transforms/mlrt,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/runtime_fallback/test,tensorflow/core/runtime_fallback/test/gpu,tensorflow/core/runtime_fallback/test/saved_model,tensorflow/core/runtime_fallback/test/testdata,tensorflow/core/tfrt/stubs,tensorflow/core/tfrt/tfrt_session,tensorflow/core/tfrt/mlrt,tensorflow/core/tfrt/mlrt/attribute,tensorflow/core/tfrt/mlrt/kernel,tensorflow/core/tfrt/mlrt/bytecode,tensorflow/core/tfrt/mlrt/interpreter,tensorflow/compiler/mlir/tfrt/translate/mlrt,tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug,tensorflow/core/tfrt/saved_model/python,tensorflow/core/tfrt/graph_executor/python,tensorflow/core/tfrt/saved_model/utils
+
+# BEGIN TF CACHE HELPER OPTIONS
+# Options when using remote execution
+# WARNING: THESE OPTIONS WONT WORK IF YOU DO NOT HAVE PROPER AUTHENTICATION AND PERMISSIONS
+
+# Use --config=tf_public_cache to try and use the TensorFlow public build cache
+# to build TensorFlow. Look at ci/official/envs to find which types of jobs
+# push to the cache.
+build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
+# Cache pushes are limited to TF's CI system.
+build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_results=true --google_default_credentials
+
+# END TF CACHE HELPER OPTIONS
+# BEGIN TF TEST SUITE OPTIONS
+# These are convenience config options that effectively declare TF's CI test suites. Look
+# at the scripts of ci/official/ to see how TF's CI uses them.
+
+# LIBTENSORFLOW TESTS are for building Libtensorflow archives. These are CUDA/CPU-agnostic.
+test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
+build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
+
+# PYTHON TESTS run a suite of Python tests intended for verifying that the Python wheel
+# will work properly. These are usually run Nightly or upon Release.
+# CPU WHEEL
+test:linux_cpu_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cpu_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cpu_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+# CUDA WHEEL
+test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:linux_cuda_wheel_test --config=nonpip_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+
+# PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
+# the whole TF code base. These are usually run continuously or upon presubmit.
+# CPU PYCPP:
+test:linux_cpu_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:linux_cpu_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:linux_cpu_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
+test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+# CUDA PYCPP:
+test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
+test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
+test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
+test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+
+# END TF TEST SUITE OPTIONS
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 98eff1ea726464..50b3372e43a32f 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -425,15 +425,19 @@ build:ios     --define=with_xla_support=false
 # Options when using remote execution
 # WARNING: THESE OPTIONS WONT WORK IF YOU DO NOT HAVE PROPER AUTHENTICATION AND PERMISSIONS
 
+# Allow creation of resultstore URLs for any bazel invocation
+build:resultstore --google_default_credentials
+build:resultstore --bes_backend=buildeventservice.googleapis.com
+build:resultstore --bes_instance_name="tensorflow-testing"
+build:resultstore --bes_results_url="https://source.cloud.google.com/results/invocations"
+build:resultstore --bes_timeout=600s
+
 # Flag to enable remote config
 common --experimental_repo_remote_exec
 
 # Make Bazel not try to probe the host system for a C++ toolchain.
+build:rbe_base --config=resultstore
 build:rbe_base --repo_env=BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1
-build:rbe_base --google_default_credentials
-build:rbe_base --bes_backend=buildeventservice.googleapis.com
-build:rbe_base --bes_results_url="https://source.cloud.google.com/results/invocations"
-build:rbe_base --bes_timeout=600s
 build:rbe_base --define=EXECUTOR=remote
 build:rbe_base --jobs=800
 build:rbe_base --remote_executor=grpcs://remotebuildexecution.googleapis.com
@@ -475,7 +479,6 @@ build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_conf
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-build:rbe_linux_cpu --bes_instance_name="tensorflow-testing"
 
 # TODO(kanglan): Remove it after toolchain update is complete.
 build:rbe_linux_cpu_old --config=rbe_linux
@@ -488,7 +491,6 @@ build:rbe_linux_cpu_old --platforms="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cu
 build:rbe_linux_cpu_old --python_path="/usr/local/bin/python3.9"
 build:rbe_linux_cpu_old --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_python3.9"
 common:rbe_linux_cpu_old --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-build:rbe_linux_cpu_old --bes_instance_name="tensorflow-testing"
 
 build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
@@ -531,7 +533,6 @@ build:rbe_win_py39 --python_path=C:\\Python39\\python.exe
 
 # TODO(kanglan): Merge tensorflow_testing_rbe_win into rbe_win
 common:tensorflow_testing_rbe_win --remote_instance_name=projects/tensorflow-testing/instances/windows
-build:tensorflow_testing_rbe_win --bes_instance_name="tensorflow-testing"
 # END TF REMOTE BUILD EXECUTION OPTIONS
 
 # TFLite build configs for generic embedded Linux
@@ -641,3 +642,51 @@ build:android --config=no_tfrt
 build:macos   --config=no_tfrt
 build:windows --config=no_tfrt
 build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/ir,tensorflow/compiler/mlir/tfrt/ir/mlrt,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/mlrt,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/compiler/mlir/tfrt/transforms/mlrt,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/runtime_fallback/test,tensorflow/core/runtime_fallback/test/gpu,tensorflow/core/runtime_fallback/test/saved_model,tensorflow/core/runtime_fallback/test/testdata,tensorflow/core/tfrt/stubs,tensorflow/core/tfrt/tfrt_session,tensorflow/core/tfrt/mlrt,tensorflow/core/tfrt/mlrt/attribute,tensorflow/core/tfrt/mlrt/kernel,tensorflow/core/tfrt/mlrt/bytecode,tensorflow/core/tfrt/mlrt/interpreter,tensorflow/compiler/mlir/tfrt/translate/mlrt,tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug,tensorflow/core/tfrt/saved_model/python,tensorflow/core/tfrt/graph_executor/python,tensorflow/core/tfrt/saved_model/utils
+
+# BEGIN TF CACHE HELPER OPTIONS
+# Options when using remote execution
+# WARNING: THESE OPTIONS WONT WORK IF YOU DO NOT HAVE PROPER AUTHENTICATION AND PERMISSIONS
+
+# Use --config=tf_public_cache to try and use the TensorFlow public build cache
+# to build TensorFlow. Look at ci/official/envs to find which types of jobs
+# push to the cache.
+build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
+# Cache pushes are limited to TF's CI system.
+build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_results=true --google_default_credentials
+
+# END TF CACHE HELPER OPTIONS
+# BEGIN TF TEST SUITE OPTIONS
+# These are convenience config options that effectively declare TF's CI test suites. Look
+# at the scripts of ci/official/ to see how TF's CI uses them.
+
+# LIBTENSORFLOW TESTS are for building Libtensorflow archives. These are CUDA/CPU-agnostic.
+test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
+build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
+
+# PYTHON TESTS run a suite of Python tests intended for verifying that the Python wheel
+# will work properly. These are usually run Nightly or upon Release.
+# CPU WHEEL
+test:linux_cpu_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cpu_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cpu_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+# CUDA WHEEL
+test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:linux_cuda_wheel_test --config=nonpip_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+
+# PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
+# the whole TF code base. These are usually run continuously or upon presubmit.
+# CPU PYCPP:
+test:linux_cpu_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:linux_cpu_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:linux_cpu_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
+test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+# CUDA PYCPP:
+test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
+test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
+test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
+test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+
+# END TF TEST SUITE OPTIONS

From 38a9352c0ec79bc200222ea5e6ad4cdd2d7fa72f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 09:15:30 -0700
Subject: [PATCH 270/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/ad4ba135f9a5ca24862daf7c93e4a3b6e53c69c8.

PiperOrigin-RevId: 568559126
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 94cc1beb5b2f9c..f0af407c117416 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "cf730a32058241ae24fd12fe0a9a0258faa0ca89"
-    TFRT_SHA256 = "7fcc709032d16aa5f2cf04e0c4cbcdf9e0dbafb8377e141d2c0074f24ede4066"
+    TFRT_COMMIT = "ad4ba135f9a5ca24862daf7c93e4a3b6e53c69c8"
+    TFRT_SHA256 = "f523256aa10d35e9c2f8c2d7e5674e8496aef7fd2a6547942696462d0d0497f6"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index 94cc1beb5b2f9c..f0af407c117416 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "cf730a32058241ae24fd12fe0a9a0258faa0ca89"
-    TFRT_SHA256 = "7fcc709032d16aa5f2cf04e0c4cbcdf9e0dbafb8377e141d2c0074f24ede4066"
+    TFRT_COMMIT = "ad4ba135f9a5ca24862daf7c93e4a3b6e53c69c8"
+    TFRT_SHA256 = "f523256aa10d35e9c2f8c2d7e5674e8496aef7fd2a6547942696462d0d0497f6"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 94cc1beb5b2f9c..f0af407c117416 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "cf730a32058241ae24fd12fe0a9a0258faa0ca89"
-    TFRT_SHA256 = "7fcc709032d16aa5f2cf04e0c4cbcdf9e0dbafb8377e141d2c0074f24ede4066"
+    TFRT_COMMIT = "ad4ba135f9a5ca24862daf7c93e4a3b6e53c69c8"
+    TFRT_SHA256 = "f523256aa10d35e9c2f8c2d7e5674e8496aef7fd2a6547942696462d0d0497f6"
 
     tf_http_archive(
         name = "tf_runtime",

From f16b9c216905ed9443b8c1e5a37a45efe66b76b3 Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Tue, 26 Sep 2023 10:14:46 -0700
Subject: [PATCH 271/567] Clean up dependency and disable test from running
 under msan.

PiperOrigin-RevId: 568576434
---
 third_party/xla/xla/tools/BUILD | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index a5f6c0a75abed2..8dda4c1ebb9c34 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -276,7 +276,6 @@ xla_cc_binary(
 
 cc_library(
     name = "hlo_expand_main",
-    testonly = True,
     srcs = ["hlo_expand_main.cc"],
     visibility = ["//visibility:public"],
     deps = [
@@ -309,8 +308,6 @@ cc_library(
         "//xla/service:rng_bit_generator_expander",
         "//xla/service:rng_expander",
         "//xla/service:triangular_solve_expander",
-        "//xla/stream_executor:dnn",
-        "//xla/stream_executor:platform",
         "@local_tsl//tsl/util:command_line_flags",
     ],
 )
@@ -320,9 +317,12 @@ xla_cc_test(
     srcs = ["tests/hlo_expand_test.cc"],
     data = [
         "tests/cholesky.hlo",
+        ":hlo-expand",
+    ],
+    tags = [
+        "nomsan",  # No msan for precompiled Nvidia binaries.
     ],
     deps = [
-        ":hlo-expand",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:subprocess",

From 6bb02ebc7d9c88e35c0ef79135de7cc029e92863 Mon Sep 17 00:00:00 2001
From: Denali Molitor <dmolitor@google.com>
Date: Tue, 26 Sep 2023 10:21:10 -0700
Subject: [PATCH 272/567] [HloValueSemanticsAnalysis] Handle while loops in
 einsum depth analysis.

PiperOrigin-RevId: 568578511
---
 third_party/xla/xla/service/BUILD             |   4 +
 .../service/hlo_value_semantics_analysis.cc   |  55 +++-
 .../service/hlo_value_semantics_analysis.h    |   2 +
 .../hlo_value_semantics_analysis_test.cc      | 299 ++++++++++--------
 4 files changed, 209 insertions(+), 151 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index f20a5fcf8e77b8..2228ee9c084e42 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -4230,6 +4230,10 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
index 41fdff9e6d5aaa..b6252902957138 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
@@ -111,14 +111,12 @@ Status EinsumDepthAnalysis::RunInternal(
   for (HloInstruction* root : roots) {
     if (root == computation.root_instruction()) {
       if (root_depth.has_value()) {
-        einsum_depth_map_.insert(std::make_pair(root, *root_depth));
+        TF_RETURN_IF_ERROR(SetInstructionDepth(root, *root_depth));
       } else {
-        ShapeTree<int> depth(root->shape(), 0);
-        einsum_depth_map_.insert(std::make_pair(root, std::move(depth)));
+        TF_RETURN_IF_ERROR(SetInstructionDepth(root, 0));
       }
     } else {
-      ShapeTree<int> depth(root->shape(), -1);
-      einsum_depth_map_.insert(std::make_pair(root, std::move(depth)));
+      GetOrCreateDepthTree(root);
     }
   }
   HloPreOrderDFS dfs;
@@ -222,6 +220,14 @@ Status EinsumDepthAnalysis::SetInstructionDepth(HloInstruction* instruction,
   return OkStatus();
 }
 
+Status EinsumDepthAnalysis::SetInstructionDepth(HloInstruction* instruction,
+                                                const ShapeTree<int>& depth) {
+  auto depth_iter = GetOrCreateDepthTree(instruction);
+  ShapeTree<int>& depth_tree = depth_iter->second;
+  SetDepth(depth_tree, depth);
+  return OkStatus();
+}
+
 Status EinsumDepthAnalysis::DefaultAction(HloInstruction* instruction) {
   if (!instruction->shape().IsToken() && !instruction->shape().IsArray() &&
       !instruction->shape().IsTuple()) {
@@ -239,9 +245,7 @@ Status EinsumDepthAnalysis::DefaultAction(HloInstruction* instruction) {
   if (instruction->operand_count() == 1) {
     HloInstruction* operand = instruction->mutable_operand(0);
     if (Shape::Equal().IgnoreLayout()(instruction->shape(), operand->shape())) {
-      auto operand_depth_iter = GetOrCreateDepthTree(operand);
-      ShapeTree<int>& operand_depth = operand_depth_iter->second;
-      SetDepth(operand_depth, depth_tree);
+      TF_RETURN_IF_ERROR(SetInstructionDepth(operand, depth_tree));
       return OkStatus();
     }
   }
@@ -375,9 +379,31 @@ Status EinsumDepthAnalysis::HandleWhile(HloInstruction* xla_while) {
   TF_RETURN_IF_ERROR(HandleCalledComputation(
       *condition_computation, condition_depth, xla_while->operands()));
   HloComputation* body_computation = xla_while->while_body();
-  Status status = HandleCalledComputation(*body_computation, depth_tree,
-                                          xla_while->operands());
-  return status;
+  TF_RETURN_IF_ERROR(HandleCalledComputation(*body_computation, depth_tree,
+                                             xla_while->operands()));
+  // Elements of while loop outputs may only be used within the while loop.
+  // Set the depth of the while body outputs to have the max of their original
+  // depth and their corresponding operand depth if their original depth was
+  // negative. Then recompute while loop instruction depths.
+  auto body_depth_iter =
+      GetOrCreateDepthTree(body_computation->root_instruction());
+  ShapeTree<int>& body_depth = body_depth_iter->second;
+  // Note: while body computations have a single parameter. See
+  // ShapeVerifier::HandleWhile.
+  HloInstruction* operand = body_computation->parameter_instruction(0);
+  auto operand_depth = GetOrCreateDepthTree(operand)->second;
+  body_depth.ForEachMutableElement(
+      [&body_depth, &operand_depth](const ShapeIndex& shape_index,
+                                    int* depth_ptr) {
+        if (body_depth.IsLeaf(shape_index)) {
+          if (body_depth.element(shape_index) < 0 &&
+              operand_depth.element(shape_index) >= 0) {
+            *depth_ptr = 0;
+          }
+        }
+      });
+  return HandleCalledComputation(*body_computation, body_depth,
+                                 xla_while->operands());
 }
 
 Status EinsumDepthAnalysis::HandleConditional(HloInstruction* conditional) {
@@ -398,8 +424,7 @@ Status EinsumDepthAnalysis::HandleCalledComputation(
     HloInstruction* parameter = called_computation.parameter_instruction(i);
     const ShapeTree<int> parameter_depth =
         GetOrCreateDepthTree(parameter)->second;
-    ShapeTree<int>& operand_depth = GetOrCreateDepthTree(operand)->second;
-    SetDepth(operand_depth, parameter_depth);
+    TF_RETURN_IF_ERROR(SetInstructionDepth(operand, parameter_depth));
   }
   return OkStatus();
 }
@@ -414,9 +439,7 @@ Status EinsumDepthAnalysis::HandleOutfeed(HloInstruction* outfeed) {
   const ShapeTree<int> depth_tree = depth_iter->second;
   int max_depth = GetMaxDepth(depth_tree);
   for (HloInstruction* operand : outfeed->mutable_operands()) {
-    auto operand_depth_iter = GetOrCreateDepthTree(operand);
-    ShapeTree<int>& operand_depth = operand_depth_iter->second;
-    SetDepth(operand_depth, max_depth);
+    TF_RETURN_IF_ERROR(SetInstructionDepth(operand, max_depth));
   }
   return OkStatus();
 }
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.h b/third_party/xla/xla/service/hlo_value_semantics_analysis.h
index fdc1430c171b29..c0491079555a90 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.h
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis.h
@@ -92,6 +92,8 @@ class EinsumDepthAnalysis : public DfsHloVisitorWithDefault {
                      const std::optional<ShapeTree<int>>& root_depth);
   EinsumDepthMap::iterator GetOrCreateDepthTree(HloInstruction* instruction);
   Status SetInstructionDepth(HloInstruction* instruction, int depth);
+  Status SetInstructionDepth(HloInstruction* instruction,
+                             const ShapeTree<int>& depth);
   Status HandleDepthIncrementInstruction(HloInstruction* instruction);
   Status HandleCalledComputation(const HloComputation& called_computation,
                                  const ShapeTree<int>& root_depth,
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc b/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
index 651130f4384d59..41c934f48523a9 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
@@ -15,15 +15,146 @@ limitations under the License.
 
 #include "xla/service/hlo_value_semantics_analysis.h"
 
+#include <memory>
 #include <string>
 
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
+const char kMnistHlo[] = R"(
+HloModule MnistTrainingLoopWithInfeed.140, entry_computation_layout={(f32[784,128]{1,0:T(8,128)},f32[128]{0:T(256)},f32[128,32]{1,0:T(8,128)},f32[32]{0:T(256)},f32[32,10]{1,0:T(8,128)},f32[10]{0:T(256)})->(f32[784,128]{1,0:T(8,128)}, f32[128]{0:T(256)}, f32[128,32]{1,0:T(8,128)}, f32[32]{0:T(256)}, f32[32,10]{1,0:T(8,128)}, /*index=5*/f32[10]{0:T(256)})}
+
+relu.9 {
+  x.10 = f32[] parameter(0)
+  constant.11 = f32[] constant(0)
+  ROOT maximum.12 = f32[] maximum(x.10, constant.11)
+}
+
+max_F32.17 {
+  lhs.18 = f32[] parameter(0)
+  rhs.19 = f32[] parameter(1)
+  ROOT maximum.20 = f32[] maximum(lhs.18, rhs.19)
+}
+
+add_F32.1 {
+  lhs.22 = f32[] parameter(0)
+  rhs.23 = f32[] parameter(1)
+  ROOT add.24 = f32[] add(lhs.22, rhs.23)
+}
+
+relu_gradients.29 {
+  activation.30 = f32[] parameter(0)
+  constant.32 = f32[] constant(0)
+  compare.33 = pred[] compare(activation.30, constant.32), direction=GT
+  backprop.31 = f32[] parameter(1)
+  ROOT select.34 = f32[] select(compare.33, backprop.31, constant.32)
+}
+
+body.49 {
+  after-all.51 = token[] after-all()
+  infeed.52 = ((f32[100,784]{1,0}, f32[100,10]{1,0}, pred[]), token[]) infeed(after-all.51)
+  get.53 = (f32[100,784]{1,0}, f32[100,10]{1,0}, pred[]) get-tuple-element(infeed.52), index=0
+  get.54 = f32[100,784]{1,0} get-tuple-element(get.53), index=0
+  prev.50 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) parameter(0)
+  get.57 = f32[784,128]{1,0} get-tuple-element(prev.50), index=0
+  dot.63 = f32[100,128]{1,0} dot(get.54, get.57), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  get.58 = f32[128]{0} get-tuple-element(prev.50), index=1
+  broadcast.64 = f32[100,128]{1,0} broadcast(get.58), dimensions={1}
+  add.65 = f32[100,128]{1,0} add(dot.63, broadcast.64)
+  map.66 = f32[100,128]{1,0} map(add.65), dimensions={0,1}, to_apply=relu.9
+  get.59 = f32[128,32]{1,0} get-tuple-element(prev.50), index=2
+  dot.67 = f32[100,32]{1,0} dot(map.66, get.59), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  get.60 = f32[32]{0} get-tuple-element(prev.50), index=3
+  broadcast.68 = f32[100,32]{1,0} broadcast(get.60), dimensions={1}
+  add.69 = f32[100,32]{1,0} add(dot.67, broadcast.68)
+  map.70 = f32[100,32]{1,0} map(add.69), dimensions={0,1}, to_apply=relu.9
+  get.61 = f32[32,10]{1,0} get-tuple-element(prev.50), index=4
+  dot.71 = f32[100,10]{1,0} dot(map.70, get.61), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  get.62 = f32[10]{0} get-tuple-element(prev.50), index=5
+  broadcast.72 = f32[100,10]{1,0} broadcast(get.62), dimensions={1}
+  add.73 = f32[100,10]{1,0} add(dot.71, broadcast.72)
+  constant.74 = f32[] constant(-inf)
+  reduce.75 = f32[100]{0} reduce(add.73, constant.74), dimensions={1}, to_apply=max_F32.17
+  broadcast.76 = f32[100,10]{1,0} broadcast(reduce.75), dimensions={0}
+  subtract.77 = f32[100,10]{1,0} subtract(add.73, broadcast.76)
+  exponential.78 = f32[100,10]{1,0} exponential(subtract.77)
+  constant.79 = f32[] constant(0)
+  reduce.80 = f32[100]{0} reduce(exponential.78, constant.79), dimensions={1}, to_apply=add_F32.1
+  broadcast.81 = f32[100,10]{1,0} broadcast(reduce.80), dimensions={0}
+  divide.82 = f32[100,10]{1,0} divide(exponential.78, broadcast.81)
+  get.55 = f32[100,10]{1,0} get-tuple-element(get.53), index=1
+  subtract.83 = f32[100,10]{1,0} subtract(divide.82, get.55)
+  transpose.88 = f32[10,32]{0,1} transpose(get.61), dimensions={1,0}
+  dot.89 = f32[100,32]{1,0} dot(subtract.83, transpose.88), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  map.90 = f32[100,32]{1,0} map(map.70, dot.89), dimensions={0,1}, to_apply=relu_gradients.29
+  transpose.95 = f32[32,128]{0,1} transpose(get.59), dimensions={1,0}
+  dot.96 = f32[100,128]{1,0} dot(map.90, transpose.95), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  map.97 = f32[100,128]{1,0} map(map.66, dot.96), dimensions={0,1}, to_apply=relu_gradients.29
+  transpose.98 = f32[784,100]{0,1} transpose(get.54), dimensions={1,0}
+  dot.99 = f32[784,128]{1,0} dot(transpose.98, map.97), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  constant.104 = f32[] constant(0.01)
+  broadcast.105 = f32[784,128]{1,0} broadcast(constant.104), dimensions={}
+  multiply.106 = f32[784,128]{1,0} multiply(dot.99, broadcast.105)
+  subtract.107 = f32[784,128]{1,0} subtract(get.57, multiply.106)
+  reduce.101 = f32[128]{0} reduce(map.97, constant.79), dimensions={0}, to_apply=add_F32.1
+  broadcast.109 = f32[128]{0} broadcast(constant.104), dimensions={}
+  multiply.110 = f32[128]{0} multiply(reduce.101, broadcast.109)
+  subtract.111 = f32[128]{0} subtract(get.58, multiply.110)
+  transpose.91 = f32[128,100]{0,1} transpose(map.66), dimensions={1,0}
+  dot.92 = f32[128,32]{1,0} dot(transpose.91, map.90), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  broadcast.113 = f32[128,32]{1,0} broadcast(constant.104), dimensions={}
+  multiply.114 = f32[128,32]{1,0} multiply(dot.92, broadcast.113)
+  subtract.115 = f32[128,32]{1,0} subtract(get.59, multiply.114)
+  reduce.94 = f32[32]{0} reduce(map.90, constant.79), dimensions={0}, to_apply=add_F32.1
+  broadcast.117 = f32[32]{0} broadcast(constant.104), dimensions={}
+  multiply.118 = f32[32]{0} multiply(reduce.94, broadcast.117)
+  subtract.119 = f32[32]{0} subtract(get.60, multiply.118)
+  transpose.84 = f32[32,100]{0,1} transpose(map.70), dimensions={1,0}
+  dot.85 = f32[32,10]{1,0} dot(transpose.84, subtract.83), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  broadcast.121 = f32[32,10]{1,0} broadcast(constant.104), dimensions={}
+  multiply.122 = f32[32,10]{1,0} multiply(dot.85, broadcast.121)
+  subtract.123 = f32[32,10]{1,0} subtract(get.61, multiply.122)
+  reduce.87 = f32[10]{0} reduce(subtract.83, constant.79), dimensions={0}, to_apply=add_F32.1
+  broadcast.125 = f32[10]{0} broadcast(constant.104), dimensions={}
+  multiply.126 = f32[10]{0} multiply(reduce.87, broadcast.125)
+  subtract.127 = f32[10]{0} subtract(get.62, multiply.126)
+  get.56 = pred[] get-tuple-element(get.53), index=2
+  ROOT tuple.128 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) tuple(subtract.107, subtract.111, subtract.115, subtract.119, subtract.123, subtract.127, get.56)
+}
+
+condition.129 {
+  prev.130 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) parameter(0)
+  ROOT get.131 = pred[] get-tuple-element(prev.130), index=6
+}
+
+ENTRY MnistTrainingLoopWithInfeed.140 {
+  layer1_weights.1 = f32[784,128]{1,0} parameter(0)
+  layer1_biases.2 = f32[128]{0} parameter(1)
+  layer2_weights.3 = f32[128,32]{1,0} parameter(2)
+  layer2_biases.4 = f32[32]{0} parameter(3)
+  layer3_weights.5 = f32[32,10]{1,0} parameter(4)
+  layer3_biases.6 = f32[10]{0} parameter(5)
+  constant.7 = pred[] constant(true)
+  tuple.8 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) tuple(layer1_weights.1, layer1_biases.2, layer2_weights.3, layer2_biases.4, layer3_weights.5, layer3_biases.6, constant.7)
+  while.132 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) while(tuple.8), condition=condition.129, body=body.49
+  get.133 = f32[784,128]{1,0} get-tuple-element(while.132), index=0
+  get.134 = f32[128]{0} get-tuple-element(while.132), index=1
+  get.135 = f32[128,32]{1,0} get-tuple-element(while.132), index=2
+  get.136 = f32[32]{0} get-tuple-element(while.132), index=3
+  get.137 = f32[32,10]{1,0} get-tuple-element(while.132), index=4
+  get.138 = f32[10]{0} get-tuple-element(while.132), index=5
+  ROOT tuple.139 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}) tuple(get.133, get.134, get.135, get.136, get.137, get.138)
+}
+)";
+
 class HloValueSemanticsAnalysisTest : public HloTestBase {
  public:
   bool HasLabel(const HloValueSemanticsAnalysis& hlo_value_semantics_analysis,
@@ -393,142 +524,8 @@ ENTRY entry {
 }
 
 TEST_F(HloValueSemanticsAnalysisTest, MnistTrainingLoop) {
-  const std::string module_str = R"(
-HloModule MnistTrainingLoopWithInfeed.140, entry_computation_layout={(f32[784,128]{1,0:T(8,128)},f32[128]{0:T(256)},f32[128,32]{1,0:T(8,128)},f32[32]{0:T(256)},f32[32,10]{1,0:T(8,128)},f32[10]{0:T(256)})->(f32[784,128]{1,0:T(8,128)}, f32[128]{0:T(256)}, f32[128,32]{1,0:T(8,128)}, f32[32]{0:T(256)}, f32[32,10]{1,0:T(8,128)}, /*index=5*/f32[10]{0:T(256)})}
-
-relu.9 {
-  x.10 = f32[] parameter(0)
-  constant.11 = f32[] constant(0)
-  ROOT maximum.12 = f32[] maximum(x.10, constant.11)
-}
-
-max_F32.17 {
-  lhs.18 = f32[] parameter(0)
-  rhs.19 = f32[] parameter(1)
-  ROOT maximum.20 = f32[] maximum(lhs.18, rhs.19)
-}
-
-add_F32.1 {
-  lhs.22 = f32[] parameter(0)
-  rhs.23 = f32[] parameter(1)
-  ROOT add.24 = f32[] add(lhs.22, rhs.23)
-}
-
-relu_gradients.29 {
-  activation.30 = f32[] parameter(0)
-  constant.32 = f32[] constant(0)
-  compare.33 = pred[] compare(activation.30, constant.32), direction=GT
-  backprop.31 = f32[] parameter(1)
-  ROOT select.34 = f32[] select(compare.33, backprop.31, constant.32)
-}
-
-body.49 {
-  after-all.51 = token[] after-all()
-  infeed.52 = ((f32[100,784]{1,0}, f32[100,10]{1,0}, pred[]), token[]) infeed(after-all.51)
-  get.53 = (f32[100,784]{1,0}, f32[100,10]{1,0}, pred[]) get-tuple-element(infeed.52), index=0
-  get.54 = f32[100,784]{1,0} get-tuple-element(get.53), index=0
-  prev.50 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) parameter(0)
-  get.57 = f32[784,128]{1,0} get-tuple-element(prev.50), index=0
-  dot.63 = f32[100,128]{1,0} dot(get.54, get.57), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  get.58 = f32[128]{0} get-tuple-element(prev.50), index=1
-  broadcast.64 = f32[100,128]{1,0} broadcast(get.58), dimensions={1}
-  add.65 = f32[100,128]{1,0} add(dot.63, broadcast.64)
-  map.66 = f32[100,128]{1,0} map(add.65), dimensions={0,1}, to_apply=relu.9
-  get.59 = f32[128,32]{1,0} get-tuple-element(prev.50), index=2
-  dot.67 = f32[100,32]{1,0} dot(map.66, get.59), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  get.60 = f32[32]{0} get-tuple-element(prev.50), index=3
-  broadcast.68 = f32[100,32]{1,0} broadcast(get.60), dimensions={1}
-  add.69 = f32[100,32]{1,0} add(dot.67, broadcast.68)
-  map.70 = f32[100,32]{1,0} map(add.69), dimensions={0,1}, to_apply=relu.9
-  get.61 = f32[32,10]{1,0} get-tuple-element(prev.50), index=4
-  dot.71 = f32[100,10]{1,0} dot(map.70, get.61), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  get.62 = f32[10]{0} get-tuple-element(prev.50), index=5
-  broadcast.72 = f32[100,10]{1,0} broadcast(get.62), dimensions={1}
-  add.73 = f32[100,10]{1,0} add(dot.71, broadcast.72)
-  constant.74 = f32[] constant(-inf)
-  reduce.75 = f32[100]{0} reduce(add.73, constant.74), dimensions={1}, to_apply=max_F32.17
-  broadcast.76 = f32[100,10]{1,0} broadcast(reduce.75), dimensions={0}
-  subtract.77 = f32[100,10]{1,0} subtract(add.73, broadcast.76)
-  exponential.78 = f32[100,10]{1,0} exponential(subtract.77)
-  constant.79 = f32[] constant(0)
-  reduce.80 = f32[100]{0} reduce(exponential.78, constant.79), dimensions={1}, to_apply=add_F32.1
-  broadcast.81 = f32[100,10]{1,0} broadcast(reduce.80), dimensions={0}
-  divide.82 = f32[100,10]{1,0} divide(exponential.78, broadcast.81)
-  get.55 = f32[100,10]{1,0} get-tuple-element(get.53), index=1
-  subtract.83 = f32[100,10]{1,0} subtract(divide.82, get.55)
-  transpose.88 = f32[10,32]{0,1} transpose(get.61), dimensions={1,0}
-  dot.89 = f32[100,32]{1,0} dot(subtract.83, transpose.88), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  map.90 = f32[100,32]{1,0} map(map.70, dot.89), dimensions={0,1}, to_apply=relu_gradients.29
-  transpose.95 = f32[32,128]{0,1} transpose(get.59), dimensions={1,0}
-  dot.96 = f32[100,128]{1,0} dot(map.90, transpose.95), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  map.97 = f32[100,128]{1,0} map(map.66, dot.96), dimensions={0,1}, to_apply=relu_gradients.29
-  transpose.98 = f32[784,100]{0,1} transpose(get.54), dimensions={1,0}
-  dot.99 = f32[784,128]{1,0} dot(transpose.98, map.97), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  constant.104 = f32[] constant(0.01)
-  broadcast.105 = f32[784,128]{1,0} broadcast(constant.104), dimensions={}
-  multiply.106 = f32[784,128]{1,0} multiply(dot.99, broadcast.105)
-  subtract.107 = f32[784,128]{1,0} subtract(get.57, multiply.106)
-  constant.100 = f32[] constant(0)
-  reduce.101 = f32[128]{0} reduce(map.97, constant.100), dimensions={0}, to_apply=add_F32.1
-  constant.108 = f32[] constant(0.01)
-  broadcast.109 = f32[128]{0} broadcast(constant.108), dimensions={}
-  multiply.110 = f32[128]{0} multiply(reduce.101, broadcast.109)
-  subtract.111 = f32[128]{0} subtract(get.58, multiply.110)
-  transpose.91 = f32[128,100]{0,1} transpose(map.66), dimensions={1,0}
-  dot.92 = f32[128,32]{1,0} dot(transpose.91, map.90), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  constant.112 = f32[] constant(0.01)
-  broadcast.113 = f32[128,32]{1,0} broadcast(constant.112), dimensions={}
-  multiply.114 = f32[128,32]{1,0} multiply(dot.92, broadcast.113)
-  subtract.115 = f32[128,32]{1,0} subtract(get.59, multiply.114)
-  constant.93 = f32[] constant(0)
-  reduce.94 = f32[32]{0} reduce(map.90, constant.93), dimensions={0}, to_apply=add_F32.1
-  constant.116 = f32[] constant(0.01)
-  broadcast.117 = f32[32]{0} broadcast(constant.116), dimensions={}
-  multiply.118 = f32[32]{0} multiply(reduce.94, broadcast.117)
-  subtract.119 = f32[32]{0} subtract(get.60, multiply.118)
-  transpose.84 = f32[32,100]{0,1} transpose(map.70), dimensions={1,0}
-  dot.85 = f32[32,10]{1,0} dot(transpose.84, subtract.83), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  constant.120 = f32[] constant(0.01)
-  broadcast.121 = f32[32,10]{1,0} broadcast(constant.120), dimensions={}
-  multiply.122 = f32[32,10]{1,0} multiply(dot.85, broadcast.121)
-  subtract.123 = f32[32,10]{1,0} subtract(get.61, multiply.122)
-  constant.86 = f32[] constant(0)
-  reduce.87 = f32[10]{0} reduce(subtract.83, constant.86), dimensions={0}, to_apply=add_F32.1
-  constant.124 = f32[] constant(0.01)
-  broadcast.125 = f32[10]{0} broadcast(constant.124), dimensions={}
-  multiply.126 = f32[10]{0} multiply(reduce.87, broadcast.125)
-  subtract.127 = f32[10]{0} subtract(get.62, multiply.126)
-  get.56 = pred[] get-tuple-element(get.53), index=2
-  ROOT tuple.128 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) tuple(subtract.107, subtract.111, subtract.115, subtract.119, subtract.123, subtract.127, get.56)
-}
-
-condition.129 {
-  prev.130 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) parameter(0)
-  ROOT get.131 = pred[] get-tuple-element(prev.130), index=6
-}
-
-ENTRY MnistTrainingLoopWithInfeed.140 {
-  layer1_weights.1 = f32[784,128]{1,0} parameter(0)
-  layer1_biases.2 = f32[128]{0} parameter(1)
-  layer2_weights.3 = f32[128,32]{1,0} parameter(2)
-  layer2_biases.4 = f32[32]{0} parameter(3)
-  layer3_weights.5 = f32[32,10]{1,0} parameter(4)
-  layer3_biases.6 = f32[10]{0} parameter(5)
-  constant.7 = pred[] constant(true)
-  tuple.8 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) tuple(layer1_weights.1, layer1_biases.2, layer2_weights.3, layer2_biases.4, layer3_weights.5, layer3_biases.6, constant.7)
-  while.132 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) while(tuple.8), condition=condition.129, body=body.49
-  get.133 = f32[784,128]{1,0} get-tuple-element(while.132), index=0
-  get.134 = f32[128]{0} get-tuple-element(while.132), index=1
-  get.135 = f32[128,32]{1,0} get-tuple-element(while.132), index=2
-  get.136 = f32[32]{0} get-tuple-element(while.132), index=3
-  get.137 = f32[32,10]{1,0} get-tuple-element(while.132), index=4
-  get.138 = f32[10]{0} get-tuple-element(while.132), index=5
-  ROOT tuple.139 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}) tuple(get.133, get.134, get.135, get.136, get.137, get.138)
-}
-)";
-
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(module_str,
+                          ParseAndReturnVerifiedModule(kMnistHlo,
                                                        /*replica_count=*/1,
                                                        /*num_partitions=*/1));
   TF_ASSERT_OK_AND_ASSIGN(
@@ -552,5 +549,37 @@ ENTRY MnistTrainingLoopWithInfeed.140 {
       IsWeightGradient(*hlo_value_semantics_analysis, module.get(), "dot.99"));
 }
 
+class EinsumDepthAnalysisTest : public HloTestBase {
+ public:
+  int GetInstructionDepth(const EinsumDepthMap& depth_map,
+                          HloComputation* computation, absl::string_view name) {
+    HloInstruction* instruction = computation->GetInstructionWithName(name);
+    auto depth_iter = depth_map.find(instruction);
+    EXPECT_NE(depth_iter, depth_map.end());
+    return depth_iter->second.element({});
+  }
+};
+
+TEST_F(EinsumDepthAnalysisTest, MnistTrainingLoop) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kMnistHlo,
+                                                       /*replica_count=*/1,
+                                                       /*num_partitions=*/1));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<EinsumDepthAnalysis> einsum_depth_analysis,
+      EinsumDepthAnalysis::Run(*module->entry_computation()));
+  const EinsumDepthMap& einsum_depth_map =
+      einsum_depth_analysis->GetEinsumDepthMap();
+  HloComputation* computation = module->GetComputationWithName("body.49");
+  EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "dot.63"), 5);
+  EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "dot.67"), 4);
+  EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "dot.71"), 3);
+  EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "dot.89"), 2);
+  EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "dot.96"), 1);
+  EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "dot.92"), 0);
+  EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "dot.99"), 0);
+  EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "dot.85"), 0);
+}
+
 }  // namespace
 }  // namespace xla

From ee61b61f6cce45b30542a859e27a6db3791b9ab2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 26 Sep 2023 10:35:26 -0700
Subject: [PATCH 273/567] [stream_executor] NFC: Remove external dependencies
 on stream_executor internal targets

PiperOrigin-RevId: 568583065
---
 tensorflow/core/common_runtime/pluggable_device/BUILD | 6 ++----
 tensorflow/core/platform/BUILD                        | 2 --
 third_party/xla/xla/backends/interpreter/BUILD        | 1 -
 third_party/xla/xla/pjrt/BUILD                        | 4 +---
 third_party/xla/xla/pjrt/gpu/BUILD                    | 2 +-
 third_party/xla/xla/service/gpu/BUILD                 | 1 -
 third_party/xla/xla/service/gpu/runtime/BUILD         | 2 +-
 7 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/common_runtime/pluggable_device/BUILD b/tensorflow/core/common_runtime/pluggable_device/BUILD
index 514bad92fc5fec..f1bcd11a9beadb 100644
--- a/tensorflow/core/common_runtime/pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/pluggable_device/BUILD
@@ -102,9 +102,8 @@ cc_library(
         "@com_google_absl//absl/status",
         "@local_xla//xla/pjrt:pjrt_api",
         "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:device_mem_allocator",
-        "@local_xla//xla/stream_executor:event",
-        "@local_xla//xla/stream_executor:kernel",
     ] + if_static([
         "//tensorflow/core/common_runtime:copy_tensor",
     ]),
@@ -126,9 +125,8 @@ cc_library(
         "//tensorflow/core/lib/core:status",
         "//tensorflow/core/platform:stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:device_mem_allocator",
-        "@local_xla//xla/stream_executor:event",
-        "@local_xla//xla/stream_executor:kernel",
     ] + if_static([
         # Temporary workaround for duplicated symbols issues.
         ":pluggable_device_runtime_impl",
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 229277ad00c0d8..af8ca88c76447e 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -1101,7 +1101,6 @@ tf_cuda_library(
         "@local_tsl//tsl/platform",
         "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:dnn",
-        "@local_xla//xla/stream_executor:event",
         "@local_xla//xla/stream_executor:multi_platform_manager",
         "@local_xla//xla/stream_executor:scratch_allocator",
         "@local_xla//xla/stream_executor/cuda:cuda_activation_header",
@@ -1126,7 +1125,6 @@ cc_library(
     deps = [
         "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:dnn",
-        "@local_xla//xla/stream_executor:event",
         "@local_xla//xla/stream_executor:multi_platform_manager",
         "@local_xla//xla/stream_executor:scratch_allocator",
         "@local_xla//xla/stream_executor/cuda:cuda_platform_id",
diff --git a/third_party/xla/xla/backends/interpreter/BUILD b/third_party/xla/xla/backends/interpreter/BUILD
index bba10b2e1fd7ee..92d022b6dc2d16 100644
--- a/third_party/xla/xla/backends/interpreter/BUILD
+++ b/third_party/xla/xla/backends/interpreter/BUILD
@@ -97,7 +97,6 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/stream_executor",
-        "//xla/stream_executor:event",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 8666c726b89af4..b44e02ceba8125 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -87,9 +87,9 @@ cc_library(
         "//xla/runtime:async_runtime",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
+        "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:event",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/synchronization",
     ],
@@ -124,7 +124,6 @@ cc_library(
         "//xla:util",
         "//xla/client:local_client",
         "//xla/stream_executor",
-        "//xla/stream_executor:event",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
@@ -430,7 +429,6 @@ cc_library(
         "//xla/service:transfer_manager",
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/stream_executor",
-        "//xla/stream_executor:event",
         "//xla/stream_executor/host:host_platform_id",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 12c50d4c918a80..3755a466731e9c 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -25,8 +25,8 @@ cc_library(
         "//xla/client:client_library",
         "//xla/client:local_client",
         "//xla/service:platform_util",
+        "//xla/stream_executor",
         "//xla/stream_executor:device_mem_allocator",
-        "//xla/stream_executor:kernel",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/framework:bfc_allocator",
         "@local_tsl//tsl/framework:device_id_impl",
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index c8ef34a5592760..075e103eaf2b59 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1036,7 +1036,6 @@ cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:kernel",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/gpu:asm_compiler",
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index fc2a57803bafe0..b3936182b93a17 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -573,7 +573,7 @@ cc_library(
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
         "//xla/service:executable",
-        "//xla/stream_executor:event",
+        "//xla/stream_executor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",

From 14f74dbd85fcc01654d05c79a34a883709ecb460 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Tue, 26 Sep 2023 10:58:21 -0700
Subject: [PATCH 274/567] [XLA:GPU][NFC] Pass CompileOptions directly to Run*
 from LLVMCompiler::Compile

The option should be used instead of just the device allocator.

PiperOrigin-RevId: 568590287
---
 third_party/xla/xla/service/llvm_compiler.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/llvm_compiler.cc b/third_party/xla/xla/service/llvm_compiler.cc
index 02b71f5c47c83f..ea20c00592b955 100644
--- a/third_party/xla/xla/service/llvm_compiler.cc
+++ b/third_party/xla/xla/service/llvm_compiler.cc
@@ -42,12 +42,11 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
   std::vector<std::unique_ptr<HloModule>> modules =
       module_group->ConsumeModules();
   for (size_t i = 0; i < modules.size(); i++) {
-    TF_ASSIGN_OR_RETURN(modules[i],
-                        RunHloPasses(std::move(modules[i]), stream_execs[i][0],
-                                     options.device_allocator));
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                        RunBackend(std::move(modules[i]), stream_execs[i][0],
-                                   options.device_allocator));
+    TF_ASSIGN_OR_RETURN(modules[i], RunHloPasses(std::move(modules[i]),
+                                                 stream_execs[i][0], options));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<Executable> executable,
+        RunBackend(std::move(modules[i]), stream_execs[i][0], options));
     result.push_back(std::move(executable));
   }
 

From ba1bfa60c37c4a142ab170bcf67228af9c5492e2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 11:00:55 -0700
Subject: [PATCH 275/567] Don't fuse the tf.Split op and add a new attr to
 tf.TPUCompileAndExecute to indicate number of arguments per partition so that
 the kernel knows how to distribute the execution.

PiperOrigin-RevId: 568591194
---
 .../compiler/mlir/tensorflow/ir/tf_ops.td     |  2 +
 .../mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.td      |  1 +
 .../compiler/mlir/tfrt/ir/mlrt/tf_ops.td      |  1 +
 .../fuse_tpu_compile_and_execute_ops.mlir     | 34 +++++++-----
 .../mlir/tfrt/tests/mlrt/tpu_conversions.mlir | 20 +++----
 .../fuse_tpu_compile_and_execute_ops.cc       | 53 ++++++++-----------
 .../mlrt/tpu_conversion_patterns.cc           |  4 +-
 7 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 29be67c510a3e8..08d0a6b40559c3 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -1706,6 +1706,7 @@ target computation.
 'metadata' is a serialized TPUCompileMetadataProto describing the shapes and
 types of the inputs to the computation, as well as a mapping onto the TPU pod
 topology.
+'num_operands_per_execute' is the number of operands that each partition(execution on one tpu core) takes.
 'producer_name' is a string describing the name of the framework that add support for running this portion of the model on TPUs.
   }];
 
@@ -1715,6 +1716,7 @@ topology.
     OptionalAttr<I32ArrayAttr>:$operands_with_static_shape,
     DefaultValuedStrAttr<StrAttr, "">:$mlir_module,
     StrAttr:$metadata,
+    UI32Attr:$num_operands_per_execute,
     StrAttr:$producer_name
   );
 
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.td
index a207b83c7e5207..b603b1e8f1e35d 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.td
@@ -65,6 +65,7 @@ def CompileAndExecuteOp : TensorflowMlrtTpu_Op<"compile_and_execute"> {
     StrAttr:$metadata,
     StrAttr:$mlir_module,
     UI32Attr:$num_operands,
+    UI32Attr:$num_operands_per_execute,
     DenseI32ArrayAttr:$operands_with_static_shape,
     StrAttr:$producer_name
   );
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td
index bb567f32106215..26045a126b5745 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td
@@ -134,6 +134,7 @@ def TFTPUCompileAndExecuteOp : TensorflowMlrt_Op<"tf_tpu_compile_and_execute", [
     StrAttr:$metadata,
     StrAttr:$mlir_module,
     UI32Attr:$num_operands,
+    UI32Attr:$num_operands_per_execute,
     DenseI32ArrayAttr:$operands_with_static_shape,
     StrAttr:$producer_name
   );
diff --git a/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
index 823cbbc6f24cf7..8f7edbb465be23 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
@@ -10,8 +10,8 @@ func.func private @test_fuse_tpu_ops(%arg0: tensor<*xi32>, %arg1: tensor<*x!tf_t
   // CHECK-NOT: tf.TPUCompileSucceededAssert
   // CHECK-NOT: tf.TPUExecuteOp
 
-  // CHECK-NEXT: %0 = "tf.ReadVariableOp"(%arg1)
-  // CHECK:      [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 2, 0>, operands_with_static_shape = [], producer_name = "default"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
+  // CHECK-NEXT: %0 = "tf.ReadVariableOp"(%arg1) {device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<*xi32>
+  // CHECK:      [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %0) {metadata = "metadata", mlir_module = "mlir_module", num_operands_per_execute = 2 : ui32, operandSegmentSizes = array<i32: 2, 0>, operands_with_static_shape = [], producer_name = "default"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
   // CHECK-NEXT: return [[exec_result]] : tensor<*xi32>
 
   %0 = "tf.ReadVariableOp"(%arg1) {device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<*xi32>
@@ -38,7 +38,7 @@ func.func private @test_outside_compilation(%arg0: tensor<*xi32>, %arg1: tensor<
   // CHECK-NOT: tf.TPUExecuteOp
 
   // CHECK-NEXT: %0 = "tf.ReadVariableOp"(%arg1)
-  // CHECK:      [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 2, 0>, operands_with_static_shape = [], producer_name = "default"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
+  // CHECK:      [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %0) {metadata = "metadata", mlir_module = "mlir_module", num_operands_per_execute = 2 : ui32, operandSegmentSizes = array<i32: 2, 0>, operands_with_static_shape = [], producer_name = "default"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
   // CHECK-NEXT: "tf._XlaSendFromHost"(%arg0, %0, [[key]]) {_xla_has_host_transfer = true, device = "/job:localhost/replica:0/task:0/device:CPU:0", device_ordinal = 0 : i64, key = "host_compute_channel_0_retvals"} : (tensor<*xi32>, tensor<*xi32>, tensor<3x!tf_type.string>) -> ()
   // CHECK-NEXT: return [[exec_result]] : tensor<*xi32>
   %0 = "tf.ReadVariableOp"(%arg1) {device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<*xi32>
@@ -69,8 +69,8 @@ func.func private @test_fuse_dynamic_dimension_ops(%arg0: tensor<?x?xi32>, %arg1
   // CHECK: [[read_result:%.*]] = "tf.ReadVariableOp"(%arg1)
   // CHECK: [[shape_result_1:%.*]] = "tf.Shape"(%arg0) {device = "/CPU:0"} : (tensor<?x?xi32>) -> tensor<?xi64>
   // CHECK: [[shape_result_2:%.*]] = "tf.Shape"([[read_result]]) {device = "/CPU:0"} : (tensor<*xi32>) -> tensor<?xi64>
-  // CHECK: [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, [[shape_result_2]], %0, %0, %arg2, %arg4, %arg3) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 4, 3>, operands_with_static_shape = [0 : i32, 1 : i32, 3 : i32], producer_name = "default"} : (tensor<?x?xi32>, tensor<?xi64>, tensor<*xi32>, tensor<*xi32>, tensor<2xi64>, tensor<?xi64>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
-  // CHECK: [[key_1:%.*]], [[exec_result_1:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %2, %0, %1) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 4, 0>, operands_with_static_shape = [], producer_name = "default"} : (tensor<?x?xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
+  // CHECK: [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, [[shape_result_2]], %0, %0, %arg2, %arg4, %arg3) {metadata = "metadata", mlir_module = "mlir_module",  num_operands_per_execute = 4 : ui32, operandSegmentSizes = array<i32: 4, 3>, operands_with_static_shape = [0 : i32, 1 : i32, 3 : i32], producer_name = "default"} : (tensor<?x?xi32>, tensor<?xi64>, tensor<*xi32>, tensor<*xi32>, tensor<2xi64>, tensor<?xi64>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
+  // CHECK: [[key_1:%.*]], [[exec_result_1:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %2, %0, %1) {metadata = "metadata", mlir_module = "mlir_module",  num_operands_per_execute = 4 : ui32, operandSegmentSizes = array<i32: 4, 0>, operands_with_static_shape = [], producer_name = "default"} : (tensor<?x?xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
   // CHECK-NEXT: return [[exec_result]] : tensor<*xi32>
   %0 = "tf.ReadVariableOp"(%arg1) {device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<*xi32>
   %dyn_arg0 = "tf.SetStaticDimensionBounds" (%arg0, %arg2) :(tensor<?x?xi32>, tensor<2xi64>) -> tensor<?x?xi32>
@@ -111,10 +111,12 @@ func.func private @reorder_execute_arg_defining_ops(%arg0: tensor<1x3xf32> {tf.d
 // -----
 
 module attributes {tf_saved_model.semantics} {
-// CHECK-LABEL: func private @spmd_fuse_mulitple_execute_ops
-// CHECK-NEXT: %0 = "tf.VarHandleOp"()
-// CHECK-NEXT: %1 = "tf.ReadVariableOp"(%0)
-// CHECK-NEXT: %rendezvous_key_base, %results = "tf.TPUCompileMlirAndExecute"(%arg0, %1)
+// CHECK: func private @spmd_fuse_mulitple_execute_ops
+// CHECK: %cst = "tf.Const"() {device = "/CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT: %0:2 = "tf.Split"(%cst, %arg0) {device = "/CPU:0"} : (tensor<i32>, tensor<1x4xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>)
+// CHECK-NEXT: %1 = "tf.VarHandleOp"()
+// CHECK-NEXT: %2 = "tf.ReadVariableOp"(%1) {device = "/CPU:0"} : (tensor<!tf_type.resource<tensor<2x1xf32>>>) -> tensor<2x1xf32>
+// CHECK-NEXT: %rendezvous_key_base, %results = "tf.TPUCompileMlirAndExecute"(%0#0, %2, %0#1, %2) {metadata = "metadata", mlir_module = "propgram", num_operands_per_execute = 2 : ui32, operandSegmentSizes = array<i32: 4, 0>, operands_with_static_shape = [], producer_name = "UNKNOWN"} : (tensor<1x2xf32>, tensor<2x1xf32>, tensor<1x2xf32>, tensor<2x1xf32>) -> (tensor<3x!tf_type.string>, tensor<1x1xf32>)
 func.func private @spmd_fuse_mulitple_execute_ops(%arg0: tensor<1x4xf32> {tf.device = "/CPU:0"}) -> (tensor<1x1xf32> {tf.device = "/TPU:0"}) {
   %cst = "tf.Const"() {device = "/CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %compilation_status, %program:2 = "tf._TPUCompileMlir"() {device = "/CPU:0", metadata = "metadata", mlir_module = "propgram"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>, tensor<3x!tf_type.string>)
@@ -133,9 +135,11 @@ func.func private @spmd_fuse_mulitple_execute_ops(%arg0: tensor<1x4xf32> {tf.dev
 
 module attributes {tf_saved_model.semantics} {
 // CHECK-LABEL: func private @spmd_fuse_mulitple_execute_ops_2
+// CHECK-NEXT: %cst = "tf.Const"()
 // CHECK-NEXT: %0 = "tf.VarHandleOp"()
 // CHECK-NEXT: %1 = "tf.ReadVariableOp"(%0)
-// CHECK-NEXT: %rendezvous_key_base, %results = "tf.TPUCompileMlirAndExecute"(%arg0, %1)
+// CHECK-NEXT: %2:2 = "tf.Split"(%cst, %1)
+// CHECK-NEXT: %rendezvous_key_base, %results = "tf.TPUCompileMlirAndExecute"(%arg0, %2#0, %arg0, %2#1)
 func.func private @spmd_fuse_mulitple_execute_ops_2(%arg0: tensor<1x1xf32> {tf.device = "/CPU:0"}) -> (tensor<1x1xf32> {tf.device = "/TPU:0"}) {
   %cst = "tf.Const"() {device = "/CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %compilation_status, %program:2 = "tf._TPUCompileMlir"() {device = "/CPU:0", metadata = "metadata", mlir_module = "propgram"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>, tensor<3x!tf_type.string>)
@@ -154,9 +158,13 @@ func.func private @spmd_fuse_mulitple_execute_ops_2(%arg0: tensor<1x1xf32> {tf.d
 
 module attributes {tf_saved_model.semantics} {
 // CHECK-LABEL: func private @spmd_fuse_split_nd_ops
-// CHECK-NEXT: %0 = "tf.VarHandleOp"()
-// CHECK-NEXT: %1 = "tf.ReadVariableOp"(%0)
-// CHECK-NEXT: %rendezvous_key_base, %results = "tf.TPUCompileMlirAndExecute"(%arg0, %1)
+// CHECK-NEXT: %cst = "tf.Const"()
+// CHECK-NEXT: %0:2 = "tf.Split"(%cst, %arg0)
+// CHECK-NEXT: %1:2 = "tf.Split"(%cst, %0#0)
+// CHECK-NEXT: %2 = "tf.VarHandleOp"()
+// CHECK-NEXT: %3 = "tf.ReadVariableOp"(%2) {device = "/CPU:0"} : (tensor<!tf_type.resource<tensor<1x1xf32>>>) -> tensor<1x1xf32>
+// CHECK-NEXT: %4:2 = "tf.Split"(%cst, %0#1) {device = "/CPU:0"} : (tensor<i32>, tensor<1x2xf32>) -> (tensor<1x1xf32>, tensor<1x1xf32>)
+// CHECK-NEXT: %rendezvous_key_base, %results = "tf.TPUCompileMlirAndExecute"(%1#0, %3, %1#1, %3, %4#0, %3, %4#1, %3) {metadata = "metadata", mlir_module = "propgram", num_operands_per_execute = 2 : ui32, operandSegmentSizes = array<i32: 8, 0>, operands_with_static_shape = [], producer_name = "UNKNOWN"} : (tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>) -> (tensor<3x!tf_type.string>, tensor<1x1xf32>)
 func.func private @spmd_fuse_split_nd_ops(%arg0: tensor<1x4xf32> {tf.device = "/CPU:0"}) -> (tensor<1x1xf32> {tf.device = "/TPU:0"}) {
   %cst = "tf.Const"() {device = "/CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %compilation_status, %program:4 = "tf._TPUCompileMlir"() {device = "/CPU:0", metadata = "metadata", mlir_module = "propgram"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>, tensor<3x!tf_type.string>, tensor<3x!tf_type.string>, tensor<3x!tf_type.string>)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tpu_conversions.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tpu_conversions.mlir
index ade6fe206ac86f..ee09a8ac541fbf 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tpu_conversions.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tpu_conversions.mlir
@@ -11,7 +11,7 @@ func.func @batch_function(%arg0: tensor<i32>) -> (tensor<i32>) {
   // CHECK-NEXT: [[rendezvous_key_base:%.*]] = tf_mlrt_tpu.compile_and_execute([[batch_result]])
   // CHECK-NEXT: return [[rendezvous_key_base]]
   %0 = "tf.BatchFunction"(%arg0, %arg0) {device = "/device:CPU:0", allowed_batch_sizes = [64], batch_timeout_micros = 1 : i64, batching_queue = "", container = "", f = @callee, max_batch_size = 256 : i64, num_batch_threads = 2 : i64, operandSegmentSizes = array<i32: 1, 1>, shared_name = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  %1 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<i32>) -> tensor<i32>
+  %1 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, num_operands_per_execute = 1 : ui32, producer_name = "producer_name"} : (tensor<i32>) -> tensor<i32>
   func.return %1 : tensor<i32>
 }
 
@@ -24,7 +24,7 @@ func.func @executeop_input(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
   // CHECK: [[rendezvous_key_base:%.*]], [[result_future:%.*]] = tf_mlrt_tpu.compile_and_execute([[cast]])
   // CHECK: tf_mlrt.await [[result_future]]
   %0 = "tf.Cast"(%arg0) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
-  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
+  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, num_operands_per_execute = 1 : ui32, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
   func.return %1, %2 : tensor<i32>, tensor<i32>
 }
 
@@ -36,7 +36,7 @@ func.func @executeop_side_effecting_input(%arg0: tensor<!tf_type.resource<tensor
   // CHECK: [[var:%.*]] = tf_mlrt.executeop.device([[device]]){{.*}}op: \22ResourceGather\22
   // CHECK: [[rendezvous_key_base:%.*]] = tf_mlrt_tpu.compile_and_execute([[var]])
   %0 = "tf.ResourceGather"(%arg0, %indices) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<4xf32>>>, tensor<i32>) -> tensor<f32>
-  %1 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<f32>) -> tensor<i32>
+  %1 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, num_operands_per_execute = 1 : ui32, producer_name = "producer_name"} : (tensor<f32>) -> tensor<i32>
   func.return %1 : tensor<i32>
 }
 
@@ -48,7 +48,7 @@ func.func @executeop_input_same_execute_op(%arg0: tensor<i32>, %arg1: tensor<2xf
   // CHECK: [[split:%.*]]:2 = tf_mlrt.executeop.device([[device]])
   // CHECK: tf_mlrt_tpu.compile_and_execute([[split]]#0, [[split]]#1)
   %0, %1 = "tf.Split"(%arg0, %arg1) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<i32>, tensor<2xf32>) -> (tensor<f32>, tensor<f32>)
-  %2 = "tf.TPUCompileMlirAndExecute"(%0, %1) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 2, 0>, producer_name = "producer_name"} : (tensor<f32>, tensor<f32>) -> tensor<i32>
+  %2 = "tf.TPUCompileMlirAndExecute"(%0, %1) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 2, 0>, num_operands_per_execute = 2 : ui32, producer_name = "producer_name"} : (tensor<f32>, tensor<f32>) -> tensor<i32>
   func.return %2 : tensor<i32>
 }
 
@@ -65,7 +65,7 @@ func.func @executeop_dag(%arg0: tensor<i32>) -> (tensor<i32>) {
   // CHECK-NEXT: tf_mlrt_tpu.compile_and_execute
   %0 = "tf.Cast"(%arg0) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
   %1 = "tf.Relu"(%0) {__op_key = 1: i32, device = "/device:CPU:0"} : (tensor<f32>) -> (tensor<f32>)
-  %2 = "tf.TPUCompileMlirAndExecute"(%1, %0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 2, 0>, producer_name = "producer_name"} : (tensor<f32>, tensor<f32>) -> tensor<i32>
+  %2 = "tf.TPUCompileMlirAndExecute"(%1, %0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 2, 0>, num_operands_per_execute = 1 : ui32, producer_name = "producer_name"} : (tensor<f32>, tensor<f32>) -> tensor<i32>
   func.return %2 : tensor<i32>
 }
 
@@ -79,7 +79,7 @@ func.func @test_fuse_dynamic_dimension_ops(%arg0: tensor<*xi32>, %arg1: tensor<*
   // CHECK-SAME: constant_operand_indices = array<i32: 2>
   // CHECK-SAME: num_operands = 4
   // CHECK-SAME: operands_with_static_shape = array<i32: 0, 1, 3>
-  %rendezvous_key_base, %results = "tf.TPUCompileMlirAndExecute"(%arg0, %2, %0, %1, %arg5, %arg6, %arg7) {operands_with_static_shape = [0 : i32, 1 : i32, 3 : i32], metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 4, 3>, producer_name = "producer_name"} : (tensor<*xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
+  %rendezvous_key_base, %results = "tf.TPUCompileMlirAndExecute"(%arg0, %2, %0, %1, %arg5, %arg6, %arg7) {operands_with_static_shape = [0 : i32, 1 : i32, 3 : i32], metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 4, 3>, num_operands_per_execute = 3 : ui32, producer_name = "producer_name"} : (tensor<*xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
   func.return %results : tensor<*xi32>
 }
 
@@ -98,7 +98,7 @@ func.func @executeop_input(%arg0: tensor<i32>) -> (tensor<i32>) {
   // CHECK: tf_mlrt.executeop
   %0 = "tf.Cast"(%arg0) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
   // CHECK: [[rendezvous_key_base:%.*]], [[result:%.*]] = tf_mlrt_tpu.compile_and_execute
-  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
+  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, num_operands_per_execute = 1 : ui32, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
   %3 = "tf.StringFormat"(%2) {__op_key = 1: i32, device = "/job:localhost/replica:0/task:0/device:CPU:0", placeholder = "{}", strtemplate = "%s", summarize = 3 : i64, template = "Outside compiled {}"} : (tensor<i32>) -> tensor<!tf_type.string>
   "tf.PrintV2"(%3) {__op_key = 2: i32, device = "/job:localhost/replica:0/task:0/device:CPU:0", end = "\0A", output_stream = "stderr"} : (tensor<!tf_type.string>) -> ()
   // CHECK: [[handle:%.*]] = mlrt.async([[result]])
@@ -127,7 +127,7 @@ func.func @preserve_constant_args(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.r
   %0 = "tf.Cast"(%arg0) {__op_key = 3: i32, device = "/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
   // CHECK: tf_mlrt_tpu.compile_and_execute({{%.*}}, [[cast]]
   // CHECK-SAME: constant_operand_indices = array<i32: 1, 3, 4>
-  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0, %v1, %0, %v2, %v0, %arg0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 6, 0>, producer_name = "producer_name"} : (tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
+  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0, %v1, %0, %v2, %v0, %arg0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 6, 0>, num_operands_per_execute = 6 : ui32, producer_name = "producer_name"} : (tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
   func.return %2 : tensor<i32>
 }
 
@@ -141,7 +141,7 @@ func.func @executeop_input_async() -> (tensor<i32>, tensor<i32>) {
   // CHECK: [[rendezvous_key_base:%.*]], [[result_future:%.*]] = tf_mlrt_tpu.compile_and_execute([[recv]])
   // CHECK: tf_mlrt.await [[result_future]]
   %0 = "tf.Recv"() {__op_key = 0: i32, device = "/device:CPU:0", tensor_name = "tensor", send_device = "/device:CPU:0", send_device_incarnation = 0, recv_device = "/device:CPU:0"} : () -> tensor<f32>
-  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
+  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, num_operands_per_execute = 1 : ui32, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
   func.return %1, %2 : tensor<i32>, tensor<i32>
 }
 
@@ -153,7 +153,7 @@ func.func @executeop_input_async() -> (tensor<i32>, tensor<i32>) {
 func.func @main(%input0: tensor<i32>, %input1: tensor<i32>, %input2: tensor<!tf_type.variant<tensor<*xf32>>> ) -> tensor<i32> {
   %0 = "tf.Cast"(%input0) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
   // CHECK: tf_mlrt_tpu.compile_and_execute
-  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
+  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 1, 0>, num_operands_per_execute = 1 : ui32, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
   %max_iter = "tf.Const"() {__op_key = 1, value = dense<2> : tensor<i32>} : () -> tensor<i32>
   // CHECK: tf_mlrt.map_fn
   %result = "tf_mlrt.tf_map_fn"(%max_iter, %input2, %2) { operandSegmentSizes = array<i32: 1, 1, 1>, body_fn = @NopMapFnBody, num_tensor_list_or_flow_in = 1 : i32} : (tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>) -> tensor<i32>
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
index 0024aa8b658b17..57a52f868fa5b4 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
@@ -63,18 +63,6 @@ void GroupCompileOpAndExecuteOp(mlir::func::FuncOp func,
   }
 }
 
-mlir::Value FindFullShapedOperand(mlir::TF::SplitOp split_op) {
-  auto operand = split_op.getOperand(1);
-  auto defining_op = operand.getDefiningOp();
-  if (!defining_op) {
-    return operand;
-  }
-  if (auto parent_split_op = llvm::dyn_cast<mlir::TF::SplitOp>(defining_op))
-    return FindFullShapedOperand(parent_split_op);
-
-  return operand;
-}
-
 bool MaybeFindUsedExecuteOp(
     const llvm::SmallVector<mlir::TF::TPUExecuteOp, 4> &exec_op_in_group,
     mlir::TF::TPUExecuteOp &used_exec_op) {
@@ -105,7 +93,6 @@ bool MaybeFindUsedExecuteOp(
 void FuseCompileAndExecuteOps(
     mlir::TF::_TPUCompileMlirOp &compile_op,
     const llvm::SmallVector<mlir::TF::TPUExecuteOp, 4> &exec_op_in_group,
-    mlir::TF::TPUExecuteOp &used_exec_op,
     llvm::SmallDenseMap<
         mlir::TF::TPUExecuteOp,
         llvm::SmallDenseMap<int, mlir::TF::SetStaticDimensionBoundsOp>>
@@ -115,17 +102,21 @@ void FuseCompileAndExecuteOps(
   llvm::SmallVector<mlir::Type, 4> output_types;
   output_types.push_back(mlir::RankedTensorType::get(
       {3}, builder.getType<mlir::TF::StringType>()));
-  output_types.insert(output_types.end(), used_exec_op.getResultTypes().begin(),
-                      used_exec_op.getResultTypes().end());
+  auto first_exec_op = exec_op_in_group[0];
+  output_types.insert(output_types.end(),
+                      first_exec_op.getResultTypes().begin(),
+                      first_exec_op.getResultTypes().end());
 
   llvm::SmallVector<int> static_shaped_operand_indices_attr;
   llvm::SmallVector<mlir::Value> static_shape_tensors;
   llvm::SmallVector<mlir::Value> exec_op_args;
-  exec_op_args.resize(used_exec_op.getArgs().size());
+  exec_op_args.resize(first_exec_op.getArgs().size() * exec_op_in_group.size());
 
+  // SetStaticDimensionBoundsOp not supported for SPMD, so only examine the
+  // first execute op.
   auto &static_shaped_operands =
-      exec_to_static_shaped_operands_map[used_exec_op];
-  for (int i = 0; i < used_exec_op.getArgs().size(); ++i) {
+      exec_to_static_shaped_operands_map[first_exec_op];
+  for (int i = 0; i < first_exec_op.getArgs().size(); ++i) {
     auto iter = static_shaped_operands.find(i);
     if (iter != static_shaped_operands.end()) {
       static_shaped_operand_indices_attr.push_back(iter->first);
@@ -137,22 +128,27 @@ void FuseCompileAndExecuteOps(
           mlir::ValueRange({iter->second.getInput()}));
       iter->second->erase();
     } else {
-      auto split_op = ::llvm::dyn_cast_or_null<mlir::TF::SplitOp>(
-          used_exec_op->getOperand(i).getDefiningOp());
-      exec_op_args[i] = split_op ? FindFullShapedOperand(split_op)
-                                 : used_exec_op->getOperand(i);
+      exec_op_args[i] = first_exec_op->getOperand(i);
+    }
+  }
+  int num_operands_per_execute = first_exec_op.getArgs().size();
+  for (int i = 1; i < exec_op_in_group.size(); ++i) {
+    for (int j = 0; j < num_operands_per_execute; ++j) {
+      exec_op_args[i * num_operands_per_execute + j] =
+          exec_op_in_group[i]->getOperand(j);
     }
   }
 
   auto producer_name =
-      used_exec_op->getAttrOfType<mlir::StringAttr>("_producer_name");
+      first_exec_op->getAttrOfType<mlir::StringAttr>("_producer_name");
   if (!producer_name) producer_name = mlir::StringAttr::get(context, "default");
   auto compile_and_execute_op =
       builder.create<mlir::TF::TPUCompileMlirAndExecuteOp>(
-          used_exec_op.getLoc(), output_types, exec_op_args,
+          first_exec_op.getLoc(), output_types, exec_op_args,
           static_shape_tensors,
           builder.getI32ArrayAttr(static_shaped_operand_indices_attr),
-          compile_op.getMlirModule(), compile_op.getMetadata(), producer_name);
+          compile_op.getMlirModule(), compile_op.getMetadata(),
+          num_operands_per_execute, producer_name);
 
   for (auto exec_op : exec_op_in_group) {
     exec_op.replaceAllUsesWith(compile_and_execute_op.getResults());
@@ -241,13 +237,8 @@ class FuseTpuCompileAndExecutePass
       auto compile_op = kv.first;
       const llvm::SmallVector<mlir::TF::TPUExecuteOp, 4> &exec_op_in_group =
           kv.second;
-      mlir::TF::TPUExecuteOp used_exec_op;
-      if (!MaybeFindUsedExecuteOp(exec_op_in_group, used_exec_op)) {
-        signalPassFailure();
-        return;
-      }
 
-      FuseCompileAndExecuteOps(compile_op, exec_op_in_group, used_exec_op,
+      FuseCompileAndExecuteOps(compile_op, exec_op_in_group,
                                exec_to_static_shaped_operands_map, builder,
                                &getContext());
     }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.cc
index 0212c945de61eb..c7f33527005654 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.cc
@@ -97,6 +97,7 @@ class TPUCompileMlirAndExecuteOpPreParallelizationConversion
             rewriter.getDenseI32ArrayAttr(constant_operand_indices),
             op.getMetadataAttr(), op.getMlirModuleAttr(),
             rewriter.getUI32IntegerAttr(tensor_operands_size),
+            op.getNumOperandsPerExecuteAttr(),
             rewriter.getDenseI32ArrayAttr(operands_with_static_shapes),
             producer_name);
 
@@ -131,7 +132,8 @@ class TPUCompileMlirAndExecuteOpConversion
         rewriter.create<tf_mlrt_tpu::CompileAndExecuteOp>(
             op.getLoc(), result_types, operands, op.getConstantOperandIndices(),
             op.getMetadataAttr(), op.getMlirModuleAttr(), op.getNumOperands(),
-            op.getOperandsWithStaticShape(), op.getProducerName());
+            op.getNumOperandsPerExecute(), op.getOperandsWithStaticShape(),
+            op.getProducerName());
 
     rewriter.replaceOp(op, compile_and_execute_op->getResults());
 

From 6bf2c9b23c83d45da51b4340872dfc01d12297ef Mon Sep 17 00:00:00 2001
From: Fiona Lang <flang@google.com>
Date: Tue, 26 Sep 2023 11:13:29 -0700
Subject: [PATCH 276/567] Remove import of debug gradients in gradients_impl.

Due to the debug init file, the debug gradient imports cause a cycle with gradients_impl.py.

Consequently, regenerate pywrap_gradient_exclusions to not include debug gradients, under the assumption that debug gradients would not be used without first importing debug.

PiperOrigin-RevId: 568594978
---
 .../python/eager/gradient_input_output_exclusions.py   |  4 ++++
 tensorflow/python/eager/pywrap_gradient_exclusions.cc  | 10 ++--------
 tensorflow/python/ops/BUILD                            |  2 --
 tensorflow/python/ops/gradients_impl.py                |  2 --
 4 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/eager/gradient_input_output_exclusions.py b/tensorflow/python/eager/gradient_input_output_exclusions.py
index 5b9baf3545c71c..76a3da4adc9c47 100644
--- a/tensorflow/python/eager/gradient_input_output_exclusions.py
+++ b/tensorflow/python/eager/gradient_input_output_exclusions.py
@@ -97,6 +97,10 @@
     "CopyToMeshGrad",
     "Relayout",
     "RelayoutLike",
+    # Debug ops that similarly only appear in OSS, and fails the test in OSS.
+    "DebugGradientIdentity",
+    "DebugGradientRefIdentity",
+    "DebugIdentityV2",
 ]
 
 
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index d2e1c1296ad310..f259ae9cebb17f 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 366> a = {{
+  static std::array<OpIndexInfo, 363> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -96,9 +96,6 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"CropAndResize", 1, {3}},
       {"CrossReplicaSum", 1, {0}},
       {"Cumsum", 1, {0}},
-      {"DebugGradientIdentity"},
-      {"DebugGradientRefIdentity"},
-      {"DebugIdentityV2"},
       {"DecodeBase64"},
       {"DecodePaddedRaw"},
       {"DecodeProtoV2"},
@@ -429,7 +426,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 485> a = {{
+  static std::array<OpIndexInfo, 482> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -507,9 +504,6 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"CrossReplicaSum"},
       {"Cumprod"},
       {"Cumsum"},
-      {"DebugGradientIdentity"},
-      {"DebugGradientRefIdentity"},
-      {"DebugIdentityV2"},
       {"DecodeBase64"},
       {"DecodePaddedRaw"},
       {"DecodeRaw"},
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index ca03c9485561b1..945a303d36735f 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -1634,8 +1634,6 @@ py_strict_library(
         ":tensor_array_ops",
         ":unconnected_gradients",
         ":while_loop",
-        "//tensorflow/python/debug/lib:debug_gradients",
-        "//tensorflow/python/debug/lib:dumping_callback",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_grad",
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 9812b382093768..5a69feae861089 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Implements the graph generation for computation of gradients."""
 
-from tensorflow.python.debug.lib import debug_gradients  # pylint: disable=unused-import
-from tensorflow.python.debug.lib import dumping_callback  # pylint: disable=unused-import
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import

From 297ec1f82d90599d24033a0bbb677f79565ffeb9 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Tue, 26 Sep 2023 11:17:08 -0700
Subject: [PATCH 277/567] Release GIL while destructing IFRT objects

Holding GIL while destructing IFRT executables is particularly problematic since destructing `LoadedExecutable` requires waiting for all in-flight host callbacks to complete, which may be blocked by GIL.

PiperOrigin-RevId: 568596172
---
 third_party/xla/xla/python/py_executable.cc    | 4 ++++
 third_party/xla/xla/python/py_host_callback.cc | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/third_party/xla/xla/python/py_executable.cc b/third_party/xla/xla/python/py_executable.cc
index 1dcdb136e01a5f..712d41fbffe000 100644
--- a/third_party/xla/xla/python/py_executable.cc
+++ b/third_party/xla/xla/python/py_executable.cc
@@ -87,6 +87,10 @@ PyLoadedExecutable::~PyLoadedExecutable() {
   if (next_) {
     next_->prev_ = prev_;
   }
+  {
+    py::gil_scoped_release gil_release;
+    ifrt_loaded_executable_.reset();
+  }
 }
 
 std::vector<ClientAndPtr<PjRtDevice>> PyLoadedExecutable::AddressableDevices()
diff --git a/third_party/xla/xla/python/py_host_callback.cc b/third_party/xla/xla/python/py_host_callback.cc
index e149742659aa11..757b8e2f4d902c 100644
--- a/third_party/xla/xla/python/py_host_callback.cc
+++ b/third_party/xla/xla/python/py_host_callback.cc
@@ -204,6 +204,8 @@ PyHostSendAndRecvLoadedHostCallback::PyHostSendAndRecvLoadedHostCallback(
 PyHostSendAndRecvLoadedHostCallback::~PyHostSendAndRecvLoadedHostCallback() {
   GlobalPyRefManager()->AddGarbage(
       absl::MakeSpan(static_cast<pybind11::object*>(&callable_), 1));
+  GlobalPyRefManager()->AddGarbage(
+      absl::MakeSpan(static_cast<pybind11::object*>(&serializer_), 1));
 }
 
 StatusOr<std::string> PyHostSendAndRecvLoadedHostCallback::Serialize() const {

From e77f4b676cb98ccf0a252d6fd869cd1a978aa434 Mon Sep 17 00:00:00 2001
From: Edward Schwartz <schwartzedward@google.com>
Date: Tue, 26 Sep 2023 11:40:22 -0700
Subject: [PATCH 278/567] Move replicate_constants_pass to
 POST_REWRITE_FOR_EXEC

To support TF1 feeding/fetching (in the "rewrite for execution"), where nodes (including Const nodes) can be replaced by a feed node, the replicate_constants_pass should be done after "rewrite for execution" so it only applies to nodes that remain Const nodes after feed nodes are present.

Also, avoid a segfault if there is no graph.

PiperOrigin-RevId: 568603662
---
 .../core/common_runtime/optimization_registry_test.cc       | 2 ++
 tensorflow/core/common_runtime/replicate_constants_pass.cc  | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/optimization_registry_test.cc b/tensorflow/core/common_runtime/optimization_registry_test.cc
index 6f41d9f026a978..d57611db5b0b96 100644
--- a/tensorflow/core/common_runtime/optimization_registry_test.cc
+++ b/tensorflow/core/common_runtime/optimization_registry_test.cc
@@ -70,6 +70,8 @@ class OptimizationPassTest : public ::testing::Test {
   void RunPass() {
     GraphOptimizationPassOptions options;
     options.flib_def = flib_def_.get();
+    // Note that options.graph is not set so this test checks that passes
+    // properly handle this being nullptr (esp. Segfault is avoided).
     EXPECT_EQ(OkStatus(),
               OptimizationPassRegistry::Global()->RunGrouping(
                   OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, options));
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass.cc b/tensorflow/core/common_runtime/replicate_constants_pass.cc
index 73f96d66f940bb..9264d67461eaf8 100644
--- a/tensorflow/core/common_runtime/replicate_constants_pass.cc
+++ b/tensorflow/core/common_runtime/replicate_constants_pass.cc
@@ -124,6 +124,10 @@ Status ReplicateConstantsPass::Run(
              "number-of-elements <= "
           << kMaxSize;
 
+  if (options.graph == nullptr) {
+    VLOG(1) << "No graph in replicate_constants_pass.";
+    return OkStatus();
+  }
   Graph* graph = options.graph->get();
   if (VLOG_IS_ON(1)) {
     VLOG(1) << DumpGraphToFile("before_replicate_constants_pass", *graph,
@@ -183,7 +187,7 @@ Status ReplicateConstantsPass::Run(
   return OkStatus();
 }
 
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_PLACEMENT, 3,
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 3,
                       ReplicateConstantsPass);
 
 }  // namespace tensorflow

From 742e788aaabec6f9769d7c728765ac919cac5ecb Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Tue, 26 Sep 2023 11:49:59 -0700
Subject: [PATCH 279/567] Extract TPU Bridge pipeline into its own file

PiperOrigin-RevId: 568606553
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   1 +
 .../compiler/mlir/tensorflow/transforms/BUILD |   5 +-
 .../mlir/tensorflow/transforms/bridge.cc      | 147 +-------------
 .../compiler/mlir/tf2xla/internal/BUILD       |  70 ++++---
 .../internal/clustering_bridge_passes.cc      | 181 ++++++++++++++++++
 .../internal/clustering_bridge_passes.h       |  34 ++++
 .../internal/clustering_bridge_passes_test.cc |  36 ++++
 tensorflow/compiler/mlir/tfrt/BUILD           |   1 +
 tensorflow/compiler/tf2xla/BUILD              |   2 +
 9 files changed, 311 insertions(+), 166 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
 create mode 100644 tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h
 create mode 100644 tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 927708cea51ff4..4a87632a35770c 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1573,6 +1573,7 @@ tf_cc_test(
     deps = [
         ":dump_mlir_util",
         ":tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:bridge",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index 8c889570459f8c..d34d397cb815a5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -74,6 +74,7 @@ cc_library(
         "mlprogram.h",
     ],
     deps = [
+        ":bridge",
         ":tensorflow_passes",
         ":tf_saved_model_passes",
         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
@@ -438,7 +439,6 @@ cc_library(
         "annotate_parameter_replication.cc",
         "batchmatmul_to_einsum.cc",
         "breakup-islands.cc",
-        "bridge.cc",
         "canonicalize_compile_and_replicate_attributes.cc",
         "check_control_dependencies.cc",
         "cluster_formation.cc",
@@ -554,7 +554,6 @@ cc_library(
         "xla_validate_inputs.cc",
     ],
     hdrs = [
-        "bridge.h",
         "cluster_ops_by_policy.h",
         "collection_ops_util.h",
         "einsum.h",
@@ -690,6 +689,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tf2xla/internal:clustering_bridge_passes",
         "//tensorflow/compiler/mlir/tf2xla/internal/inference:inference_metrics_pass",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
@@ -792,6 +792,7 @@ cc_library(
         "bridge_pass.cc",
     ],
     deps = [
+        ":bridge",
         ":tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index 7e56a6496b5292..210ff858e84286 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/inference/inference_passes.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/platform/error_payloads.h"
@@ -130,153 +131,14 @@ tensorflow::Status RunTFXLABridge(
   return diag_handler.ConsumeStatus();
 }
 
-void CreateTPUBridgePipelineImpl(
-    OpPassManager &pm, llvm::StringRef module_name = llvm::StringRef()) {
-  // The following ops must be preserved regardless of reachability. Ideally,
-  // all graphs should have control dependencies to enforce this but this is
-  // currently not the case (see b/177478741).
-  const llvm::SmallVector<std::string, 4> ops_to_preserve = {
-      "tf.TPUReplicateMetadata", "tf.TPUCompilationResult",
-      "tf.TPUReplicatedOutput"};
-  bool strict_clusters =
-      tensorflow::GetMlirCommonFlags()->tf_mlir_enable_strict_clusters;
-  pm.addNestedPass<func::FuncOp>(
-      tf_executor::CreateTFExecutorGraphPruningPass(ops_to_preserve));
-  // It is assumed at this stage there are no V1 control flow ops as Graph
-  // functionalization is ran before import. Ops can be lifted out of
-  // tf_executor dialect islands/graphs.
-  pm.addNestedPass<func::FuncOp>(
-      CreateExecutorDialectToFunctionalConversionPass());
-  // Guarantee all functions have one use, which enables more exact shape
-  // inference.
-  pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
-  // Run shape inference so that tf_executor/tf_device ops created later will
-  // likely to inherit more concrete types.
-  pm.addPass(TF::CreateTFShapeInferencePass());
-  pm.addNestedPass<func::FuncOp>(CreateTPUPartitionedOpConversionPass());
-  pm.addNestedPass<func::FuncOp>(
-      CreateTPUReorderReplicateAndPartitionedInputsPass());
-  pm.addNestedPass<func::FuncOp>(TF::CreateDecomposeReduceDatasetPass());
-  // Only one of EmbeddingSequencing and EmbeddingPipelining will actually
-  // run and the logic is in EmbeddingPipeliningPass. If the pipelining pass
-  // runs, embedding attributes are stripped and the sequencing pass will have
-  // no effect. If the pipelining pass doesn't run, embedding attributes are
-  // preserved and the sequencing rewrite will trigger.
-  pm.addPass(TFDevice::CreateEmbeddingPipeliningPass());
-  pm.addPass(TFDevice::CreateEmbeddingSequencingPass());
-  pm.addPass(CreateTPUClusterFormationPass(strict_clusters));
-  // CreateEmbeddingPipeliningPass may have created more functions, but
-  // TPUClusterCleanup and OutsideCompiledToHostLaunch need every function to be
-  // only called from one cluster. Here, we choose to fix the all-funcs-one-use
-  // invariant right before it's needed, not after it's been broken.
-  pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
-  // Run TPU cluster cleanup attributes so ops with no outside compiled
-  // attribute have no host device attribute.
-  pm.addPass(CreateTPUClusterCleanupAttributesPass());
-  pm.addPass(TFDevice::CreateOutsideCompiledToHostLaunchPass());
-  pm.addNestedPass<func::FuncOp>(TFDevice::CreateDeviceAttributeToLaunchPass());
-  // Running canonicalizer before decomposing resource ops in cluster helps the
-  // latter pass to converge faster as it does not have to spend time folding
-  // away dead ops.
-  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-  // Place DecomposeResourceOpsPass before TFExecutorConstantSinking pass
-  // because DecomposeResourceOpsPass uses pattern rewriter which hoists
-  // changed constants out of tf_device.Launch.
-  pm.addPass(TFDevice::CreateDecomposeResourceOpsInClusterPass());
-  // Encode this in its own scope so that func_pm is not mistakenly used
-  // later on.
-  {
-    OpPassManager &func_pm = pm.nest<func::FuncOp>();
-    func_pm.addPass(CreateTPUHostComputationExpansionPass());
-    func_pm.addPass(CreateTPUUpdateEmbeddingEnqueueOpInputsPass());
-  }
-  // TODO(b/173622615): This should incrementally be moved down as
-  // more passes support this representation and then can be removed once
-  // all passes support it.
-  pm.addPass(TFDevice::CreateHostLaunchToOutsideCompiledPass());
-
-  // TODO(b/173622615): Once OutsideCompilation is represented by launch op and
-  // the remaining passes including Inliner support it, remove this
-  // LaunchToDeviceAttributePass. This LaunchToDeviceAttribute pass needs to
-  // come before TPUClusterCleanupAttributes pass or else the device attribute
-  // will be removed from launch causing an error.
-  pm.addNestedPass<func::FuncOp>(TFDevice::CreateLaunchToDeviceAttributePass());
-
-  // TODO(b/173622615): This can be removed once more passes support outside
-  // compilation represented by op and conversion back to attribute is removed.
-  pm.addPass(TFDevice::CreateOutsideCompiledToHostLaunchPass());
-  // Note that the region-based control-flow produced here still contains
-  // function call ops which get inlined by the subsequent inliner pass.
-  pm.addPass(TF::CreateTFFunctionalControlFlowToRegions());
-  pm.addPass(mlir::createInlinerPass());
-  pm.addNestedPass<func::FuncOp>(
-      TF::CreateDropWhileShapeInvariantInDeviceClusterPass());
-  // Run another shape inference pass because resource decomposition might have
-  // created new partial types. Also, after dropping `shape_invariant` attribute
-  // from While/WhileRegion ops within cluster would lead to more precise
-  // shapes.
-  pm.addPass(TF::CreateTFShapeInferencePass());
-  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-  pm.addPass(CreateTPUClusterCleanupAttributesPass());
-  pm.addPass(TFDevice::CreateResourceOpLiftingPass());
-  // Re-run the canonicalizer pass as some cleanup during resource op lifting
-  // pass opens up some opportunities for canonicalization of cluster ops.
-  // Specifically, we want to eliminate pass through results from the cluster
-  // op.
-  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-
-  pm.addNestedPass<func::FuncOp>(createCSEPass());
-  if (tensorflow::GetMlirCommonFlags()
-          ->tf_mlir_enable_merge_control_flow_pass) {
-    pm.addPass(TFDevice::CreateMergeControlFlowPass());
-  }
-
-  // TODO(b/173622615): This should incrementally be moved down as
-  // more passes support this representation and then can be removed once
-  // all passes support it.
-  pm.addPass(TFDevice::CreateHostLaunchToOutsideCompiledPass());
-
-  pm.addPass(TFDevice::CreateMarkOpsForOutsideCompilationPass());
-  pm.addPass(TFDevice::CreateExtractHeadTailOutsideCompilationPass());
-  pm.addPass(TFDevice::CreateExtractOutsideCompilationPass());
-  pm.addNestedPass<func::FuncOp>(
-      TFDevice::CreateVerifyNoOutsideCompilationMarkersPass());
-
-  pm.addNestedPass<func::FuncOp>(TFDevice::CreateClusterConstantSinkingPass());
-  pm.addPass(TF::CreateResourceDeviceInferencePass());
-  pm.addPass(TFDevice::CreateClusterOutliningPass());
-  pm.addPass(CreateTPUResourceReadForWritePass());
-  pm.addPass(TFDevice::CreateMarkInputOutputAliasesPass());
-  pm.addPass(CreateTPUShardingIdentificationPass());
-  pm.addNestedPass<func::FuncOp>(
-      CreateTPUResourceReadsWritesPartitioningPass());
-  pm.addPass(TFDevice::CreateAnnotateParameterReplicationPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::TF::CreateRewriteTPUEmbeddingOpsPass());
-  pm.addPass(CreateTPUAnnotateDynamicShapeInputsPass());
-  pm.addNestedPass<func::FuncOp>(
-      TF::CreateHoistReplicateInvariantResourceWritesPass());
-
-  pm.addPass(CreateTPURewritePass(module_name));
-  pm.addPass(createSymbolDCEPass());
-  pm.addNestedPass<func::FuncOp>(TFDevice::CreateEmbeddingProgramKeyPass());
-  pm.addNestedPass<func::FuncOp>(
-      TFDevice::CreateReplicateInvariantOpHoistingPass());
-  pm.addPass(CreateTPUMergeVariablesWithExecutePass());
-  pm.addNestedPass<func::FuncOp>(CreateExtractTPUCopyWithDynamicShapeOpPass());
-  pm.addNestedPass<func::FuncOp>(CreateTPUColocateCompositeResourceOps());
-  if (tensorflow::GetMlirCommonFlags()
-          ->tf_mlir_enable_tpu_variable_runtime_reformatting_pass) {
-    pm.addPass(CreateTPUVariableRuntimeReformattingPass());
-  }
-}
 }  // namespace
 
 void CreateTPUBridgePipeline(OpPassManager &pm, llvm::StringRef module_name) {
   pm.addPass(CreateTPUValidateInputsPass());
   pm.addNestedPass<func::FuncOp>(
       TF::CreateCanonicalizeCompileAndReplicateAttributesPass());
-  CreateTPUBridgePipelineImpl(pm, module_name);
+  tensorflow::tf2xla::internal::AddBridgeClusteringPipelinePasses(pm,
+                                                                  module_name);
 }
 
 void CreateTPUBridgePipelineV1(OpPassManager &pm) {
@@ -297,7 +159,8 @@ void CreateTPUBridgePipelineV1(OpPassManager &pm) {
   pm.addPass(tf_executor::CreateTFExecutorTPUV1IslandCoarseningPass());
   pm.addPass(tf_executor::CreateTFExecutorTPUV1IslandOutliningPass());
   OpPassManager &nested_module = pm.nest<ModuleOp>();
-  CreateTPUBridgePipelineImpl(nested_module);
+  tensorflow::tf2xla::internal::AddBridgeClusteringPipelinePasses(
+      nested_module);
 
   pm.addPass(tf_executor::CreateTFExecutorTPUV1IslandInliningPass());
   // There are cases where we don't consume all compilation and replication
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
index 45942908060caf..145a0510e034ac 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
+        "//tensorflow/compiler/mlir/tensorflow/transforms:__pkg__",
         "//tensorflow/compiler/mlir/tf2xla/api/v1:__subpackages__",
         "//tensorflow/compiler/mlir/tf2xla/api/v2:__subpackages__",
     ],
@@ -106,49 +107,49 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "legalize_tf_to_hlo",
-    srcs = ["legalize_tf_to_hlo.cc"],
-    hdrs = ["legalize_tf_to_hlo.h"],
+tf_cc_test(
+    name = "legalize_tf_mlir_test",
+    srcs = ["legalize_tf_mlir_test.cc"],
     deps = [
-        ":compilation_timer",
         ":legalize_tf_mlir",
-        "//tensorflow/compiler/mlir/tf2xla/api/v1:compile_tf_graph",
-        "//tensorflow/compiler/tf2xla:layout_util",
+        ":test_matchers",
+        "//tensorflow/compiler/jit",
+        "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_helpers",
-        "//tensorflow/compiler/tf2xla:xla_tpu_backend_registration",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:status",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
-        "@llvm-project//llvm:Support",
+        "@com_google_googletest//:gtest",
         "@llvm-project//mlir:Pass",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla:shape_util",
-        "@local_xla//xla/client:compile_only_client",
     ],
 )
 
-tf_cc_test(
-    name = "legalize_tf_mlir_test",
-    srcs = ["legalize_tf_mlir_test.cc"],
+cc_library(
+    name = "legalize_tf_to_hlo",
+    srcs = ["legalize_tf_to_hlo.cc"],
+    hdrs = ["legalize_tf_to_hlo.h"],
     deps = [
+        ":compilation_timer",
         ":legalize_tf_mlir",
-        ":test_matchers",
-        "//tensorflow/compiler/jit",
-        "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/mlir/tf2xla/api/v1:compile_tf_graph",
+        "//tensorflow/compiler/tf2xla:layout_util",
         "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/tf2xla:xla_tpu_backend_registration",
         "//tensorflow/core:framework",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/lib/monitoring:cell_reader",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:status",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
-        "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Pass",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla:shape_util",
+        "@local_xla//xla/client:compile_only_client",
     ],
 )
 
@@ -177,3 +178,28 @@ tf_cc_test(
         "@local_xla//xla/stream_executor",
     ],
 )
+
+cc_library(
+    name = "clustering_bridge_passes",
+    srcs = ["clustering_bridge_passes.cc"],
+    hdrs = ["clustering_bridge_passes.h"],
+    deps = [
+        "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+tf_cc_test(
+    name = "clustering_bridge_passes_test",
+    srcs = ["clustering_bridge_passes_test.cc"],
+    deps = [
+        ":clustering_bridge_passes",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:Pass",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
new file mode 100644
index 00000000000000..1d3d218c23d90d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
@@ -0,0 +1,181 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h"
+
+#include <string>
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+using mlir::OpPassManager;
+using mlir::func::FuncOp;
+
+// Adds Bridge clustering pipeline passes to the given pass_manager. Does not
+// run them.
+void AddBridgeClusteringPipelinePasses(OpPassManager& pm,
+                                       llvm::StringRef module_name) {
+  // The following ops must be preserved regardless of reachability. Ideally,
+  // all graphs should have control dependencies to enforce this but this is
+  // currently not the case (see b/177478741).
+  const llvm::SmallVector<std::string, 4> ops_to_preserve = {
+      "tf.TPUReplicateMetadata", "tf.TPUCompilationResult",
+      "tf.TPUReplicatedOutput"};
+  bool strict_clusters =
+      tensorflow::GetMlirCommonFlags()->tf_mlir_enable_strict_clusters;
+  pm.addNestedPass<FuncOp>(
+      mlir::tf_executor::CreateTFExecutorGraphPruningPass(ops_to_preserve));
+  // It is assumed at this stage there are no V1 control flow ops as Graph
+  // functionalization is ran before import. Ops can be lifted out of
+  // tf_executor dialect islands/graphs.
+  pm.addNestedPass<FuncOp>(
+      mlir::CreateExecutorDialectToFunctionalConversionPass());
+  // Guarantee all functions have one use, which enables more exact shape
+  // inference.
+  pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
+  // Run shape inference so that tf_executor/tf_device ops created later will
+  // likely to inherit more concrete types.
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  pm.addNestedPass<FuncOp>(mlir::TFTPU::CreateTPUPartitionedOpConversionPass());
+  pm.addNestedPass<FuncOp>(
+      mlir::TFTPU::CreateTPUReorderReplicateAndPartitionedInputsPass());
+  pm.addNestedPass<FuncOp>(mlir::TF::CreateDecomposeReduceDatasetPass());
+  // Only one of EmbeddingSequencing and EmbeddingPipelining will actually
+  // run and the logic is in EmbeddingPipeliningPass. If the pipelining pass
+  // runs, embedding attributes are stripped and the sequencing pass will have
+  // no effect. If the pipelining pass doesn't run, embedding attributes are
+  // preserved and the sequencing rewrite will trigger.
+  pm.addPass(mlir::TFDevice::CreateEmbeddingPipeliningPass());
+  pm.addPass(mlir::TFDevice::CreateEmbeddingSequencingPass());
+  pm.addPass(mlir::TFTPU::CreateTPUClusterFormationPass(strict_clusters));
+  // CreateEmbeddingPipeliningPass may have created more functions, but
+  // TPUClusterCleanup and OutsideCompiledToHostLaunch need every function to be
+  // only called from one cluster. Here, we choose to fix the all-funcs-one-use
+  // invariant right before it's needed, not after it's been broken.
+  pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
+  // Run TPU cluster cleanup attributes so ops with no outside compiled
+  // attribute have no host device attribute.
+  pm.addPass(mlir::TFTPU::CreateTPUClusterCleanupAttributesPass());
+  pm.addPass(mlir::TFDevice::CreateOutsideCompiledToHostLaunchPass());
+  pm.addNestedPass<FuncOp>(mlir::TFDevice::CreateDeviceAttributeToLaunchPass());
+  // Running canonicalizer before decomposing resource ops in cluster helps the
+  // latter pass to converge faster as it does not have to spend time folding
+  // away dead ops.
+  pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
+  // Place DecomposeResourceOpsPass before TFExecutorConstantSinking pass
+  // because DecomposeResourceOpsPass uses pattern rewriter which hoists
+  // changed constants out of tf_device.Launch.
+  pm.addPass(mlir::TFDevice::CreateDecomposeResourceOpsInClusterPass());
+  // Encode this in its own scope so that func_pm is not mistakenly used
+  // later on.
+  {
+    OpPassManager& func_pm = pm.nest<FuncOp>();
+    func_pm.addPass(mlir::TFTPU::CreateTPUHostComputationExpansionPass());
+    func_pm.addPass(mlir::TFTPU::CreateTPUUpdateEmbeddingEnqueueOpInputsPass());
+  }
+  // TODO(b/173622615): This should incrementally be moved down as
+  // more passes support this representation and then can be removed once
+  // all passes support it.
+  pm.addPass(mlir::TFDevice::CreateHostLaunchToOutsideCompiledPass());
+
+  // TODO(b/173622615): Once OutsideCompilation is represented by launch op and
+  // the remaining passes including Inliner support it, remove this
+  // LaunchToDeviceAttributePass. This LaunchToDeviceAttribute pass needs to
+  // come before TPUClusterCleanupAttributes pass or else the device attribute
+  // will be removed from launch causing an error.
+  pm.addNestedPass<FuncOp>(mlir::TFDevice::CreateLaunchToDeviceAttributePass());
+
+  // TODO(b/173622615): This can be removed once more passes support outside
+  // compilation represented by op and conversion back to attribute is removed.
+  pm.addPass(mlir::TFDevice::CreateOutsideCompiledToHostLaunchPass());
+  // Note that the region-based control-flow produced here still contains
+  // function call ops which get inlined by the subsequent inliner pass.
+  pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addNestedPass<FuncOp>(
+      mlir::TF::CreateDropWhileShapeInvariantInDeviceClusterPass());
+  // Run another shape inference pass because resource decomposition might have
+  // created new partial types. Also, after dropping `shape_invariant` attribute
+  // from While/WhileRegion ops within cluster would lead to more precise
+  // shapes.
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::TFTPU::CreateTPUClusterCleanupAttributesPass());
+  pm.addPass(mlir::TFDevice::CreateResourceOpLiftingPass());
+  // Re-run the canonicalizer pass as some cleanup during resource op lifting
+  // pass opens up some opportunities for canonicalization of cluster ops.
+  // Specifically, we want to eliminate pass through results from the cluster
+  // op.
+  pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
+
+  pm.addNestedPass<FuncOp>(mlir::createCSEPass());
+  if (tensorflow::GetMlirCommonFlags()
+          ->tf_mlir_enable_merge_control_flow_pass) {
+    pm.addPass(mlir::TFDevice::CreateMergeControlFlowPass());
+  }
+
+  // TODO(b/173622615): This should incrementally be moved down as
+  // more passes support this representation and then can be removed once
+  // all passes support it.
+  pm.addPass(mlir::TFDevice::CreateHostLaunchToOutsideCompiledPass());
+
+  pm.addPass(mlir::TFDevice::CreateMarkOpsForOutsideCompilationPass());
+  pm.addPass(mlir::TFDevice::CreateExtractHeadTailOutsideCompilationPass());
+  pm.addPass(mlir::TFDevice::CreateExtractOutsideCompilationPass());
+  pm.addNestedPass<FuncOp>(
+      mlir::TFDevice::CreateVerifyNoOutsideCompilationMarkersPass());
+
+  pm.addNestedPass<FuncOp>(mlir::TFDevice::CreateClusterConstantSinkingPass());
+  pm.addPass(mlir::TF::CreateResourceDeviceInferencePass());
+  pm.addPass(mlir::TFDevice::CreateClusterOutliningPass());
+  pm.addPass(mlir::TFTPU::CreateTPUResourceReadForWritePass());
+  pm.addPass(mlir::TFDevice::CreateMarkInputOutputAliasesPass());
+  pm.addPass(mlir::TFTPU::CreateTPUShardingIdentificationPass());
+  pm.addNestedPass<FuncOp>(
+      mlir::TFTPU::CreateTPUResourceReadsWritesPartitioningPass());
+  pm.addPass(mlir::TFDevice::CreateAnnotateParameterReplicationPass());
+  pm.addNestedPass<FuncOp>(mlir::TF::CreateRewriteTPUEmbeddingOpsPass());
+  pm.addPass(mlir::TFTPU::CreateTPUAnnotateDynamicShapeInputsPass());
+  pm.addNestedPass<FuncOp>(
+      mlir::TF::CreateHoistReplicateInvariantResourceWritesPass());
+
+  pm.addPass(mlir::TFTPU::CreateTPURewritePass(module_name));
+  pm.addPass(mlir::createSymbolDCEPass());
+  pm.addNestedPass<FuncOp>(mlir::TFDevice::CreateEmbeddingProgramKeyPass());
+  pm.addNestedPass<FuncOp>(
+      mlir::TFDevice::CreateReplicateInvariantOpHoistingPass());
+  pm.addPass(mlir::TFTPU::CreateTPUMergeVariablesWithExecutePass());
+  pm.addNestedPass<FuncOp>(
+      mlir::TFTPU::CreateExtractTPUCopyWithDynamicShapeOpPass());
+  pm.addNestedPass<FuncOp>(
+      mlir::TFTPU::CreateTPUColocateCompositeResourceOps());
+  if (tensorflow::GetMlirCommonFlags()
+          ->tf_mlir_enable_tpu_variable_runtime_reformatting_pass) {
+    pm.addPass(mlir::TFTPU::CreateTPUVariableRuntimeReformattingPass());
+  }
+}
+
+};  // namespace internal
+};  // namespace tf2xla
+};  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h
new file mode 100644
index 00000000000000..bd3a04960deed9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_CLUSTERING_BRIDGE_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_CLUSTERING_BRIDGE_PASSES_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+// Given the pass manager, add Bridge passes to cluster the input.
+void AddBridgeClusteringPipelinePasses(
+    mlir::OpPassManager& pm, llvm::StringRef module_name = llvm::StringRef());
+
+};  // namespace internal
+};  // namespace tf2xla
+};  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_CLUSTERING_BRIDGE_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc
new file mode 100644
index 00000000000000..20e3033054ae4e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h"
+
+#include <gtest/gtest.h>
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+using mlir::OpPassManager;
+
+TEST(ClusteringBridgePassesTest, AddsBridgePasses) {
+  OpPassManager pass_manager;
+  AddBridgeClusteringPipelinePasses(pass_manager);
+
+  EXPECT_EQ(pass_manager.size(), 54);
+}
+
+};  // namespace internal
+};  // namespace tf2xla
+};  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index c6ad67caeacfad..aba0aeb26972c7 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -361,6 +361,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:bridge",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_asset_sinking_pass",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 08a499f25c2083..4d59097a60381c 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -178,6 +178,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:import_utils",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:bridge",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
@@ -1156,6 +1157,7 @@ cc_library(
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:device_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:bridge",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
         "//tensorflow/core:core_cpu",

From 5d485ce42700b5c184214ad7e3e8228094619867 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 26 Sep 2023 11:50:35 -0700
Subject: [PATCH 280/567] [stream_executor] NFC: Make command_buffer, event and
 kernel implementations private

PiperOrigin-RevId: 568606765
---
 third_party/xla/xla/stream_executor/BUILD     | 136 +++++++-----------
 .../xla/xla/stream_executor/cuda/BUILD        |  10 +-
 third_party/xla/xla/stream_executor/gpu/BUILD |   2 -
 third_party/xla/xla/stream_executor/kernel.cc |   5 +-
 third_party/xla/xla/stream_executor/kernel.h  |   4 +-
 .../xla/xla/stream_executor/rocm/BUILD        |   4 +-
 6 files changed, 58 insertions(+), 103 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 3d8b798c4b3389..e92cdaf32f09c2 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -247,6 +247,32 @@ filegroup(
 # implementation) or `stream_executor_headers` (only headers, if there is a reason not to link
 # implementation) if they want to use StreamExecutor.
 
+cc_library(
+    name = "command_buffer",
+    srcs = ["command_buffer.cc"],
+    hdrs = ["command_buffer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":stream_executor_internal",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "event",
+    srcs = ["event.cc"],
+    hdrs = ["event.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":stream_executor_headers",
+        ":stream_executor_internal",
+        "//xla/stream_executor/platform",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
 cc_library(
     name = "executor_cache",
     srcs = ["executor_cache.cc"],
@@ -282,6 +308,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel",
+    srcs = ["kernel.cc"],
+    hdrs = ["kernel.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device_memory",
+        ":platform",
+        ":stream_executor_headers",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 #===--------------------------------------------------------------------------------------------===#
 
 # The stream_executor_headers target does not prescribe an implementation.
@@ -437,6 +484,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -470,6 +518,7 @@ cc_library(
         ":host_or_device_scalar",
         ":kernel",
         ":kernel_spec",
+        ":launch_dim",
         ":platform",
         ":plugin_registry",
         ":stream_executor_headers",
@@ -497,66 +546,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "kernel",
-    srcs = [
-        "fft.h",
-        "kernel.cc",
-        "plugin.h",
-        "stream.h",
-        "stream_executor_pimpl.h",
-        "temporary_device_memory.h",
-        "temporary_memory_manager.h",
-    ],
-    hdrs = [
-        "blas.h",
-        "device_description.h",
-        "device_options.h",
-        "event.h",
-        "kernel.h",
-        "kernel_spec.h",
-        "launch_dim.h",
-        "multi_platform_manager.h",
-        "platform.h",
-        "plugin_registry.h",
-        "stream_executor.h",
-        "stream_executor_internal.h",
-        "trace_listener.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":allocator_stats",
-        ":device_description",
-        ":device_description_proto_cc",
-        ":device_memory",
-        ":device_options",
-        ":fft",
-        ":kernel_spec",
-        ":launch_dim",
-        ":platform",
-        ":plugin_registry",
-        ":stream_executor_headers",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/protobuf:dnn_proto_cc",
-    ],
-)
-
 cc_library(
     name = "blas",
     srcs = ["blas.cc"],
@@ -572,32 +561,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "command_buffer",
-    srcs = ["command_buffer.cc"],
-    hdrs = ["command_buffer.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":stream_executor_internal",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "event",
-    srcs = ["event.cc"],
-    hdrs = ["event.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":stream_executor_headers",
-        ":stream_executor_internal",
-        "//xla/stream_executor/platform",
-        "@local_tsl//tsl/platform:status",
-    ],
-)
-
 cc_library(
     name = "plugin_registry",
     srcs = ["plugin_registry.cc"],
@@ -746,6 +709,7 @@ cc_library(
         ":executor_cache",
         ":kernel",
         ":kernel_spec",
+        ":launch_dim",
         ":multi_platform_manager",
         ":platform",
         ":stream_executor_headers",
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 01b9bb3c68a497..01f687965afbe1 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -217,7 +217,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_kernel",
-        "//xla/stream_executor:event",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/platform",
     ]),
@@ -309,7 +309,6 @@ cc_library(
         ":cuda_stream",
         ":cuda_helpers",
         "@local_config_cuda//cuda:cuda_headers",
-        "//xla/stream_executor:event",
         "//xla/stream_executor:fft",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
@@ -356,7 +355,6 @@ cc_library(
         "@local_tsl//tsl/cuda:cudnn_version",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
         "//xla/stream_executor:dnn",
-        "//xla/stream_executor:event",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream_executor_headers",
@@ -379,7 +377,6 @@ cc_library(
     deps = if_cuda_is_configured([
         ":cuda_driver",
         "@local_config_cuda//cuda:cuda_headers",
-        "//xla/stream_executor:event",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_kernel_header",
         "//xla/stream_executor/platform",
@@ -456,10 +453,9 @@ cc_library(
         ":cuda_platform_id",
         ":cuda_stream",
         "@com_google_absl//absl/strings",
-        "//xla/stream_executor:event",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor/gpu:asm_compiler",
         "//xla/stream_executor/gpu:gpu_command_buffer",
         "//xla/stream_executor/gpu:gpu_executor_header",
@@ -538,8 +534,6 @@ xla_cc_test(
         ":cuda_gpu_executor",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:event",
-        "//xla/stream_executor:kernel",
         "//xla/stream_executor/gpu:gpu_asm_opts",
         "//xla/stream_executor/gpu:redzone_allocator",
         "@local_tsl//tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 5d35ea3380f44b..0da2b1e3b3033b 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -135,7 +135,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":gpu_kernel_header",
-        "//xla/stream_executor:event",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:stream_executor_internal",
@@ -200,7 +199,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":gpu_driver_header",
-        "//xla/stream_executor:event",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor/platform",
diff --git a/third_party/xla/xla/stream_executor/kernel.cc b/third_party/xla/xla/stream_executor/kernel.cc
index d09f25873c8555..586226612473a5 100644
--- a/third_party/xla/xla/stream_executor/kernel.cc
+++ b/third_party/xla/xla/stream_executor/kernel.cc
@@ -19,13 +19,14 @@ limitations under the License.
 
 #include "xla/stream_executor/kernel.h"
 
+#include <string>
+#include <utility>
+
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/demangle.h"
-#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index 8991fcaa8404bf..7ad6994d3cf225 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -70,19 +70,19 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_KERNEL_H_
 
 #include <array>
+#include <cstdint>
 #include <cstring>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <type_traits>
-#include <vector>
 
+#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel_cache_config.h"
 #include "xla/stream_executor/platform/port.h"
-#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index b9e5f20ef22af0..fe414f1c4d4255 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -108,11 +108,9 @@ cc_library(
         ":rocm_platform_id",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
-        "//xla/stream_executor:kernel",
-        "//xla/stream_executor:event",
+        "//xla/stream_executor",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:stream_executor_internal",
-        "//xla/stream_executor:stream_executor_pimpl_header",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_event",
         "//xla/stream_executor/gpu:gpu_kernel_header",

From 8adeb5ffa55dd02df9ce20c35624998ac718b593 Mon Sep 17 00:00:00 2001
From: Shixin Li <shixinli@google.com>
Date: Tue, 26 Sep 2023 11:54:35 -0700
Subject: [PATCH 281/567] Add StreamExecutorUnloadedExecutable
 (de)serialization and `LoadSerializedExecutable` API.

PiperOrigin-RevId: 568607896
---
 third_party/xla/xla/pjrt/BUILD                | 13 +++++
 third_party/xla/xla/pjrt/gpu/BUILD            |  1 +
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    | 53 +++++++++++++++++++
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     |  7 +++
 .../pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc | 33 ++++++++++++
 .../stream_executor_unloaded_executable.cc    | 21 ++++++--
 .../stream_executor_unloaded_executable.proto |  2 +-
 7 files changed, 124 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index b44e02ceba8125..9997ed8fc94e9a 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -376,6 +376,17 @@ cc_library(
     ],
 )
 
+tf_proto_library(
+    name = "stream_executor_unloaded_executable_proto",
+    srcs = ["stream_executor_unloaded_executable.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":compile_options_proto",
+        "//xla:xla_proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "stream_executor_unloaded_executable",
     srcs = ["stream_executor_unloaded_executable.cc"],
@@ -383,10 +394,12 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":pjrt_executable",
+        ":stream_executor_unloaded_executable_proto_cc",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
         "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 3755a466731e9c..28addefa812c7a 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -55,6 +55,7 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_stream_executor_client",
         "//xla/pjrt:stream_executor_unloaded_executable",
+        "//xla/pjrt:stream_executor_unloaded_executable_proto_cc",
         "//xla/pjrt:tracked_device_buffer",
         "//xla/pjrt:utils",
         "//xla/pjrt/distributed:client",
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 6df22004640e1b..733942386f645c 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -62,13 +62,21 @@ limitations under the License.
 #ifdef GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/gpu/nccl_id_store.h"
+#include "xla/pjrt/stream_executor_unloaded_executable.pb.h"
+#include "xla/service/gpu/gpu_compiler.h"
 #include "xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
+#include "xla/xla.pb.h"
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
+#include "xla/pjrt/compile_options.pb.h"  // NOLINT(build/include)
 #include "xla/pjrt/gpu/nccl_id_store.h"  // NOLINT(build/include)
+#include "xla/pjrt/stream_executor_unloaded_executable.pb.h"  // NOLINT(build/include)
+#include "xla/service/gpu/gpu_compiler.h"  // NOLINT(build/include)
+#include "xla/xla.pb.h"  // NOLINT(build/include)
 #endif  // TENSORFLOW_USE_ROCM
 
 #include "xla/client/client_library.h"
@@ -524,6 +532,51 @@ PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
       });
 }
 
+namespace {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+StatusOr<std::unique_ptr<StreamExecutorUnloadedExecutable>> FromProto(
+    const StreamExecutorUnloadedExecutableProto& proto) {
+  TF_ASSIGN_OR_RETURN(CompileOptions compile_options,
+                      CompileOptions::FromProto(proto.compile_options()));
+  std::vector<std::unique_ptr<xla::AotCompilationResult>>
+      deserialized_aot_executables;
+  for (const auto& executable : proto.executables()) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<xla::AotCompilationResult> deserialized,
+        gpu::GpuXlaRuntimeAotCompilationResult::FromString(executable));
+    deserialized_aot_executables.push_back(std::move(deserialized));
+  }
+  return std::make_unique<StreamExecutorUnloadedExecutable>(
+      compile_options, std::move(deserialized_aot_executables),
+      proto.num_replicas(), proto.num_partitions(), proto.name());
+}
+#endif
+}  // namespace
+
+StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+StreamExecutorGpuClient::LoadSerializedExecutable(
+    absl::string_view serialized, std::optional<CompileOptions> options,
+    const LoadOptions& load_options) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  StreamExecutorUnloadedExecutableProto proto;
+  if (serialized.size() > std::numeric_limits<int>::max()) {
+    return Internal(
+        "PjRtStreamExecutorClient::DeserializeExecutable proto too large "
+        "(>2GB)");
+  }
+  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+    return Internal(
+        "StreamExecutorGpuClient::DeserializeExecutable proto deserialization "
+        "failed");
+  }
+  TF_ASSIGN_OR_RETURN(auto se_executable, FromProto(proto));
+  // TODO(b/296466237): Unify the `Load` method.
+  return Load(std::move(se_executable));
+#endif
+  return absl::InternalError(
+      "LoadSerializedExecutable only works with cuda or rocm.");
+}
+
 std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
     int node_id) {
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index 7b2f6da962daa7..f7b7e703beb2ce 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -24,9 +24,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/statusor.h"
 
@@ -200,6 +203,10 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
       std::unique_ptr<PjRtExecutable> executable);
 
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerializedExecutable(
+      absl::string_view serialized, std::optional<CompileOptions> options,
+      const LoadOptions& load_options) override;
+
  private:
   xla::StreamExecutorGpuTopologyDescription topology_;
 };
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index 91c063300117da..d3063d4b27be5b 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -163,5 +165,36 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
   ValidateResult(result);
 }
 
+TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
+  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
+                       GetGpuTargetConfig());
+  CompileOptions options = xla::CompileOptions();
+  StreamExecutorGpuCompiler compiler(gpu_config);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  auto se_client = absl::WrapUnique(
+      tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
+  TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      compiler.Compile(xla::CompileOptions(), computation, *topology, nullptr));
+
+  // Serialize the executable and load it.
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable,
+                          executable->SerializeExecutable());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto loaded_executable,
+      se_client->LoadSerializedExecutable(serialized_executable, std::nullopt,
+                                          LoadOptions()));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
+  ValidateResult(result);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.cc b/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.cc
index 460737315f3343..e2c15ad314d58b 100644
--- a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.cc
+++ b/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.cc
@@ -15,17 +15,28 @@ limitations under the License.
 
 #include "xla/pjrt/stream_executor_unloaded_executable.h"
 
+#include <memory>
 #include <string>
 
-#include "absl/status/status.h"
+#include "xla/pjrt/stream_executor_unloaded_executable.pb.h"
+#include "xla/service/compiler.h"
 #include "xla/statusor.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
-// TODO(b/296466237): Add serialization.
 StatusOr<std::string> StreamExecutorUnloadedExecutable::SerializeExecutable()
     const {
-  return absl::UnimplementedError(
-      "StreamExecutorUnloadedExecutable::SerializeExecutable() not "
-      "implemented");
+  StreamExecutorUnloadedExecutableProto proto;
+  TF_ASSIGN_OR_RETURN(*proto.mutable_compile_options(),
+                      compile_options_.ToProto());
+  for (const std::unique_ptr<xla::AotCompilationResult>& aot_executable :
+       aot_executables_) {
+    TF_ASSIGN_OR_RETURN(*proto.add_executables(),
+                        aot_executable->SerializeAsString());
+  }
+  proto.set_num_replicas(num_replicas_);
+  proto.set_num_partitions(num_partitions_);
+  proto.set_name(name_);
+  return proto.SerializeAsString();
 }
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.proto b/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.proto
index 28b3e21204c766..aeade5edc7cb7b 100644
--- a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.proto
+++ b/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.proto
@@ -19,7 +19,7 @@ package xla;
 
 import "xla/pjrt/compile_options.proto";
 
-message StreamExecutorGpuExecutableProto {
+message StreamExecutorUnloadedExecutableProto {
   CompileOptionsProto compile_options = 1;
   repeated bytes executables = 2;
   int32 num_replicas = 3;

From 15b882e5c31531f72eeeddf5f411516eec3df9e9 Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Tue, 26 Sep 2023 11:54:50 -0700
Subject: [PATCH 282/567] Add Sparse Core ops to MLIR.

PiperOrigin-RevId: 568607967
---
 .../compiler/mlir/tensorflow/ir/tf_ops.td     | 331 ++++++++++++++++++
 .../transforms/legalization_op_config.cc      |  14 +
 .../transforms/legalization_op_config_test.cc |   2 +-
 3 files changed, 346 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 08d0a6b40559c3..0c160d5cef8d8d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -2022,4 +2022,335 @@ then performs a GatherNd operation on it.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_XlaSparseCoreAdagradOp : TF_Op<"XlaSparseCoreAdagrad", []> {
+  let summary = "aaa";
+
+  let arguments = (ins
+    TF_Int32Tensor:$indices,
+    TF_Float32Tensor:$gradient,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$accumulator,
+    TF_Float32Tensor:$embedding_table,
+
+    I64Attr:$feature_width
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator
+  );
+}
+
+def TF_XlaSparseCoreAdagradMomentumOp : TF_Op<"XlaSparseCoreAdagradMomentum", []> {
+  let summary = "aaa";
+
+  let arguments = (ins
+    TF_Int32Tensor:$indices,
+    TF_Float32Tensor:$gradient,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$beta_1,
+    TF_Float32Tensor:$epsilon,
+    TF_Float32Tensor:$accumulator,
+    TF_Float32Tensor:$momentum,
+    TF_Float32Tensor:$embedding_table,
+
+    I64Attr:$feature_width,
+    BoolAttr:$use_nesterov,
+    F32Attr:$beta_2,
+    F32Attr:$exponent
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator,
+    TF_Float32Tensor:$updated_momentum
+  );
+}
+
+def TF_XlaSparseCoreAdamOp : TF_Op<"XlaSparseCoreAdam", []> {
+  let summary = "aaa";
+
+  let arguments = (ins
+    TF_Float32Tensor:$embedding_table,
+    TF_Int32Tensor:$indices,
+    TF_Float32Tensor:$gradient,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$momentum,
+    TF_Float32Tensor:$velocity,
+    TF_Float32Tensor:$beta_1,
+    TF_Float32Tensor:$beta_2,
+    TF_Float32Tensor:$epsilon,
+
+    I64Attr:$feature_width,
+    BoolAttr:$use_sum_inside_sqrt
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_velocity,
+    TF_Float32Tensor:$updated_momentum
+  );
+}
+
+def TF_XlaSparseCoreFtrlOp : TF_Op<"XlaSparseCoreFtrl", []> {
+  let summary = "aaa";
+
+  let arguments = (ins
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$accumulator,
+    TF_Float32Tensor:$linear,
+    TF_Float32Tensor:$learning_rate,
+    TF_Int32Tensor:$indices,
+    TF_Float32Tensor:$gradient,
+    TF_Float32Tensor:$beta,
+    TF_Float32Tensor:$learning_rate_power,
+    TF_Float32Tensor:$l2_regularization_strength,
+
+    I64Attr:$feature_width,
+    BoolAttr:$multiply_linear_by_learning_rate,
+    F32Attr:$l1_regularization_strength
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator,
+    TF_Float32Tensor:$updated_linear
+  );
+}
+
+def TF_XlaSparseCoreSgdOp : TF_Op<"XlaSparseCoreSgd", []> {
+  let summary = "aaa";
+
+  let arguments = (ins
+    TF_Int32Tensor:$indices,
+    TF_Float32Tensor:$gradient,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$embedding_table,
+
+    I64Attr:$feature_width
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table
+  );
+}
+
+def TF_XlaSparseDenseMatmulWithCsrInputOp : TF_Op<"XlaSparseDenseMatmulWithCsrInput", [Pure]> {
+  let summary = "aaa";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$embedding_table,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$input_size,
+    OptionalAttr<F32Attr>:$quantization_config_low,
+    OptionalAttr<F32Attr>:$quantization_config_high,
+    OptionalAttr<I64Attr>:$quantization_config_num_buckets,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$activations
+  );
+}
+
+def TF_XlaSparseDenseMatmulGradWithSgdAndCsrInputOp : TF_Op<"XlaSparseDenseMatmulGradWithSgdAndCsrInput", [Pure]> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$activation_gradients,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table
+  );
+}
+
+def XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp : TF_Op<"XlaSparseDenseMatmulGradWithAdagradAndCsrInput", [Pure]> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$activation_gradients,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$accumulator,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator
+  );
+}
+
+def XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp : TF_Op<"XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput", [Pure]> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$activation_gradients,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$accumulator,
+    TF_Float32Tensor:$momenta,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    BoolAttr:$use_nesterov,
+    F32Attr:$exponent,
+    F32Attr:$beta1,
+    F32Attr:$beta2,
+    F32Attr:$epsilon,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator,
+    TF_Float32Tensor:$updated_momenta
+  );
+}
+
+def XlaSparseDenseMatmulGradWithAdamAndCsrInputOp : TF_Op<"XlaSparseDenseMatmulGradWithAdamAndCsrInput", [Pure]> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$activation_gradients,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$momenta,
+    TF_Float32Tensor:$velocity,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    BoolAttr:$use_sum_inside_sqrt,
+    F32Attr:$beta1,
+    F32Attr:$beta2,
+    F32Attr:$epsilon,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_momenta,
+    TF_Float32Tensor:$updated_velocity
+  );
+}
+
+def XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp : TF_Op<"XlaSparseDenseMatmulGradWithFtrlAndCsrInput", [Pure]> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$activation_gradients,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$accumulator,
+    TF_Float32Tensor:$linear,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    BoolAttr:$multiply_linear_by_learning_rate,
+    F32Attr:$beta,
+    F32Attr:$learning_rate_power,
+    F32Attr:$l1_regularization_strength,
+    F32Attr:$l2_regularization_strength,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator,
+    TF_Float32Tensor:$updated_linear
+  );
+}
+
+def TF_GetMinibatchesInCsrWithPhysicalReplicaOp : TF_Op<"GetMinibatchesInCsrWithPhysicalReplica", [Pure]> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_StrTensor:$program_key,
+    TF_Int32Tensor:$row_ids,
+    TF_Int32Tensor:$col_ids,
+    TF_Float32Tensor:$gains,
+    TF_Int64Tensor:$splits,
+    TF_Int32Tensor:$id_counts,
+
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$sample_count,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$num_replica,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_minibatches_per_sc,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_ids_per_chip_per_sample,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$table_vocab_size,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$feature_width,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$num_sc_per_chip,
+
+    StrAttr:$table_name,
+    StrAttr:$mini_batch_in_csr
+  );
+
+  let results = (outs
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Int32Tensor:$row_pointers_unpadded_size,
+    TF_Int32Tensor:$ids_unpadded_size,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core
+  );
+}
+
+def TF_GetMinibatchSplitsWithPhysicalReplicaOp : TF_Op<"GetMinibatchSplitsWithPhysicalReplica", [Pure]> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_StrTensor:$program_key,
+    TF_Int32Tensor:$row_ids,
+    TF_Int32Tensor:$col_ids,
+    TF_Float32Tensor:$gains,
+
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$sample_count,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$num_replica,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$table_vocab_size,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$feature_width,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$num_sc_per_chip,
+
+    StrAttr:$table_name,
+    StrAttr:$mini_batch_splits
+  );
+
+  let results = (outs
+    TF_Int32Tensor:$sorted_row_ids,
+    TF_Int32Tensor:$sorted_col_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Int64Tensor:$splits,
+    TF_Int32Tensor:$id_counts
+  );
+}
+
 #endif // TF_OPS
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
index 4989af91d45c76..f4889113d1b6c4 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
@@ -205,6 +205,8 @@ bool IsOpTypeAllowedTf2XlaFallback(const TypeID& type_id) {
             TypeID::get<TF::FakeQuantWithMinMaxVarsPerChannelGradientOp>(),
             TypeID::get<TF::FloorDivOp>(),
             TypeID::get<TF::FloorModOp>(),
+            TypeID::get<TF::GetMinibatchesInCsrWithPhysicalReplicaOp>(),
+            TypeID::get<TF::GetMinibatchSplitsWithPhysicalReplicaOp>(),
             TypeID::get<TF::GreaterOp>(),
             TypeID::get<TF::HSVToRGBOp>(),
             TypeID::get<TF::IFFT2DOp>(),
@@ -341,6 +343,18 @@ bool IsOpTypeAllowedTf2XlaFallback(const TypeID& type_id) {
             TypeID::get<TF::XlaPadOp>(),
             TypeID::get<TF::XlaSetBoundOp>(),
             TypeID::get<TF::XlaSetDynamicDimensionSizeOp>(),
+            TypeID::get<TF::XlaSparseCoreAdagradMomentumOp>(),
+            TypeID::get<TF::XlaSparseCoreAdagradOp>(),
+            TypeID::get<TF::XlaSparseCoreAdamOp>(),
+            TypeID::get<TF::XlaSparseCoreFtrlOp>(),
+            TypeID::get<TF::XlaSparseCoreSgdOp>(),
+            TypeID::get<TF::XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp>(),
+            TypeID::get<
+                TF::XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp>(),
+            TypeID::get<TF::XlaSparseDenseMatmulGradWithAdamAndCsrInputOp>(),
+            TypeID::get<TF::XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp>(),
+            TypeID::get<TF::XlaSparseDenseMatmulGradWithSgdAndCsrInputOp>(),
+            TypeID::get<TF::XlaSparseDenseMatmulWithCsrInputOp>(),
             TypeID::get<TF::XlaSpmdFullToShardShapeOp>(),
             TypeID::get<TF::XlaSpmdShardToFullShapeOp>(),
             TypeID::get<TF::XlaSvdOp>(),
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
index 00195b8a722ac9..7453cdb042e515 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
@@ -130,7 +130,7 @@ TEST_F(LegalizationOpConfigTest, CountLoweringsSet) {
   // from MLIR to TF2XLA), these numbers should change. Or if TF Dialect adds
   // a new op, we should expect these to change too.
   EXPECT_EQ(mlir_lowering_count, 68);
-  EXPECT_EQ(tf2xla_fallback_count, 300);
+  EXPECT_EQ(tf2xla_fallback_count, 313);
   EXPECT_EQ(non_categorized_count, 419);
 }
 

From 45cc6dd29bab7d8c79864097cc64fd9eb3126def Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 26 Sep 2023 12:56:45 -0700
Subject: [PATCH 283/567] [stream_executor:cuda] Add a minimal test for
 launching CUDA kernels

PiperOrigin-RevId: 568624489
---
 .../xla/xla/stream_executor/cuda/BUILD        |  15 +++
 .../stream_executor/cuda/cuda_kernel_test.cc  | 119 ++++++++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc

diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 01f687965afbe1..fd617c9c9266a8 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -1,6 +1,7 @@
 # Description:
 #   CUDA-platform specific StreamExecutor support code.
 
+load("//xla/tests:build_defs.bzl", "xla_test")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
@@ -383,6 +384,20 @@ cc_library(
     ]),
 )
 
+xla_test(
+    name = "cuda_kernel_test",
+    srcs = ["cuda_kernel_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":cuda_kernel",
+        "//xla/stream_executor",
+        "//xla/stream_executor:multi_platform_manager",
+        "//xla/stream_executor:platform",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 # TODO(leary) we likely need to canonicalize/eliminate this.
 cc_library(
     name = "cuda_helpers",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
new file mode 100644
index 00000000000000..9b7f99ed8f34b0
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/test.h"
+
+namespace stream_executor::cuda {
+
+// PTX kernel compiled from:
+//
+//  __global__ void add(int* a, int* b, int* c) {
+//    int index = threadIdx.x + blockIdx.x * blockDim.x;
+//    c[index] = a[index] + b[index];
+//  }
+//
+// Easiest way to get PTX from C++ is to use https://godbolt.org.
+static std::string_view kAddI32Kernel = R"(
+.version 8.2
+.target sm_50
+.address_size 64
+
+.visible .entry add(
+        .param .u64 add_param_0,
+        .param .u64 add_param_1,
+        .param .u64 add_param_2
+)
+{
+        .reg .b32       %r<8>;
+        .reg .b64       %rd<11>;
+        .loc    1 1 0
+
+        ld.param.u64    %rd1, [add_param_0];
+        ld.param.u64    %rd2, [add_param_1];
+        ld.param.u64    %rd3, [add_param_2];
+        .loc    1 3 3
+        cvta.to.global.u64      %rd4, %rd3;
+        cvta.to.global.u64      %rd5, %rd2;
+        cvta.to.global.u64      %rd6, %rd1;
+        mov.u32         %r1, %tid.x;
+        mov.u32         %r2, %ctaid.x;
+        mov.u32         %r3, %ntid.x;
+        mad.lo.s32      %r4, %r2, %r3, %r1;
+        .loc    1 4 3
+        mul.wide.s32    %rd7, %r4, 4;
+        add.s64         %rd8, %rd6, %rd7;
+        ld.global.u32   %r5, [%rd8];
+        add.s64         %rd9, %rd5, %rd7;
+        ld.global.u32   %r6, [%rd9];
+        add.s32         %r7, %r6, %r5;
+        add.s64         %rd10, %rd4, %rd7;
+        st.global.u32   [%rd10], %r7;
+        .loc    1 5 1
+        ret;
+
+})";
+
+TEST(CudaKernelTest, Add) {
+  using AddI32Kernel = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
+                                   DeviceMemory<int32_t>>;
+
+  Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
+  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  MultiKernelLoaderSpec spec(/*arity=*/3);
+  spec.AddCudaPtxInMemory(kAddI32Kernel, "add");
+
+  AddI32Kernel add_kernel(executor);
+  ASSERT_TRUE(executor->GetKernel(spec, &add_kernel).ok());
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: a=1, b=2, c=0
+  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+
+  stream.ThenMemset32(&a, 1, byte_length);
+  stream.ThenMemset32(&b, 2, byte_length);
+  stream.ThenMemZero(&c, byte_length);
+
+  // Launch kernel.
+  auto st = stream.ThenLaunch(ThreadDim(), BlockDim(4), add_kernel, a, b, c);
+  ASSERT_TRUE(st.ok());
+
+  // Copy data back to host.
+  std::vector<int32_t> dst(4, 42);
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+
+  std::vector<int32_t> expected = {3, 3, 3, 3};
+  ASSERT_EQ(dst, expected);
+}
+
+}  // namespace stream_executor::cuda

From a11f8e7c00b6c7f4c02070cb73afe5213aeb52bc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 13:11:49 -0700
Subject: [PATCH 284/567] Do not pad `fp16` grouped convolutions.

In such case padding may break the invariant `input_features = feature_group_size * kernel_in_features`.

https://github.com/tensorflow/tensorflow/issues/55633

PiperOrigin-RevId: 568628624
---
 .../service/gpu/cudnn_pad_for_convolutions.cc  |  7 +++++++
 .../gpu/cudnn_pad_for_convolutions_test.cc     | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions.cc b/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions.cc
index d1855e970892fd..71a375e920c6be 100644
--- a/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions.cc
@@ -191,6 +191,13 @@ static StatusOr<bool> TryResolvePaddedShapesForTensorCore(
     return false;
   }
 
+  // When convolution is grouped, the shapes are in agreement with the group
+  // size. We cannot pad them independently.
+  if (conv->feature_group_count() > 1 || conv->batch_group_count() > 1) {
+    VLOG(2) << "Do not pad grouped convolution.";
+    return false;
+  }
+
   // TODO(timshen): Don't skip forward-activation convs if we find a benchmark
   // where there's a speedup.
   if (kind == CudnnConvKind::kForwardActivation) {
diff --git a/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions_test.cc b/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
index 7ab91c7206a28d..d81d98a5987806 100644
--- a/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
@@ -31,6 +31,24 @@ namespace m = xla::match;
 
 class CudnnPadForConvolutionsTest : public HloTestBase {};
 
+TEST_F(CudnnPadForConvolutionsTest, DoNotPadF16ForwardConvWhenGrouped) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    input = f16[704,48,1,49]{3,2,1,0} parameter(0)
+    filter = f16[44,768,1,50]{3,2,1,0} parameter(1)
+    ROOT result = (f16[1,128,48,768]{3,2,1,0}, u8[0]{0})
+      custom-call(input, filter)
+      , window={size=1x50 pad=0_0x64_64}
+      , dim_labels=fb01_io01->01bf
+      , feature_group_count=16
+      , custom_call_target="__cudnn$convForward"
+  })")
+                    .value();
+  EXPECT_FALSE(CudnnPadForConvolutions({7, 5}).Run(module.get()).value());
+}
+
 TEST_F(CudnnPadForConvolutionsTest, PadF16ForwardConvInputChannels) {
   auto module = ParseAndReturnVerifiedModule(R"(
   HloModule TestModule

From ef6dd0af646037971fe0d7db8f353acaa1f66187 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Tue, 26 Sep 2023 13:51:01 -0700
Subject: [PATCH 285/567] [tsl] platform/BUILD: sort exports_files() and remove
 duplicates

PiperOrigin-RevId: 568639296
---
 .../xla/third_party/tsl/tsl/platform/BUILD    | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/BUILD b/third_party/xla/third_party/tsl/tsl/platform/BUILD
index 22212210dccb6a..3708a5f4cec794 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/BUILD
@@ -674,12 +674,10 @@ filegroup(
 
 exports_files(
     [
-        "context.h",
-        "host_info.h",
-        "human_readable_json.h",
         "bfloat16.h",
-        "cpu_info.h",
+        "context.h",
         "cpu_info.cc",
+        "cpu_info.h",
         "crash_analysis.h",
         "criticality.h",
         "cuda_libdevice_path.h",
@@ -692,26 +690,25 @@ exports_files(
         "file_system.h",
         "file_system_helper.cc",
         "file_system_helper.h",
+        "host_info.h",
+        "human_readable_json.h",
         "init_main.h",
         "logging.h",
         "mem.h",
         "net.h",
         "numa.h",
         "ram_file_system.h",
-        "resource_loader.h",
         "resource.h",
+        "resource_loader.h",
+        "rocm_rocdl_path.h",
         "snappy.h",
         "stacktrace_handler.h",
-        "cuda_libdevice_path.h",
-        "test.h",
-        "bfloat16.h",
-        "logging.h",
-        "rocm_rocdl_path.h",
         "subprocess.h",
+        "test.h",
         "threadpool.cc",
         "threadpool.h",
-        "tracing.h",
         "tracing.cc",
+        "tracing.h",
     ],
     visibility = ["//visibility:public"],
 )

From f7ba15ae0bb6d69d9eb415a2e9e05c244e627f5f Mon Sep 17 00:00:00 2001
From: Matt Callanan <mpcallanan@google.com>
Date: Tue, 26 Sep 2023 13:51:16 -0700
Subject: [PATCH 286/567] #tf-data Turn down `"inject_io_prefetch"` experiment.

PiperOrigin-RevId: 568639355
---
 tensorflow/core/data/dataset_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index b03527344e3d18..b353dc5db3b7f7 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -990,7 +990,7 @@ REGISTER_DATASET_EXPERIMENT("file_locality_v2", RandomJobSamplePercentage<0>,
                             AllTasks);
 REGISTER_DATASET_EXPERIMENT("no_compression", RandomJobSamplePercentage<50>,
                             AllTasks);
-REGISTER_DATASET_EXPERIMENT("inject_io_prefetch", RandomJobSamplePercentage<50>,
+REGISTER_DATASET_EXPERIMENT("inject_io_prefetch", RandomJobSamplePercentage<0>,
                             AllTasks);
 }  // namespace
 }  // namespace data

From 2d2b6147010b130f025df772b580225b4d8a243b Mon Sep 17 00:00:00 2001
From: Katherine Wu <kathywu@google.com>
Date: Tue, 26 Sep 2023 14:08:38 -0700
Subject: [PATCH 287/567] Only set `_arg_keywords` when concrete function is
 created.

(previously, it would set the arg_keywords whenever the function is accessed)

PiperOrigin-RevId: 568644270
---
 .../python/eager/polymorphic_function/tracing_compilation.py   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/eager/polymorphic_function/tracing_compilation.py b/tensorflow/python/eager/polymorphic_function/tracing_compilation.py
index 83063733347ae4..7cf1642bfa7bb1 100644
--- a/tensorflow/python/eager/polymorphic_function/tracing_compilation.py
+++ b/tensorflow/python/eager/polymorphic_function/tracing_compilation.py
@@ -178,7 +178,6 @@ def trace_function(args=None, kwargs=None, tracing_options=None):
     concrete_function = _maybe_define_function(
         args, kwargs, tracing_options
     )
-    _set_arg_keywords(concrete_function)
 
   if not tracing_options.bind_graph_to_function:
     concrete_function._garbage_collector.release()  # pylint: disable=protected-access
@@ -348,7 +347,7 @@ def _create_concrete_function(
       # ConcreteFunction.
       shared_func_graph=False,
   )
-
+  _set_arg_keywords(concrete_function)
   transform.call_concrete_function_callbacks(concrete_function)
 
   return concrete_function

From 25290c3f153bc57fbc608b3abd73569a8e37565e Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Tue, 26 Sep 2023 14:26:50 -0700
Subject: [PATCH 288/567] [tsl] platform: use fully qualified namespaces where
 necessary

This paves the way for changes to the logging headers.

PiperOrigin-RevId: 568649175
---
 third_party/xla/third_party/tsl/tsl/platform/errors.h | 4 ++--
 third_party/xla/third_party/tsl/tsl/platform/status.h | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/errors.h b/third_party/xla/third_party/tsl/tsl/platform/errors.h
index 6466b2a04060d9..9008dedad8270c 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/errors.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/errors.h
@@ -95,7 +95,7 @@ inline std::unordered_map<std::string, std::string> GetPayloads(
     const ::tsl::Status& status) {
   std::unordered_map<std::string, std::string> payloads;
   status.ForEachPayload(
-      [&payloads](tsl::StringPiece key, const absl::Cord& value) {
+      [&payloads](::tsl::StringPiece key, const absl::Cord& value) {
         payloads[std::string(key)] = std::string(value);
       });
   return payloads;
@@ -114,7 +114,7 @@ inline void InsertPayloads(
 // Copies all payloads from one Status to another. Will overwrite existing
 // payloads in the destination if they exist with the same key.
 inline void CopyPayloads(const ::tsl::Status& from, ::tsl::Status& to) {
-  from.ForEachPayload([&to](tsl::StringPiece key, const absl::Cord& value) {
+  from.ForEachPayload([&to](::tsl::StringPiece key, const absl::Cord& value) {
     to.SetPayload(key, value);
   });
 }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status.h b/third_party/xla/third_party/tsl/tsl/platform/status.h
index 6affca19b6b5bd..e4d342af9f2d17 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/status.h
@@ -179,10 +179,10 @@ class StatusGroup {
 
 typedef std::function<void(const Status&)> StatusCallback;
 
-extern tsl::string* TfCheckOpHelperOutOfLine(const ::tsl::Status& v,
-                                             const char* msg);
+extern ::tsl::string* TfCheckOpHelperOutOfLine(const ::tsl::Status& v,
+                                               const char* msg);
 
-inline tsl::string* TfCheckOpHelper(::tsl::Status v, const char* msg) {
+inline ::tsl::string* TfCheckOpHelper(::tsl::Status v, const char* msg) {
   if (v.ok()) return nullptr;
   return TfCheckOpHelperOutOfLine(v, msg);
 }

From 8d745464c2ad6d9a1f0198586e07a12ef82c468f Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 26 Sep 2023 14:51:10 -0700
Subject: [PATCH 289/567] Make DeviceExecutablePersistor safe under concurrent
 reads+writes.

If you have multiple XLA compilations going on simultaneously in the system
(same or different processes), you can hit a race condition where

 - one thread/process is reading an entry from the compilation cache at the
   same time as
 - another thread/process is writing that entry to the compilation cache.

Under these conditions, the reader may read an empty or otherwise incomplete
entry.

We fix this by writing to a unique temp location that won't be seen by the
reader.  Only once the write is complete do we move into the final resting
place.

If two writers race, there's also no problem.  They will both write to separate
temp files, and then the last one to rename the file "wins".  If a reader
starts reading after the first write and before the second one, that's also no
problem, at least on *nix, because the reader can continue reading the old file
after it's renamed.  (Hopefully Windows does something reasonable here too, but
in any case the behavior is still better than what we had before.)

PiperOrigin-RevId: 568655744
---
 tensorflow/compiler/jit/BUILD                 |  6 ++--
 .../jit/device_executable_persistor.h         | 30 +++++++++++++++++--
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index c319d6853ce7cd..3c9748984b1115 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -1,13 +1,13 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "if_libtpu", "if_with_tpu_support", "tf_cc_test", "tf_copts", "tf_cuda_cc_test", "tf_cuda_only_cc_test")
-load("@local_xla//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "tf_custom_op_py_strict_library", "tf_jit_compilation_passes_extra_deps")
+load("@local_xla//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -1538,9 +1538,9 @@ cc_library(
         "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
+        "@com_google_absl//absl/log",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla:util",
-        "@local_xla//xla/client:local_client",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/service:hlo_proto_cc",
     ],
diff --git a/tensorflow/compiler/jit/device_executable_persistor.h b/tensorflow/compiler/jit/device_executable_persistor.h
index 3911eee8fd02d4..a38be5b5d86ca9 100644
--- a/tensorflow/compiler/jit/device_executable_persistor.h
+++ b/tensorflow/compiler/jit/device_executable_persistor.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 
+#include "absl/log/log.h"
 #include "tensorflow/compiler/jit/xla_compilation_cache.pb.h"
 #include "tensorflow/compiler/jit/xla_device_compiler_client.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -278,8 +279,33 @@ DeviceExecutablePersistor<ExecutableType, ClientType>::SaveSerializedEntry(
     const XlaSerializedCacheEntry& entry) const {
   Env* env = Env::Default();
   TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(persistent_cache_directory_));
-  const std::string file_path = GetFilePath(entry.key());
-  return WriteBinaryProto(env, file_path, entry);
+
+  // The cache on the filesystem can be read while we're writing out the proto.
+  // To prevent reads of partially-written files, we write the proto to a temp
+  // file, then move it into place once we're done writing.  And we warn the
+  // user if these moves are not known to be atomic.
+  bool has_atomic_move = false;
+  env->HasAtomicMove(persistent_cache_directory_, &has_atomic_move)
+      .IgnoreError();
+  if (!has_atomic_move) {
+    LOG_EVERY_POW_2(WARNING)
+        << "Filesystem for XLA persistent cache at "
+        << persistent_cache_directory_
+        << " does not support atomic moves.  Therefore the persistent cache is "
+           "racy if you have multiple XLA compilations occurring "
+           "simultaneously!  You have been warned. :)";
+  }
+
+  // Write to temp location, then when that completes, atomically move into the
+  // final location.
+  std::string temp_path = io::JoinPath(
+      persistent_cache_directory_, XlaSerializedCacheKeyToString(entry.key()));
+  if (!env->CreateUniqueFileName(&temp_path, ".pb.tmp")) {
+    return absl::UnavailableError(absl::StrCat(
+        "Could not create a unique file inside ", persistent_cache_directory_));
+  }
+  TF_RETURN_IF_ERROR(WriteBinaryProto(env, temp_path, entry));
+  return env->RenameFile(temp_path, GetFilePath(entry.key()));
 }
 
 template <typename ExecutableType, typename ClientType>

From 1d649aad42c1293751fd1ac53c138ec41b001f58 Mon Sep 17 00:00:00 2001
From: Majid Dadashi <majiddadashi@google.com>
Date: Tue, 26 Sep 2023 15:25:36 -0700
Subject: [PATCH 290/567] [tflite] Add stablehlo.gather kernel tests

PiperOrigin-RevId: 568664728
---
 .../lite/core/api/flatbuffer_conversions.cc   |  60 +++++++-
 .../lite/core/api/flatbuffer_conversions.h    |   5 +
 tensorflow/lite/core/c/builtin_op_data.h      |  17 +++
 .../lite/kernels/stablehlo_gather_test.cc     | 137 ++++++++++++++++++
 4 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/lite/kernels/stablehlo_gather_test.cc

diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index e37dfe9389754a..e46ef134b91321 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -878,6 +878,9 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseStablehloRngBitGenerator(op, error_reporter, allocator,
                                            builtin_data);
     }
+    case BuiltinOperator_STABLEHLO_GATHER: {
+      return ParseStablehloGather(op, error_reporter, allocator, builtin_data);
+    }
 
     // TODO: skip param parsing for now since ops below don't have kernels
     case BuiltinOperator_STABLEHLO_SLICE:
@@ -918,7 +921,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_STABLEHLO_REDUCE_WINDOW:
     case BuiltinOperator_STABLEHLO_SORT:
     case BuiltinOperator_STABLEHLO_WHILE:
-    case BuiltinOperator_STABLEHLO_GATHER:
     case BuiltinOperator_STABLEHLO_TRANSPOSE:
 
     // Below are the ops with no builtin_data structure.
@@ -2141,6 +2143,62 @@ TfLiteStatus ParseStablehloRngBitGenerator(const Operator* op,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseStablehloGather(const Operator* op,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteStablehloGatherParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteStablehloGatherParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const StablehloGatherOptions* schema_params =
+      op->builtin_options_2_as_StablehloGatherOptions();
+
+  if (schema_params != nullptr) {
+    TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray<int64_t>(
+        /*max_size_of_buffer=*/schema_params->offset_dims()->size() *
+            sizeof(int64_t),
+        /*flat_vector=*/schema_params->offset_dims(),
+        /*buffer=*/params->offset_dims, /*error_reporter=*/error_reporter,
+        /*op_name=*/"stablehlo_gather"));
+    params->num_offset_dims = schema_params->offset_dims()->size();
+
+    TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray<int64_t>(
+        schema_params->collapsed_slice_dims()->size() * sizeof(int64_t),
+        schema_params->collapsed_slice_dims(), params->collapsed_slice_dims,
+        error_reporter, "stablehlo_gather"));
+    params->num_collapsed_slice_dims =
+        schema_params->collapsed_slice_dims()->size();
+
+    TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray<int64_t>(
+        schema_params->start_index_map()->size() * sizeof(int64_t),
+        schema_params->start_index_map(), params->start_index_map,
+        error_reporter, "stablehlo_gather"));
+    params->num_start_index_map = schema_params->start_index_map()->size();
+
+    params->index_vector_dim = schema_params->index_vector_dim();
+
+    TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray<int64_t>(
+        schema_params->slice_sizes()->size() * sizeof(int64_t),
+        schema_params->slice_sizes(), params->slice_sizes, error_reporter,
+        "stablehlo_gather"));
+    params->num_slice_sizes = schema_params->slice_sizes()->size();
+
+    params->indices_are_sorted = schema_params->indices_are_sorted();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 3f7c27d35d9dcc..9c895b2f89b94a 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -430,6 +430,11 @@ TfLiteStatus ParseStablehloRngBitGenerator(const Operator* op,
                                            BuiltinDataAllocator* allocator,
                                            void** builtin_data);
 
+TfLiteStatus ParseStablehloGather(const Operator* op,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
diff --git a/tensorflow/lite/core/c/builtin_op_data.h b/tensorflow/lite/core/c/builtin_op_data.h
index 166c3353547f94..252f2b23b1a69a 100644
--- a/tensorflow/lite/core/c/builtin_op_data.h
+++ b/tensorflow/lite/core/c/builtin_op_data.h
@@ -33,6 +33,7 @@ extern "C" {
 // number of dimensions.
 #define TFLITE_RESHAPE_PARAMS_MAX_DIMENSION_COUNT 8
 #define TFLITE_STABLEHLO_SCATTER_PARAMS_MAX_DIMENSION_COUNT 8
+#define TFLITE_STABLEHLO_GATHER_PARAMS_MAX_DIMENSION_COUNT 8
 
 // TODO(aselle): Consider using "if this then that" for testing.
 
@@ -588,6 +589,22 @@ typedef struct {
   TfLiteRngAlgorithm algorithm;
 } TfLiteStablehloRngBitGeneratorParams;
 
+typedef struct {
+  // See the stablehlo spec for the explanation of the attributes:
+  // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#gather
+  int64_t offset_dims[TFLITE_STABLEHLO_GATHER_PARAMS_MAX_DIMENSION_COUNT];
+  int num_offset_dims;
+  int64_t
+      collapsed_slice_dims[TFLITE_STABLEHLO_GATHER_PARAMS_MAX_DIMENSION_COUNT];
+  int num_collapsed_slice_dims;
+  int64_t start_index_map[TFLITE_STABLEHLO_GATHER_PARAMS_MAX_DIMENSION_COUNT];
+  int num_start_index_map;
+  int64_t index_vector_dim;
+  int64_t slice_sizes[TFLITE_STABLEHLO_GATHER_PARAMS_MAX_DIMENSION_COUNT];
+  int num_slice_sizes;
+  bool indices_are_sorted;
+} TfLiteStablehloGatherParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/kernels/stablehlo_gather_test.cc b/tensorflow/lite/kernels/stablehlo_gather_test.cc
new file mode 100644
index 00000000000000..1c27809301891a
--- /dev/null
+++ b/tensorflow/lite/kernels/stablehlo_gather_test.cc
@@ -0,0 +1,137 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <initializer_list>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class StablehloGatherOpModel : public SingleOpModel {
+ public:
+  StablehloGatherOpModel(const TensorData& input, const TensorData& indices,
+                         const TfLiteStablehloGatherParams& params) {
+    input_ = AddInput(input);
+    indices_ = AddInput(indices);
+    output_ = AddOutput(TensorData(input.type, {2, 3, 2, 2}));
+    SetBuiltinOp(
+        BuiltinOperator_STABLEHLO_GATHER,
+        BuiltinOptions2_StablehloGatherOptions,
+        CreateStablehloGatherOptions(
+            builder_,
+            builder_.CreateVector(
+                std::vector(params.offset_dims,
+                            params.offset_dims + params.num_offset_dims)),
+            builder_.CreateVector(std::vector(
+                params.collapsed_slice_dims,
+                params.collapsed_slice_dims + params.num_collapsed_slice_dims)),
+            builder_.CreateVector(std::vector(
+                params.start_index_map,
+                params.start_index_map + params.num_start_index_map)),
+            params.index_vector_dim,
+            builder_.CreateVector(
+                std::vector(params.slice_sizes,
+                            params.slice_sizes + params.num_slice_sizes)),
+            params.indices_are_sorted)
+            .Union());
+    BuildInterpreter({GetShape(input_), GetShape(indices_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
+  }
+
+  template <typename T>
+  void SetIndices(std::initializer_list<T> data) {
+    PopulateTensor<T>(indices_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+ protected:
+  int input_;
+  int indices_;
+  int output_;
+};
+
+TEST(StablehloScatterOpTest, GathersSlices) {
+  TfLiteStablehloGatherParams params = {
+      {2, 3},     // offset_dims
+      2,          // num_offset_dims;
+      {0},        // collapsed_slice_dims
+      1,          // num_collapsed_slice_dims;
+      {1, 0},     // start_index_map
+      2,          // num_start_index_map;
+      2,          // index_vector_dim;
+      {1, 2, 2},  // slice_sizes
+      3,          // num_slice_sizes;
+      false       // indices_are_sorted;
+  };
+  StablehloGatherOpModel model({TensorType_FLOAT32, {3, 4, 2}},
+                               {TensorType_INT64, {2, 3, 2}}, params);
+  model.SetInput<float>({1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.SetIndices<int64_t>({0, 0, 1, 0, 2, 1, 0, 1, 1, 1, 0, 2});
+
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  std::vector<float> expected_values = {1,  2,  3,  4,  3,  4,  5,  6,
+                                        13, 14, 15, 16, 9,  10, 11, 12,
+                                        11, 12, 13, 14, 17, 18, 19, 20};
+  EXPECT_THAT(model.GetOutput<float>(), ElementsAreArray(expected_values));
+}
+
+TEST(StablehloScatterOpTest, ClipsStartingIndices) {
+  TfLiteStablehloGatherParams params = {
+      {2, 3},     // offset_dims
+      2,          // num_offset_dims;
+      {0},        // collapsed_slice_dims
+      1,          // num_collapsed_slice_dims;
+      {1, 0},     // start_index_map
+      2,          // num_start_index_map;
+      2,          // index_vector_dim;
+      {1, 2, 2},  // slice_sizes
+      3,          // num_slice_sizes;
+      false       // indices_are_sorted;
+  };
+  StablehloGatherOpModel model({TensorType_FLOAT32, {3, 4, 2}},
+                               {TensorType_INT64, {2, 3, 2}}, params);
+  model.SetInput<float>({1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.SetIndices<int64_t>({0, 0, 1, 0, 2, 1, 0, 1, 1, 1, 0, 9});
+
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  std::vector<float> expected_values = {1,  2,  3,  4,  3,  4,  5,  6,
+                                        13, 14, 15, 16, 9,  10, 11, 12,
+                                        11, 12, 13, 14, 17, 18, 19, 20};
+  EXPECT_THAT(model.GetOutput<float>(), ElementsAreArray(expected_values));
+}
+
+}  // namespace
+}  // namespace tflite

From 550729cf2d4ab18901372d4eb28e196521c40efe Mon Sep 17 00:00:00 2001
From: Jian Cai <jiancai@google.com>
Date: Tue, 26 Sep 2023 15:36:00 -0700
Subject: [PATCH 291/567] Keep attributes in TableGen canonicalization rewrites

By default, pattern rewrites do not preserve op attributes. This is problematic for GPU graphs, where device placement of ops are stored in "device" attribute. Losing this information causesd significant performance lost in some cases. This fixes the issues by copying `device` and other underscored attributes for patterns defined in TF2XLA MLIR phase 1 bridges.

PiperOrigin-RevId: 568667444
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   1 +
 .../mlir/tensorflow/tests/canonicalize.mlir   | 264 ++++++++--------
 .../compiler/mlir/tensorflow/transforms/BUILD |   1 +
 .../tensorflow/transforms/canonicalize.td     | 290 ++++++++++--------
 .../tensorflow/transforms/rewrite_util.cc     |  21 +-
 .../mlir/tensorflow/transforms/rewrite_util.h |  10 +-
 6 files changed, 321 insertions(+), 266 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 4a87632a35770c..78d87e81c45525 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -400,6 +400,7 @@ cc_library(
         "ir/tfrt_ops.h.inc",
     ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
     deps = [
+        ":attribute_utils",
         ":serialize_mlir_module_utils",
         ":tensorflow_attributes",
         ":tensorflow_op_interfaces",
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 84a985b0449ac8..d6c33ce4e4d53e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -20,47 +20,46 @@ func.func @tfAssertFalse(%arg0: tensor<1x1x6x2xf32>) {
 // Ensures that axis param and batch_dims attr use their default values of 0.
 func.func @testGatherToV2(%params: tensor<4x3xf32>, %indices: tensor<1x2xi32>) -> tensor<2x3xf32> {
   // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: "tf.GatherV2"(%arg0, %arg1, %[[AXIS]]) {batch_dims = 0 : i64} : (tensor<4x3xf32>, tensor<1x2xi32>, tensor<i32>) -> tensor<2x3xf32>
-  %0 = "tf.Gather"(%params, %indices) : (tensor<4x3xf32>, tensor<1x2xi32>) -> tensor<2x3xf32>
+  // CHECK: "tf.GatherV2"(%arg0, %arg1, %[[AXIS]]) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x3xf32>, tensor<1x2xi32>, tensor<i32>) -> tensor<2x3xf32>
+  %0 = "tf.Gather"(%params, %indices) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x3xf32>, tensor<1x2xi32>) -> tensor<2x3xf32>
   func.return %0: tensor<2x3xf32>
 }
 
 // CHECK-LABEL: testBatchMatMulToV2
 func.func @testBatchMatMulToV2(%arg0: tensor<2x3x5xf32>, %arg1: tensor<2x5x7xf32>) -> tensor<2x3x7xf32> {
-  // CHECK: tf.BatchMatMulV2
-  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<2x3x5xf32>, tensor<2x5x7xf32>) -> tensor<2x3x7xf32>
+  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3x5xf32>, tensor<2x5x7xf32>) -> tensor<2x3x7xf32>
   func.return %0: tensor<2x3x7xf32>
 }
 
 // CHECK-LABEL: testDynamicBatchMatMulToV2
 func.func @testDynamicBatchMatMulToV2(%arg0: tensor<2x3x5xf32>, %arg1: tensor<?x5x7xf32>) -> tensor<2x3x7xf32> {
-  // CHECK: tf.BatchMatMul
-  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<2x3x5xf32>, tensor<?x5x7xf32>) -> tensor<2x3x7xf32>
+  // CHECK: "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3x5xf32>, tensor<?x5x7xf32>) -> tensor<2x3x7xf32>
   func.return %0: tensor<2x3x7xf32>
 }
 
 // CHECK-LABEL: testBatchMatMulToMatMul
 func.func @testBatchMatMulToMatMul(%arg0: tensor<2x3xf32>, %arg1: tensor<3x2xf32>) -> tensor<2x2xf32> {
-  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32>
+  // CHECK: %0 = "tf.MatMul"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", transpose_a = false, transpose_b = false} : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32>
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32>
+  // CHECK: return %0
   func.return %0: tensor<2x2xf32>
-
-// CHECK: %0 = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32>
-// CHECK: return %0
 }
 
 // CHECK-LABEL: testBatchMatMulV2ToMatMul
 func.func @testBatchMatMulV2ToMatMul(%arg0: tensor<4x3xf32>, %arg1: tensor<4x5xf32>) -> tensor<3x5xf32> {
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = false} : (tensor<4x3xf32>, tensor<4x5xf32>) -> tensor<3x5xf32>
+  // CHECK: %0 = "tf.MatMul"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", transpose_a = true, transpose_b = false} : (tensor<4x3xf32>, tensor<4x5xf32>) -> tensor<3x5xf32>
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x3xf32>, tensor<4x5xf32>) -> tensor<3x5xf32>
+  // CHECK: return %0
   func.return %0: tensor<3x5xf32>
 
-// CHECK: %0 = "tf.MatMul"(%arg0, %arg1) {transpose_a = true, transpose_b = false} : (tensor<4x3xf32>, tensor<4x5xf32>) -> tensor<3x5xf32>
-// CHECK: return %0
 }
 
 // CHECK-LABEL: testBiasAddV1ToBiasAdd
 func.func @testBiasAddV1ToBiasAdd(%arg0: tensor<*xf32>, %arg1: tensor<128xf32>) -> tensor<*xf32> {
-  // CHECK: "tf.BiasAdd"(%arg0, %arg1) {data_format = "NHWC"} : (tensor<*xf32>, tensor<128xf32>) -> tensor<*xf32>
-  %0 = "tf.BiasAddV1"(%arg0, %arg1) : (tensor<*xf32>, tensor<128xf32>) -> tensor<*xf32>
+  // CHECK: "tf.BiasAdd"(%arg0, %arg1) {data_format = "NHWC", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xf32>, tensor<128xf32>) -> tensor<*xf32>
+  %0 = "tf.BiasAddV1"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xf32>, tensor<128xf32>) -> tensor<*xf32>
   func.return %0: tensor<*xf32>
 }
 
@@ -72,43 +71,42 @@ func.func @testLeakyRelu(%arg0 : tensor<16xf32>) -> (tensor<16xf32>) {
 }
 
 // CHECK-LABEL: testSameBitcastType
-func.func @testSameBitcastType(%arg0: tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xf32> {
+// CHECK-SAME: (%arg0: tensor<8x16x32x64xf32> {tf.device = "/job:localhost/replica:0/task:0/device:GPU:0"})
+func.func @testSameBitcastType(%arg0: tensor<8x16x32x64xf32> {tf.device = "/job:localhost/replica:0/task:0/device:GPU:0"}) -> tensor<8x16x32x64xf32> {
   %0 = "tf.Bitcast"(%arg0) : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xf32>
+  // CHECK: return %arg0
   func.return %0: tensor<8x16x32x64xf32>
-
-// CHECK: return %arg0
 }
 
 // CHECK-LABEL: testDifferentBitcastType
 func.func @testDifferentBitcastType(%arg0: tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32> {
+  // CHECK: %0 = "tf.Bitcast"(%arg0) : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
   %0 = "tf.Bitcast"(%arg0) : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
+  // CHECK: return %0
   func.return %0: tensor<8x16x32x64xi32>
-
-// CHECK: %0 = "tf.Bitcast"(%arg0) : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
-// CHECK: return %0
 }
 
 // CHECK-LABEL: testDoubleBitcast
 func.func @testDoubleBitcast(%arg0: tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32> {
   %0 = "tf.Bitcast"(%arg0) : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64x2xi16>
-  %1 = "tf.Bitcast"(%0) : (tensor<8x16x32x64x2xi16>) -> tensor<8x16x32x64xi32>
+  %1 = "tf.Bitcast"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16x32x64x2xi16>) -> tensor<8x16x32x64xi32>
   func.return %1: tensor<8x16x32x64xi32>
 
-// CHECK: %0 = "tf.Bitcast"(%arg0) : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
-// CHECK: return %0
+  // CHECK: %0 = "tf.Bitcast"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testDoubleBitcastWithDependentArg
 func.func @testDoubleBitcastWithDependentArg(%arg0: tensor<8x16x32x64xf32>) -> (tensor<8x16x32x64xi32>, tensor<8x16x32x64x2xi16>) {
   %0 = "tf.Bitcast"(%arg0) : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64x2xi16>
-  %1 = "tf.Bitcast"(%0) : (tensor<8x16x32x64x2xi16>) -> tensor<8x16x32x64xi32>
+  %1 = "tf.Bitcast"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16x32x64x2xi16>) -> tensor<8x16x32x64xi32>
   %2 = "tf.Identity"(%0) : (tensor<8x16x32x64x2xi16>) -> tensor<8x16x32x64x2xi16>
   func.return %1, %2 :  tensor<8x16x32x64xi32>, tensor<8x16x32x64x2xi16>
 
-// CHECK: %0 = "tf.Bitcast"(%arg0) : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64x2xi16>
-// CHECK: %1 = "tf.Bitcast"(%arg0) : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
-// CHECK: %2 = "tf.Identity"(%0) : (tensor<8x16x32x64x2xi16>) -> tensor<8x16x32x64x2xi16>
-// CHECK: return %1, %2
+  // CHECK: %0 = "tf.Bitcast"(%arg0) : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64x2xi16>
+  // CHECK: %1 = "tf.Bitcast"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
+  // CHECK: %2 = "tf.Identity"(%0) : (tensor<8x16x32x64x2xi16>) -> tensor<8x16x32x64x2xi16>
+  // CHECK: return %1, %2
 }
 
 // CHECK-LABEL: testSameCastType
@@ -117,7 +115,7 @@ func.func @testSameCastType(%arg0: tensor<8x16x32x64xf32>) -> (tensor<8x16x32x64
   %1 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xf32>
   func.return %0, %1: tensor<8x16x32x64xf32>, tensor<8x16x32x64xf32>
 
-// CHECK: return %arg0, %arg0
+  // CHECK: return %arg0, %arg0
 }
 
 // CHECK-LABEL: testDifferentCastType
@@ -126,9 +124,9 @@ func.func @testDifferentCastType(%arg0: tensor<8x16x32x64xf32>) -> (tensor<8x16x
   %1 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
   func.return %0, %1: tensor<8x16x32x64xi32>, tensor<8x16x32x64xi32>
 
-// CHECK: %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
-// CHECK: %1 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
-// CHECK: return %0, %1
+  // CHECK: %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
+  // CHECK: %1 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
+  // CHECK: return %0, %1
 }
 
 // CHECK-LABEL: testCompatibleCastType
@@ -137,9 +135,9 @@ func.func @testCompatibleCastType(%arg0: tensor<?xf32>) -> (tensor<10xf32>, tens
   %1 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<?xf32>) -> tensor<10xf32>
   func.return %0, %1: tensor<10xf32>, tensor<10xf32>
 
-// CHECK: %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<?xf32>) -> tensor<10xf32>
-// CHECK: %1 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<?xf32>) -> tensor<10xf32>
-// CHECK: return %0, %1
+  // CHECK: %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<?xf32>) -> tensor<10xf32>
+  // CHECK: %1 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<?xf32>) -> tensor<10xf32>
+  // CHECK: return %0, %1
 }
 
 // CHECK-LABEL: testSameCastTypeAcrossBasicBlocks
@@ -153,7 +151,7 @@ func.func @testSameCastTypeAcrossBasicBlocks(tensor<8x16x32x64xf32>) -> tensor<8
 ^exit:
   func.return %1: tensor<8x16x32x64xf32>
 
-// CHECK: return %arg0
+  // CHECK: return %arg0
 }
 
 // CHECK-LABEL: testConcatCanonicalization
@@ -161,8 +159,8 @@ func.func @testConcatCanonicalization(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1x
   // CHECK: %[[AXIS:.*]] = "tf.Const"
   %0 = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK: "tf.ConcatV2"(%arg0, %arg1, %[[AXIS]])
-  %1 = "tf.Concat"(%0, %arg0, %arg1) : (tensor<i32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x2xi32>
+  // CHECK: "tf.ConcatV2"(%arg0, %arg1, %[[AXIS]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  %1 = "tf.Concat"(%0, %arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x2xi32>
   func.return %1 : tensor<2x2xi32>
 }
 
@@ -331,11 +329,11 @@ func.func @testConcatCwiseBinarySynthMulOp2Inputs(%arg0: tensor<?x1xf32>, %arg1:
 // CHECK-LABEL: testLogOfSoftmax
 func.func @testLogOfSoftmax(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Softmax"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  %1 = "tf.Log"(%0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  %1 = "tf.Log"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %1: tensor<8x16xf32>
 
-// CHECK: %0 = "tf.LogSoftmax"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-// CHECK: return %0
+  // CHECK: %0 = "tf.LogSoftmax"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testLogToLog1p
@@ -344,9 +342,9 @@ func.func @testLogToLog1p(%arg0 : tensor<4x4xf32>) -> tensor<4x4xf32> {
   %1 = "tf.Const"() {value = dense<2.0> : tensor<f32>} : () -> tensor<1xf32>
   %2 = "tf.Const"() {value = dense<[1.0, 1.0, 1.0, 1.0]> : tensor<4xf32>} : () -> tensor<4xf32>
 
-  // CHECK: %[[LOGP:.*]] = "tf.Log1p"(%arg0) : (tensor<4x4xf32>) -> tensor<4x4xf32>
+  // CHECK: %[[LOGP:.*]] = "tf.Log1p"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x4xf32>) -> tensor<4x4xf32>
   %3 = "tf.AddV2"(%arg0, %0): (tensor<4x4xf32>, tensor<1xf32>) -> tensor<4x4xf32>
-  %4 = "tf.Log"(%3): (tensor<4x4xf32>) -> tensor<4x4xf32>
+  %4 = "tf.Log"(%3) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}: (tensor<4x4xf32>) -> tensor<4x4xf32>
 
   // CHECK: %[[ADD1:.*]] = "tf.AddV2"
   // CHECK: %[[LOG1:.*]] = "tf.Log"(%[[ADD1]])
@@ -368,10 +366,10 @@ func.func @testLogToLog1p(%arg0 : tensor<4x4xf32>) -> tensor<4x4xf32> {
 // CHECK-LABEL: testSubOfNeg
 func.func @testSubOfNeg(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Neg"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  %1 = "tf.Sub"(%arg0, %0) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  %1 = "tf.Sub"(%arg0, %0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %1: tensor<8x16xf32>
 
-// CHECK: %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK: %0 = "tf.AddV2"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
 // CHECK: return %0
 }
 
@@ -401,20 +399,20 @@ func.func @testSubOfZeroWithBroadcasting(%arg0: tensor<4x1xf32>) -> tensor<4x4xf
 // CHECK-LABEL: testSquareOfSub
 func.func @testSquareOfSub(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Sub"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
-  %1 = "tf.Square"(%0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  %1 = "tf.Square"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %1: tensor<8x16xf32>
 
-// CHECK: %0 = "tf.SquaredDifference"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
-// CHECK: return %0
+  // CHECK: %0 = "tf.SquaredDifference"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testAddToAddV2
 func.func @testAddToAddV2(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
-  %0 = "tf.Add"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  %0 = "tf.Add"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %0: tensor<8x16xf32>
 
-// CHECK: %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
-// CHECK: return %0
+  // CHECK: %0 = "tf.AddV2"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testNoAddToAddV2ForStringType
@@ -422,46 +420,46 @@ func.func @testNoAddToAddV2ForStringType(%arg0: tensor<8x16x!tf_type.string>, %a
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<8x16x!tf_type.string>, tensor<8x16x!tf_type.string>) -> tensor<8x16x!tf_type.string>
   func.return %0: tensor<8x16x!tf_type.string>
 
-// CHECK: %0 = "tf.Add"(%arg0, %arg1) : (tensor<8x16x!tf_type.string>, tensor<8x16x!tf_type.string>) -> tensor<8x16x!tf_type.string>
-// CHECK: return %0
+  // CHECK: %0 = "tf.Add"(%arg0, %arg1) : (tensor<8x16x!tf_type.string>, tensor<8x16x!tf_type.string>) -> tensor<8x16x!tf_type.string>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testAddOfNegLeft
 func.func @testAddOfNegLeft(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Neg"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  %1 = "tf.Add"(%0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  %1 = "tf.Add"(%0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %1: tensor<8x16xf32>
 
-// CHECK: %0 = "tf.Sub"(%arg1, %arg0) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
-// CHECK: return %0
+  // CHECK: %0 = "tf.Sub"(%arg1, %arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testAddOfNegRight
 func.func @testAddOfNegRight(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Neg"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  %1 = "tf.Add"(%arg0, %0) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  %1 = "tf.Add"(%arg0, %0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %1: tensor<8x16xf32>
 
-// CHECK: %0 = "tf.Sub"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
-// CHECK: return %0
+  // CHECK: %0 = "tf.Sub"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testAddV2OfNegLeft
 func.func @testAddV2OfNegLeft(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Neg"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  %1 = "tf.AddV2"(%0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  %1 = "tf.AddV2"(%0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %1: tensor<8x16xf32>
-// CHECK: %0 = "tf.Sub"(%arg1, %arg0) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK: %0 = "tf.Sub"(%arg1, %arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
 // CHECK: return %0
 }
 
 // CHECK-LABEL: testAddV2OfNegRight
 func.func @testAddV2OfNegRight(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Neg"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  %1 = "tf.AddV2"(%arg0, %0) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  %1 = "tf.AddV2"(%arg0, %0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %1: tensor<8x16xf32>
 
-// CHECK: %0 = "tf.Sub"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK: %0 = "tf.Sub"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
 // CHECK: return %0
 }
 
@@ -578,12 +576,12 @@ func.func @testRedundantReshape(%arg0: tensor<4x4xi32>) -> tensor<2x8xi32> {
   %0 = "tf.Const"() {value = dense<[8, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tf.Const"() {value = dense<[2, 8]> : tensor<2xi32>} : () -> tensor<2xi32>
   %2 = "tf.Reshape"(%arg0, %0) : (tensor<4x4xi32>, tensor<2xi32>) -> tensor<8x2xi32>
-  %3 = "tf.Reshape"(%2, %1) : (tensor<8x2xi32>, tensor<2xi32>) -> tensor<2x8xi32>
+  %3 = "tf.Reshape"(%2, %1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x2xi32>, tensor<2xi32>) -> tensor<2x8xi32>
   func.return %3: tensor<2x8xi32>
 
   // CHECK: %[[CONST:.*]] = "tf.Const"
   // CHECK-SAME: value = dense<[2, 8]> : tensor<2xi32>
-  // CHECK: %[[RES:.*]] = "tf.Reshape"(%arg0, %[[CONST]])
+  // CHECK: %[[RES:.*]] = "tf.Reshape"(%arg0, %[[CONST]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK: return %[[RES]] : tensor<2x8xi32>
 }
 
@@ -697,7 +695,7 @@ func.func @testStaticAndIdenticalTypeForNotEqualOp(%arg0: tensor<2xi32>, %arg1:
 func.func @testUnknownBroadcastForNotEqualOp(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
   // CHECK:      "tf.NotEqual"(%arg0, %arg1)
   // CHECK-SAME:   incompatible_shape_error = false
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
+  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false} : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
   func.return %0: tensor<*xi1>
 }
 
@@ -705,7 +703,7 @@ func.func @testUnknownBroadcastForNotEqualOp(%arg0: tensor<?xi32>, %arg1: tensor
 func.func @testKnownGoodBroadcastForNotEqualOp(%arg0: tensor<1x?xi32>, %arg1: tensor<?x1xi32>) -> tensor<?x?xi1> {
   // CHECK:      "tf.NotEqual"(%arg0, %arg1)
   // CHECK-SAME:   incompatible_shape_error = true
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<1x?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false} : (tensor<1x?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
   func.return %0: tensor<?x?xi1>
 }
 
@@ -713,7 +711,7 @@ func.func @testKnownGoodBroadcastForNotEqualOp(%arg0: tensor<1x?xi32>, %arg1: te
 func.func @testKnownBadBroadcastForNotEqualOp(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<i1> {
   // CHECK:      "tf.NotEqual"(%arg0, %arg1)
   // CHECK-SAME:   incompatible_shape_error = false
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<i1>
+  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false} : (tensor<?xi32>, tensor<?xi32>) -> tensor<i1>
   func.return %0: tensor<i1>
 }
 
@@ -736,69 +734,69 @@ func.func @testUnrankedLHSForNotEqualOp(%arg0: tensor<*xi32>, %arg1: tensor<i32>
 // CHECK-LABEL: func @testScalarForNotEqualOp
 func.func @testScalarForNotEqualOp(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i1> {
   // CHECK:      "tf.NotEqual"(%arg0, %arg1)
-  // CHECK-SAME:   incompatible_shape_error = true
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-SAME: incompatible_shape_error = true
+  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false} : (tensor<i32>, tensor<i32>) -> tensor<i1>
   func.return %0: tensor<i1>
 }
 
 // CHECK-LABEL: testLogicalNotOfEqual
 func.func @testLogicalNotOfEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xi1> {
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-  %1 = "tf.LogicalNot"(%0) : (tensor<8x16xi1>) -> tensor<8x16xi1>
+  %1 = "tf.LogicalNot"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xi1>) -> tensor<8x16xi1>
   func.return %1: tensor<8x16xi1>
 
-// CHECK: %[[NE:.*]] = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = true}
-// CHECK: return %[[NE]]
+  // CHECK: %[[NE:.*]] = "tf.NotEqual"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", incompatible_shape_error = true}
+  // CHECK: return %[[NE]]
 }
 
 // CHECK-LABEL: testLogicalNotOfNotEqual
 func.func @testLogicalNotOfNotEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xi1> {
   %0 = "tf.NotEqual"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-  %1 = "tf.LogicalNot"(%0) : (tensor<8x16xi1>) -> tensor<8x16xi1>
+  %1 = "tf.LogicalNot"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xi1>) -> tensor<8x16xi1>
   func.return %1: tensor<8x16xi1>
 
-// CHECK: %[[NE:.*]] = "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = true}
-// CHECK: return %[[NE]]
+  // CHECK: %[[NE:.*]] = "tf.Equal"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", incompatible_shape_error = true}
+  // CHECK: return %[[NE]]
 }
 
 // CHECK-LABEL: testLogicalNotOfGreater
 func.func @testLogicalNotOfGreater(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xi1> {
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-  %1 = "tf.LogicalNot"(%0) : (tensor<8x16xi1>) -> tensor<8x16xi1>
+  %1 = "tf.LogicalNot"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xi1>) -> tensor<8x16xi1>
   func.return %1: tensor<8x16xi1>
 
-// CHECK: %0 = "tf.LessEqual"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-// CHECK: return %0
+  // CHECK: %0 = "tf.LessEqual"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testLogicalNotOfGreaterEqual
 func.func @testLogicalNotOfGreaterEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xi1> {
   %0 = "tf.GreaterEqual"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-  %1 = "tf.LogicalNot"(%0) : (tensor<8x16xi1>) -> tensor<8x16xi1>
+  %1 = "tf.LogicalNot"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xi1>) -> tensor<8x16xi1>
   func.return %1: tensor<8x16xi1>
 
-// CHECK: %0 = "tf.Less"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-// CHECK: return %0
+  // CHECK: %0 = "tf.Less"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testLogicalNotOfLess
 func.func @testLogicalNotOfLess(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xi1> {
   %0 = "tf.Less"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-  %1 = "tf.LogicalNot"(%0) : (tensor<8x16xi1>) -> tensor<8x16xi1>
+  %1 = "tf.LogicalNot"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xi1>) -> tensor<8x16xi1>
   func.return %1: tensor<8x16xi1>
 
-// CHECK: %0 = "tf.GreaterEqual"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-// CHECK: return %0
+  // CHECK: %0 = "tf.GreaterEqual"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testLogicalNotOfLessEqual
 func.func @testLogicalNotOfLessEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xi1> {
   %0 = "tf.LessEqual"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-  %1 = "tf.LogicalNot"(%0) : (tensor<8x16xi1>) -> tensor<8x16xi1>
+  %1 = "tf.LogicalNot"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xi1>) -> tensor<8x16xi1>
   func.return %1: tensor<8x16xi1>
 
-// CHECK: %0 = "tf.Greater"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-// CHECK: return %0
+  // CHECK: %0 = "tf.Greater"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testSizeFolding
@@ -813,11 +811,11 @@ func.func @testSizeFolding(%arg0: tensor<3x5x7xf32>) -> tensor<i32> {
 // CHECK-LABEL: testDivWithSqrtDivisor
 func.func @testDivWithSqrtDivisor(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Sqrt"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  %1 = "tf.Div"(%arg0, %0) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  %1 = "tf.Div"(%arg0, %0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %1: tensor<8x16xf32>
 
-// CHECK: %0 = "tf.Rsqrt"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-// CHECK: %1 = "tf.Mul"(%arg0, %0) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK: %0 = "tf.Rsqrt"(%arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK: %1 = "tf.Mul"(%arg0, %0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
 // CHECK: return %1
 }
 
@@ -847,22 +845,22 @@ func.func @testRealDivWithConstDivisor(%arg0: tensor<8x2xf32>) -> tensor<8x2xf32
 // CHECK-LABEL: testTruncateDivWithSqrtDivisor
 func.func @testTruncateDivWithSqrtDivisor(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Sqrt"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  %1 = "tf.TruncateDiv"(%arg0, %0) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  %1 = "tf.TruncateDiv"(%arg0, %0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %1: tensor<8x16xf32>
 
-// CHECK: %0 = "tf.Rsqrt"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-// CHECK: %1 = "tf.Mul"(%arg0, %0) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK: %0 = "tf.Rsqrt"(%arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK: %1 = "tf.Mul"(%arg0, %0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
 // CHECK: return %1
 }
 
 // CHECK-LABEL: testXdivyWithSqrtDivisor
 func.func @testXdivyWithSqrtDivisor(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Sqrt"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  %1 = "tf.Xdivy"(%arg0, %0) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  %1 = "tf.Xdivy"(%arg0, %0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %1: tensor<8x16xf32>
 
-// CHECK: %0 = "tf.Rsqrt"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-// CHECK: %1 = "tf.MulNoNan"(%0, %arg0) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK: %0 = "tf.Rsqrt"(%arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK: %1 = "tf.MulNoNan"(%0, %arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
 // CHECK: return %1
 }
 
@@ -1069,21 +1067,21 @@ func.func @ToBool_2DTensorZeroDim(%arg0: tensor<1x0xf32>) -> tensor<i1> {
 // CHECK-LABEL: testReadVariableOpOfCast
 func.func @testReadVariableOpOfCast(%arg0: tensor<!tf_type.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32> {
   %0 = "tf.Cast"(%arg0) : (tensor<!tf_type.resource<tensor<8x40xf32>>>) -> tensor<*x!tf_type.resource>
-  %1 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf_type.resource>) -> tensor<8x40xf32>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<8x40xf32>
   func.return %1: tensor<8x40xf32>
 
-// CHECK: %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32>
-// CHECK: return %0
+  // CHECK: %0 = "tf.ReadVariableOp"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testReadVariableOpOfCastWithTruncate
 func.func @testReadVariableOpOfCastWithTruncate(%arg0: tensor<!tf_type.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32> {
   %0 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<!tf_type.resource<tensor<8x40xf32>>>) -> tensor<*x!tf_type.resource>
-  %1 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf_type.resource>) -> tensor<8x40xf32>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<8x40xf32>
   func.return %1: tensor<8x40xf32>
 
-// CHECK: %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32>
-// CHECK: return %0
+  // CHECK: %0 = "tf.ReadVariableOp"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32>
+  // CHECK: return %0
 }
 
 // CHECK-LABEL: testReadVariableOpOfCastMultiUse
@@ -1102,12 +1100,12 @@ func.func @testReadVariableOpOfCastMultiUse(%arg0: tensor<!tf_type.resource<tens
 // CHECK-LABEL: testMultiReadVariableOpsOfCast
 func.func @testMultiReadVariableOpsOfCast(%arg0: tensor<!tf_type.resource<tensor<f32>>>) -> (tensor<f32>, tensor<f32>) {
   %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<*x!tf_type.resource>
-  %1 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf_type.resource>) -> tensor<f32>
-  %2 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf_type.resource>) -> tensor<f32>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<f32>
+  %2 = "tf.ReadVariableOp"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<f32>
   func.return %1, %2: tensor<f32>, tensor<f32>
 
- // CHECK: %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
- // CHECK: %1 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+ // CHECK: %0 = "tf.ReadVariableOp"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+ // CHECK: %1 = "tf.ReadVariableOp"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
  // CHECK: return %0, %1
 }
 
@@ -1308,8 +1306,8 @@ func.func @sub(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-SAME: ([[INPUT:%.*]]: tensor<?x?x?x?xf32>, [[CROPS:%.*]]: tensor<?x?xi32>)
 func.func @testBatchToSpaceToBatchToSpaceND(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?xi32>) -> tensor<*xf32> {
   // CHECK: [[BLOCK_SHAPE:%.*]] = "tf.Const"() {value = dense<8> : tensor<2xi64>}
-  // CHECK: [[BATCH_TO_SHAPE_ND:%.*]] = "tf.BatchToSpaceND"([[INPUT]], [[BLOCK_SHAPE]], [[CROPS]])
-  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 8 : i64} : (tensor<?x?x?x?xf32>, tensor<?x?xi32>) -> tensor<*xf32>
+  // CHECK: [[BATCH_TO_SHAPE_ND:%.*]] = "tf.BatchToSpaceND"([[INPUT]], [[BLOCK_SHAPE]], [[CROPS]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 8 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<?x?x?x?xf32>, tensor<?x?xi32>) -> tensor<*xf32>
   // CHECK: return [[BATCH_TO_SHAPE_ND]]
   func.return %0 : tensor<*xf32>
 }
@@ -1627,29 +1625,29 @@ func.func @testMatrixDiag(%diag: tensor<2x4xf32>) -> tensor<2x4x4xf32> {
   // CHECK-DAG: %[[MINUS1:.*]] = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
   // CHECK-DAG: %[[ZEROI:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
   // CHECK-DAG: %[[ZEROF:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: "tf.MatrixDiagV3"(%arg0, %[[ZEROI]], %[[MINUS1]], %[[MINUS1]], %[[ZEROF]]) {align = "RIGHT_LEFT"} : (tensor<2x4xf32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<f32>) -> tensor<2x4x4xf32>
-  %0 = "tf.MatrixDiag"(%diag) : (tensor<2x4xf32>) -> tensor<2x4x4xf32>
+  // CHECK-DAG: "tf.MatrixDiagV3"(%arg0, %[[ZEROI]], %[[MINUS1]], %[[MINUS1]], %[[ZEROF]]) {align = "RIGHT_LEFT", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x4xf32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<f32>) -> tensor<2x4x4xf32>
+  %0 = "tf.MatrixDiag"(%diag) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x4xf32>) -> tensor<2x4x4xf32>
   func.return %0 : tensor<2x4x4xf32>
 }
 
 // CHECK-LABEL: @testMatrixSetDiag
 func.func @testMatrixSetDiag(%arg0: tensor<3x3xi64>, %arg1: tensor<3xi64>) -> tensor<3x3xi64> {
-  %0 = "tf.MatrixSetDiag"(%arg0, %arg1) : (tensor<3x3xi64>, tensor<3xi64>) -> tensor<3x3xi64>
+  %0 = "tf.MatrixSetDiag"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<3x3xi64>, tensor<3xi64>) -> tensor<3x3xi64>
   func.return %0 : tensor<3x3xi64>
 
   // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
   // CHECK: %[[RES:.*]] = "tf.MatrixSetDiagV3"(%arg0, %arg1, %[[ZERO]])
-  // CHECK-SAME: {align = "RIGHT_LEFT"}
+  // CHECK-SAME: {align = "RIGHT_LEFT", device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK-SAME: (tensor<3x3xi64>, tensor<3xi64>, tensor<i32>) -> tensor<3x3xi64>
 }
 
 // CHECK-LABEL: @testMatrixSetDiagV2
 func.func @testMatrixSetDiagV2(%arg0: tensor<3x3xi64>, %arg1: tensor<3xi64>, %arg2: tensor<i32>) -> tensor<3x3xi64> {
-  %0 = "tf.MatrixSetDiagV2"(%arg0, %arg1, %arg2) : (tensor<3x3xi64>, tensor<3xi64>, tensor<i32>) -> tensor<3x3xi64>
+  %0 = "tf.MatrixSetDiagV2"(%arg0, %arg1, %arg2) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<3x3xi64>, tensor<3xi64>, tensor<i32>) -> tensor<3x3xi64>
   func.return %0 : tensor<3x3xi64>
 
   // CHECK: %[[RES:.*]] = "tf.MatrixSetDiagV3"(%arg0, %arg1, %arg2)
-  // CHECK-SAME: {align = "LEFT_LEFT"}
+  // CHECK-SAME: {align = "LEFT_LEFT", device = "/job:localhost/replica:0/task:0/device:GPU:0"}
 }
 
 // CHECK-LABEL: @testVariableToVariableV2
@@ -1657,8 +1655,8 @@ func.func @testVariableToVariableV2() {
   // CHECK-NOT: "tf.Variable"
 
   %0 = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
-  // CHECK: "tf.VariableV2"
-  %1 = "tf.Variable"() {container = "", dtype = i32, shared_name = "var", shape = #tf_type.shape<>} : () -> tensor<!tf_type.int32ref>
+  // CHECK: "tf.VariableV2"() {container = "", device = "/job:localhost/replica:0/task:0/device:GPU:0", shape = #tf_type.shape<>, shared_name = "var"}
+  %1 = "tf.Variable"() {container = "", dtype = i32, shared_name = "var", shape = #tf_type.shape<>, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : () -> tensor<!tf_type.int32ref>
   %2 = "tf.Assign"(%1, %0) : (tensor<!tf_type.int32ref>, tensor<i32>) -> (tensor<!tf_type.int32ref>)
 
   func.return
@@ -1957,9 +1955,9 @@ func.func @while_with_id_passthrough(%arg0: tensor<7xf32> {tf._user_specified_na
 
 // CHECK-LABEL: testConvertQuantizeAndDequantizeV2ToQuantizeAndDequantizeV4
 func.func @testConvertQuantizeAndDequantizeV2ToQuantizeAndDequantizeV4(%arg0 : tensor<?x?xf32>, %arg1 : tensor<f32>, %arg2 : tensor<f32>) -> tensor<?x?xf32> {
-  %0 = "tf.QuantizeAndDequantizeV2"(%arg0, %arg1, %arg2) : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
+  %0 = "tf.QuantizeAndDequantizeV2"(%arg0, %arg1, %arg2) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
-  // CHECK: %[[QUANT:.*]] = "tf.QuantizeAndDequantizeV4"(%arg0, %arg1, %arg2) {axis = -1 : i64, narrow_range = false, num_bits = 8 : i64, range_given = false, round_mode = "HALF_TO_EVEN", signed_input = true} : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK: %[[QUANT:.*]] = "tf.QuantizeAndDequantizeV4"(%arg0, %arg1, %arg2) {axis = -1 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0", narrow_range = false, num_bits = 8 : i64, range_given = false, round_mode = "HALF_TO_EVEN", signed_input = true} : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
   // CHECK: return %[[QUANT]] : tensor<?x?xf32>
 }
 
@@ -1967,6 +1965,7 @@ func.func @testConvertQuantizeAndDequantizeV2ToQuantizeAndDequantizeV4(%arg0 : t
 func.func @testHashTableAndInitializeTableToV2(%arg0: tensor<!tf_type.string>) {
   // CHECK: [[handle:%.*]] = "tf.HashTableV2"()
   // CHECK-SAME: container = ""
+  // CHECK-SAME: device = ""
   // CHECK-SAME: key_dtype = !tf_type.string
   // CHECK-SAME: shared_name = "table"
   // CHECK-SAME: value_dtype = i32
@@ -1974,6 +1973,7 @@ func.func @testHashTableAndInitializeTableToV2(%arg0: tensor<!tf_type.string>) {
   %handle = "tf.HashTable"() {container = "", device = "", shared_name = "table", key_dtype = !tf_type.string, value_dtype = i32} : () -> tensor<*x!tf_type.stringref>
 
   // CHECK: "tf.InitializeTableFromTextFileV2"([[handle]]
+  // CHECK-SAME: device = ""
   "tf.InitializeTableFromTextFile"(%handle, %arg0) {device = "", key_index=1, value_index=1, delimiter="\t"} : (tensor<*x!tf_type.stringref>, tensor<!tf_type.string>) -> ()
   func.return
 }
@@ -1982,6 +1982,7 @@ func.func @testHashTableAndInitializeTableToV2(%arg0: tensor<!tf_type.string>) {
 func.func @testHashTableAndLookupTableSizeToV2() -> tensor<i64> {
   // CHECK: [[handle:%.*]] = "tf.HashTableV2"()
   // CHECK-SAME: container = ""
+  // CHECK-SAME: device = ""
   // CHECK-SAME: key_dtype = !tf_type.string
   // CHECK-SAME: shared_name = "table"
   // CHECK-SAME: value_dtype = i32
@@ -1989,7 +1990,8 @@ func.func @testHashTableAndLookupTableSizeToV2() -> tensor<i64> {
   %handle = "tf.HashTable"() {container = "", device = "", shared_name = "table", key_dtype = !tf_type.string, value_dtype = i32} : () -> tensor<*x!tf_type.stringref>
 
   // CHECK: "tf.LookupTableSizeV2"([[handle]]
-  %0 = "tf.LookupTableSize"(%handle) {} : (tensor<*x!tf_type.stringref>) -> tensor<i64>
+  // CHECK-SAME: device = ""
+  %0 = "tf.LookupTableSize"(%handle) {device = ""} : (tensor<*x!tf_type.stringref>) -> tensor<i64>
   func.return %0 : tensor<i64>
 }
 
@@ -1997,6 +1999,7 @@ func.func @testHashTableAndLookupTableSizeToV2() -> tensor<i64> {
 func.func @testHashTableAndLookupTableFindToV2(%arg0: tensor<!tf_type.string>, %arg1: tensor<i32>) -> tensor<i32> {
   // CHECK: [[handle:%.*]] = "tf.HashTableV2"()
   // CHECK-SAME: container = ""
+  // CHECK-SAME: device = ""
   // CHECK-SAME: key_dtype = !tf_type.string
   // CHECK-SAME: shared_name = "table"
   // CHECK-SAME: value_dtype = i32
@@ -2004,7 +2007,8 @@ func.func @testHashTableAndLookupTableFindToV2(%arg0: tensor<!tf_type.string>, %
   %handle = "tf.HashTable"() {container = "", device = "", shared_name = "table", key_dtype = !tf_type.string, value_dtype = i32} : () -> tensor<*x!tf_type.stringref>
 
   // CHECK: "tf.LookupTableFindV2"([[handle]]
-  %0 = "tf.LookupTableFind"(%handle, %arg0, %arg1) {} : (tensor<*x!tf_type.stringref>, tensor<!tf_type.string>, tensor<i32>) -> tensor<i32>
+  // CHECK-SAME: device = ""
+  %0 = "tf.LookupTableFind"(%handle, %arg0, %arg1) {device = ""} : (tensor<*x!tf_type.stringref>, tensor<!tf_type.string>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
@@ -2129,10 +2133,10 @@ func.func private @sum2(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*x
 
 // CHECK-LABEL: testMaximumOfZeroToReluFloat
 func.func @testMaximumOfZeroToReluFloat(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: %0 = "tf.Relu"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: %0 = "tf.Relu"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK: return %0
   %cst_0 = arith.constant dense<0.000000e+00> : tensor<f32>
-  %0 = "tf.Maximum"(%arg0, %cst_0) : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  %0 = "tf.Maximum"(%arg0, %cst_0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 }
 
@@ -2157,7 +2161,7 @@ func.func @testMaximumOfZeroToReluInt32OnGpu(%arg0: tensor<4xi32>) -> tensor<4xi
 
 // CHECK-LABEL: testMaximumOfZeroToReluInt32OnCpu
 func.func @testMaximumOfZeroToReluInt32OnCpu(%arg0: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK: %[[RESULT:.*]] = "tf.Relu"(%arg0) : (tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: %[[RESULT:.*]] = "tf.Relu"(%arg0) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<4xi32>) -> tensor<4xi32>
   // CHECK: return %[[RESULT]]
   %cst_0 = arith.constant dense<0> : tensor<i32>
   %0 = "tf.Maximum"(%arg0, %cst_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0", dtype = i32} : (tensor<4xi32>, tensor<i32>) -> tensor<4xi32>
@@ -2166,7 +2170,7 @@ func.func @testMaximumOfZeroToReluInt32OnCpu(%arg0: tensor<4xi32>) -> tensor<4xi
 
 // CHECK-LABEL: testMaximumOfZeroToReluInt64OnGpu
 func.func @testMaximumOfZeroToReluInt64OnGpu(%arg0: tensor<4xi64>) -> tensor<4xi64> {
-  // CHECK: %[[RESULT:.*]] = "tf.Relu"(%arg0) : (tensor<4xi64>) -> tensor<4xi64>
+  // CHECK: %[[RESULT:.*]] = "tf.Relu"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4xi64>) -> tensor<4xi64>
   // CHECK: return %[[RESULT]]
   %cst_0 = arith.constant dense<0> : tensor<i64>
   %0 = "tf.Maximum"(%arg0, %cst_0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4xi64>, tensor<i64>) -> tensor<4xi64>
@@ -2175,21 +2179,21 @@ func.func @testMaximumOfZeroToReluInt64OnGpu(%arg0: tensor<4xi64>) -> tensor<4xi
 
 // CHECK-LABEL: testReluOfMinimum6ToRelu6Float
 func.func @testReluOfMinimum6ToRelu6Float(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: %0 = "tf.Relu6"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: %0 = "tf.Relu6"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK: return %0
   %cst_6 = arith.constant dense<6.000000e+00> : tensor<f32>
   %0 = "tf.Minimum"(%arg0, %cst_6) : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
-  %1 = "tf.Relu"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+  %1 = "tf.Relu"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4xf32>) -> tensor<4xf32>
   func.return %1 : tensor<4xf32>
 }
 
 // CHECK-LABEL: testReluOfMinimum6ToRelu6Int
 func.func @testReluOfMinimum6ToRelu6Int(%arg0: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK: %0 = "tf.Relu6"(%arg0) : (tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: %0 = "tf.Relu6"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4xi32>) -> tensor<4xi32>
   // CHECK: return %0
   %cst_6 = arith.constant dense<6> : tensor<i32>
   %0 = "tf.Minimum"(%arg0, %cst_6) : (tensor<4xi32>, tensor<i32>) -> tensor<4xi32>
-  %1 = "tf.Relu"(%0) : (tensor<4xi32>) -> tensor<4xi32>
+  %1 = "tf.Relu"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4xi32>) -> tensor<4xi32>
   func.return %1 : tensor<4xi32>
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index d34d397cb815a5..117e475e431bdf 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -144,6 +144,7 @@ cc_library(
         "rewrite_util.h",
     ],
     deps = [
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/core:framework",
         "@llvm-project//mlir:IR",
     ],
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
index 49261e9122e5e6..4525bb7ce7db81 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
@@ -35,24 +35,27 @@ def HasOnlyReadVariableOpUsers : Constraint<
   CPred<"llvm::all_of($0.getUsers(), [](mlir::OpOperand op) { "
         "return llvm::isa<mlir::TF::ReadVariableOp>(op.getOwner()); })">>;
 
-def SetAttrs: NativeCodeCall<"CopyAttributes($0.getOwner(), $1)">;
+def CopyAttrs: NativeCodeCallVoid<"CopyDeviceAndUnderscoredAttributesAdaptor($0, $1)">;
 
 //===----------------------------------------------------------------------===//
 // Add op patterns.
 //===----------------------------------------------------------------------===//
 
-def AddToAddV2 : Pat<(TF_AddOp TF_NumberTensor:$arg0, TF_NumberTensor:$arg1),
-                     (TF_AddV2Op $arg0, $arg1)>;
+def AddToAddV2 : Pat<
+  (TF_AddOp:$src TF_NumberTensor:$arg0, TF_NumberTensor:$arg1),
+  (TF_AddV2Op:$dest $arg0, $arg1), [], [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // AddV2 op patterns.
 //===----------------------------------------------------------------------===//
 
-def AddV2OfNegLeft : Pat<(TF_AddV2Op (TF_NegOp $arg0), $arg1),
-                         (TF_SubOp $arg1, $arg0)>;
+def AddV2OfNegLeft : Pat<
+  (TF_AddV2Op:$src (TF_NegOp $arg0), $arg1), (TF_SubOp:$dest $arg1, $arg0),
+  [], [(CopyAttrs $src, $dest)]>;
 
-def AddV2OfNegRight : Pat<(TF_AddV2Op $arg0, (TF_NegOp $arg1)),
-                          (TF_SubOp $arg0, $arg1)>;
+def AddV2OfNegRight : Pat<
+  (TF_AddV2Op:$src $arg0, (TF_NegOp $arg1)), (TF_SubOp:$dest $arg0, $arg1), [],
+  [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // Gather op patterns.
@@ -67,10 +70,11 @@ class GetI64Attr<int x>: NativeCodeCall<
 // The deprecated validate_indices attribute is NOT verified during
 // canonicalization because it is neither supported in the old bridge
 // nor in the reference TF implementation.
-def GatherToV2 :
-  Pat<(TF_GatherOp $params, $indices, $ignored_validate_indices),
-      (TF_GatherV2Op $params, $indices,
-        (TF_ConstOp (GetI32Attr<0>)), (GetI64Attr<0>))>;
+def GatherToV2 : Pat<
+  (TF_GatherOp:$src $params, $indices, $ignored_validate_indices),
+  (TF_GatherV2Op:$dest $params, $indices, (TF_ConstOp (GetI32Attr<0>)),
+    (GetI64Attr<0>)),
+  [], [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // BatchMatMul op patterns.
@@ -81,22 +85,27 @@ def GatherToV2 :
 // dynamically shaped operands is not correct as that will execute ops that
 // have non matching batch dimensions but are broadcastable which should fail
 // with V1.
-def BatchMatMulToV2 :
-  Pat<(TF_BatchMatMulOp AnyStaticShapeTensor:$x, AnyStaticShapeTensor:$y,
-                        $adj_x, $adj_y),
-      (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y)>;
+def BatchMatMulToV2 : Pat<
+  (TF_BatchMatMulOp:$src AnyStaticShapeTensor:$x, AnyStaticShapeTensor:$y,
+    $adj_x, $adj_y),
+  (TF_BatchMatMulV2Op:$dest $x, $y, $adj_x, $adj_y),
+  [], [(CopyAttrs $src, $dest)]>;
 
-def BatchMatMulToMatMul : Pat<(TF_BatchMatMulOp $x, $y, $adj_x, $adj_y),
-                              (TF_MatMulOp $x, $y, $adj_x, $adj_y),
-                              [(IsRank2Tensor $x), (IsRank2Tensor $y)]>;
+def BatchMatMulToMatMul : Pat<
+  (TF_BatchMatMulOp:$src $x, $y, $adj_x, $adj_y),
+  (TF_MatMulOp:$dest $x, $y, $adj_x, $adj_y),
+  [(IsRank2Tensor $x), (IsRank2Tensor $y)],
+  [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // BatchMatMulV2 op patterns.
 //===----------------------------------------------------------------------===//
 
-def BatchMatMulV2ToMatMul : Pat<(TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y),
-                              (TF_MatMulOp $x, $y, $adj_x, $adj_y),
-                              [(IsRank2Tensor $x), (IsRank2Tensor $y)]>;
+def BatchMatMulV2ToMatMul : Pat<
+  (TF_BatchMatMulV2Op:$src $x, $y, $adj_x, $adj_y),
+  (TF_MatMulOp:$dest $x, $y, $adj_x, $adj_y),
+  [(IsRank2Tensor $x), (IsRank2Tensor $y)],
+  [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // BatchToSpace op patterns.
@@ -106,51 +115,63 @@ def BatchToSpaceBlockSizeToBlockShape : NativeCodeCall<
   "DenseElementsAttr::get(RankedTensorType::get({2}, $_builder.getI64Type()), "
   "ArrayRef<APInt>{$0.getValue(), $0.getValue()})">;
 
-def BatchToSpaceToBatchToSpaceND :
-    Pat<(TF_BatchToSpaceOp $input, $crops, $block_size),
-        (TF_BatchToSpaceNDOp $input,
-         (TF_ConstOp (BatchToSpaceBlockSizeToBlockShape $block_size)),
-         $crops),
-        [(IsRank4Tensor $input), (IsRank2Tensor $crops)]>;
+def BatchToSpaceToBatchToSpaceND : Pat<
+  (TF_BatchToSpaceOp:$src $input, $crops, $block_size),
+  (TF_BatchToSpaceNDOp:$dest $input,
+    (TF_ConstOp (BatchToSpaceBlockSizeToBlockShape $block_size)), $crops),
+  [(IsRank4Tensor $input), (IsRank2Tensor $crops)], [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // BiasAddV1 op patterns.
 //===----------------------------------------------------------------------===//
 
-def BiasAddV1ToBiasAdd : Pat<(TF_BiasAddV1Op $arg0, $arg1),
-  (TF_BiasAddOp $arg0, $arg1, ConstantAttr<TF_ConvnetDataFormatAttr, "\"NHWC\"">)>;
+def BiasAddV1ToBiasAdd : Pat<
+  (TF_BiasAddV1Op:$src $arg0, $arg1),
+  (TF_BiasAddOp:$dest $arg0, $arg1,
+    ConstantAttr<TF_ConvnetDataFormatAttr, "\"NHWC\"">),
+  [], [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // Bitcast op patterns.
 //===----------------------------------------------------------------------===//
 
-def BitcastSameType : Pat<(TF_BitcastOp:$res $arg), (replaceWithValue $arg),
-                          [(SingleResultAndOperandHaveSameElementType $res,
-                                                                      $arg)]>;
+// Parameter attributes are part of function attributes, which is not affected
+// by pattern rewrites. Therefore we do not need to explicilty copy attributes
+// over.
+def BitcastSameType : Pat<
+  (TF_BitcastOp:$res $arg),
+  (replaceWithValue $arg),
+  [(SingleResultAndOperandHaveSameElementType $res, $arg)]>;
 
-def BitcastNested : Pat<(TF_BitcastOp (TF_BitcastOp $arg)),
-                        (TF_BitcastOp $arg)>;
+def BitcastNested : Pat<
+  (TF_BitcastOp:$src (TF_BitcastOp $arg)), (TF_BitcastOp:$dest $arg), [],
+  [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // Concat op patterns.
 //===----------------------------------------------------------------------===//
 
-def ConvertToConcatV2 : Pat<(TF_ConcatOp $axis, $inputs),
-                            (TF_ConcatV2Op $inputs, $axis)>;
+def ConvertToConcatV2 : Pat<
+  (TF_ConcatOp:$src $axis, $inputs), (TF_ConcatV2Op:$dest $inputs, $axis),
+  [], [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // Div op patterns.
 //===----------------------------------------------------------------------===//
 
 /// Favor Mul over Div.
-def DivWithSqrtDivisor : Pat<(TF_DivOp $arg0, (TF_SqrtOp $arg1)),
-                             (TF_MulOp $arg0, (TF_RsqrtOp $arg1))>;
+def DivWithSqrtDivisor : Pat<
+  (TF_DivOp:$src $arg0, (TF_SqrtOp $arg1)),
+  (TF_MulOp:$dest1 $arg0, (TF_RsqrtOp:$dest2 $arg1)), [],
+  [(CopyAttrs $src, $dest1), (CopyAttrs $src, $dest2)]>;
 
 //===----------------------------------------------------------------------===//
 // Log op patterns.
 //===----------------------------------------------------------------------===//
 
-def LogOfSoftmax : Pat<(TF_LogOp (TF_SoftmaxOp $arg)), (TF_LogSoftmaxOp $arg)>;
+def LogOfSoftmax : Pat<
+  (TF_LogOp:$src (TF_SoftmaxOp $arg)), (TF_LogSoftmaxOp:$dest $arg), [],
+  [(CopyAttrs $src, $dest)]>;
 
 // Canonicalize: Log(1.0 + x) to Log1p(x)
 //
@@ -159,35 +180,38 @@ def LogOfSoftmax : Pat<(TF_LogOp (TF_SoftmaxOp $arg)), (TF_LogSoftmaxOp $arg)>;
 // constant values is not a scalar, we have to first prove that it is
 // broadcastable to `x`, which requires static shape information.
 def LogToLog1p : Pat<
-    (TF_LogOp (TF_AddV2Op
-                $arg,
-                (TF_ConstOp ConstantAttr<RankedF32ElementsAttr<[]>, "1.0f">))),
-    (TF_Log1pOp $arg)>;
+  (TF_LogOp:$src (TF_AddV2Op $arg,
+    (TF_ConstOp ConstantAttr<RankedF32ElementsAttr<[]>, "1.0f">))),
+  (TF_Log1pOp:$dest $arg), [], [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // LogicalNot op patterns.
 //===----------------------------------------------------------------------===//
 
 def LogicalNotOfEqual : Pat<
-    (TF_LogicalNotOp (TF_EqualOp $arg0, $arg1, $shape_error)),
-    (TF_NotEqualOp $arg0, $arg1, $shape_error)>;
+  (TF_LogicalNotOp:$src (TF_EqualOp $arg0, $arg1, $shape_error)),
+  (TF_NotEqualOp:$dest $arg0, $arg1, $shape_error), [],
+  [(CopyAttrs $src, $dest)]>;
 
 def LogicalNotOfNotEqual : Pat<
-    (TF_LogicalNotOp (TF_NotEqualOp $arg0, $arg1, $shape_error)),
-    (TF_EqualOp $arg0, $arg1, $shape_error)>;
+  (TF_LogicalNotOp:$src (TF_NotEqualOp $arg0, $arg1, $shape_error)),
+  (TF_EqualOp:$dest $arg0, $arg1, $shape_error), [], [(CopyAttrs $src, $dest)]>;
 
-def LogicalNotOfGreater : Pat<(TF_LogicalNotOp (TF_GreaterOp $arg0, $arg1)),
-                              (TF_LessEqualOp $arg0, $arg1)>;
+def LogicalNotOfGreater : Pat<
+  (TF_LogicalNotOp:$src (TF_GreaterOp $arg0, $arg1)),
+  (TF_LessEqualOp:$dest $arg0, $arg1), [], [(CopyAttrs $src, $dest)]>;
 
-def LogicalNotOfGreaterEqual : Pat<(TF_LogicalNotOp (TF_GreaterEqualOp $arg0,
-                                                                       $arg1)),
-                                   (TF_LessOp $arg0, $arg1)>;
+def LogicalNotOfGreaterEqual : Pat<
+  (TF_LogicalNotOp:$src (TF_GreaterEqualOp $arg0, $arg1)),
+  (TF_LessOp:$dest $arg0, $arg1), [], [(CopyAttrs $src, $dest)]>;
 
-def LogicalNotOfLess : Pat<(TF_LogicalNotOp (TF_LessOp $arg0, $arg1)),
-                           (TF_GreaterEqualOp $arg0, $arg1)>;
+def LogicalNotOfLess : Pat<(
+  TF_LogicalNotOp:$src (TF_LessOp $arg0, $arg1)),
+  (TF_GreaterEqualOp:$dest $arg0, $arg1), [], [(CopyAttrs $src, $dest)]>;
 
-def LogicalNotOfLessEqual : Pat<(TF_LogicalNotOp (TF_LessEqualOp $arg0, $arg1)),
-                                (TF_GreaterOp $arg0, $arg1)>;
+def LogicalNotOfLessEqual : Pat<
+  (TF_LogicalNotOp:$src (TF_LessEqualOp $arg0, $arg1)),
+  (TF_GreaterOp:$dest $arg0, $arg1), [], [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // MatrixSetDiag op patterns.
@@ -196,62 +220,68 @@ def LogicalNotOfLessEqual : Pat<(TF_LogicalNotOp (TF_LessEqualOp $arg0, $arg1)),
 class GetStrAttr<string x>: NativeCodeCall<
   "$_builder.getStringAttr(\"" # x # "\")">;
 
-def MatrixSetDiagToV3 : Pat<(TF_MatrixSetDiagOp $input, $diag),
-                            (TF_MatrixSetDiagV3Op $input, $diag,
-                              (TF_ConstOp (GetI32Attr<0>)),
-                              (GetStrAttr<"RIGHT_LEFT">))>;
+def MatrixSetDiagToV3 : Pat<
+  (TF_MatrixSetDiagOp:$src $input, $diag),
+  (TF_MatrixSetDiagV3Op:$dest $input, $diag, (TF_ConstOp (GetI32Attr<0>)),
+    (GetStrAttr<"RIGHT_LEFT">)),
+  [], [(CopyAttrs $src, $dest)]>;
 
 // MatrixSetDiagToV2 op implicitly used LEFT_LEFT alignment.
-def MatrixSetDiagV2ToV3 : Pat<(TF_MatrixSetDiagV2Op $input, $diag, $k),
-                              (TF_MatrixSetDiagV3Op $input, $diag, $k,
-                                (GetStrAttr<"LEFT_LEFT">))>;
+def MatrixSetDiagV2ToV3 : Pat<
+  (TF_MatrixSetDiagV2Op:$src $input, $diag, $k),
+  (TF_MatrixSetDiagV3Op:$dest $input, $diag, $k, (GetStrAttr<"LEFT_LEFT">)),
+  [], [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // MatrixDiag Op.
 //===----------------------------------------------------------------------===//
 
-def MatrixDiagToV3 : Pat<(TF_MatrixDiagOp $diag),
-                         (TF_MatrixDiagV3Op $diag,
-                           (TF_ConstOp (GetI32Attr<0>)),
-                           (TF_ConstOp (GetI32Attr<-1>)),
-                           (TF_ConstOp (GetI32Attr<-1>)),
-                           (TF_ConstOp (GetScalarOfType<0> $diag)),
-                           (GetStrAttr<"RIGHT_LEFT">))>;
+def MatrixDiagToV3 : Pat<
+  (TF_MatrixDiagOp:$src $diag),
+  (TF_MatrixDiagV3Op:$dest $diag, (TF_ConstOp (GetI32Attr<0>)),
+    (TF_ConstOp (GetI32Attr<-1>)), (TF_ConstOp (GetI32Attr<-1>)),
+    (TF_ConstOp (GetScalarOfType<0> $diag)), (GetStrAttr<"RIGHT_LEFT">)),
+  [], [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // QuantizeAndDequantizeV2 op patterns.
 //===----------------------------------------------------------------------===//
 
 def QuantizeAndDequantizeV2ToQuantizeAndDequantizeV4 : Pat<
-  (TF_QuantizeAndDequantizeV2Op $inputs, $min, $max, $signed_input, $num_bits,
-    $range_given, $round_mode, $narrow_range, $axis),
-  (TF_QuantizeAndDequantizeV4Op $inputs, $min, $max, $signed_input, $num_bits,
-    $range_given, $round_mode, $narrow_range, $axis)>;
+  (TF_QuantizeAndDequantizeV2Op:$src $inputs, $min, $max, $signed_input,
+    $num_bits, $range_given, $round_mode, $narrow_range, $axis),
+  (TF_QuantizeAndDequantizeV4Op:$dest $inputs, $min, $max, $signed_input,
+    $num_bits, $range_given, $round_mode, $narrow_range, $axis),
+  [], [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // RealDiv op patterns.
 //===----------------------------------------------------------------------===//
 
-def RealDivWithSqrtDivisor : Pat<(TF_RealDivOp:$src $arg0, (TF_SqrtOp $arg1)),
-                                 (SetAttrs $src, (TF_MulOp $arg0, (SetAttrs $src, (TF_RsqrtOp $arg1))))>;
+def RealDivWithSqrtDivisor : Pat<
+  (TF_RealDivOp:$src $arg0, (TF_SqrtOp $arg1)),
+    (TF_MulOp:$dest1 $arg0, (TF_RsqrtOp:$dest2 $arg1)),
+  [], [(CopyAttrs $src, $dest1), (CopyAttrs $src, $dest2)]>;
 
 // Replace division by a constant with a multiplication by a reciprocal of that
 // constant. Floating point division can be ~10x more expensive than a
 // multiplication.
 def RealDivWithConstDivisor : Pat<
   (TF_RealDivOp:$src $arg0, (TF_ConstOp FloatElementsAttr<32>:$value)),
-  (SetAttrs $src, (TF_MulOp $arg0, (SetAttrs $src, (TF_ReciprocalOp (TF_ConstOp $value)))))>;
+  (TF_MulOp:$dest1 $arg0, (TF_ReciprocalOp:$dest2 (TF_ConstOp $value))),
+  [], [(CopyAttrs $src, $dest1), (CopyAttrs $src, $dest2)]>;
 
 //===----------------------------------------------------------------------===//
 // Reshape op patterns.
 //===----------------------------------------------------------------------===//
 
-def RedundantReshape : Pat<(TF_ReshapeOp (TF_ReshapeOp $arg, $unused), $shape),
-                           (TF_ReshapeOp $arg, $shape)>;
+def RedundantReshape : Pat<
+  (TF_ReshapeOp:$src (TF_ReshapeOp $arg, $unused), $shape),
+  (TF_ReshapeOp:$dest $arg, $shape), [], [(CopyAttrs $src, $dest)]>;
 
-def ReshapeToSelfShape : Pat<(TF_ReshapeOp:$op $x, (TF_ShapeOp $x)),
-                         (replaceWithValue $x),
-                         [(SingleResultAndOperandHaveSameType $x, $op)]>;
+def ReshapeToSelfShape : Pat<
+  (TF_ReshapeOp:$op $x, (TF_ShapeOp $x)), (replaceWithValue $x),
+  [(SingleResultAndOperandHaveSameType $x, $op)]>;
 
 //===----------------------------------------------------------------------===//
 // Square op patterns.
@@ -261,76 +291,82 @@ def ReshapeToSelfShape : Pat<(TF_ReshapeOp:$op $x, (TF_ShapeOp $x)),
 def TF_NumberTensorNoSmallInt: TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64,
     TF_Float16, TF_Float32, TF_Int32, TF_Int64]>;
 
-def SquareOfSub : Pat<(TF_SquareOp (TF_SubOp TF_NumberTensorNoSmallInt:$arg0,
-                                    TF_NumberTensorNoSmallInt:$arg1)),
-                      (TF_SquaredDifferenceOp $arg0, $arg1)>;
+def SquareOfSub : Pat<
+  (TF_SquareOp:$src (TF_SubOp TF_NumberTensorNoSmallInt:$arg0,
+    TF_NumberTensorNoSmallInt:$arg1)),
+  (TF_SquaredDifferenceOp:$dest $arg0, $arg1), [], [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // Sub op patterns.
 //===----------------------------------------------------------------------===//
 
-def SubOfNeg : Pat<(TF_SubOp $arg0, (TF_NegOp $arg1)),
-                   (TF_AddV2Op $arg0, $arg1)>;
+def SubOfNeg : Pat<
+  (TF_SubOp:$src $arg0, (TF_NegOp $arg1)), (TF_AddV2Op:$dest $arg0, $arg1), [],
+  [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // TruncateDiv op patterns.
 //===----------------------------------------------------------------------===//
 
-def TruncateDivWithSqrtDivisor : Pat<(TF_TruncateDivOp $arg0,
-                                                       (TF_SqrtOp $arg1)),
-                                     (TF_MulOp $arg0, (TF_RsqrtOp $arg1))>;
+def TruncateDivWithSqrtDivisor : Pat<
+  (TF_TruncateDivOp:$src $arg0, (TF_SqrtOp $arg1)),
+  (TF_MulOp:$dest1 $arg0, (TF_RsqrtOp:$dest2 $arg1)), [],
+  [(CopyAttrs $src, $dest1), (CopyAttrs $src, $dest2)]>;
 
 //===----------------------------------------------------------------------===//
 // Xdivy op patterns.
 //===----------------------------------------------------------------------===//
 
-def XdivyWithSqrtDivisor : Pat<(TF_XdivyOp $arg0, (TF_SqrtOp $arg1)),
-                               (TF_MulNoNanOp (TF_RsqrtOp $arg1), $arg0)>;
+def XdivyWithSqrtDivisor : Pat<
+  (TF_XdivyOp:$src $arg0, (TF_SqrtOp $arg1)),
+  (TF_MulNoNanOp:$dest1 (TF_RsqrtOp:$dest2 $arg1), $arg0), [],
+  [(CopyAttrs $src, $dest1), (CopyAttrs $src, $dest2)]>;
 
 
 //===----------------------------------------------------------------------===//
 // Cast op followed by a ReadVariable op can be folded into the ReadVariable
 //===----------------------------------------------------------------------===//
 
-def ReadVariableOfCast : Pat<(TF_ReadVariableOp (TF_CastOp:$output $x, BoolAttr:$Truncate)), (TF_ReadVariableOp $x), [(HasOnlyReadVariableOpUsers $output)]>;
+def ReadVariableOfCast : Pat<
+  (TF_ReadVariableOp:$src (TF_CastOp:$output $x, BoolAttr:$Truncate)),
+  (TF_ReadVariableOp:$dest $x), [(HasOnlyReadVariableOpUsers $output)],
+  [(CopyAttrs $src, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // Canonicalize tf.Variable ops to tf.VariableV2 ops
 //===----------------------------------------------------------------------===//
 
-def VariableToVariableV2 : Pat<(TF_VariableOp $shape, $container, $shard_name), (TF_VariableV2Op $shape, $container, $shard_name)>;
+def VariableToVariableV2 : Pat<
+  (TF_VariableOp:$src $shape, $container, $shard_name),
+  (TF_VariableV2Op:$dest $shape, $container, $shard_name),
+  [], [(CopyAttrs $src, $dest)]>;
 
 
 //===----------------------------------------------------------------------===//
 // Canonicalize tf.HashTable and its user ops to V2 versions.
 //===----------------------------------------------------------------------===//
-def HashTableAndInitializeTableToV2 :
- Pat<
-   (TF_InitializeTableFromTextFileOp
-     (TF_HashTableOp $container, $shared_name, $use_node_name_sharing, $key_dtype, $value_dtype),
-     $filename, $key_index, $value_index, $vocab_size, $delimiter, $offset),
-   (TF_InitializeTableFromTextFileV2Op
-     (TF_HashTableV2Op $container, $shared_name, $use_node_name_sharing, $key_dtype, $value_dtype),
-     $filename, $key_index, $value_index, $vocab_size, $delimiter, $offset)
-  >;
-
-def HashTableAndLookupTableSizeToV2 :
- Pat<
-   (TF_LookupTableSizeOp
-     (TF_HashTableOp $container, $shared_name, $use_node_name_sharing, $key_dtype, $value_dtype)),
-   (TF_LookupTableSizeV2Op
-     (TF_HashTableV2Op $container, $shared_name, $use_node_name_sharing, $key_dtype, $value_dtype))
-  >;
-
-def HashTableAndLookupTableFindToV2 :
- Pat<
-   (TF_LookupTableFindOp
-     (TF_HashTableOp $container, $shared_name, $use_node_name_sharing, $key_dtype, $value_dtype),
-     $keys, $default_value),
-   (TF_LookupTableFindV2Op
-     (TF_HashTableV2Op $container, $shared_name, $use_node_name_sharing, $key_dtype, $value_dtype),
-     $keys, $default_value)
- >;
+def HashTableAndInitializeTableToV2 : Pat<
+  (TF_InitializeTableFromTextFileOp:$src (TF_HashTableOp $container,
+    $shared_name, $use_node_name_sharing, $key_dtype, $value_dtype), $filename,
+    $key_index, $value_index, $vocab_size, $delimiter, $offset),
+  (TF_InitializeTableFromTextFileV2Op:$dest1 (TF_HashTableV2Op:$dest2 $container,
+    $shared_name, $use_node_name_sharing, $key_dtype, $value_dtype),
+    $filename, $key_index, $value_index, $vocab_size, $delimiter, $offset),
+  [], [(CopyAttrs $src, $dest1), (CopyAttrs $src, $dest2)]>;
+
+def HashTableAndLookupTableSizeToV2 : Pat<
+  (TF_LookupTableSizeOp:$src (TF_HashTableOp $container, $shared_name,
+    $use_node_name_sharing, $key_dtype, $value_dtype)),
+  (TF_LookupTableSizeV2Op:$dest1 (TF_HashTableV2Op:$dest2 $container, $shared_name,
+    $use_node_name_sharing, $key_dtype, $value_dtype)),
+  [], [(CopyAttrs $src, $dest1), (CopyAttrs $src, $dest2)]>;
+
+def HashTableAndLookupTableFindToV2 : Pat<
+  (TF_LookupTableFindOp:$src (TF_HashTableOp $container, $shared_name,
+    $use_node_name_sharing, $key_dtype, $value_dtype), $keys, $default_value),
+  (TF_LookupTableFindV2Op:$dest1 (TF_HashTableV2Op:$dest2 $container, $shared_name,
+    $use_node_name_sharing, $key_dtype, $value_dtype), $keys, $default_value),
+  [], [(CopyAttrs $src, $dest1), (CopyAttrs $src, $dest2)]>;
 
 //===----------------------------------------------------------------------===//
 // Canonicalize tf.Maximum of zero to tf.Relu
@@ -347,14 +383,14 @@ def IsDeviceCompatible: Constraint<
 // TODO(b/239736969): Remove the device compatibility constraint to make the
 // canonicalization device independent, by adding an int32 GPU kernel for Relu.
 def MaximumOfZeroToRelu : Pat<
-  (TF_MaximumOp:$maximum_op $x, $y), (TF_ReluOp $x),
-  [(IsConstantValueOf<0> $y), (IsDeviceCompatible $maximum_op)]>;
+  (TF_MaximumOp:$maximum_op $x, $y), (TF_ReluOp:$dest $x),
+  [(IsConstantValueOf<0> $y), (IsDeviceCompatible $maximum_op)],
+  [(CopyAttrs $maximum_op, $dest)]>;
 
 //===----------------------------------------------------------------------===//
 // Canonicalize tf.Relu of Minimul six to tf.Relu6
 //===----------------------------------------------------------------------===//
 
-def ReluOfMinimum6ToRelu6 :
-  Pat<(TF_ReluOp (TF_MinimumOp $x, $y)),
-      (TF_Relu6Op $x),
-      [(IsConstantValueOf<6> $y)]>;
+def ReluOfMinimum6ToRelu6 : Pat<
+  (TF_ReluOp:$src (TF_MinimumOp $x, $y)), (TF_Relu6Op:$dest $x),
+  [(IsConstantValueOf<6> $y)], [(CopyAttrs $src, $dest)]>;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.cc b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.cc
index c6e4ed4b1bd4de..fb3c78c914a0e7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <optional>
 #include <string>
 
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace mlir {
@@ -51,12 +53,19 @@ bool IsOnGpuDevice(mlir::Operation *op) {
   return *device == kDeviceGpu;
 }
 
-mlir::Value CopyAttributes(mlir::Operation *src, mlir::Operation *dest) {
-  // This is not expected to happen in practice.
-  if (dest->getNumResults() != 1)
-    llvm_unreachable("expected single result in `dest`");
-  dest->setAttrs(src->getAttrs());
-  return dest->getResult(0);
+void CopyDeviceAndUnderscoredAttributesAdaptor(mlir::OpResult src,
+                                               mlir::OpResult dest) {
+  CopyDeviceAndUnderscoredAttributesAdaptor(src.getOwner(), dest.getOwner());
+}
+
+void CopyDeviceAndUnderscoredAttributesAdaptor(mlir::Operation *src,
+                                               mlir::OpResult dest) {
+  CopyDeviceAndUnderscoredAttributesAdaptor(src, dest.getOwner());
+}
+
+void CopyDeviceAndUnderscoredAttributesAdaptor(mlir::Operation *src,
+                                               mlir::Operation *dest) {
+  CopyDeviceAndUnderscoredAttributes(src, dest);
 }
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h
index 4d4ba9ff686e5d..c86c4383cc602f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h
@@ -70,9 +70,13 @@ bool IsConstantValueOf(Value value, T raw_value) {
 // devices or the device is not specified.
 bool IsOnGpuDevice(mlir::Operation *op);
 
-// Copy the attributes of `src` op to the `dest` op , and return the first
-// result of the `dest` op.
-mlir::Value CopyAttributes(mlir::Operation *src, mlir::Operation *dest);
+// Wrappers for CopyDeviceAndUnderscoredAttributes
+void CopyDeviceAndUnderscoredAttributesAdaptor(mlir::OpResult src,
+                                               mlir::OpResult dest);
+void CopyDeviceAndUnderscoredAttributesAdaptor(mlir::Operation *src,
+                                               mlir::OpResult dest);
+void CopyDeviceAndUnderscoredAttributesAdaptor(mlir::Operation *src,
+                                               mlir::Operation *dest);
 }  // namespace TF
 }  // namespace mlir
 

From af1855ea4b0e7d6167940a5738ae63dd99786975 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 26 Sep 2023 16:43:14 -0700
Subject: [PATCH 292/567] Resolve issue in which `import tensorflow;import
 keras` would work but `import keras;import tensorflow` would result in TF not
 loading Keras. This is due to the attempt to actually import `keras` in
 `__init__` rather than lazy-load it.

The fix requires a change of approach and the introduction of a Keras LazyLoader class. An added benefit is that it enables better error message when users try to access deprecated v1 APIs with Keras 3 (instead of missing not finding the APIs).

PiperOrigin-RevId: 568684260
---
 tensorflow/api_template.__init__.py       | 126 ++++++----------------
 tensorflow/api_template_v1.__init__.py    | 119 +++++---------------
 tensorflow/compat_template.__init__.py    |  51 ++++-----
 tensorflow/compat_template_v1.__init__.py |  78 +++++---------
 tensorflow/python/util/lazy_loader.py     | 106 ++++++++++++++++++
 5 files changed, 211 insertions(+), 269 deletions(-)

diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 6894ab538894d6..794abc94ec9b6a 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -38,6 +38,7 @@
 
 from tensorflow.python.tools import module_util as _module_util
 from tensorflow.python.util.lazy_loader import LazyLoader as _LazyLoader
+from tensorflow.python.util.lazy_loader import KerasLazyLoader as _KerasLazyLoader
 
 # Make sure code inside the TensorFlow codebase can use tf2.enabled() at import.
 _os.environ["TF2_BEHAVIOR"] = "1"
@@ -87,52 +88,11 @@
   _current_module.__path__ = [_module_dir] + _current_module.__path__
 setattr(_current_module, "estimator", estimator)
 
-# Keras v2 loading.
-_keras_to_use = None
-_keras_package_name = None
-_keras_version = None
-if _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"):
-  # Users can opt out of Keras 3 with this environment variable.
-  try:
-    import tf_keras.api._v2.keras as _keras_to_use
-
-    _keras_package_name = "tf_keras.api._v2.keras"
-    _keras_version = "tf_keras"
-  except ImportError:
-    _logging.warning(
-        "Your environment has TF_USE_LEGACY_KERAS set to True, but you "
-        "do not have the tf_keras package installed. You must install it "
-        "in order to use the legacy tf.keras. Install it via: "
-        "`pip install tf_keras`"
-    )
-else:
-  try:
-    import keras as _keras_module
-
-    if _keras_module.__version__.startswith("3."):
-      # This is the Keras 3.x case.
-      _keras_to_use = _keras_module._tf_keras
-      _keras_package_name = "keras._tf_keras.keras"
-      _keras_version = "keras_3"
-    else:
-      # This is the Keras 2.x case.
-      import keras.api._v2.keras as _keras_to_use
-      _keras_package_name = "keras.api._v2.keras"
-      _keras_version = "keras_2"
-  except (ImportError, AttributeError):
-    pass
-
-if _keras_to_use is not None:
-  _module_dir = _module_util.get_parent_dir_for_name(_keras_package_name)
-  if _module_dir:
-    _current_module.__path__ = [_module_dir] + _current_module.__path__
-  setattr(_current_module,
-          "keras",
-          _LazyLoader("keras", globals(), _keras_package_name))
-else:
-    # TF will not have `tf.keras` in this case. This should not be silent.
-  _logging.warning("Unable to load `tf.keras`. Check that the `keras` package "
-                   "is installed.")
+# Lazy-load Keras v2/3.
+setattr(_current_module, "keras", _KerasLazyLoader(globals()))
+for _module_dir in ("keras._tf_keras.keras", "keras.api._v2.keras"):
+  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
+  _current_module.__path__ = [_module_dir] + _current_module.__path__
 
 
 # Enable TF2 behaviors
@@ -189,37 +149,30 @@ def _running_from_pip_package():
       _os.getenv("TF_PLUGGABLE_DEVICE_LIBRARY_PATH")
   )
 
-# Add module aliases
-if hasattr(_current_module, "keras"):
-  # It is possible that keras is a lazily loaded module, which might break when
-  # actually trying to import it. Have a Try-Catch to make sure it doesn't break
-  # when it doing some very initial loading, like tf.compat.v2, etc.
-  try:
-    _losses = _LazyLoader("losses", globals(), _keras_package_name + ".losses")
-    _metrics = _LazyLoader(
-        "metrics", globals(), _keras_package_name + ".metrics")
-    _optimizers = _LazyLoader(
-        "optimizers", globals(), _keras_package_name + ".optimizers")
-    _initializers = _LazyLoader(
-        "initializers", globals(), _keras_package_name + ".initializers")
-    setattr(_current_module, "losses", _losses)
-    setattr(_current_module, "metrics", _metrics)
-    setattr(_current_module, "optimizers", _optimizers)
-    setattr(_current_module, "initializers", _initializers)
-  except ImportError:
-    pass
-
-  # Do an eager load for Keras' code so that any function/method that needs to
-  # happen at load time will trigger, eg registration of optimizers in the
-  # SavedModel registry.
-  # See b/196254385 for more details.
-  try:
-    if _keras_version == "keras_2":
-      importlib.import_module("keras.src.optimizers")
-    elif _keras_version == "tf_keras":
-      importlib.import_module("tf_keras.src.optimizers")
-  except (ImportError, AttributeError):
-    pass
+# Add Keras module aliases
+_losses = _KerasLazyLoader(globals(), submodule="losses", name="losses")
+_metrics = _KerasLazyLoader(globals(), submodule="metrics", name="metrics")
+_optimizers = _KerasLazyLoader(
+    globals(), submodule="optimizers", name="optimizers")
+_initializers = _KerasLazyLoader(
+    globals(), submodule="initializers", name="initializers")
+setattr(_current_module, "losses", _losses)
+setattr(_current_module, "metrics", _metrics)
+setattr(_current_module, "optimizers", _optimizers)
+setattr(_current_module, "initializers", _initializers)
+
+
+# Do an eager load for Keras' code so that any function/method that needs to
+# happen at load time will trigger, eg registration of optimizers in the
+# SavedModel registry.
+# See b/196254385 for more details.
+try:
+  if _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"):
+    importlib.import_module("tf_keras.src.optimizers")
+  else:
+    importlib.import_module("keras.src.optimizers")
+except (ImportError, AttributeError):
+  pass
 
 del importlib
 
@@ -227,25 +180,6 @@ def _running_from_pip_package():
 if _typing.TYPE_CHECKING:
   from tensorflow_estimator.python.estimator.api._v2 import estimator as estimator
 
-  if _keras_version == "keras_2":
-    from keras.api._v2 import keras
-    from keras.api._v2.keras import losses
-    from keras.api._v2.keras import metrics
-    from keras.api._v2.keras import optimizers
-    from keras.api._v2.keras import initializers
-  elif _keras_version == "tf_keras":
-    from tf_keras.api._v2 import keras
-    from tf_keras.api._v2.keras import losses
-    from tf_keras.api._v2.keras import metrics
-    from tf_keras.api._v2.keras import optimizers
-    from tf_keras.api._v2.keras import initializers
-  elif _keras_version == "keras_3":
-    from keras._tf_keras import keras
-    from keras._tf_keras.keras import losses
-    from keras._tf_keras.keras import metrics
-    from keras._tf_keras.keras import optimizers
-    from keras._tf_keras.keras import initializers
-
 # pylint: enable=undefined-variable
 
 # Delete modules that should be hidden from dir().
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index 82ef4315f4aef1..6265c294dc733d 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -27,6 +27,7 @@
 from tensorflow.python.tools import module_util as _module_util
 from tensorflow.python.platform import tf_logging as _logging
 from tensorflow.python.util.lazy_loader import LazyLoader as _LazyLoader
+from tensorflow.python.util.lazy_loader import KerasLazyLoader as _KerasLazyLoader
 
 # API IMPORTS PLACEHOLDER
 
@@ -77,52 +78,11 @@
   _current_module.__path__ = [_module_dir] + _current_module.__path__
 setattr(_current_module, "estimator", estimator)
 
-# Keras v1 loading.
-_keras_to_use = None
-_keras_package_name = None
-_keras_version = None
-if _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"):
-  # Users can opt out of Keras 3 with this environment variable.
-  try:
-    import tf_keras.api._v1.keras as _keras_to_use
-    _keras_package_name = "tf_keras.api._v1.keras"
-    _keras_version = "tf_keras"
-  except ImportError:
-    _logging.warning(
-        "Your environment has TF_USE_LEGACY_KERAS set to True, but you "
-        "do not have the tf_keras package installed. You must install it "
-        "in order to use the legacy tf.keras. Install it via: "
-        "`pip install tf_keras`"
-    )
-else:
-  try:
-    import keras as _keras_module
-
-    if _keras_module.__version__.startswith("3."):
-      # This is the Keras 3.x case. It does not have v1 compatibility.
-      _keras_to_use = None
-      _keras_package_name = None
-      _keras_version = "keras_3"
-    else:
-      # This is the Keras 2.x case.
-      import keras.api._v1.keras as _keras_to_use
-      _keras_package_name = "keras.api._v1.keras"
-      _keras_version = "keras_2"
-  except (ImportError, AttributeError):
-    pass
-
-if _keras_to_use is not None:
-  _module_dir = _module_util.get_parent_dir_for_name(_keras_package_name)
-  if _module_dir:
-    _current_module.__path__ = [_module_dir] + _current_module.__path__
-  setattr(_current_module,
-          "keras",
-          _LazyLoader("keras", globals(), _keras_package_name))
-else:
-    # TF will not have `tf.keras` in this case. This should not be silent.
-  _logging.warning("Unable to load `tf.keras`. Check that the `keras` package "
-                   "is installed.")
-
+# Lazy-load Keras v1.
+setattr(_current_module, "keras", _KerasLazyLoader(globals(), mode="v1"))
+for _module_dir in ("keras._tf_keras.keras", "keras.api._v1.keras"):
+  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
+  _current_module.__path__ = [_module_dir] + _current_module.__path__
 
 _CONTRIB_WARNING = """
 The TensorFlow contrib module will not be included in TensorFlow 2.0.
@@ -149,39 +109,28 @@
 
 # Add module aliases from Keras to TF.
 # Some tf endpoints actually lives under Keras.
-if (hasattr(_current_module, "keras") and
-    _keras_version in ("tf_keras", "keras_2")):
-  # It is possible that keras is a lazily loaded module, which might break when
-  # actually trying to import it. Have a Try-Catch to make sure it doesn't break
-  # when it doing some very initial loading, like tf.compat.v2, etc.
-  try:
-    _layer_package = f"{_keras_package_name}.__internal__.legacy.layers"
-    layers = _LazyLoader("layers", globals(), _layer_package)
-    _module_dir = _module_util.get_parent_dir_for_name(_layer_package)
-    if _module_dir:
-      _current_module.__path__ = [_module_dir] + _current_module.__path__
-    setattr(_current_module, "layers", layers)
-
-    _legacy_rnn_package = f"{_keras_package_name}.__internal__.legacy.rnn_cell"
-    _rnn_cell = _LazyLoader("legacy_rnn", globals(), _legacy_rnn_package)
-    _module_dir = _module_util.get_parent_dir_for_name(_legacy_rnn_package)
-    if _module_dir:
-      _current_module.nn.__path__ = [_module_dir] + _current_module.nn.__path__
-    _current_module.nn.rnn_cell = _rnn_cell
-  except ImportError:
-    pass
-
-  # Do an eager load for Keras' code so that any function/method that needs to
-  # happen at load time will trigger, eg registration of optimizers in the
-  # SavedModel registry.
-  # See b/196254385 for more details.
-  try:
-    if _keras_version == "keras_2":
-      importlib.import_module("keras.src.optimizers")
-    elif _keras_version == "tf_keras":
-      importlib.import_module("tf_keras.src.optimizers")
-  except (ImportError, AttributeError):
-    pass
+_current_module.layers = _KerasLazyLoader(
+    globals(),
+    submodule="__internal__.legacy.layers",
+    name="layers",
+    mode="v1")
+for _module_dir in (
+    "keras.api._v1.keras.__internal__.legacy.layers",
+    "tf_keras.api._v1.keras.__internal__.legacy.layers"):
+  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
+  _current_module.__path__ = [_module_dir] + _current_module.__path__
+
+_current_module.nn.rnn_cell = _KerasLazyLoader(
+    globals(),
+    submodule="__internal__.legacy.rnn_cell",
+    name="rnn_cell",
+    mode="v1")
+for _module_dir in (
+    "keras.api._v1.keras.__internal__.legacy.rnn_cell",
+    "tf_keras.api._v1.keras.__internal__.legacy.rnn_cell"):
+  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
+  _current_module.nn.__path__ = [_module_dir] + _current_module.nn.__path__
+
 del importlib
 
 # Load all plugin libraries from site-packages/tensorflow-plugins if we are
@@ -234,18 +183,6 @@ def _running_from_pip_package():
 # Explicitly import lazy-loaded modules to support autocompletion.
 if _typing.TYPE_CHECKING:
   from tensorflow_estimator.python.estimator.api._v1 import estimator as estimator
-  if _keras_version == "keras_2":
-    from keras.api._v1 import keras
-    from keras.api._v1.keras import losses
-    from keras.api._v1.keras import metrics
-    from keras.api._v1.keras import optimizers
-    from keras.api._v1.keras import initializers
-  elif _keras_version == "tf_keras":
-    from tf_keras.api._v1 import keras
-    from tf_keras.api._v1.keras import losses
-    from tf_keras.api._v1.keras import metrics
-    from tf_keras.api._v1.keras import optimizers
-    from tf_keras.api._v1.keras import initializers
 
 # Delete modules that should be hidden from dir().
 # Don't fail if these modules are not available.
diff --git a/tensorflow/compat_template.__init__.py b/tensorflow/compat_template.__init__.py
index 3372fc0c08ffa0..814b38d53584dd 100644
--- a/tensorflow/compat_template.__init__.py
+++ b/tensorflow/compat_template.__init__.py
@@ -14,6 +14,8 @@
 # ==============================================================================
 """Bring in all of the public TensorFlow interface into this module."""
 
+# pylint: disable=g-bad-import-order,g-import-not-at-top,protected-access
+
 import logging as _logging
 import os as _os
 import sys as _sys
@@ -21,8 +23,7 @@
 
 from tensorflow.python.tools import module_util as _module_util
 from tensorflow.python.util.lazy_loader import LazyLoader as _LazyLoader
-
-# pylint: disable=g-bad-import-order
+from tensorflow.python.util.lazy_loader import KerasLazyLoader as _KerasLazyLoader
 
 # API IMPORTS PLACEHOLDER
 
@@ -48,12 +49,11 @@
   _current_module.__path__ = [_module_dir] + _current_module.__path__
 setattr(_current_module, "estimator", estimator)
 
-_keras_module = "keras.api._v2.keras"
-_keras = _LazyLoader("keras", globals(), _keras_module)
-_module_dir = _module_util.get_parent_dir_for_name(_keras_module)
-if _module_dir:
-  _current_module.__path__ = [_module_dir] + _current_module.__path__
-setattr(_current_module, "keras", _keras)
+# Lazy load Keras v2
+setattr(_current_module, "keras", _KerasLazyLoader(globals(), mode="v2"))
+_module_dir = _module_util.get_parent_dir_for_name("keras.api._v2.keras")
+_current_module.__path__ = [_module_dir] + _current_module.__path__
+
 
 # We would like the following to work for fully enabling 2.0 in a 1.0 install:
 #
@@ -65,32 +65,21 @@
 setattr(_current_module, "enable_v2_behavior", enable_v2_behavior)
 
 # Add module aliases
-if hasattr(_current_module, 'keras'):
-  # It is possible that keras is a lazily loaded module, which might break when
-  # actually trying to import it. Have a Try-Catch to make sure it doesn't break
-  # when it doing some very initial loading, like tf.compat.v2, etc.
-  try:
-    _keras_package = "keras.api._v2.keras."
-    _losses = _LazyLoader("losses", globals(), _keras_package + "losses")
-    _metrics = _LazyLoader("metrics", globals(), _keras_package + "metrics")
-    _optimizers = _LazyLoader(
-        "optimizers", globals(), _keras_package + "optimizers")
-    _initializers = _LazyLoader(
-        "initializers", globals(), _keras_package + "initializers")
-    setattr(_current_module, "losses", _losses)
-    setattr(_current_module, "metrics", _metrics)
-    setattr(_current_module, "optimizers", _optimizers)
-    setattr(_current_module, "initializers", _initializers)
-  except ImportError:
-    pass
+_losses = _KerasLazyLoader(
+    globals(), submodule="losses", name="losses", mode="v2")
+_metrics = _KerasLazyLoader(
+    globals(), submodule="metrics", name="metrics", mode="v2")
+_optimizers = _KerasLazyLoader(
+    globals(), submodule="optimizers", name="optimizers", mode="v2")
+_initializers = _KerasLazyLoader(
+    globals(), submodule="initializers", name="initializers", mode="v2")
+setattr(_current_module, "losses", _losses)
+setattr(_current_module, "metrics", _metrics)
+setattr(_current_module, "optimizers", _optimizers)
+setattr(_current_module, "initializers", _initializers)
 
 # Explicitly import lazy-loaded modules to support autocompletion.
 # pylint: disable=g-import-not-at-top
 if _typing.TYPE_CHECKING:
   from tensorflow_estimator.python.estimator.api._v2 import estimator as estimator
-  from keras.api._v2 import keras
-  from keras.api._v2.keras import losses
-  from keras.api._v2.keras import metrics
-  from keras.api._v2.keras import optimizers
-  from keras.api._v2.keras import initializers
 # pylint: enable=g-import-not-at-top
diff --git a/tensorflow/compat_template_v1.__init__.py b/tensorflow/compat_template_v1.__init__.py
index ec1ac65e5ad03a..18282f4569e5dc 100644
--- a/tensorflow/compat_template_v1.__init__.py
+++ b/tensorflow/compat_template_v1.__init__.py
@@ -14,14 +14,14 @@
 # ==============================================================================
 """Bring in all of the public TensorFlow interface into this module."""
 
-import os as _os
+# pylint: disable=g-bad-import-order,g-import-not-at-top,protected-access
+
 import sys as _sys
 import typing as _typing
 
 from tensorflow.python.tools import module_util as _module_util
 from tensorflow.python.util.lazy_loader import LazyLoader as _LazyLoader
-
-# pylint: disable=g-bad-import-order
+from tensorflow.python.util.lazy_loader import KerasLazyLoader as _KerasLazyLoader
 
 # API IMPORTS PLACEHOLDER
 
@@ -38,21 +38,11 @@
   _current_module.__path__ = [_module_dir] + _current_module.__path__
 setattr(_current_module, "estimator", estimator)
 
-_keras_package_name = None
-_keras_version = None
-if _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"):
-  # Users can opt out of Keras 3 with this environment variable.
-  _keras_package_name = "tf_keras.api._v1.keras"
-  _keras_version = "tf_keras"
-else:
-  _keras_package_name = "keras.api._v1.keras"
-  _keras_version = "keras_2"
+# Lazy load Keras v1
+setattr(_current_module, "keras", _KerasLazyLoader(globals(), mode="v1"))
+_module_dir = _module_util.get_parent_dir_for_name("keras.api._v1.keras")
+_current_module.__path__ = [_module_dir] + _current_module.__path__
 
-keras = _LazyLoader("keras", globals(), _keras_package_name)
-_module_dir = _module_util.get_parent_dir_for_name(_keras_package_name)
-if _module_dir:
-  _current_module.__path__ = [_module_dir] + _current_module.__path__
-setattr(_current_module, "keras", keras)
 
 from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
 _current_module.app.flags = flags  # pylint: disable=undefined-variable
@@ -60,44 +50,30 @@
 
 # Add module aliases from Keras to TF.
 # Some tf endpoints actually lives under Keras.
-if hasattr(_current_module, "keras"):
-  # It is possible that keras is a lazily loaded module, which might break when
-  # actually trying to import it. Have a Try-Catch to make sure it doesn't break
-  # when it doing some very initial loading, like tf.compat.v2, etc.
-  try:
-    _layer_package = f"{_keras_package_name}.__internal__.legacy.layers"
-    layers = _LazyLoader("layers", globals(), _layer_package)
-    _module_dir = _module_util.get_parent_dir_for_name(_layer_package)
-    if _module_dir:
-      _current_module.__path__ = [_module_dir] + _current_module.__path__
-    setattr(_current_module, "layers", layers)
+_current_module.layers = _KerasLazyLoader(
+    globals(),
+    submodule="__internal__.legacy.layers",
+    name="layers",
+    mode="v1")
+for _module_dir in (
+    "keras.api._v1.keras.__internal__.legacy.layers",
+    "tf_keras.api._v1.keras.__internal__.legacy.layers"):
+  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
+  _current_module.__path__ = [_module_dir] + _current_module.__path__
 
-    _legacy_rnn_package = f"{_keras_package_name}.__internal__.legacy.rnn_cell"
-    _rnn_cell = _LazyLoader("legacy_rnn", globals(), _legacy_rnn_package)
-    _module_dir = _module_util.get_parent_dir_for_name(_legacy_rnn_package)
-    if _module_dir:
-      _current_module.nn.__path__ = [_module_dir] + _current_module.nn.__path__
-    _current_module.nn.rnn_cell = _rnn_cell
-  except ImportError:
-    pass
+_current_module.nn.rnn_cell = _KerasLazyLoader(
+    globals(),
+    submodule="__internal__.legacy.rnn_cell",
+    name="rnn_cell",
+    mode="v1")
+for _module_dir in (
+    "keras.api._v1.keras.__internal__.legacy.rnn_cell",
+    "tf_keras.api._v1.keras.__internal__.legacy.rnn_cell"):
+  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
+  _current_module.nn.__path__ = [_module_dir] + _current_module.nn.__path__
 
 # Explicitly import lazy-loaded modules to support autocompletion.
 # pylint: disable=g-import-not-at-top
 if _typing.TYPE_CHECKING:
   from tensorflow_estimator.python.estimator.api._v1 import estimator as estimator
-  try:
-    if _keras_version == "keras_2":
-      from keras.api._v1 import keras
-      from keras.api._v1.keras import losses
-      from keras.api._v1.keras import metrics
-      from keras.api._v1.keras import optimizers
-      from keras.api._v1.keras import initializers
-    elif _keras_version == "tf_keras":
-      from tf_keras.api._v1 import keras
-      from tf_keras.api._v1.keras import losses
-      from tf_keras.api._v1.keras import metrics
-      from tf_keras.api._v1.keras import optimizers
-      from tf_keras.api._v1.keras import initializers
-  except (ImportError, AttributeError):
-    pass
 # pylint: enable=g-import-not-at-top
diff --git a/tensorflow/python/util/lazy_loader.py b/tensorflow/python/util/lazy_loader.py
index 534d2037d10c27..bfd0372bc99c85 100644
--- a/tensorflow/python/util/lazy_loader.py
+++ b/tensorflow/python/util/lazy_loader.py
@@ -16,6 +16,7 @@
 """A LazyLoader class."""
 
 import importlib
+import os
 import types
 from tensorflow.python.platform import tf_logging as logging
 
@@ -74,3 +75,108 @@ def __repr__(self):
   def __dir__(self):
     module = self._load()
     return dir(module)
+
+
+class KerasLazyLoader(LazyLoader):
+  """LazyLoader that handles routing to different Keras version."""
+
+  def __init__(  # pylint: disable=super-init-not-called
+      self, parent_module_globals, mode=None, submodule=None, name="keras"):
+    self._parent_module_globals = parent_module_globals
+    self._mode = mode
+    self._submodule = submodule
+    self._name = name
+    self._initialized = False
+
+  def _initialize(self):
+    """Resolve the Keras version to use and initialize the loader."""
+    self._initialized = True
+    package_name = None
+    keras_version = None
+    if os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"):
+      try:
+        import tf_keras  # pylint: disable=g-import-not-at-top,unused-import
+
+        keras_version = "tf_keras"
+        if self._mode == "v1":
+          package_name = "tf_keras.api._v1.keras"
+        else:
+          package_name = "tf_keras.api._v2.keras"
+      except ImportError:
+        logging.warning(
+            "Your environment has TF_USE_LEGACY_KERAS set to True, but you "
+            "do not have the tf_keras package installed. You must install it "
+            "in order to use the legacy tf.keras. Install it via: "
+            "`pip install tf_keras`"
+        )
+    else:
+      try:
+        import keras  # pylint: disable=g-import-not-at-top
+
+        if keras.__version__.startswith("3."):
+          # This is the Keras 3.x case.
+          keras_version = "keras_3"
+          package_name = "keras._tf_keras.keras"
+        else:
+          # This is the Keras 2.x case.
+          keras_version = "keras_2"
+          if self._mode == "v1":
+            package_name = "keras.api._v1.keras"
+          else:
+            package_name = "keras.api._v2.keras"
+      except ImportError:
+        raise ImportError(  # pylint: disable=raise-missing-from
+            "Keras cannot be imported. Check that it is installed."
+        )
+
+    self._keras_version = keras_version
+    if keras_version is not None:
+      if self._submodule is not None:
+        package_name += "." + self._submodule
+      super().__init__(self._name, self._parent_module_globals, package_name)
+    else:
+      raise ImportError(  # pylint: disable=raise-missing-from
+          "Keras cannot be imported. Check that it is installed."
+      )
+
+  def __getattr__(self, item):
+    if item in ("_mode", "_initialized", "_name"):
+      return super(types.ModuleType, self).__getattribute__(item)
+    if not self._initialized:
+      self._initialize()
+    if self._keras_version == "keras_3":
+      if (self._mode == "v1" and
+          not self._submodule and
+          item.startswith("compat.v1.")):
+        raise AttributeError(
+            "`tf.compat.v1.keras` is not available with Keras 3. Keras 3 has "
+            "no support for TF 1 APIs. You can install the `tf_keras` package "
+            "as an alternative, and set the environment variable "
+            "`TF_USE_LEGACY_KERAS=True` to configure TensorFlow to route "
+            "`tf.compat.v1.keras` to `tf_keras`."
+        )
+      elif (self._mode == "v2" and
+            not self._submodule and
+            item.startswith("compat.v2.")):
+        raise AttributeError(
+            "`tf.compat.v2.keras` is not available with Keras 3. Just use "
+            "`import keras` instead."
+        )
+      elif (self._submodule and
+            self._submodule.startswith("__internal__.legacy.")):
+        raise AttributeError(
+            f"`{item}` is not available with Keras 3."
+        )
+    module = self._load()
+    return getattr(module, item)
+
+  def __repr__(self):
+    if self._initialized:
+      return (f"<KerasLazyLoader ({self._keras_version}) "
+              f"{self.__name__} as {self._local_name}")
+    return "<KerasLazyLoader>"
+
+  def __dir__(self):
+    if not self._initialized:
+      self._initialize()
+    return super().__dir__()

From d7e7b9d0f80ba7e4d31099325e4271355066c446 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 16:51:27 -0700
Subject: [PATCH 293/567] Open source Sparse Core XLA OP kernels

PiperOrigin-RevId: 568686276
---
 tensorflow/core/tpu/kernels/BUILD             |   32 +
 .../core/tpu/kernels/sparse_core_xla_ops.cc   | 1424 +++++++++++++++++
 .../core/tpu/kernels/sparse_core_xla_ops.h    |   45 +
 .../tpu/tpu_library_init_fns.inc              |    1 +
 .../xla/stream_executor/tpu/tpu_ops_c_api.h   |   18 +
 5 files changed, 1520 insertions(+)
 create mode 100644 tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
 create mode 100644 tensorflow/core/tpu/kernels/sparse_core_xla_ops.h

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 4ac173f9c5b655..9a41f6b39827a0 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -159,6 +159,38 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "sparse_core_xla_ops",
+    srcs = ["sparse_core_xla_ops.cc"],
+    hdrs = ["sparse_core_xla_ops.h"],
+    deps = [
+        ":sparse_core_ops_utils",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_xla//xla:literal_util",
+        "@local_xla//xla:shape_util",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/client:xla_builder",
+        "@local_xla//xla/client:xla_computation",
+        "@local_xla//xla/client/lib:constants",
+        "@local_xla//xla/client/lib:slicing",
+        "@local_xla//xla/stream_executor/tpu:c_api_decl",
+        "@local_xla//xla/stream_executor/tpu:status_helper",
+        "@local_xla//xla/stream_executor/tpu:tpu_api",
+        "@local_xla//xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "sparse_core_ops_stats_handler",
     hdrs = ["sparse_core_ops_stats_handler.h"],
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
new file mode 100644
index 00000000000000..2033559b64ca0a
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
@@ -0,0 +1,1424 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/sparse_core_xla_ops.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/client/lib/constants.h"
+#include "xla/client/lib/slicing.h"
+#include "xla/client/xla_builder.h"
+#include "xla/client/xla_computation.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/stream_executor/tpu/status_helper.h"
+#include "xla/stream_executor/tpu/tpu_api.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/tpu/kernels/sparse_core_ops_utils.h"
+#include "tsl/platform/macros.h"
+
+namespace tensorflow {
+namespace {
+
+xla::XlaOp PowHack(xla::XlaOp base, xla::XlaOp exp) {
+  // FIXME(b/250994736) replace PowHack with xla::Pow once this bug is fixed.
+  // Approximation of xla::Pow(base, exp);
+  return xla::Exp(xla::Log(base) * exp);
+}
+
+// Gets the max ids and max unique ids for the given table.
+Status GetMaxIdsAndUniques(const std::string& program_key,
+                           const std::string& table_name,
+                           int64_t num_samples_per_sparse_core,
+                           int64_t feature_width,
+                           int64_t* max_ids_per_partition,
+                           int64_t* max_unique_ids_per_partition) {
+  SparseCore_GetMaxIdsAndUniques_Params params;
+  params.program_key = program_key.c_str();
+  params.table_name = table_name.c_str();
+  params.num_samples_per_sparse_core = num_samples_per_sparse_core;
+  params.feature_width = feature_width;
+  StatusHelper status;
+  params.status = status.c_status;
+
+  stream_executor::tpu::OpsApiFn()->SparseCore_GetMaxIdsAndUniquesFn(&params);
+  *max_ids_per_partition = params.max_ids_per_partition;
+  *max_unique_ids_per_partition = params.max_unique_ids_per_partition;
+  return status.status();
+}
+
+// Get the SparseCore logical replica count.
+// TODO(agagik): get it from the tpu topology.
+StatusOr<int64_t> GetSparseCoresPerChip() { return 4; }
+
+// This TensorFlow op performs the embedding lookup on SparseCore. It takes the
+// embedding table and input sparse tensor represented by the `row_ids`,
+// `col_ids` and `values`(gains). It produces the embedding look up result and
+// the preserved result which will be used in the gradient calculation op.
+class XlaSparseDenseMatmulOp : public XlaOpKernel {
+ public:
+  explicit XlaSparseDenseMatmulOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("input_size", &input_size_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("max_ids_per_partition", &max_ids_per_partition_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_unique_id_per_partition",
+                                     &max_unique_ids_per_partition_));
+  }
+
+  ~XlaSparseDenseMatmulOp() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+
+    const int32 num_physical_replica =
+        stream_executor::tpu::OpsApiFn()->TpuTopology_AvailableCoreCountFn(
+            /*mesh_state=*/nullptr,
+            /*tpu_core_type=*/TpuCoreTypeEnum::kEmbeddingV2);
+
+    OP_REQUIRES(ctx, num_physical_replica > 0,
+                errors::InvalidArgument(
+                    "No SparseCore is available in the tpu system."));
+
+    // TODO(pineapplejuice233): Add error checking logic.
+    xla::XlaOp row_ids = ctx->Input("row_ids");
+    xla::XlaOp col_ids = ctx->Input("col_ids");
+    xla::XlaOp values = ctx->Input("values");
+    // TODO(pineapplejuice233): Right now we are passing this argument as
+    // non_zero_element_num, switch to actual 'offsets' once the decomposer
+    // supports it.
+    xla::XlaOp offsets = ctx->Input("offsets");
+    xla::XlaOp embedding_table = ctx->Input("embedding_table");
+
+    // Construct the shape and a const 0 input for the activations
+    xla::XlaOp zero = xla::ConstantLiteral(
+        builder, xla::LiteralUtil::Zero(ctx->InputXlaType("embedding_table")));
+    OP_REQUIRES_VALUE(xla::Shape activation_shape, ctx,
+                      ctx->InputXlaShape("embedding_table"));
+    activation_shape.set_dimensions(0, input_size_);
+    xla::Shape row_pointers_shape =
+        xla::ShapeUtil::MakeShapeWithType<int32_t>({num_physical_replica});
+
+    // Get the number of ids from row_ids.
+    OP_REQUIRES_VALUE(xla::Shape row_ids_shape, ctx,
+                      ctx->InputXlaShape("row_ids"));
+    int64_t token_count = row_ids_shape.dimensions(0);
+
+    // TODO(pineapplejuice233): Change this to include padding once minibatching is done.
+    xla::Shape sorted_ids_shape =
+        xla::ShapeUtil::MakeShapeWithType<int32_t>({token_count});
+    xla::Shape sorted_gains_shape =
+        xla::ShapeUtil::MakeShapeWithType<float>({token_count});
+
+    xla::XlaOp activation_init =
+        xla::Broadcast(zero, activation_shape.dimensions());
+
+    xla::FrontendAttributes original_frontend_attributes =
+        builder->frontend_attributes();
+
+    xla::FrontendAttributes new_frontend_attributes;
+
+    new_frontend_attributes.mutable_map()->insert(
+        {"_xla_compute_type", "sparse"});
+
+    builder->SetFrontendAttributes(new_frontend_attributes);
+
+    // Pack the input tensors as a tuple. This is a intermediate stage before
+    // switching to SparseTensor type.
+    xla::XlaOp coo_tensor_input =
+        xla::Tuple(builder, {row_ids, col_ids, values});
+
+    new_frontend_attributes.mutable_map()->insert(
+        {"_xla_sharding_strategy", "mod"});
+
+    new_frontend_attributes.mutable_map()->insert(
+        {"_xla_max_ids_per_partition", absl::StrCat(max_ids_per_partition_)});
+
+    new_frontend_attributes.mutable_map()->insert(
+        {"_xla_max_unique_ids_per_partition",
+         absl::StrCat(max_unique_ids_per_partition_)});
+
+    builder->SetFrontendAttributes(new_frontend_attributes);
+
+    xla::XlaOp result = xla::CustomCall(
+        builder, "SparseDenseMatmulOp",
+        {coo_tensor_input, embedding_table, offsets, activation_init},
+        xla::ShapeUtil::MakeTupleShape({activation_shape, row_pointers_shape,
+                                        sorted_ids_shape, sorted_ids_shape,
+                                        sorted_gains_shape}));
+
+    builder->SetFrontendAttributes(original_frontend_attributes);
+
+    // Embedding activation.
+    ctx->SetOutput(0, xla::GetTupleElement(result, 0));
+    // CSR pointer corresponding to logical sharding replicas.
+    ctx->SetOutput(1, xla::GetTupleElement(result, 1));
+    // CSR values of embedding ids.
+    ctx->SetOutput(2, xla::GetTupleElement(result, 2));
+    // CSR values of sample ids.
+    ctx->SetOutput(3, xla::GetTupleElement(result, 3));
+    // CSR values of gains.
+    ctx->SetOutput(4, xla::GetTupleElement(result, 4));
+  }
+
+ private:
+  int input_size_;
+  int max_ids_per_partition_;
+  int max_unique_ids_per_partition_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulOp);
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmul"), XlaSparseDenseMatmulOp);
+
+// This TensorFlow op performs the embedding lookup on SparseCore. It has
+// different input and output format comparing to the XlaSparseDenseMatmulOp.
+// It takes the embedding table and input csr tensor represented by the
+// `row_pointers`, `sorted_sample_ids`, `sorted_token_ids` and `sorted_gains`.
+// It only produces the embedding look up result.
+class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
+ public:
+  explicit XlaSparseDenseMatmulWithCsrInputOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("table_name", &table_name_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("input_size", &input_size_));
+    OP_REQUIRES_VALUE(num_sparsecores_per_chip_, ctx, GetSparseCoresPerChip());
+    OP_REQUIRES(ctx, input_size_ % num_sparsecores_per_chip_ == 0,
+                errors::InvalidArgument("input_size_ ", input_size_,
+                                        " not divisible by the number "
+                                        "of sparsecores per chip ",
+                                        num_sparsecores_per_chip_));
+
+    // Get and save quantization config params, if they were configured.
+    // num_buckets == 0 indicate no quantization configs were provided.
+    int check_num_buckets;
+    Status status =
+        ctx->GetAttr("quantization_config_num_buckets", &check_num_buckets);
+    if (status.ok() && check_num_buckets > 0) {
+      quantization_config_num_buckets_ = check_num_buckets;
+      float quant_clipping_float;
+      status = ctx->GetAttr("quantization_config_low", &quant_clipping_float);
+      if (status.ok()) {
+        quantization_config_low_ = quant_clipping_float;
+      }
+      status = ctx->GetAttr("quantization_config_high", &quant_clipping_float);
+      if (status.ok()) {
+        quantization_config_high_ = quant_clipping_float;
+      }
+    }
+    // Check for incomplete quantization config.
+    OP_REQUIRES(ctx,
+                quantization_config_low_.has_value() ==
+                        quantization_config_high_.has_value() &&
+                    quantization_config_low_.has_value() ==
+                        quantization_config_num_buckets_.has_value(),
+                errors::InvalidArgument("Quantization config is incomplete."));
+  }
+
+  ~XlaSparseDenseMatmulWithCsrInputOp() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    int64_t per_sparse_core_batch_size =
+        input_size_ / num_sparsecores_per_chip_;
+    int64_t max_ids_per_partition = 0;
+    int64_t max_unique_ids_per_partition = 0;
+
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp row_pointers = ctx->Input("row_pointers");
+    xla::XlaOp sorted_sample_ids = ctx->Input("sorted_sample_ids");
+    xla::XlaOp sorted_token_ids = ctx->Input("sorted_token_ids");
+    xla::XlaOp sorted_gains = ctx->Input("sorted_gains");
+    xla::XlaOp embedding_table = ctx->Input("embedding_table");
+
+    OP_REQUIRES_VALUE(xla::Shape embedding_table_shape, ctx,
+                      ctx->InputXlaShape("embedding_table"));
+    const int32_t feature_width = embedding_table_shape.dimensions(1);
+
+    OP_REQUIRES_OK(
+        ctx, GetMaxIdsAndUniques("", table_name_, per_sparse_core_batch_size,
+                                 feature_width, &max_ids_per_partition,
+                                 &max_unique_ids_per_partition));
+    VLOG(3) << "XlaSparseDenseMatmulWithCsrInputOp: "
+            << "table_name = " << table_name_
+            << ", max_ids = " << max_ids_per_partition
+            << ", max_uniques = " << max_unique_ids_per_partition;
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsScalar(ctx->InputShape(
+                    "num_minibatches_per_physical_sparse_core")),
+                errors::InvalidArgument(
+                    "num_minibatches_per_physical_sparse_core must be scalar"));
+
+    xla::XlaOp num_minibatches_per_physical_sparse_core =
+        ctx->Input("num_minibatches_per_physical_sparse_core");
+
+    // Construct the shape and a const 0 input for the activations
+    xla::XlaOp zero = xla::ConstantLiteral(
+        builder, xla::LiteralUtil::Zero(ctx->InputXlaType("embedding_table")));
+    OP_REQUIRES_VALUE(xla::Shape activation_shape, ctx,
+                      ctx->InputXlaShape("embedding_table"));
+    activation_shape.set_dimensions(0, input_size_);
+
+    xla::XlaOp activation_init =
+        xla::Broadcast(zero, activation_shape.dimensions());
+
+    xla::FrontendAttributes new_frontend_attributes;
+
+    new_frontend_attributes.mutable_map()->insert(
+        {"_xla_compute_type", "sparse"});
+
+    builder->SetFrontendAttributes(new_frontend_attributes);
+
+    new_frontend_attributes.mutable_map()->insert(
+        {"_xla_sharding_strategy", "mod"});
+
+    new_frontend_attributes.mutable_map()->insert(
+        {"_xla_pad_value", absl::StrCat(kXlaPadValue)});
+
+    new_frontend_attributes.mutable_map()->insert(
+        {"_xla_max_ids_per_partition", absl::StrCat(max_ids_per_partition)});
+
+    new_frontend_attributes.mutable_map()->insert(
+        {"_xla_max_unique_ids_per_partition",
+         absl::StrCat(max_unique_ids_per_partition)});
+
+    if (quantization_config_low_.has_value()) {
+      new_frontend_attributes.mutable_map()->insert(
+          {"_xla_quantization_high_value",
+           absl::StrCat(quantization_config_high_.value())});
+      new_frontend_attributes.mutable_map()->insert(
+          {"_xla_quantization_low_value",
+           absl::StrCat(quantization_config_low_.value())});
+      new_frontend_attributes.mutable_map()->insert(
+          {"_xla_quantization_num_buckets_value",
+           absl::StrCat(quantization_config_num_buckets_.value())});
+    }
+    builder->SetFrontendAttributes(new_frontend_attributes);
+
+    xla::XlaOp result =
+        xla::CustomCall(builder, "SparseDenseMatmulWithMinibatchingOp",
+                        {row_pointers, sorted_token_ids, sorted_sample_ids,
+                         sorted_gains, num_minibatches_per_physical_sparse_core,
+                         embedding_table, activation_init},
+                        activation_shape);
+
+    // Embedding activation.
+    ctx->SetOutput(0, result);
+  }
+
+ private:
+  int input_size_;
+  int64_t num_sparsecores_per_chip_;
+  std::optional<float> quantization_config_low_;
+  std::optional<float> quantization_config_high_;
+  std::optional<int> quantization_config_num_buckets_;
+  std::string table_name_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulWithCsrInputOp);
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulWithCsrInput"),
+                XlaSparseDenseMatmulWithCsrInputOp);
+
+
+// Base class for all the minibatch with CSR input optimizer kernel.
+class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
+ public:
+  explicit XlaSparseDenseMatmulGradWithCsrInputBase(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("table_name", &table_name_));
+  }
+
+  ~XlaSparseDenseMatmulGradWithCsrInputBase() override = default;
+
+  virtual xla::XlaComputation build_optimizer_computation(
+      int32_t feature_width) = 0;
+
+  virtual xla::XlaOp get_tables_input(XlaOpKernelContext* ctx) = 0;
+
+  virtual xla::XlaOp get_hyperparameters_input(XlaOpKernelContext* ctx) = 0;
+
+  virtual xla::Shape get_tables_shape(xla::Shape embedding_table_shape) = 0;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+
+    // TODO(pineapplejuice233): Add error checking logic.
+    xla::XlaOp row_pointers = ctx->Input("row_pointers");
+    xla::XlaOp sorted_sample_ids = ctx->Input("sorted_sample_ids");
+    xla::XlaOp sorted_token_ids = ctx->Input("sorted_token_ids");
+    xla::XlaOp sorted_gains = ctx->Input("sorted_gains");
+    xla::XlaOp activation_gradients = ctx->Input("activation_gradients");
+    xla::XlaOp num_minibatches_per_physical_sparse_core =
+        ctx->Input("num_minibatches_per_physical_sparse_core");
+
+    // Get the shape of the gradient.
+    OP_REQUIRES_VALUE(xla::Shape activation_shape, ctx,
+                      ctx->InputXlaShape("activation_gradients"));
+    OP_REQUIRES(
+        ctx,
+        activation_shape.is_static() && activation_shape.dimensions_size() == 2,
+        errors::InvalidArgument(
+            "activations input has non static or non-rank 2 shape: ",
+            activation_shape.ToString()));
+    OP_REQUIRES_VALUE(int64_t num_sparsecores_per_chip, ctx,
+                      GetSparseCoresPerChip());
+    int64 num_samples_per_chip = activation_shape.dimensions(0);
+    OP_REQUIRES(ctx, num_samples_per_chip % num_sparsecores_per_chip == 0,
+                errors::InvalidArgument(
+                    "num_samples_per_chip ", num_samples_per_chip,
+                    " not divisible by the number of sparsecores per chip ",
+                    num_sparsecores_per_chip));
+    int64_t per_sparse_core_batch_size =
+        num_samples_per_chip / num_sparsecores_per_chip;
+    int64_t max_ids_per_partition = 0;
+    int64_t max_unique_ids_per_partition = 0;
+    OP_REQUIRES_VALUE(xla::Shape embedding_table_shape, ctx,
+                      ctx->InputXlaShape("embedding_table"));
+
+    const int32_t feature_width = embedding_table_shape.dimensions(1);
+    OP_REQUIRES_OK(
+        ctx, GetMaxIdsAndUniques("", table_name_, per_sparse_core_batch_size,
+                                 feature_width, &max_ids_per_partition,
+                                 &max_unique_ids_per_partition));
+    VLOG(3) << "XlaSparseDenseMatmulWithCsrInputOp: "
+            << "table_name = " << table_name_
+            << ", max_ids = " << max_ids_per_partition
+            << ", max_uniques = " << max_unique_ids_per_partition;
+
+
+    xla::XlaComputation optimizer = build_optimizer_computation(feature_width);
+
+    xla::FrontendAttributes original_frontend_attributes =
+        builder->frontend_attributes();
+
+    xla::FrontendAttributes tuple_frontend_attributes;
+
+    tuple_frontend_attributes.mutable_map()->insert(
+        {"_xla_compute_type", "sparse"});
+
+    builder->SetFrontendAttributes(tuple_frontend_attributes);
+
+    xla::XlaOp tables = get_tables_input(ctx);
+
+    xla::XlaOp hyperparameters = get_hyperparameters_input(ctx);
+
+    xla::Shape tables_shape = get_tables_shape(embedding_table_shape);
+
+    xla::FrontendAttributes custom_call_frontend_attributes;
+
+    custom_call_frontend_attributes.mutable_map()->insert(
+        {"_xla_compute_type", "sparse"});
+
+    custom_call_frontend_attributes.mutable_map()->insert(
+        {"_xla_sharding_strategy", "mod"});
+
+    custom_call_frontend_attributes.mutable_map()->insert(
+        {"_xla_pad_value", absl::StrCat(kXlaPadValue)});
+
+    custom_call_frontend_attributes.mutable_map()->insert(
+        {"_xla_max_ids_per_partition", absl::StrCat(max_ids_per_partition)});
+
+    custom_call_frontend_attributes.mutable_map()->insert(
+        {"_xla_max_unique_ids_per_partition",
+         absl::StrCat(max_unique_ids_per_partition)});
+
+    builder->SetFrontendAttributes(custom_call_frontend_attributes);
+
+    xla::XlaOp updated_tables = xla::CustomCallWithComputation(
+        builder, "SparseDenseMatmulGradOptimizerUpdateWithMinibatchingOp",
+        {row_pointers, sorted_token_ids, sorted_sample_ids, sorted_gains,
+         num_minibatches_per_physical_sparse_core, tables, activation_gradients,
+         hyperparameters},
+        optimizer, tables_shape);
+
+    builder->SetFrontendAttributes(tuple_frontend_attributes);
+
+    // Updated embedding table.
+    for (int i = 0; i < tables_shape.tuple_shapes_size(); ++i) {
+      ctx->SetOutput(i, xla::GetTupleElement(updated_tables, i));
+    }
+
+    builder->SetFrontendAttributes(original_frontend_attributes);
+  }
+
+ private:
+  std::string table_name_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulGradWithCsrInputBase);
+};
+
+// This TensorFlow op calculates the gradients and performs SGD update on the
+// embedding table on SparseCore. It takes the activation gradients, input
+// sparse tensor represented by the `row_pointers`, `sorted_embedding_ids`,
+// `sorted_sample_ids` and 'learning_rate'. It produces the updated embedding
+// table. It also supports minibatching.
+class XlaSparseDenseMatmulGradWithSgdAndCsrInputOp
+    : public XlaSparseDenseMatmulGradWithCsrInputBase {
+ public:
+  explicit XlaSparseDenseMatmulGradWithSgdAndCsrInputOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulGradWithCsrInputBase(ctx) {}
+
+  ~XlaSparseDenseMatmulGradWithSgdAndCsrInputOp() override = default;
+
+  xla::XlaComputation build_optimizer_computation(
+      const int32_t feature_width) override {
+    xla::XlaComputation sgd_optimizer = [&] {
+      auto sgd_optimizer_builder =
+          std::make_unique<xla::XlaBuilder>("sgd_optimizer_builder");
+
+      xla::Shape per_row_shape =
+          xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
+
+      xla::XlaOp gradient = xla::Parameter(sgd_optimizer_builder.get(), 0,
+                                           per_row_shape, "gradient");
+
+      xla::XlaOp tables = xla::Parameter(
+          sgd_optimizer_builder.get(), 1,
+          xla::ShapeUtil::MakeTupleShape({per_row_shape}), "tables");
+
+      xla::XlaOp hyperparameters = xla::Parameter(
+          sgd_optimizer_builder.get(), 2,
+          xla::ShapeUtil::MakeTupleShape({per_row_shape}), "hyperparameters");
+
+      xla::XlaOp updated_embedding_table =
+          xla::GetTupleElement(tables, 0) -
+          xla::GetTupleElement(hyperparameters, 0) * gradient;
+
+      xla::XlaOp updated_tables =
+          xla::Tuple(sgd_optimizer_builder.get(), {updated_embedding_table});
+
+      return sgd_optimizer_builder->Build(updated_tables).value();
+    }();
+
+    return sgd_optimizer;
+  }
+
+  xla::XlaOp get_tables_input(XlaOpKernelContext* ctx) override {
+    return xla::Tuple(ctx->builder(), {ctx->Input("embedding_table")});
+  }
+
+  xla::XlaOp get_hyperparameters_input(XlaOpKernelContext* ctx) override {
+    return xla::Tuple(ctx->builder(), {ctx->Input("learning_rate")});
+  }
+
+  xla::Shape get_tables_shape(xla::Shape embedding_table_shape) override {
+    return xla::ShapeUtil::MakeTupleShape({embedding_table_shape});
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulGradWithSgdAndCsrInputOp);
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithSgdAndCsrInput"),
+                XlaSparseDenseMatmulGradWithSgdAndCsrInputOp);
+
+// This TensorFlow op calculates the gradients and performs Adagrad update on
+// the embedding table on SparseCore. It takes the activation gradients, input
+// sparse tensor represented by the `row_pointers`, `sorted_embedding_ids`,
+// `sorted_sample_ids` and 'learning_rate'. It produces the updated embedding
+// table. It also supports minibatching.
+class XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp
+    : public XlaSparseDenseMatmulGradWithCsrInputBase {
+ public:
+  explicit XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulGradWithCsrInputBase(ctx) {}
+
+  ~XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp() override = default;
+
+  xla::XlaComputation build_optimizer_computation(
+      const int32_t feature_width) override {
+    xla::XlaComputation adagrad_optimizer = [&] {
+      auto adagrad_optimizer_builder =
+          std::make_unique<xla::XlaBuilder>("adagrad_optimizer_builder");
+
+      xla::Shape per_row_shape =
+          xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
+
+      xla::XlaOp gradient = xla::Parameter(adagrad_optimizer_builder.get(), 0,
+                                           per_row_shape, "gradient");
+
+      xla::XlaOp tables = xla::Parameter(
+          adagrad_optimizer_builder.get(), 1,
+          xla::ShapeUtil::MakeTupleShape({per_row_shape, per_row_shape}),
+          "tables");
+
+      xla::XlaOp hyperparameters = xla::Parameter(
+          adagrad_optimizer_builder.get(), 2,
+          xla::ShapeUtil::MakeTupleShape({per_row_shape}), "hyperparameters");
+
+      xla::XlaOp embedding_table = xla::GetTupleElement(tables, 0);
+
+      xla::XlaOp accumulator = xla::GetTupleElement(tables, 1);
+
+      xla::XlaOp learning_rate = xla::GetTupleElement(hyperparameters, 0);
+
+      xla::XlaOp new_accumulator = accumulator + gradient * gradient;
+
+      xla::XlaOp updated_embedding_table =
+          embedding_table -
+          learning_rate * gradient / xla::Sqrt(new_accumulator);
+
+      xla::XlaOp updated_tables =
+          xla::Tuple(adagrad_optimizer_builder.get(),
+                     {updated_embedding_table, new_accumulator});
+      return adagrad_optimizer_builder->Build(updated_tables).value();
+    }();
+
+    return adagrad_optimizer;
+  }
+
+  xla::XlaOp get_tables_input(XlaOpKernelContext* ctx) override {
+    return xla::Tuple(ctx->builder(), {ctx->Input("embedding_table"),
+                                       ctx->Input("accumulator")});
+  }
+
+  xla::XlaOp get_hyperparameters_input(XlaOpKernelContext* ctx) override {
+    return xla::Tuple(ctx->builder(), {ctx->Input("learning_rate")});
+  }
+
+  xla::Shape get_tables_shape(xla::Shape embedding_table_shape) override {
+    return xla::ShapeUtil::MakeTupleShape(
+        {embedding_table_shape, embedding_table_shape});
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp);
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithAdagradAndCsrInput"),
+                XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp);
+
+// This TensorFlow op calculates the gradients and performs Adagrad with
+// momentum update on the embedding table on SparseCore. It takes the activation
+// gradients, input sparse tensor represented by the `row_pointers`,
+// `sorted_embedding_ids`, `sorted_sample_ids` and 'learning_rate'. It produces
+// the updated embedding table. It also supports minibatching.
+class XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp
+    : public XlaSparseDenseMatmulGradWithCsrInputBase {
+ public:
+  explicit XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulGradWithCsrInputBase(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("exponent", &exponent_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("beta1", &beta1_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("beta2", &beta2_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
+  }
+
+  ~XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp() override =
+      default;
+
+  xla::XlaComputation build_optimizer_computation(
+      const int32_t feature_width) override {
+    xla::XlaComputation adagrad_momentum_optimizer = [&] {
+      auto adagrad_momentum_optimizer_builder =
+          std::make_unique<xla::XlaBuilder>(
+              "adagrad_momentum_optimizer_builder");
+
+      xla::Shape per_row_shape =
+          xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
+
+      xla::XlaOp gradient =
+          xla::Parameter(adagrad_momentum_optimizer_builder.get(), 0,
+                         per_row_shape, "gradient");
+
+      xla::XlaOp tables =
+          xla::Parameter(adagrad_momentum_optimizer_builder.get(), 1,
+                         xla::ShapeUtil::MakeTupleShape(
+                             {per_row_shape, per_row_shape, per_row_shape}),
+                         "tables");
+
+      xla::XlaOp hyperparameters = xla::Parameter(
+          adagrad_momentum_optimizer_builder.get(), 2,
+          xla::ShapeUtil::MakeTupleShape({per_row_shape}), "hyperparameters");
+
+      xla::XlaOp embedding_table = xla::GetTupleElement(tables, 0);
+      xla::XlaOp accumulator = xla::GetTupleElement(tables, 1);
+      xla::XlaOp momenta = xla::GetTupleElement(tables, 2);
+
+      xla::XlaOp learning_rate = xla::GetTupleElement(hyperparameters, 0);
+
+      xla::XlaOp beta1 =
+          xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), beta1_);
+      xla::XlaOp beta2 =
+          xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), beta2_);
+      xla::XlaOp epsilon =
+          xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), epsilon_);
+
+      // If beta_2 == 1:
+      //    accumulator(t) = accumulator(t-1) + gradient(t) ^ 2
+      // Else:
+      //    accumulator(t) = beta_2 * accumulator(t-1) +
+      //                    (1-beta_2) * gradient(t) ^ 2
+      xla::XlaOp exponent = xla::ConstantR0(
+          adagrad_momentum_optimizer_builder.get(), 1.0f / exponent_);
+      xla::XlaOp one =
+          xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), 1.0f);
+
+      xla::XlaOp new_accumulator = xla::Select(
+          xla::Eq(beta2, one), accumulator + gradient * gradient,
+          beta2 * accumulator + (one - beta2) * gradient * gradient);
+
+      // scaled_gradient = (accumulator + epsilon)^(-1/k) * gradient
+      xla::XlaOp scaled_gradients =
+          PowHack(new_accumulator + epsilon, xla::Neg(exponent)) * gradient;
+
+      // momenta(t) = beta1 * momenta(t-1) + scaled_gradient(t)
+      xla::XlaOp new_momenta = beta1 * momenta + scaled_gradients;
+
+      // Table update:
+      // non-nesterov: update = momenta_t
+      // nesterov:     update = beta_1 * momenta_t + scaled_gradient
+      // weights(t) = weights(t-1) - lr * update
+      xla::XlaOp updated_embedding_table;
+      if (use_nesterov_) {
+        updated_embedding_table =
+            embedding_table -
+            learning_rate * (beta1 * new_momenta + scaled_gradients);
+      } else {
+        updated_embedding_table = embedding_table - learning_rate * new_momenta;
+      }
+
+      xla::XlaOp updated_tables =
+          xla::Tuple(adagrad_momentum_optimizer_builder.get(),
+                     {updated_embedding_table, new_accumulator, new_momenta});
+      return adagrad_momentum_optimizer_builder->Build(updated_tables).value();
+    }();
+
+    return adagrad_momentum_optimizer;
+  }
+
+  xla::XlaOp get_tables_input(XlaOpKernelContext* ctx) override {
+    return xla::Tuple(ctx->builder(),
+                      {ctx->Input("embedding_table"), ctx->Input("accumulator"),
+                       ctx->Input("momenta")});
+  }
+
+  xla::XlaOp get_hyperparameters_input(XlaOpKernelContext* ctx) override {
+    return xla::Tuple(ctx->builder(), {ctx->Input("learning_rate")});
+  }
+
+  xla::Shape get_tables_shape(xla::Shape embedding_table_shape) override {
+    return xla::ShapeUtil::MakeTupleShape(
+        {embedding_table_shape, embedding_table_shape, embedding_table_shape});
+  }
+
+ private:
+  bool use_nesterov_;
+  float exponent_;
+  float beta1_;
+  float beta2_;
+  float epsilon_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(
+      XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp);
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"),
+                XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp);
+
+// This TensorFlow op calculates the gradients and performs Adam
+// update on the embedding table on SparseCore. It takes the activation
+// gradients, input sparse tensor represented by the `row_pointers`,
+// `sorted_embedding_ids`, `sorted_sample_ids` and 'learning_rate'. It produces
+// the updated embedding table. It also supports minibatching.
+class XlaSparseDenseMatmulGradWithAdamAndCsrInputOp
+    : public XlaSparseDenseMatmulGradWithCsrInputBase {
+ public:
+  explicit XlaSparseDenseMatmulGradWithAdamAndCsrInputOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulGradWithCsrInputBase(ctx) {
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("use_sum_inside_sqrt", &use_sum_inside_sqrt_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("beta1", &beta1_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("beta2", &beta2_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
+  }
+
+  ~XlaSparseDenseMatmulGradWithAdamAndCsrInputOp() override = default;
+
+  xla::XlaComputation build_optimizer_computation(
+      const int32_t feature_width) override {
+    xla::XlaComputation adam_optimizer = [&] {
+      auto adam_optimizer_builder =
+          std::make_unique<xla::XlaBuilder>("adam_optimizer_builder");
+
+      xla::Shape per_row_shape =
+          xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
+
+      xla::XlaOp gradient = xla::Parameter(adam_optimizer_builder.get(), 0,
+                                           per_row_shape, "gradient");
+
+      xla::XlaOp tables =
+          xla::Parameter(adam_optimizer_builder.get(), 1,
+                         xla::ShapeUtil::MakeTupleShape(
+                             {per_row_shape, per_row_shape, per_row_shape}),
+                         "tables");
+
+      xla::XlaOp hyperparameters = xla::Parameter(
+          adam_optimizer_builder.get(), 2,
+          xla::ShapeUtil::MakeTupleShape({per_row_shape}), "hyperparameters");
+
+      xla::XlaOp embedding_table = xla::GetTupleElement(tables, 0);
+      xla::XlaOp momenta = xla::GetTupleElement(tables, 1);
+      xla::XlaOp velocity = xla::GetTupleElement(tables, 2);
+
+      xla::XlaOp learning_rate = xla::GetTupleElement(hyperparameters, 0);
+
+      xla::XlaOp beta1 = xla::ConstantR0(adam_optimizer_builder.get(), beta1_);
+      xla::XlaOp beta2 = xla::ConstantR0(adam_optimizer_builder.get(), beta2_);
+      xla::XlaOp epsilon =
+          xla::ConstantR0(adam_optimizer_builder.get(), epsilon_);
+
+      // Depending on sum_inside_sqrt, the denominator is either:
+      //     sum_inside_sqrt==true: sqrt(v + eps^2)
+      //     sum_inside_sqrt==false: sqrt(v) + eps
+      // To simplify the for loop below, write the sqrt denominator as:
+      //     sqrt(v + e1) + e2
+      // and set e1 and e2 appropriately:
+      xla::XlaOp zero = xla::ConstantR0(adam_optimizer_builder.get(), 0.0f);
+      xla::XlaOp one = xla::ConstantR0(adam_optimizer_builder.get(), 1.0f);
+      xla::XlaOp e1 = use_sum_inside_sqrt_ ? epsilon * epsilon : zero;
+      xla::XlaOp e2 = use_sum_inside_sqrt_ ? zero : epsilon;
+
+      // momentum(t) = beta_1 * momentum(t-1)
+      //                      + (1-beta_1)*gradient(t)
+      xla::XlaOp new_momenta = beta1 * momenta + (one - beta1) * gradient;
+
+      // velocity(t) = beta_2 * velocity(t-1)
+      //                      + (1-beta_2)*gradient(t)*gradient(t)
+      xla::XlaOp new_velocity =
+          beta2 * velocity + (one - beta2) * gradient * gradient;
+
+      xla::XlaOp updated_embedding_table =
+          embedding_table -
+          learning_rate * new_momenta / (xla::Sqrt(new_velocity + e1) + e2);
+
+      xla::XlaOp updated_tables =
+          xla::Tuple(adam_optimizer_builder.get(),
+                     {updated_embedding_table, new_momenta, new_velocity});
+      return adam_optimizer_builder->Build(updated_tables).value();
+    }();
+
+    return adam_optimizer;
+  }
+
+  xla::XlaOp get_tables_input(XlaOpKernelContext* ctx) override {
+    return xla::Tuple(ctx->builder(),
+                      {ctx->Input("embedding_table"), ctx->Input("momenta"),
+                       ctx->Input("velocity")});
+  }
+
+  xla::XlaOp get_hyperparameters_input(XlaOpKernelContext* ctx) override {
+    return xla::Tuple(ctx->builder(), {ctx->Input("learning_rate")});
+  }
+
+  xla::Shape get_tables_shape(xla::Shape embedding_table_shape) override {
+    return xla::ShapeUtil::MakeTupleShape(
+        {embedding_table_shape, embedding_table_shape, embedding_table_shape});
+  }
+
+ private:
+  bool use_sum_inside_sqrt_;
+  float beta1_;
+  float beta2_;
+  float epsilon_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulGradWithAdamAndCsrInputOp);
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithAdamAndCsrInput"),
+                XlaSparseDenseMatmulGradWithAdamAndCsrInputOp);
+
+// This TensorFlow op calculates the gradients and performs FTRL
+// update on the embedding table on SparseCore. It takes the activation
+// gradients, input sparse tensor represented by the `row_pointers`,
+// `sorted_embedding_ids`, `sorted_sample_ids` and 'learning_rate'. It produces
+// the updated embedding table. It also supports minibatching.
+class XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp
+    : public XlaSparseDenseMatmulGradWithCsrInputBase {
+ public:
+  explicit XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulGradWithCsrInputBase(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("multiply_linear_by_learning_rate",
+                                     &multiply_linear_by_learning_rate_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("beta", &beta_));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("learning_rate_power", &learning_rate_power_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("l1_regularization_strength",
+                                     &l1_regularization_strength_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("l2_regularization_strength",
+                                     &l2_regularization_strength_));
+  }
+
+  ~XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp() override = default;
+
+  xla::XlaComputation build_optimizer_computation(
+      const int32_t feature_width) override {
+    xla::XlaComputation ftrl_optimizer = [&] {
+      auto ftrl_optimizer_builder =
+          std::make_unique<xla::XlaBuilder>("ftrl_optimizer_builder");
+
+      xla::Shape per_row_shape =
+          xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
+
+      xla::XlaOp gradient = xla::Parameter(ftrl_optimizer_builder.get(), 0,
+                                           per_row_shape, "gradient");
+
+      xla::XlaOp tables =
+          xla::Parameter(ftrl_optimizer_builder.get(), 1,
+                         xla::ShapeUtil::MakeTupleShape(
+                             {per_row_shape, per_row_shape, per_row_shape}),
+                         "tables");
+
+      xla::XlaOp hyperparameters = xla::Parameter(
+          ftrl_optimizer_builder.get(), 2,
+          xla::ShapeUtil::MakeTupleShape({per_row_shape}), "hyperparameters");
+
+      xla::XlaOp embedding_table = xla::GetTupleElement(tables, 0);
+      xla::XlaOp accumulator = xla::GetTupleElement(tables, 1);
+      xla::XlaOp linear = xla::GetTupleElement(tables, 2);
+
+      xla::XlaOp learning_rate = xla::GetTupleElement(hyperparameters, 0);
+
+      // accumulator(t) = accumulator(t-1) + gradient(t) ^ 2
+      xla::XlaOp new_accumulator = accumulator + gradient * gradient;
+
+      xla::XlaOp learning_rate_power =
+          xla::ConstantR0(ftrl_optimizer_builder.get(), learning_rate_power_);
+
+      xla::XlaOp power_old =
+          PowHack(accumulator, xla::Neg(learning_rate_power));
+      xla::XlaOp power_new =
+          PowHack(new_accumulator, xla::Neg(learning_rate_power));
+      xla::XlaOp delta_p = power_new - power_old;
+
+      xla::XlaOp zero = xla::ConstantR0(ftrl_optimizer_builder.get(), 0.0f);
+
+      xla::XlaOp two = xla::ConstantR0(ftrl_optimizer_builder.get(), 2.0f);
+
+      xla::XlaOp l1_regularization_strength = xla::ConstantR0(
+          ftrl_optimizer_builder.get(), l1_regularization_strength_);
+
+      xla::XlaOp l2_regularization_strength = xla::ConstantR0(
+          ftrl_optimizer_builder.get(), l2_regularization_strength_);
+
+      xla::XlaOp beta = xla::ConstantR0(ftrl_optimizer_builder.get(), beta_);
+
+      // Note:
+      //    min(|linear(t)|, lr*l1)*sgn(linear(t))
+      // can be written as
+      //    clamp( -lr*l1, linear(t), lr*l1)
+      // assuming lr>0 and l1>0.
+      xla::XlaOp new_linear;
+      xla::XlaOp numer;
+      xla::XlaOp denom;
+      if (multiply_linear_by_learning_rate_) {
+        new_linear =
+            linear + learning_rate * gradient - delta_p * embedding_table;
+        // if multiply_linear:
+        //   linear(t) = linear(t-1) + lr*g - delta_p * table(t-1)
+        //   Update numerator:
+        //      N = min(|linear(t)|, lr*l1)*sgn(linear(t)) - linear(t)
+        //   Update denomninator:
+        //      D = power(t) + 2*lr*l2 + beta
+        //   table(t) = N / D
+        numer = xla::Select(
+            xla::Eq(l1_regularization_strength, zero), xla::Neg(new_linear),
+            xla::Clamp(xla::Neg(learning_rate * l1_regularization_strength),
+                       new_linear, learning_rate * l1_regularization_strength) -
+                new_linear);
+        denom =
+            power_new + two * learning_rate * l2_regularization_strength + beta;
+      } else {
+        new_linear =
+            linear + gradient - delta_p * embedding_table / learning_rate;
+        // if NOT multiply_linear:
+        //   linear(t) = linear(t-1) + g - (1/lr) * delta_p * table(t-1)
+        //   Update numerator:
+        //     N = min(|linear(t)|, l1)*sgn(linear(t)) - linear(t)
+        //   Update denomninator:
+        //     D = (1/lr) * (power(t) + beta) + 2*l2
+        //   table(t) = N / D
+        numer = xla::Select(xla::Eq(l1_regularization_strength, zero),
+                            xla::Neg(new_linear),
+                            xla::Clamp(xla::Neg(l1_regularization_strength),
+                                       new_linear, l1_regularization_strength) -
+                                new_linear);
+        denom = (power_new + beta) / learning_rate +
+                two * l2_regularization_strength;
+      }
+      xla::XlaOp updated_embedding_table = numer / denom;
+
+      xla::XlaOp updated_tables =
+          xla::Tuple(ftrl_optimizer_builder.get(),
+                     {updated_embedding_table, new_accumulator, new_linear});
+      return ftrl_optimizer_builder->Build(updated_tables).value();
+    }();
+
+    return ftrl_optimizer;
+  }
+
+  xla::XlaOp get_tables_input(XlaOpKernelContext* ctx) override {
+    return xla::Tuple(ctx->builder(),
+                      {ctx->Input("embedding_table"), ctx->Input("accumulator"),
+                       ctx->Input("linear")});
+  }
+
+  xla::XlaOp get_hyperparameters_input(XlaOpKernelContext* ctx) override {
+    return xla::Tuple(ctx->builder(), {ctx->Input("learning_rate")});
+  }
+
+  xla::Shape get_tables_shape(xla::Shape embedding_table_shape) override {
+    return xla::ShapeUtil::MakeTupleShape(
+        {embedding_table_shape, embedding_table_shape, embedding_table_shape});
+  }
+
+ private:
+  bool multiply_linear_by_learning_rate_;
+  float beta_;
+  float learning_rate_power_;
+  float l1_regularization_strength_;
+  float l2_regularization_strength_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp);
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithFtrlAndCsrInput"),
+                XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp);
+
+class XlaSparseCoreOptimizerOpBase : public XlaOpKernel {
+ public:
+  explicit XlaSparseCoreOptimizerOpBase(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("feature_width", &feature_width_));
+  }
+
+  ~XlaSparseCoreOptimizerOpBase() override = default;
+
+  // Gather table rows into a dense working buffer. This helper assumes the
+  // table is structured like a typical embedding table.
+  xla::XlaOp GatherVectors(xla::XlaOp table, xla::XlaOp indices) {
+    xla::GatherDimensionNumbers gather_dimension_numbers;
+    gather_dimension_numbers.add_offset_dims(1);
+    gather_dimension_numbers.add_collapsed_slice_dims(0);
+    gather_dimension_numbers.add_start_index_map(0);
+    gather_dimension_numbers.set_index_vector_dim(1);
+    const std::vector<int64_t> slice_sizes = {1, feature_width_};
+    return xla::Gather(table, indices, gather_dimension_numbers, slice_sizes);
+  }
+
+  xla::XlaOp ScatterReplace(xla::XlaOp table, xla::XlaOp indices,
+                            xla::XlaOp updates) {
+    // Scatter the updated table rows from the dense buffer to the
+    // full (local shard) table.
+    xla::ScatterDimensionNumbers scatter_dimension_numbers;
+    scatter_dimension_numbers.add_update_window_dims(1);
+    scatter_dimension_numbers.add_inserted_window_dims(0);
+    scatter_dimension_numbers.add_scatter_dims_to_operand_dims(0);
+    scatter_dimension_numbers.set_index_vector_dim(1);
+
+    // Determines how updates are applied to the destination data.
+    xla::XlaComputation scatter_update_fcn = [&] {
+      auto sb = std::make_unique<xla::XlaBuilder>("scatter_builder");
+      auto scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+      // Param 0 ("orig") is unused but still needs to be defined so we can
+      // access Param 1 ("update").
+      xla::Parameter(sb.get(), 0, scalar_shape, "orig");
+      auto update = xla::Parameter(sb.get(), 1, scalar_shape, "update");
+      // Update becomes the new table value.
+      return sb->Build(update).value();
+    }();
+
+    // Do we have sorted or unique indices?
+    return xla::Scatter(table, indices, updates, scatter_update_fcn,
+                        scatter_dimension_numbers);
+  }
+
+ protected:
+  int feature_width_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreOptimizerOpBase);
+};
+
+// This class uses the SGD optimizer to update the embedding table weights.
+class XlaSparseCoreSgdOp : public XlaSparseCoreOptimizerOpBase {
+ public:
+  explicit XlaSparseCoreSgdOp(OpKernelConstruction* ctx)
+      : XlaSparseCoreOptimizerOpBase(ctx) {}
+
+  ~XlaSparseCoreSgdOp() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+    UseSparseCoreFrontendAttributes fe_attributes(builder);
+
+    // TODO(patn): Add error checking
+    xla::XlaOp gradient = ctx->Input("gradient");
+    xla::XlaOp indices = ctx->Input("indices");
+    xla::XlaOp learning_rate = ctx->Input("learning_rate");
+    xla::XlaOp table = ctx->Input("embedding_table");
+
+    // The result will be scatter-added into the original embedding table so
+    // this is just the "delta".
+    xla::XlaOp table_old = GatherVectors(table, indices);
+    xla::XlaOp updates = table_old - learning_rate * gradient;
+    xla::XlaOp result = ScatterReplace(table, indices, updates);
+
+    ctx->SetOutput(0, result);
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreSgdOp);
+};
+
+REGISTER_XLA_OP(Name("XlaSparseCoreSgd"), XlaSparseCoreSgdOp);
+
+// This class uses the Adagrad optimizer to update the embedding table weights.
+class XlaSparseCoreAdagradOp : public XlaSparseCoreOptimizerOpBase {
+ public:
+  explicit XlaSparseCoreAdagradOp(OpKernelConstruction* ctx)
+      : XlaSparseCoreOptimizerOpBase(ctx) {}
+
+  ~XlaSparseCoreAdagradOp() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+    UseSparseCoreFrontendAttributes fe_attributes(builder);
+
+    // TODO(patn): Add error checking
+    xla::XlaOp gradient = ctx->Input("gradient");
+    xla::XlaOp indices = ctx->Input("indices");
+    xla::XlaOp learning_rate = ctx->Input("learning_rate");
+    xla::XlaOp table = ctx->Input("embedding_table");
+    xla::XlaOp accum = ctx->Input("accumulator");
+
+    xla::XlaOp accum_old = GatherVectors(accum, indices);
+    xla::XlaOp accum_new = accum_old + gradient * gradient;
+
+    xla::XlaOp table_old = GatherVectors(table, indices);
+    xla::XlaOp updates =
+        table_old - learning_rate * gradient / xla::Sqrt(accum_new);
+
+    xla::XlaOp accum_result = ScatterReplace(accum, indices, accum_new);
+
+    xla::XlaOp table_result = ScatterReplace(table, indices, updates);
+
+    ctx->SetOutput(0, table_result);
+    ctx->SetOutput(1, accum_result);
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreAdagradOp);
+};
+
+REGISTER_XLA_OP(Name("XlaSparseCoreAdagrad"), XlaSparseCoreAdagradOp);
+
+// This class uses the AdagradMomentum optimizer to update the embedding table
+// weights.
+class XlaSparseCoreAdagradMomentumOp : public XlaSparseCoreOptimizerOpBase {
+ public:
+  explicit XlaSparseCoreAdagradMomentumOp(OpKernelConstruction* ctx)
+      : XlaSparseCoreOptimizerOpBase(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("beta_2", &beta_2_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("exponent", &exponent_));
+  }
+
+  ~XlaSparseCoreAdagradMomentumOp() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+    UseSparseCoreFrontendAttributes fe_attributes(builder);
+
+    // TODO(patn): Add error checking
+    xla::XlaOp gradient = ctx->Input("gradient");
+    xla::XlaOp indices = ctx->Input("indices");
+    xla::XlaOp learning_rate = ctx->Input("learning_rate");
+    xla::XlaOp beta_1 = ctx->Input("beta_1");
+    xla::XlaOp epsilon = ctx->Input("epsilon");
+    xla::XlaOp table = ctx->Input("embedding_table");
+    xla::XlaOp accum = ctx->Input("accumulator");
+    xla::XlaOp momen = ctx->Input("momentum");
+
+    xla::XlaOp accum_old = GatherVectors(accum, indices);
+    xla::XlaOp momen_old = GatherVectors(momen, indices);
+
+    // If beta_2 == 1:
+    //    accumulator(t) = accumulator(t-1) + gradient(t) ^ 2
+    // Else:
+    //    accumulator(t) = beta_2 * accumulator(t-1) +
+    //                    (1-beta_2) * gradient(t) ^ 2
+    xla::XlaOp exponent = xla::ConstantR0(builder, 1.0f / exponent_);
+    xla::XlaOp accum_new;
+
+    if (beta_2_ == 1.0f) {
+      accum_new = accum_old + gradient * gradient;
+    } else {
+      xla::XlaOp beta_2 = xla::ConstantR0(builder, beta_2_);
+      xla::XlaOp one_m_beta_2 = xla::ConstantR0(builder, 1.0f - beta_2_);
+      accum_new = beta_2 * accum_old + one_m_beta_2 * gradient * gradient;
+    }
+    // scaled_gradient = (accumulator + epsilon)^(-1/k) * gradient
+    xla::XlaOp scaled_gradients =
+        PowHack(accum_new + epsilon, xla::Neg(exponent)) * gradient;
+
+    // momentum(t) = beta_1 * momentum(t-1) + scaled_gradient(t)
+    xla::XlaOp momen_new = beta_1 * momen_old + scaled_gradients;
+
+    // Table update:
+    // non-nesterov: update = momentum_t
+    // nesterov:     update = beta_1 * momentum_t + scaled_gradient
+    // weights(t) = weights(t-1) - lr * update
+    xla::XlaOp table_old = GatherVectors(table, indices);
+    xla::XlaOp updates;
+    if (use_nesterov_) {
+      updates =
+          table_old - learning_rate * (beta_1 * momen_new + scaled_gradients);
+    } else {
+      updates = table_old - learning_rate * momen_new;
+    }
+
+    xla::XlaOp accum_result = ScatterReplace(accum, indices, accum_new);
+
+    xla::XlaOp momen_result = ScatterReplace(momen, indices, momen_new);
+
+    xla::XlaOp table_result = ScatterReplace(table, indices, updates);
+
+    ctx->SetOutput(0, table_result);
+    ctx->SetOutput(1, accum_result);
+    ctx->SetOutput(2, momen_result);
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreAdagradMomentumOp);
+
+ private:
+  bool use_nesterov_;
+  float beta_2_;
+  float exponent_;
+};
+
+REGISTER_XLA_OP(Name("XlaSparseCoreAdagradMomentum"),
+                XlaSparseCoreAdagradMomentumOp);
+
+// This class uses the Adam optimizer to update the embedding table weights.
+class XlaSparseCoreAdamOp : public XlaSparseCoreOptimizerOpBase {
+ public:
+  explicit XlaSparseCoreAdamOp(OpKernelConstruction* ctx)
+      : XlaSparseCoreOptimizerOpBase(ctx) {
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("use_sum_inside_sqrt", &use_sum_inside_sqrt_));
+  }
+
+  ~XlaSparseCoreAdamOp() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+    UseSparseCoreFrontendAttributes fe_attributes(builder);
+
+    // TODO(patn): Add error checking
+    xla::XlaOp gradient = ctx->Input("gradient");
+    xla::XlaOp indices = ctx->Input("indices");
+    xla::XlaOp learning_rate = ctx->Input("learning_rate");
+    xla::XlaOp beta_1 = ctx->Input("beta_1");
+    xla::XlaOp beta_2 = ctx->Input("beta_2");
+    xla::XlaOp epsilon = ctx->Input("epsilon");
+    xla::XlaOp table = ctx->Input("embedding_table");
+    xla::XlaOp momen = ctx->Input("momentum");
+    xla::XlaOp veloc = ctx->Input("velocity");
+    // xla::XlaOp use_non_lazy_adam = ctx->Input("use_non_lazy_adam");
+
+    xla::XlaOp momen_old = GatherVectors(momen, indices);
+    xla::XlaOp veloc_old = GatherVectors(veloc, indices);
+
+    // Depending on sum_inside_sqrt, the denominator is either:
+    //     sum_inside_sqrt==true: sqrt(v + eps^2)
+    //     sum_inside_sqrt==false: sqrt(v) + eps
+    // To simplify the for loop below, write the sqrt denominator as:
+    //     sqrt(v + e1) + e2
+    // and set e1 and e2 appropriately:
+    xla::XlaOp zero = xla::ConstantR0(builder, 0.0f);
+    xla::XlaOp one = xla::ConstantR0(builder, 1.0f);
+    xla::XlaOp e1 = use_sum_inside_sqrt_ ? epsilon * epsilon : zero;
+    xla::XlaOp e2 = use_sum_inside_sqrt_ ? zero : epsilon;
+
+    // momentum(t) = beta_1 * momentum(t-1)
+    //                      + (1-beta_1)*gradient(t)
+    xla::XlaOp momen_new = beta_1 * momen_old + (one - beta_1) * gradient;
+
+    // velocity(t) = beta_2 * velocity(t-1)
+    //                      + (1-beta_2)*gradient(t)*gradient(t)
+    xla::XlaOp veloc_new =
+        beta_2 * veloc_old + (one - beta_2) * gradient * gradient;
+
+    // table(t) = table(t-1) - lr * (m(t) / (sqrt(v(t) + e1) + e2))
+    xla::XlaOp table_old = GatherVectors(table, indices);
+    xla::XlaOp updates = table_old - learning_rate * momen_new /
+                                         (xla::Sqrt(veloc_new + e1) + e2);
+
+    xla::XlaOp momen_result = ScatterReplace(momen, indices, momen_new);
+
+    xla::XlaOp veloc_result = ScatterReplace(veloc, indices, veloc_new);
+
+    xla::XlaOp table_result = ScatterReplace(table, indices, updates);
+
+    ctx->SetOutput(0, table_result);
+    ctx->SetOutput(1, veloc_result);
+    ctx->SetOutput(2, momen_result);
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreAdamOp);
+
+ private:
+  bool use_sum_inside_sqrt_;
+};
+
+REGISTER_XLA_OP(Name("XlaSparseCoreAdam"), XlaSparseCoreAdamOp);
+
+// This class uses the Ftrl optimizer to update the embedding table weights.
+class XlaSparseCoreFtrlOp : public XlaSparseCoreOptimizerOpBase {
+ public:
+  explicit XlaSparseCoreFtrlOp(OpKernelConstruction* ctx)
+      : XlaSparseCoreOptimizerOpBase(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("multiply_linear_by_learning_rate",
+                                     &multiply_linear_by_learning_rate_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("l1_regularization_strength",
+                                     &l1_regularization_strength_));
+  }
+
+  ~XlaSparseCoreFtrlOp() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+    UseSparseCoreFrontendAttributes fe_attributes(builder);
+
+    xla::XlaOp gradient = ctx->Input("gradient");
+    xla::XlaOp indices = ctx->Input("indices");
+    xla::XlaOp learning_rate = ctx->Input("learning_rate");
+    xla::XlaOp learning_rate_power = ctx->Input("learning_rate_power");
+    xla::XlaOp l2_regularization_strength =
+        ctx->Input("l2_regularization_strength");
+    xla::XlaOp beta = ctx->Input("beta");
+    xla::XlaOp table = ctx->Input("embedding_table");
+    xla::XlaOp accum = ctx->Input("accumulator");
+    xla::XlaOp linear = ctx->Input("linear");
+
+    xla::XlaOp accum_old = GatherVectors(accum, indices);
+    xla::XlaOp linear_old = GatherVectors(linear, indices);
+
+    // accumulator(t) = accumulator(t-1) + gradient(t) ^ 2
+    xla::XlaOp accum_new = accum_old + gradient * gradient;
+
+    xla::XlaOp power_old = PowHack(accum_old, -learning_rate_power);
+    xla::XlaOp power_new = PowHack(accum_new, -learning_rate_power);
+    xla::XlaOp delta_p = power_new - power_old;
+
+    xla::XlaOp two = xla::ConstantR0(builder, 2.0f);
+    xla::XlaOp table_old = GatherVectors(table, indices);
+
+    // Note:
+    //    min(|linear(t)|, lr*l1)*sgn(linear(t))
+    // can be written as
+    //    clamp( -lr*l1, linear(t), lr*l1)
+    // assuming lr>0 and l1>0.
+    xla::XlaOp linear_new;
+    xla::XlaOp numer;
+    xla::XlaOp denom;
+    if (multiply_linear_by_learning_rate_) {
+      linear_new = linear_old + learning_rate * gradient - delta_p * table_old;
+      // if multiply_linear:
+      //   linear(t) = linear(t-1) + lr*g - delta_p * table(t-1)
+      //   Update numerator:
+      //      N = min(|linear(t)|, lr*l1)*sgn(linear(t)) - linear(t)
+      //   Update denomninator:
+      //      D = power(t) + 2*lr*l2 + beta
+      //   table(t) = N / D
+      if (l1_regularization_strength_ == 0) {
+        numer = -linear_new;
+      } else {
+        xla::XlaOp l1_regularization_strength =
+            xla::ConstantR0(builder, l1_regularization_strength_);
+        numer =
+            xla::Clamp(-learning_rate * l1_regularization_strength, linear_new,
+                       learning_rate * l1_regularization_strength) -
+            linear_new;
+      }
+      denom =
+          power_new + two * learning_rate * l2_regularization_strength + beta;
+    } else {
+      linear_new = linear_old + gradient - delta_p * table_old / learning_rate;
+      // if NOT multiply_linear:
+      //   linear(t) = linear(t-1) + g - (1/lr) * delta_p * table(t-1)
+      //   Update numerator:
+      //     N = min(|linear(t)|, l1)*sgn(linear(t)) - linear(t)
+      //   Update denomninator:
+      //     D = (1/lr) * (power(t) + beta) + 2*l2
+      //   table(t) = N / D
+      if (l1_regularization_strength_ == 0) {
+        numer = -linear_new;
+      } else {
+        xla::XlaOp l1_regularization_strength =
+            xla::ConstantR0(builder, l1_regularization_strength_);
+        numer = xla::Clamp(-l1_regularization_strength, linear_new,
+                           l1_regularization_strength) -
+                linear_new;
+      }
+      denom =
+          (power_new + beta) / learning_rate + two * l2_regularization_strength;
+    }
+    xla::XlaOp updates = numer / denom;
+
+    xla::XlaOp accum_result = ScatterReplace(accum, indices, accum_new);
+
+    xla::XlaOp linear_result = ScatterReplace(linear, indices, linear_new);
+
+    xla::XlaOp table_result = ScatterReplace(table, indices, updates);
+
+    ctx->SetOutput(0, table_result);
+    ctx->SetOutput(1, accum_result);
+    ctx->SetOutput(2, linear_result);
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreFtrlOp);
+
+ private:
+  bool multiply_linear_by_learning_rate_;
+  float l1_regularization_strength_;
+};
+
+REGISTER_XLA_OP(Name("XlaSparseCoreFtrl"), XlaSparseCoreFtrlOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.h b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.h
new file mode 100644
index 00000000000000..c5eb0b09fb9e18
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.h
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_OPS_H_
+
+#include "xla/client/xla_builder.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/macros.h"
+
+// RAII helper to set the frontend attribute for the target chip to the SC.
+// Automatically restores the frontend attributes on exit.
+class UseSparseCoreFrontendAttributes {
+ public:
+  explicit UseSparseCoreFrontendAttributes(xla::XlaBuilder* builder)
+      : builder_(builder),
+        original_frontend_attributes_(builder->frontend_attributes()) {
+    xla::FrontendAttributes sc_attributes = original_frontend_attributes_;
+    (*sc_attributes.mutable_map())["_xla_compute_type"] = "sparse";
+    builder_->SetFrontendAttributes(sc_attributes);
+  }
+
+  ~UseSparseCoreFrontendAttributes() {
+    builder_->SetFrontendAttributes(original_frontend_attributes_);
+  }
+
+ private:
+  xla::XlaBuilder* builder_;
+  const xla::FrontendAttributes original_frontend_attributes_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(UseSparseCoreFrontendAttributes);
+};
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_OPS_H_
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc b/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
index 55630ea64dd784..c1a4ea41572609 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
@@ -111,6 +111,7 @@ tsl::Status SetTpuOpsStructFns(void* library_handle) {  // TENSORFLOW_STATUS_OK
                TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation);
   TFTPU_SET_FN(ops_api_fn, TpuEmbeddingEngine_DedupDataTupleMaskComputation);
 
+  TFTPU_SET_FN(ops_api_fn, SparseCore_GetMaxIdsAndUniques);
   return tsl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
index 5470b4b061ae90..65bfbabeb15f76 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
@@ -703,6 +703,22 @@ typedef struct TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params {
 TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_DedupDataTupleMaskComputation(
     TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params* params);
 
+typedef struct SparseCore_GetMaxIdsAndUniques_Params {
+  size_t struct_size;
+  void* priv;
+  const char* program_key;
+  const char* table_name;
+  int64_t num_samples_per_sparse_core;
+  int64_t feature_width;
+  // out
+  TF_Status* status;
+  int64_t max_ids_per_partition;
+  int64_t max_unique_ids_per_partition;
+} SparseCore_GetMaxIdsAndUniques_Params;
+
+TFTPU_CAPI_EXPORT void SparseCore_GetMaxIdsAndUniques(
+    SparseCore_GetMaxIdsAndUniques_Params* params);
+
 struct TfTpu_OpsApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAndBuild);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_XrtCompileAndBuild);
@@ -803,6 +819,8 @@ struct TfTpu_OpsApiFn {
   TFTPU_ADD_FN_IN_STRUCT(
       TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation);
   TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_DedupDataTupleMaskComputation);
+
+  TFTPU_ADD_FN_IN_STRUCT(SparseCore_GetMaxIdsAndUniques);
 };
 
 }  // extern "C"

From a891c78e49fb8aabed2f1a0fb26f9a6d7ab6aa35 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Tue, 26 Sep 2023 17:13:54 -0700
Subject: [PATCH 294/567] Fix missing config flags for regular builds

After my previous change removed the bazelrc/ directory from ci/official, I forgot to explicitly set
the --config options to load the settings which were previously implicit.

PiperOrigin-RevId: 568691490
---
 ci/official/envs/continuous_linux_x86_cpu_py310       | 2 +-
 ci/official/envs/continuous_linux_x86_cpu_py311       | 2 +-
 ci/official/envs/continuous_linux_x86_cpu_py39        | 2 +-
 ci/official/envs/continuous_linux_x86_cuda_py310      | 2 +-
 ci/official/envs/continuous_linux_x86_cuda_py311      | 2 +-
 ci/official/envs/continuous_linux_x86_cuda_py39       | 2 +-
 ci/official/envs/nightly_libtensorflow_linux_x86_cpu  | 2 +-
 ci/official/envs/nightly_libtensorflow_linux_x86_cuda | 2 +-
 ci/official/envs/nightly_linux_x86_cpu_py310          | 2 +-
 ci/official/envs/nightly_linux_x86_cpu_py311          | 2 +-
 ci/official/envs/nightly_linux_x86_cpu_py39           | 2 +-
 ci/official/envs/nightly_linux_x86_cuda_py310         | 2 +-
 ci/official/envs/nightly_linux_x86_cuda_py311         | 2 +-
 ci/official/envs/nightly_linux_x86_cuda_py39          | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/ci/official/envs/continuous_linux_x86_cpu_py310 b/ci/official/envs/continuous_linux_x86_cpu_py310
index c891af629fdfa7..b597e69307cde0 100644
--- a/ci/official/envs/continuous_linux_x86_cpu_py310
+++ b/ci/official/envs/continuous_linux_x86_cpu_py310
@@ -1,5 +1,5 @@
 source ci/official/envs/ci_default
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.10)
+TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config rbe_linux_cpu --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/continuous_linux_x86_cpu_py311 b/ci/official/envs/continuous_linux_x86_cpu_py311
index 0d82dee52191d3..1ce6c8b68fddc4 100644
--- a/ci/official/envs/continuous_linux_x86_cpu_py311
+++ b/ci/official/envs/continuous_linux_x86_cpu_py311
@@ -1,5 +1,5 @@
 source ci/official/envs/ci_default
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.11)
+TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config rbe_linux_cpu --repo_env=TF_PYTHON_VERSION=3.11)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.11
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.11 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/continuous_linux_x86_cpu_py39 b/ci/official/envs/continuous_linux_x86_cpu_py39
index ce3ddebfaf58c9..de2c98699a0fe2 100644
--- a/ci/official/envs/continuous_linux_x86_cpu_py39
+++ b/ci/official/envs/continuous_linux_x86_cpu_py39
@@ -1,5 +1,5 @@
 source ci/official/envs/ci_default
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.9)
+TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config rbe_linux_cpu --repo_env=TF_PYTHON_VERSION=3.9)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.9
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.11 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/continuous_linux_x86_cuda_py310 b/ci/official/envs/continuous_linux_x86_cuda_py310
index adc440db2cface..69ca107060677c 100644
--- a/ci/official/envs/continuous_linux_x86_cuda_py310
+++ b/ci/official/envs/continuous_linux_x86_cuda_py310
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 TFCI_NVIDIA_SMI_ENABLE=1
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.10)
+TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config rbe_linux_cuda --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/continuous_linux_x86_cuda_py311 b/ci/official/envs/continuous_linux_x86_cuda_py311
index e0f57d1e4ea3d4..da54acbcb3845d 100644
--- a/ci/official/envs/continuous_linux_x86_cuda_py311
+++ b/ci/official/envs/continuous_linux_x86_cuda_py311
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 TFCI_NVIDIA_SMI_ENABLE=1
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.11)
+TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config rbe_linux_cuda --repo_env=TF_PYTHON_VERSION=3.11)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.11
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.11 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/continuous_linux_x86_cuda_py39 b/ci/official/envs/continuous_linux_x86_cuda_py39
index 077c89aac47b41..a53df57a91f50f 100644
--- a/ci/official/envs/continuous_linux_x86_cuda_py39
+++ b/ci/official/envs/continuous_linux_x86_cuda_py39
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 TFCI_NVIDIA_SMI_ENABLE=1
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BAZEL_COMMON_ARGS=(--config rbe --repo_env=TF_PYTHON_VERSION=3.9)
+TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config rbe_linux_cuda --repo_env=TF_PYTHON_VERSION=3.9)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.9
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.9 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/nightly_libtensorflow_linux_x86_cpu b/ci/official/envs/nightly_libtensorflow_linux_x86_cpu
index 9da67eefa3b449..aa04aeffc7c52c 100644
--- a/ci/official/envs/nightly_libtensorflow_linux_x86_cpu
+++ b/ci/official/envs/nightly_libtensorflow_linux_x86_cpu
@@ -1,6 +1,6 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
+TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
 TFCI_LIB_SUFFIX="-cpu-linux-x86_64"
diff --git a/ci/official/envs/nightly_libtensorflow_linux_x86_cuda b/ci/official/envs/nightly_libtensorflow_linux_x86_cuda
index b011304fc3a14d..d1683177391a6f 100644
--- a/ci/official/envs/nightly_libtensorflow_linux_x86_cuda
+++ b/ci/official/envs/nightly_libtensorflow_linux_x86_cuda
@@ -1,6 +1,6 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
+TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/nightly_linux_x86_cpu_py310 b/ci/official/envs/nightly_linux_x86_cpu_py310
index 9c77911a190c50..a86e04653c7007 100644
--- a/ci/official/envs/nightly_linux_x86_cpu_py310
+++ b/ci/official/envs/nightly_linux_x86_cpu_py310
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
+TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.10 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/nightly_linux_x86_cpu_py311 b/ci/official/envs/nightly_linux_x86_cpu_py311
index 0a5e2db2acb519..568f399e60ea92 100644
--- a/ci/official/envs/nightly_linux_x86_cpu_py311
+++ b/ci/official/envs/nightly_linux_x86_cpu_py311
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.11)
+TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.11)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.11
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.11 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/nightly_linux_x86_cpu_py39 b/ci/official/envs/nightly_linux_x86_cpu_py39
index 7679944d3f0dd1..e4bd6cc727f85c 100644
--- a/ci/official/envs/nightly_linux_x86_cpu_py39
+++ b/ci/official/envs/nightly_linux_x86_cpu_py39
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.9)
+TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.9)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.9
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=3.9 --target=devel tools/tf_sig_build_dockerfiles)
diff --git a/ci/official/envs/nightly_linux_x86_cuda_py310 b/ci/official/envs/nightly_linux_x86_cuda_py310
index 836d187c365611..5f0654310e22ed 100644
--- a/ci/official/envs/nightly_linux_x86_cuda_py310
+++ b/ci/official/envs/nightly_linux_x86_cuda_py310
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
+TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.10)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--nightly_flag)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.10
diff --git a/ci/official/envs/nightly_linux_x86_cuda_py311 b/ci/official/envs/nightly_linux_x86_cuda_py311
index 36abb67cb73acc..884d179ab58016 100644
--- a/ci/official/envs/nightly_linux_x86_cuda_py311
+++ b/ci/official/envs/nightly_linux_x86_cuda_py311
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.11)
+TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.11)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--nightly_flag)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.11
diff --git a/ci/official/envs/nightly_linux_x86_cuda_py39 b/ci/official/envs/nightly_linux_x86_cuda_py39
index 4796e3374fb68a..93d7a8c73bb54c 100644
--- a/ci/official/envs/nightly_linux_x86_cuda_py39
+++ b/ci/official/envs/nightly_linux_x86_cuda_py39
@@ -1,7 +1,7 @@
 source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.9)
+TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=3.9)
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--nightly_flag)
 TFCI_DOCKER_ARGS=(--gpus all)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python3.9

From f4265451b771fee74ad8e61c34d396050dcb5e51 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 17:14:22 -0700
Subject: [PATCH 295/567] Add test case to exercise TF quantizer for Conv2d
 with dilation

PiperOrigin-RevId: 568691561
---
 .../integration_test/quantize_model_test.py   | 164 +++++++++++++++++-
 .../quantize_model_test_base.py               |   6 +-
 2 files changed, 165 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index b0c77ccc586c5f..0b8c58690f912b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -1474,6 +1474,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation',
+          'activation_fn': None,
+          'has_bias': False,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'relu',
           'activation_fn': nn_ops.relu,
@@ -1483,6 +1493,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_relu',
+          'activation_fn': nn_ops.relu,
+          'has_bias': False,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'relu6',
           'activation_fn': nn_ops.relu6,
@@ -1492,6 +1512,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_relu6',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': False,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'bn',
           'activation_fn': None,
@@ -1501,6 +1531,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_bn',
+          'activation_fn': None,
+          'has_bias': False,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias',
           'activation_fn': None,
@@ -1510,6 +1550,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_with_bias',
+          'activation_fn': None,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_relu6',
           'activation_fn': nn_ops.relu6,
@@ -1519,6 +1569,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_with_bias_and_relu6',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_bn_and_relu6',
           'activation_fn': nn_ops.relu6,
@@ -1528,6 +1588,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_with_bias_and_bn_and_relu6',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_relu6_to_xla',
           'activation_fn': nn_ops.relu6,
@@ -1537,6 +1607,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_with_bias_and_relu6_to_xla',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_bn_and_relu6_to_xla',
           'activation_fn': nn_ops.relu6,
@@ -1546,6 +1626,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_with_bias_and_bn_and_relu6_to_xla',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_relu6_to_xla_dynamic',
           'activation_fn': nn_ops.relu6,
@@ -1555,6 +1645,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': True,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_with_bias_and_relu6_to_xla_dynamic',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_bn_and_relu6_to_xla_dynamic',
           'activation_fn': nn_ops.relu6,
@@ -1564,6 +1664,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': True,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_with_bias_and_bn_and_relu6_to_xla_dynamic',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_relu6_to_uq',
           'activation_fn': nn_ops.relu6,
@@ -1573,6 +1683,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_with_bias_and_relu6_to_uq',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_bn_and_relu6_to_uq',
           'activation_fn': nn_ops.relu6,
@@ -1582,6 +1702,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'dilation_with_bias_and_bn_and_relu6_to_uq',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_relu6_to_uq_per_channel',
           'activation_fn': nn_ops.relu6,
@@ -1591,6 +1721,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': True,
       },
+      {
+          'testcase_name': 'dilation_with_bias_and_relu6_to_uq_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_bn_and_relu6_to_uq_per_channel',
           'activation_fn': nn_ops.relu6,
@@ -1600,6 +1740,18 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': True,
       },
+      {
+          'testcase_name': (
+              'dilation_with_bias_and_bn_and_relu6_to_uq_per_channel'
+          ),
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+          'dilations': [1, 2, 2, 1],
+      },
   )
   @test_util.run_in_graph_and_eager_modes
   def test_conv_ptq_model(
@@ -1610,12 +1762,20 @@ def test_conv_ptq_model(
       target_opset: quant_opts_pb2.OpSet,
       input_shape_dynamic: bool,
       enable_per_channel_quantization: bool,
+      dilations: Sequence[int] = None,
   ):
     input_shape = [None, None, None, 3] if input_shape_dynamic else [1, 3, 4, 3]
     filter_shape = [2, 3, 3, 2]
+    strides = [1, 1, 1, 1]
 
     model = self._create_conv2d_model(
-        input_shape, filter_shape, has_bias, has_batch_norm, activation_fn
+        input_shape,
+        filter_shape,
+        has_bias,
+        has_batch_norm,
+        activation_fn,
+        strides,
+        dilations,
     )
     saved_model_save.save(model, self._input_saved_model_path)
 
@@ -1674,7 +1834,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
       target_outputs = converted_model.signatures['serving_default'](
           input_tensor=ops.convert_to_tensor(input_data)
       )
-      self.assertAllClose(target_outputs, expected_outputs, atol=0.05)
+      self.assertAllClose(target_outputs, expected_outputs, atol=0.06)
 
     if target_opset == quant_opts_pb2.XLA:
       self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
index 0a16c573d09915..c8a92219052789 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
@@ -1225,9 +1225,9 @@ def conv(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
         out = nn_ops.conv2d(
             input_tensor,
             self.filters,
-            strides=[1, 1, 2, 1],
-            dilations=[1, 1, 1, 1],
-            padding='SAME',
+            strides=strides,
+            dilations=dilations,
+            padding=padding,
             data_format='NHWC',
         )
         if has_bias:

From 63f405d5b4a7da6eae7fa0138513387201d04277 Mon Sep 17 00:00:00 2001
From: Luke Boyer <lukeboyer@google.com>
Date: Tue, 26 Sep 2023 17:16:26 -0700
Subject: [PATCH 296/567] Validate custom options buffer before parsing into
 flex buffer.

PiperOrigin-RevId: 568692079
---
 .../mlir/lite/transforms/lift_tflite_flex_ops.cc  | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
index 91abd715cdfc58..9563f5d93226fb 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <string>
 #include <utility>
 
@@ -210,11 +211,15 @@ class LiftFlexCustomOp : public OpRewritePattern<TFL::CustomOp> {
       tensorflow::NodeDef& node_def) {
     // The flexbuffer contains a vector where the first elements is the
     // op name and the second is a serialized NodeDef.
+    const uint8_t* const opt_data =
+        reinterpret_cast<const uint8_t*>(custom_options.data());
+    const size_t opt_size = custom_options.size();
+    if (!flexbuffers::VerifyBuffer(opt_data, opt_size)) {
+      return emitError(loc, "invalid custom options");
+    }
+
     const flexbuffers::Vector& v =
-        flexbuffers::GetRoot(
-            reinterpret_cast<const uint8_t*>(custom_options.data()),
-            custom_options.size())
-            .AsVector();
+        flexbuffers::GetRoot(opt_data, opt_size).AsVector();
 
     op_name = v[0].AsString().str();
 

From eb6ecfb7028bb02c6f3484b6681cf76cc740ceef Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Tue, 26 Sep 2023 17:22:26 -0700
Subject: [PATCH 297/567] [stream_executor] Remove undeclared data types in
 CUDA 11

https://github.com/openxla/xla/issues/5744

PiperOrigin-RevId: 568693456
---
 third_party/xla/xla/stream_executor/cuda/cuda_driver.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
index 4daea1ebc57fb2..dcb6a3c8c4f353 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
@@ -606,13 +606,16 @@ static std::string_view StreamCaptureModeToString(
     case CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED:
       result->result = GraphExecUpdateResult::kNotSupported;
       break;
+#if CUDA_VERSION >= 12000
     case CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE:
       result->result = GraphExecUpdateResult::kUnsupportedFunctionChange;
       break;
-
     case CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED:
       result->result = GraphExecUpdateResult::kAttributesChanged;
       break;
+#endif  // CUDA_VERSION >= 12000
+    default:
+      return tsl::errors::Internal("Unknown graph update result");
   }
 
   RETURN_IF_CUDA_RES_ERROR(err_code, "Failed to update CUDA graph");
@@ -639,6 +642,7 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
       return GraphNodeType::kGraph;
     case CU_GRAPH_NODE_TYPE_EMPTY:
       return GraphNodeType::kEmpty;
+#if CUDA_VERSION >= 12000
     case CU_GRAPH_NODE_TYPE_WAIT_EVENT:
       return GraphNodeType::kWaitEvent;
     case CU_GRAPH_NODE_TYPE_EVENT_RECORD:
@@ -653,6 +657,9 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
       return GraphNodeType::kMemFree;
     case CU_GRAPH_NODE_TYPE_BATCH_MEM_OP:
       return GraphNodeType::kBatchMemOp;
+#endif  // CUDA_VERSION >= 12000
+    default:
+      return tsl::errors::Internal("Unknown graph node type");
   }
 
   return tsl::Status(absl::StatusCode::kInternal,

From 0397b257a9dba80a732ec8b0df604df633c7158f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 17:31:27 -0700
Subject: [PATCH 298/567] This CL fixes a couple small issues: 1. Failing
 without a crash in SetHloShardingPostProcessing when mesh selection is on.
 This case was not handled in a previous CL. 2. Set the chosen mesh shape in
 the AutoSharding class so it can be accessed during    testing. This was done
 before, but looks like it got lost in a previous CL.

PiperOrigin-RevId: 568695296
---
 .../auto_sharding/auto_sharding.cc            | 39 ++++++++++++++-----
 .../auto_sharding/auto_sharding_util.cc       |  6 ++-
 .../auto_sharding/cluster_environment.h       | 16 ++++++--
 3 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index e0b07bb3d98348..c19641d58e4019 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -2647,10 +2647,10 @@ void SetHloSharding(const HloInstructionSequence& sequence,
   }
 }
 
-void SetHloShardingPostProcessing(
+Status SetHloShardingPostProcessing(
     const HloInstructionSequence& sequence, const StrategyMap& strategy_map,
     const CostGraph& cost_graph, absl::Span<const NodeStrategyIdx> s_val,
-    const ClusterEnvironment& cluster_env,
+    const ClusterEnvironment& cluster_env, bool crash_at_error,
     absl::flat_hash_map<std::string, std::vector<HloSharding>>*
         preserve_shardings) {
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
@@ -2681,12 +2681,19 @@ void SetHloShardingPostProcessing(
       const auto& lhs_tensor_dim_to_mesh_dim =
           cluster_env.GetTensorDimToMeshDimWrapper(
               lhs->shape(), lhs_sharding,
-              /* consider_reverse_device_meshes */ true);
+              /* consider_reverse_device_meshes */ true,
+              /* crash_at_error */ crash_at_error);
       const auto& rhs_tensor_dim_to_mesh_dim =
           cluster_env.GetTensorDimToMeshDimWrapper(
               rhs->shape(), rhs_sharding,
-              /* consider_reverse_device_meshes */ true);
+              /* consider_reverse_device_meshes */ true,
+              /* crash_at_error */ crash_at_error);
 
+      if (lhs_tensor_dim_to_mesh_dim.size() != lhs->shape().rank() ||
+          rhs_tensor_dim_to_mesh_dim.size() != rhs->shape().rank()) {
+        return absl::InvalidArgumentError(
+            "Cannot generate tensor dim to mesh dim mapping");
+      }
       if (absl::StrContains(stra.name, "allreduce") &&
           lhs_tensor_dim_to_mesh_dim[lhs_con_dims[0]] == -1 &&
           rhs_tensor_dim_to_mesh_dim[rhs_con_dims[0]] == -1) {
@@ -2722,11 +2729,19 @@ void SetHloShardingPostProcessing(
       const auto& lhs_tensor_dim_to_mesh_dim =
           cluster_env.GetTensorDimToMeshDimWrapper(
               lhs->shape(), lhs_sharding,
-              /* consider_reverse_device_meshes */ true);
+              /* consider_reverse_device_meshes */ true,
+              /* crash_at_error */ crash_at_error);
       const auto& rhs_tensor_dim_to_mesh_dim =
           cluster_env.GetTensorDimToMeshDimWrapper(
               rhs->shape(), rhs_sharding,
-              /* consider_reverse_device_meshes */ true);
+              /* consider_reverse_device_meshes */ true,
+              /* crash_at_error */ crash_at_error);
+
+      if (lhs_tensor_dim_to_mesh_dim.size() != lhs->shape().rank() ||
+          rhs_tensor_dim_to_mesh_dim.size() != rhs->shape().rank()) {
+        return absl::InvalidArgumentError(
+            "Cannot generate tensor dim to mesh dim mapping");
+      }
 
       if (absl::StrContains(stra.name, "allreduce") &&
           lhs_tensor_dim_to_mesh_dim[lhs_in_channel_dim] == -1 &&
@@ -2855,6 +2870,7 @@ void SetHloShardingPostProcessing(
       }
     }
   }
+  return absl::OkStatus();
 }
 
 // Print liveness set for debugging.
@@ -4269,8 +4285,13 @@ StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     SetHloSharding(sequence, strategy_map, cost_graph, s_val,
                    (mesh_idx == partial_mesh_shapes.size() - 1));
     if (mesh_idx == partial_mesh_shapes.size() - 1) {
-      SetHloShardingPostProcessing(sequence, strategy_map, cost_graph, s_val,
-                                   cluster_env, &preserve_shardings);
+      if (!SetHloShardingPostProcessing(
+               sequence, strategy_map, cost_graph, s_val, cluster_env,
+               /* crash_at_error */ !option_.try_multiple_mesh_shapes,
+               &preserve_shardings)
+               .ok()) {
+        return AutoShardingResult::kModuleUnchanged;
+      }
     } else {
       spmd::RecoverShardingsFromPartialMesh(sequence, preserve_shardings);
     }
@@ -4502,7 +4523,7 @@ StatusOr<bool> AutoSharding::Run(
                 << spmd::ToString(mesh_shapes[min_mesh_shape_index])
                 << " which had the minimal solver objective value of "
                 << min_objective_value;
-
+        chosen_mesh_shape_ = mesh_shapes[min_mesh_shape_index];
         absl::flat_hash_map<HloComputation*, HloComputation*>
             computation_replacements;
         for (size_t i = 0; i < module->computation_count(); ++i) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 618474b6e53f88..100113745a49c1 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -1250,7 +1250,11 @@ absl::StatusOr<std::vector<int64_t>> GetTensorDimToMeshDimNoCrash(
     return std::vector<int64_t>(tensor_shape_rank, -1);
   }
   // Check the compatibility of tensor_shape_rank and spec
-  CHECK_EQ(tensor_shape_rank, spec.TiledDataRank());
+  if (tensor_shape_rank != spec.TiledDataRank()) {
+    return absl::InvalidArgumentError(
+        "Tensor shape rank should be equal to the tiled data rank of the input "
+        "spec.");
+  }
 
   auto check_mesh =
       [&](const Array<int64_t>& mesh) -> std::optional<std::vector<int64_t>> {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
index a01741dd5bcd61..ef00dc5a6dc29b 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
@@ -102,10 +102,20 @@ class ClusterEnvironment {
   // -1 means replicated on that dimension
   std::vector<int64_t> GetTensorDimToMeshDimWrapper(
       const Shape& shape, const HloSharding& spec,
-      bool consider_reverse_device_meshes = false) const {
+      bool consider_reverse_device_meshes = false,
+      bool crash_at_error = true) const {
     int64_t n_dim = NumTileDimensions(spec);
-    std::vector<int64_t> tensor_dim_to_mesh_dim = GetTensorDimToMeshDim(
-        shape.rank(), spec, device_mesh_, consider_reverse_device_meshes);
+    std::vector<int64_t> tensor_dim_to_mesh_dim;
+    if (crash_at_error) {
+      tensor_dim_to_mesh_dim = GetTensorDimToMeshDim(
+          shape.rank(), spec, device_mesh_, consider_reverse_device_meshes);
+    } else {
+      auto tensor_dim_to_mesh_dim_status = GetTensorDimToMeshDimNoCrash(
+          shape.rank(), spec, device_mesh_, consider_reverse_device_meshes);
+      if (tensor_dim_to_mesh_dim_status.ok()) {
+        tensor_dim_to_mesh_dim = tensor_dim_to_mesh_dim_status.value();
+      }
+    }
     AdjustTensorMeshDimMapping(tensor_dim_to_mesh_dim, n_dim);
     return tensor_dim_to_mesh_dim;
   }

From 8d4d53f8990c6424b4ab4428bbe3aa7611d20a40 Mon Sep 17 00:00:00 2001
From: Edward Schwartz <schwartzedward@google.com>
Date: Tue, 26 Sep 2023 17:41:34 -0700
Subject: [PATCH 299/567] Enable replicate_constants_pass by default (in
 POST_REWRITE_FOR_EXEC)

PiperOrigin-RevId: 568697316
---
 tensorflow/core/config/flag_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/config/flag_defs.h b/tensorflow/core/config/flag_defs.h
index 89ea6a9b73bbf1..14ae3cbf24e732 100644
--- a/tensorflow/core/config/flag_defs.h
+++ b/tensorflow/core/config/flag_defs.h
@@ -49,7 +49,7 @@ class Flags {
   TF_DECLARE_FLAG(more_stack_traces, false,
                   "Enable experimental code that preserves and propagates "
                   "graph node stack traces in C++.");
-  TF_DECLARE_FLAG(replicate_small_constants, false,
+  TF_DECLARE_FLAG(replicate_small_constants, true,
                   "Enable a graph optimization pass that replicate each small "
                   "constant to its successors' devices. This can decrease "
                   "message passing.");

From f1cd9a93373d417eb011ca07ad97a87f4965ee31 Mon Sep 17 00:00:00 2001
From: Edward Schwartz <schwartzedward@google.com>
Date: Tue, 26 Sep 2023 18:03:27 -0700
Subject: [PATCH 300/567] Update ctc_loss to use tensor_scatter ops instead of
 depreciated inplace ops

PiperOrigin-RevId: 568701158
---
 tensorflow/python/ops/BUILD      |  3 +++
 tensorflow/python/ops/ctc_ops.py | 30 ++++++++++++++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 945a303d36735f..d9c0bceb0f35b7 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -1472,10 +1472,12 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":array_ops",
+        ":array_ops_gen",
         ":array_ops_stack",
         ":ctc_ops_gen",
         ":custom_gradient",
         ":functional_ops",
+        # TODO(b/280454072) Remove inplace_ops and compat when foward compatibility window expires.
         ":inplace_ops",
         ":linalg_ops",
         ":map_fn",
@@ -1483,6 +1485,7 @@ py_strict_library(
         ":nn_grad",
         ":nn_ops",
         ":sparse_ops",
+        "//tensorflow/python/compat",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 92915eeecfffb3..1bde62c0adbc20 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -16,6 +16,9 @@
 
 import uuid
 
+# TODO(b/280454072) Remove compat and inplace_ops when foward compatibility
+# window expires.
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 
@@ -31,6 +34,7 @@
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_ctc_ops
 from tensorflow.python.ops import inplace_ops
 from tensorflow.python.ops import linalg_ops
@@ -1493,10 +1497,17 @@ def body(i, num_elems, *args):
       new_out = []
     else:
       update_i = i + 1 if inclusive and not reverse else i
-      new_out = [
-          inplace_ops.alias_inplace_update(x, update_i, y)
-          for x, y in zip(out, flat_accum)
-      ]
+      # TODO(b/280454072) Cleanup when foward compatibility window expires.
+      if compat.forward_compatible(2023, 10, 26):
+        new_out = [
+            gen_array_ops.tensor_scatter_update(x, [[update_i]], [y])
+            for x, y in zip(out, flat_accum)
+        ]
+      else:
+        new_out = [
+            inplace_ops.alias_inplace_update(x, update_i, y)
+            for x, y in zip(out, flat_accum)
+        ]
     i = i - 1 if reverse else i + 1
     return [i, num_elems] + new_out + flat_accum
 
@@ -1511,8 +1522,15 @@ def body(i, num_elems, *args):
           [[num_outputs], array_ops.shape(initial_accum)], 0)
       out = inplace_ops.empty(out_shape, dtype=initial_accum.dtype, init=True)
       if inclusive:
-        out = inplace_ops.alias_inplace_add(out, init_i + (1 if reverse else 0),
-                                            initial_accum)
+        # TODO(b/280454072) Cleanup when foward compatibility window expires.
+        if compat.forward_compatible(2023, 10, 26):
+          out = gen_array_ops.tensor_scatter_add(
+              out, [[init_i + (1 if reverse else 0)]], [initial_accum]
+          )
+        else:
+          out = inplace_ops.alias_inplace_add(
+              out, init_i + (1 if reverse else 0), initial_accum
+          )
       outputs.append(out)
   loop_in = [init_i, num_elems] + outputs + flat_initial
   hostmem = [

From 86b42089c749fd4eb017a1885a7fa3125e6584bd Mon Sep 17 00:00:00 2001
From: Luke Boyer <lukeboyer@google.com>
Date: Tue, 26 Sep 2023 19:10:33 -0700
Subject: [PATCH 301/567] Legalize TensorListElementShape to tfl custom.

PiperOrigin-RevId: 568712108
---
 .../mlir/lite/tests/legalize-tensorlist.mlir  |  9 ++++++
 .../lite/transforms/legalize_tensorlist.cc    |  4 +++
 .../lite/transforms/legalize_tensorlist.td    |  7 +++++
 .../kernels/variants/py/end_to_end_test.py    | 29 +++++++++++++++++++
 4 files changed, 49 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir
index 9a8434489387fd..b3ae743538bf61 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir
@@ -110,3 +110,12 @@ func.func @listEmptyToListReserve(%arg0: tensor<?xi32>, %arg1: tensor<i32>) -> t
   // CHECK: %0 = "tfl.custom"(%arg0, %cst) {custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x04">} : (tensor<?xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>>
   func.return %0 : tensor<!tf_type.variant<tensor<*xi64>>>
 }
+
+// -----
+
+// CHECK-LABEL: listElementShape
+func.func @listElementShape(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<*xi32> {
+  %0 = "tf.TensorListElementShape"(%arg0) : (tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<*xi32>
+  // CHECK: %0 = "tfl.custom"(%arg0) {custom_code = "TensorListElementShape", custom_option = #tfl<const_bytes : "0x">} : (tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<*xi32>
+  func.return %0 : tensor<*xi32>
+}
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
index 68ca0e53192a43..23d1715886a4c8 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
@@ -128,6 +128,10 @@ bool IsOpSupported(mlir::Operation* op) {
   if (auto empty = llvm::dyn_cast_or_null<TF::EmptyTensorListOp>(op)) {
     element_type = empty.getElementDtype();
   }
+  if (auto element_shape =
+          llvm::dyn_cast_or_null<TF::TensorListElementShapeOp>(op)) {
+    element_type = GetSingularVariantBaseType(op->getOperand(0));
+  }
 
   if (!element_type.has_value()) return false;
   // TODO(b/288302706) add support for all types handled in the
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
index 5f8757fea2c844..cac0509e13c350 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
@@ -22,6 +22,9 @@ include "mlir/Dialect/Arith/IR/ArithOps.td"
 def ConstDenseElementsI32ZeroAttr
   : NativeCodeCall<"$_builder.create<TFL::ConstOp>($_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {0}))">;
 
+def Size1InputRange : NativeCodeCall<
+  "SmallVector<Value, 1>{$0}">;
+
 def Size2InputRange : NativeCodeCall<
   "SmallVector<Value, 2>{$0, $1}">;
 
@@ -60,3 +63,7 @@ def LegalizeTensorListGetItem : Pat<(TF_TensorListGetItemOp $input, $index, $ele
 def LegalizeTensorListEmpty : Pat<(TF_EmptyTensorListOp:$tf_op $element_shape, $unused_max_elements),
     (TFL_CustomOp (Size2InputRange $element_shape, (ConstDenseElementsI32ZeroAttr)),
     (CreateStringAttr<"\"TensorListReserve\"">), (CustomOptions $tf_op))>;
+
+def LegalizeTensorListElementShape : Pat<(TF_TensorListElementShapeOp $input),
+    (TFL_CustomOp (Size1InputRange $input),
+    (CreateStringAttr<"\"TensorListElementShape\"">), (EmptyCustomOptions))>;
diff --git a/tensorflow/lite/kernels/variants/py/end_to_end_test.py b/tensorflow/lite/kernels/variants/py/end_to_end_test.py
index dff0d397b3c69d..7ec10b479918aa 100644
--- a/tensorflow/lite/kernels/variants/py/end_to_end_test.py
+++ b/tensorflow/lite/kernels/variants/py/end_to_end_test.py
@@ -273,6 +273,35 @@ def empty_tensorlist_set_stack(x) -> tf.Tensor:
     self.assertEqual(tf_out.shape, output_tensor.shape)
     self.assertTrue((tf_out == output_tensor).numpy().all())
 
+  @parameterized.named_parameters(
+      ("Unranked", None),
+      ("DynDim", [None]),
+      ("DynMultiDim", [None, 2]),
+      ("AllStatic", [2, 2]),
+  )
+  def test_reserve_element_shape(self, element_shape):
+    @tf.function
+    def reserve_element_shape() -> tf.Tensor:
+      l = list_ops.tensor_list_reserve(
+          element_shape=tf.TensorShape(element_shape),
+          element_dtype=tf.int32,
+          num_elements=10,
+      )
+      return list_ops.tensor_list_element_shape(l, tf.int32)
+
+    interpreter = self._get_interpreter_from_c_func(reserve_element_shape)
+    interpreter.allocate_tensors()
+    interpreter.invoke()
+
+    output_tensor = interpreter.get_tensor(
+        interpreter.get_output_details()[0]["index"]
+    )
+
+    tf_out = reserve_element_shape()
+    self.assertEqual(tf_out.dtype, output_tensor.dtype)
+    self.assertEqual(tf_out.shape, output_tensor.shape)
+    self.assertTrue((tf_out == output_tensor).numpy().all())
+
 
 if __name__ == "__main__":
   googletest.main()

From 4de89a208ce11d6fb61d8badd57810afc5ce86be Mon Sep 17 00:00:00 2001
From: Jake Harmon <jakeharmon@google.com>
Date: Tue, 26 Sep 2023 19:52:01 -0700
Subject: [PATCH 302/567] Fix failing cp on Mac

PiperOrigin-RevId: 568718589
---
 tensorflow/tools/pip_package/build_pip_package.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index f7b4a655a67beb..8f6906cf8f0ade 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -280,8 +280,8 @@ function prepare_src() {
     cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/local_tsl/tsl/ ${TMPDIR}/tensorflow
     cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
   else
-    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_tsl/tsl/ ${TMPDIR}/tensorflow
-    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
+    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_tsl/tsl ${TMPDIR}/tensorflow
+    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_xla/xla ${TMPDIR}/tensorflow/compiler
   fi
   # Fix the proto stubs
   if is_macos; then

From 0be57e8069699ffcfeabd526c17d2cfe9ce823c0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 20:31:18 -0700
Subject: [PATCH 303/567] [PJRT C API] Implements `CopyToMemorySpace` for
 `PjRtCApiBuffer`.

PiperOrigin-RevId: 568726154
---
 third_party/xla/xla/pjrt/c/pjrt_c_api.h       | 21 +++++++++-
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc | 14 ++++++-
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h  |  3 ++
 third_party/xla/xla/pjrt/pjrt_c_api_client.cc | 38 +++++++++++++++++++
 third_party/xla/xla/pjrt/pjrt_c_api_client.h  |  4 +-
 5 files changed, 74 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index f3064d1fe41b13..5b836c7dac0d30 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -53,7 +53,7 @@ extern "C" {
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 31
+#define PJRT_API_MINOR 32
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -1571,6 +1571,21 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyToDevice_Args, dst_buffer);
 typedef PJRT_Error* PJRT_Buffer_CopyToDevice(
     PJRT_Buffer_CopyToDevice_Args* args);
 
+struct PJRT_Buffer_CopyToMemory_Args {
+  size_t struct_size;
+  void* priv;
+  PJRT_Buffer* buffer;
+  PJRT_Memory* dst_memory;
+  PJRT_Buffer* dst_buffer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyToMemory_Args, dst_buffer);
+
+// Copies the buffer to memory `dst_memory`. Caller is responsible for freeing
+// returned `dst_buffer` with PJRT_Buffer_Destroy. Returns an error if the
+// buffer is already on `dst_memory`.
+typedef PJRT_Error* PJRT_Buffer_CopyToMemory(
+    PJRT_Buffer_CopyToMemory_Args* args);
+
 struct PJRT_Buffer_IsOnCpu_Args {
   size_t struct_size;
   void* priv;
@@ -2033,10 +2048,12 @@ typedef struct {
   // corresponding places after each major version bump.
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_OutputElementTypes);
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_OutputDimensions);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_CopyToMemory);
 } PJRT_Api;
 
 const size_t PJRT_Api_STRUCT_SIZE =
-    PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Executable_OutputDimensions);
+    PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Buffer_CopyToMemory);
 
 #undef _PJRT_API_STRUCT_FIELD
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index c6b1d2345ee921..06a16a0aaeaee5 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -1616,7 +1616,19 @@ PJRT_Error* PJRT_Buffer_CopyToDevice(PJRT_Buffer_CopyToDevice_Args* args) {
       std::unique_ptr<xla::PjRtBuffer> dst_buffer,
       args->buffer->buffer->CopyToDevice(args->dst_device->device));
   args->dst_buffer =
-      new PJRT_Buffer{std::move(dst_buffer), args->buffer->client};
+      new PJRT_Buffer{std::move(dst_buffer), args->dst_device->client};
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Buffer_CopyToMemory(PJRT_Buffer_CopyToMemory_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_Buffer_CopyToMemory_Args",
+      PJRT_Buffer_CopyToMemory_Args_STRUCT_SIZE, args->struct_size));
+  PJRT_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::PjRtBuffer> dst_buffer,
+      args->buffer->buffer->CopyToMemorySpace(args->dst_memory->memory_space));
+  args->dst_buffer =
+      new PJRT_Buffer{std::move(dst_buffer), args->dst_memory->client};
   return nullptr;
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index 7003f9c24ebac0..9620aa70454b79 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -302,6 +302,7 @@ PJRT_Error* PJRT_Buffer_Memory(PJRT_Buffer_Memory_Args* args);
 PJRT_Error* PJRT_Buffer_Delete(PJRT_Buffer_Delete_Args* args);
 PJRT_Error* PJRT_Buffer_IsDeleted(PJRT_Buffer_IsDeleted_Args* args);
 PJRT_Error* PJRT_Buffer_CopyToDevice(PJRT_Buffer_CopyToDevice_Args* args);
+PJRT_Error* PJRT_Buffer_CopyToMemory(PJRT_Buffer_CopyToMemory_Args* args);
 PJRT_Error* PJRT_Buffer_ToHostBuffer(PJRT_Buffer_ToHostBuffer_Args* args);
 PJRT_Error* PJRT_Buffer_IsOnCpu(PJRT_Buffer_IsOnCpu_Args* args);
 PJRT_Error* PJRT_Buffer_ReadyEvent(PJRT_Buffer_ReadyEvent_Args* args);
@@ -556,6 +557,8 @@ constexpr PJRT_Api CreatePjrtApi(
       pjrt::PJRT_Executable_OutputElementTypes,
       /*PJRT_Executable_OutputDimensions=*/
       pjrt::PJRT_Executable_OutputDimensions,
+      /*PJRT_Buffer_CopyToMemory=*/
+      pjrt::PJRT_Buffer_CopyToMemory,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index e6940affd26482..50177ffcaa0c9c 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -1824,6 +1824,44 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToDevice(
   }
 }
 
+StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
+    PjRtMemorySpace* dst_memory) {
+  const PJRT_Api* api = pjrt_c_api();
+  // TODO(yueshengys): Remove this after 12/20/2023.
+  if (api->pjrt_api_version.minor_version < 32) {
+    return Unimplemented(
+        "The plugin has PJRT API version 0.32 which does not support "
+        "CopyToMemorySpace");
+  }
+  if (dst_memory->client() == client_) {
+    PJRT_Buffer_CopyToMemory_Args args;
+    args.struct_size = PJRT_Buffer_CopyToMemory_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.buffer = buffer_.get();
+    args.dst_memory =
+        tensorflow::down_cast<PjRtCApiMemorySpace*>(dst_memory)->c_memory();
+    RETURN_STATUS_IF_PJRT_ERROR(api->PJRT_Buffer_CopyToMemory(&args), api);
+    return std::unique_ptr<PjRtBuffer>(
+        std::make_unique<PjRtCApiBuffer>(client_, args.dst_buffer));
+  } else {
+    // Copy across PjRtClients by copying through host
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
+    absl::InlinedVector<int64_t, 4> byte_strides(
+        literal->shape().dimensions_size());
+    TF_RETURN_IF_ERROR(
+        ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
+    // Avoid use-after-free on `literal` due to unsequenced move and use.
+    Literal* literal_pointer = literal.get();
+    return dst_memory->client()->BufferFromHostBuffer(
+        literal_pointer->untyped_data(),
+        literal_pointer->shape().element_type(),
+        literal_pointer->shape().dimensions(), byte_strides,
+        PjRtClient::HostBufferSemantics::kZeroCopy,
+        [literal{std::move(literal)}]() { /* frees literal */ }, dst_memory,
+        /*device_layout=*/nullptr);
+  }
+}
+
 bool PjRtCApiBuffer::IsOnCpu() const {
   PJRT_Buffer_IsOnCpu_Args args;
   args.struct_size = PJRT_Buffer_IsOnCpu_Args_STRUCT_SIZE;
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
index ba8eae65d6ba68..4dbccb3faa0484 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
@@ -422,9 +422,7 @@ class PjRtCApiBuffer : public PjRtBuffer {
       PjRtDevice* dst_device) override;
 
   StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
-      PjRtMemorySpace* dst_memory_space) override {
-    return Unimplemented("PJRT C API does not support CopyToMemorySpace");
-  }
+      PjRtMemorySpace* dst_memory_space) override;
 
   void CopyToRemoteDevice(
       PjRtFuture<StatusOr<std::string>> serialized_descriptor,

From fa2eb221fcf899b0281da2b197f29b2c3dc780fb Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 26 Sep 2023 21:23:26 -0700
Subject: [PATCH 304/567] Allow register_revived_type to override a previous
 registration.

This code is called at package loading and has been causing endless bugs, preventing TF to be imported altogether whenever another package in the environment has already been calling register_revived_type.

PiperOrigin-RevId: 568733482
---
 tensorflow/python/saved_model/revived_types.py      |  3 ---
 tensorflow/python/saved_model/revived_types_test.py | 12 ------------
 2 files changed, 15 deletions(-)

diff --git a/tensorflow/python/saved_model/revived_types.py b/tensorflow/python/saved_model/revived_types.py
index 458ab77e822927..cf31318b34487c 100644
--- a/tensorflow/python/saved_model/revived_types.py
+++ b/tensorflow/python/saved_model/revived_types.py
@@ -132,9 +132,6 @@ def register_revived_type(identifier, predicate, versions):
           f"type {identifier}.")
     version_numbers.add(registration.version)
 
-  if identifier in _REVIVED_TYPE_REGISTRY:
-    raise AssertionError(f"Duplicate registrations for type '{identifier}'")
-
   _REVIVED_TYPE_REGISTRY[identifier] = (predicate, versions)
   _TYPE_IDENTIFIERS.append(identifier)
 
diff --git a/tensorflow/python/saved_model/revived_types_test.py b/tensorflow/python/saved_model/revived_types_test.py
index bfaeaf2d18b9a4..3679ff7d6051f0 100644
--- a/tensorflow/python/saved_model/revived_types_test.py
+++ b/tensorflow/python/saved_model/revived_types_test.py
@@ -101,18 +101,6 @@ def test_min_producer_version(self):
                 bad_consumers=[])))
     self.assertEqual(3, deserialized.version)
 
-  def test_register_twice(self):
-    identifier = "foo"
-    predicate = lambda x: isinstance(x, int)
-    versions = [
-        revived_types.VersionedTypeRegistration(
-            object_factory=lambda _: 1,
-            version=1, min_producer_version=1,
-            min_consumer_version=1),
-    ]
-    revived_types.register_revived_type(identifier, predicate, versions)
-    with self.assertRaisesRegex(AssertionError, "Duplicate registrations"):
-      revived_types.register_revived_type(identifier, predicate, versions)
 
 if __name__ == "__main__":
   test.main()

From 150cc936e2dea2ead97fa88ac87975f9e6fbfa4f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 21:38:10 -0700
Subject: [PATCH 305/567] Internal Code Change

PiperOrigin-RevId: 568735481
---
 tensorflow/python/saved_model/tests/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/saved_model/tests/BUILD b/tensorflow/python/saved_model/tests/BUILD
index 46a61201aff953..2cccfe99bdad95 100644
--- a/tensorflow/python/saved_model/tests/BUILD
+++ b/tensorflow/python/saved_model/tests/BUILD
@@ -5,7 +5,6 @@ load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
 

From 7e330ddc4071ad5e25746ffe53aa211845c4f44d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 22:47:30 -0700
Subject: [PATCH 306/567] Remove NCCL workaround

With the upgrade to LLVM-17 in our toolchain
this workaround isn't needed anymore.

PiperOrigin-RevId: 568746859
---
 tensorflow/dtensor/python/tests/BUILD | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index 801bbb267be40d..dcf1460cf84fd9 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -389,7 +389,6 @@ dtensor_test(
     env = {
         "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
         "NCCL_P2P_DISABLE": "1",  # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
-        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this after the migration to LLVM-17.
     },
     tags = [
         "no_windows",
@@ -435,7 +434,6 @@ dtensor_test(
     env = {
         "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
         "NCCL_P2P_DISABLE": "1",  # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
-        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this after the migration to LLVM-17.
     },
     tags = [
         "no_windows",

From 18354a89e9fe8c91e2944e0e20cb0235805f36af Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Wed, 27 Sep 2023 11:45:19 +0530
Subject: [PATCH 307/567] Update run_models.py

---
 tensorflow/python/compiler/tensorrt/model_tests/run_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compiler/tensorrt/model_tests/run_models.py b/tensorflow/python/compiler/tensorrt/model_tests/run_models.py
index e1e8556a61aedb..8af5b35af9d03b 100644
--- a/tensorflow/python/compiler/tensorrt/model_tests/run_models.py
+++ b/tensorflow/python/compiler/tensorrt/model_tests/run_models.py
@@ -105,7 +105,7 @@
 flags.DEFINE_enum("output_format", "CSV", ["CSV", "JSON"],
                   "Output format of analysis results.")
 
-DEFAULT_TRT_CONVERT_PARAMS = trt.DEFAULT_TRT_CONVERSION_PARAMS
+DEFAUL_TRT_CONVERT_PARAMS = trt.DEFAULT_TRT_CONVERSION_PARAMS
 
 
 # pylint: disable=bad-whitespace

From cc279072e16a2a51fe41f423716aeccadaeda3e6 Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Wed, 27 Sep 2023 11:51:38 +0530
Subject: [PATCH 308/567] Update debug_events_writer_test.py

---
 tensorflow/python/debug/lib/debug_events_writer_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/debug/lib/debug_events_writer_test.py b/tensorflow/python/debug/lib/debug_events_writer_test.py
index e7e6a71bf650b1..1c2f11dff72f83 100644
--- a/tensorflow/python/debug/lib/debug_events_writer_test.py
+++ b/tensorflow/python/debug/lib/debug_events_writer_test.py
@@ -809,7 +809,7 @@ def testDebuggedGraphToJonsWithNameAndInnerOuterGraphIds(self):
       ("EmptyList", []),
       ("None", None),
   )
-  def testGraphOpDigestWithNoOutputsReturnsNumOutputsZero(
+  def testGraphOpDigestWithNoOutpusReturnsNumOutputsZero(
       self, output_tensor_ids):
     op_creation_digest = debug_events_reader.GraphOpCreationDigest(
         1234,

From 7dbcb2cd968a7b53848f9a7b1ed4f465436ddb9f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Sep 2023 23:25:55 -0700
Subject: [PATCH 309/567] Remove PjRt C API check in `PjRtArray::Reshard` since
 `PjRtCApiBuffer::CopyToMemorySpace` has been supported.

PiperOrigin-RevId: 568754787
---
 third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc | 11 +----------
 third_party/xla/xla/python/xla_client.py           |  2 +-
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index 205c376246ad38..4bea6712820b7e 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -21,9 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
-#include "absl/strings/match.h"
 #include "absl/strings/str_join.h"
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -346,17 +344,10 @@ StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
   // (e.g., ifrt proxy) support memories.
   bool new_sharding_has_memory_kind =
       new_sharding->memory_kind().memory_kind().has_value();
-  // TODO(yueshengys): Remove the check on PjRt C API after `CopyToMemorySpace`
-  // is supported.
-  CHECK_GT(new_sharding->devices().size(), 0);
-  bool using_c_api = absl::StrContains(
-      new_sharding->devices().front()->client()->platform_version(),
-      "PJRT C API");
   for (int i = 0; i < pjrt_buffers_.size(); ++i) {
     bool devices_equal =
         pjrt_buffers_[i]->device() == new_sharding->devices()[i];
-    bool memories_supported =
-        !using_c_api && pjrt_buffers_[i]->memory_space() != nullptr;
+    bool memories_supported = pjrt_buffers_[i]->memory_space() != nullptr;
     bool memory_kind_equal =
         new_sharding_has_memory_kind && memories_supported &&
         pjrt_buffers_[i]->memory_space()->memory_space_kind() ==
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index e0d48c09120db1..4885579643cb51 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -45,7 +45,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 198
+_version = 199
 
 # Version number for MLIR:Python components.
 mlir_api_version = 54

From 56f579e0438e79fc39f903a90f8a2bc55b914c70 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 00:47:23 -0700
Subject: [PATCH 310/567] Internal Code Change

PiperOrigin-RevId: 568769851
---
 tensorflow/core/tfrt/mlrt/attribute/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/tfrt/mlrt/attribute/BUILD b/tensorflow/core/tfrt/mlrt/attribute/BUILD
index 58765a1ff2c0d9..7190d1e4f742b8 100644
--- a/tensorflow/core/tfrt/mlrt/attribute/BUILD
+++ b/tensorflow/core/tfrt/mlrt/attribute/BUILD
@@ -10,7 +10,6 @@ package(
         # copybara:uncomment "//smartass/brain/ops/tfrt_kernels:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt/translate/mlrt:__subpackages__",
-        "//tensorflow/core/tfrt:__subpackages__",
     ],
 )
 

From 0b86e66216e547c8dd5c825457ecbd54b40c8ffe Mon Sep 17 00:00:00 2001
From: pemeliya <141146080+pemeliya@users.noreply.github.com>
Date: Wed, 27 Sep 2023 01:20:39 -0700
Subject: [PATCH 311/567] PR #5914: [ROCm] fixing build errors for ROCm nightly
 tests

Imported from GitHub PR https://github.com/openxla/xla/pull/5914

This commit fixes current build errors for ROCm platform.

@akuegel @ddunl @ezhulenev : can someone of you check this commit please ?

Here I have addressed the issues mentioned by Eugene from https://github.com/openxla/xla/pull/5867
about stream_executor targets.

Copybara import of the project:

--
307781ca50e80fc870c9b2593458a25ea7c2321a by Pavel Emeliyanenko <pavel.emeliyanenko@amd.com>:

fixing build errors for ROCm nightly tests

Merging this change closes #5914

PiperOrigin-RevId: 568775978
---
 third_party/xla/xla/stream_executor/rocm/BUILD             | 7 +------
 .../xla/xla/stream_executor/rocm/rocm_gpu_executor.cc      | 6 ++++--
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index fe414f1c4d4255..e28a71f07e712c 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -110,7 +110,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "//xla/stream_executor",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_event",
         "//xla/stream_executor/gpu:gpu_kernel_header",
@@ -158,8 +157,6 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "//xla/stream_executor",  # buildcleaner: keep
-        "//xla/stream_executor:multi_platform_manager",
-        "//xla/stream_executor:stream_executor_pimpl_header",
         "//xla/stream_executor/platform",
     ]),
     alwayslink = True,  # Registers itself with the MultiPlatformManager.
@@ -312,12 +309,10 @@ cc_library(
         ":rocm_gpu_executor",
         ":rocm_platform_id",
         "@eigen_archive//:eigen3",
+        "//xla/stream_executor",
         "//xla/stream_executor:dnn",
-        "//xla/stream_executor:event",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
-        "//xla/stream_executor:stream_executor_pimpl",
-        "//xla/stream_executor:temporary_device_memory",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_timer_header",
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
index 1443714e623719..861637fff5ea7a 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -727,8 +727,10 @@ GpuExecutor::GetStreamImplementation() {
 
 tsl::StatusOr<std::unique_ptr<internal::CommandBufferInterface>>
 GpuExecutor::GetCommandBufferImplementation() {
-  return std::unique_ptr<internal::CommandBufferInterface>(
-      new GpuCommandBuffer());
+  VLOG(2) << "Create ROCm command buffer (ROCm graph)";
+  GpuGraphHandle graph = nullptr;
+  TF_RETURN_IF_ERROR(GpuDriver::CreateGraph(&graph));
+  return std::make_unique<GpuCommandBuffer>(this, graph);
 }
 
 void* GpuExecutor::GpuContextHack() { return context_; }

From 067c77203fa942f9bb27e63f0d2cb1954552c408 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 02:03:08 -0700
Subject: [PATCH 312/567] compat: Update forward compatibility horizon to
 2023-09-27

PiperOrigin-RevId: 568783473
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index bc3240a9edd402..356df7c31d74e6 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 26)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 27)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 757c6a5750faf344e3daf2c41f1319f17971af42 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 02:06:56 -0700
Subject: [PATCH 313/567] Update GraphDef version to 1632.

PiperOrigin-RevId: 568784455
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 06e48524defcb2..c65ab6b9620b8e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1631  // Updated: 2023/9/26
+#define TF_GRAPH_DEF_VERSION 1632  // Updated: 2023/9/27
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 439e612a74defe4a0d6e76886e43f8890a32a44e Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 27 Sep 2023 03:06:31 -0700
Subject: [PATCH 314/567] Extract Scheduling and PostScheduling pipelines out
 of CompileModuleToLLVMIR (NFC).

This allows to test these parts separately from lowering to LLVM IR.
Adjust the .hlo tests to start with scheduled modules.

PiperOrigin-RevId: 568795746
---
 third_party/xla/xla/service/gpu/BUILD         | 19 ++---
 .../service/gpu/compile_module_to_llvm_ir.cc  | 77 ++---------------
 .../service/gpu/compile_module_to_llvm_ir.h   |  8 +-
 .../xla/xla/service/gpu/gpu_compiler.cc       | 78 ++++++++++++++++-
 .../xla/xla/service/gpu/gpu_compiler.h        |  3 +
 .../xla/xla/service/gpu/gpu_hlo_schedule.h    |  2 +-
 third_party/xla/xla/service/gpu/tests/BUILD   |  1 +
 .../xla/xla/service/gpu/tests/add_preds.hlo   |  2 +-
 .../service/gpu/tests/calling_convention.hlo  |  4 +-
 .../xla/xla/service/gpu/tests/concat.hlo      |  2 +-
 .../xla/xla/service/gpu/tests/constant.hlo    | 10 ++-
 .../xla/xla/service/gpu/tests/copy.hlo        | 10 ++-
 .../xla/xla/service/gpu/tests/copy_nested.hlo |  9 +-
 .../tests/dynamic_update_slice_inplace.hlo    | 14 ++--
 .../tests/element_wise_row_vectorization.hlo  | 22 ++---
 .../xla/service/gpu/tests/fused_scatter.hlo   | 27 ++++--
 .../xla/xla/service/gpu/tests/fused_slice.hlo |  2 +-
 .../tests/fused_slice_different_operands.hlo  |  2 +-
 .../xla/xla/service/gpu/tests/fusion.hlo      |  2 +-
 .../xla/service/gpu/tests/hlo_to_llvm_ir.cc   | 25 +++---
 .../xla/service/gpu/tests/kernel_reuse.hlo    | 14 ++--
 .../service/gpu/tests/launch_dimensions.hlo   | 34 +++++---
 .../xla/service/gpu/tests/pad_to_static.hlo   |  2 +-
 .../xla/service/gpu/tests/reduce_unnested.hlo | 61 ++++++++++----
 .../tests/reduction_vectorization_sm_all.hlo  | 84 +++++++++++++++----
 .../gpu/tests/rng_get_and_update_state.hlo    |  2 +-
 .../xla/xla/service/gpu/tests/scatter.hlo     |  8 +-
 .../service/gpu/tests/select_and_scatter.hlo  |  2 +-
 .../service/gpu/tests/single_instruction.hlo  | 36 ++++++--
 .../service/gpu/tests/slice_to_dynamic.hlo    |  2 +-
 .../xla/xla/service/gpu/tests/sorting.hlo     |  4 +-
 .../xla/service/gpu/tests/triton_naming.hlo   |  2 +-
 32 files changed, 361 insertions(+), 209 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 075e103eaf2b59..914070c60edba2 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2461,12 +2461,9 @@ cc_library(
     deps = [
         ":buffer_sharing",
         ":executable_proto_cc",
-        ":fusion_wrapper",
         ":gpu_constants",
-        ":gpu_convert_async_collectives_to_sync",
         ":gpu_device_info",
         ":gpu_executable",
-        ":gpu_hlo_schedule",
         ":ir_emitter_context",
         ":ir_emitter_unnested",
         ":metrics",
@@ -2482,15 +2479,11 @@ cc_library(
         "//xla/mlir/backends/gpu2/transforms:passes",
         "//xla/mlir/runtime/transforms:compilation_pipeline_gpu",
         "//xla/mlir_hlo:transforms_gpu_passes",
-        "//xla/service:bitcast_dtypes_expander",
         "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
         "//xla/service:dump",
-        "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_dataflow_analysis",
-        "//xla/service:hlo_pass_pipeline",
         "//xla/service:hlo_proto_cc",
-        "//xla/service:hlo_rematerialization",
-        "//xla/service:optimization_barrier_expander",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
@@ -2601,12 +2594,14 @@ cc_library(
         ":dot_dimension_sorter",
         ":executable_proto_cc",
         ":fusion_merger",
+        ":fusion_wrapper",
         ":gemm_broadcast_folding_rewriter",
-        ":gemm_rewriter_triton",
         ":gemm_rewriter",
+        ":gemm_rewriter_triton",
         ":gpu_async_collective_annotator",
         ":gpu_constants",
         ":gpu_conv_rewriter",
+        ":gpu_convert_async_collectives_to_sync",
         ":gpu_cost_model_stats_collection",
         ":gpu_device_info",
         ":gpu_executable",
@@ -2644,8 +2639,8 @@ cc_library(
         ":tree_reduction_rewriter",
         ":variadic_op_splitter",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/log:log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
         "@llvm-project//llvm:AsmParser",
@@ -2711,8 +2706,8 @@ cc_library(
         "//xla/service:hlo_dataflow_analysis",
         "//xla/service:hlo_dce",
         "//xla/service:hlo_module_config",
-        "//xla/service:hlo_pass_pipeline",
         "//xla/service:hlo_pass",
+        "//xla/service:hlo_pass_pipeline",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_rematerialization",
         "//xla/service:hlo_verifier",
@@ -2753,8 +2748,8 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/spmd:collective_permute_motion",
         "//xla/service/spmd:stateful_rng_spmd_partitioner",
-        "//xla/stream_executor:device_description_proto_cc_impl",
         "//xla/stream_executor",
+        "//xla/stream_executor:device_description_proto_cc_impl",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "//xla/translate/mhlo_to_hlo:location_exporter",
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 4652e3ac349b02..574bfaf63aa5c6 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -48,29 +48,22 @@ limitations under the License.
 #include "xla/mlir/backends/gpu2/transforms/passes.h"
 #include "xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
 #include "xla/mlir_hlo/transforms/gpu_passes.h"
-#include "xla/service/bitcast_dtypes_expander.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_value.h"
 #include "xla/service/dump.h"
 #include "xla/service/gpu/buffer_sharing.h"
 #include "xla/service/gpu/conditional_thunk.h"
 #include "xla/service/gpu/for_thunk.h"
-#include "xla/service/gpu/fusion_wrapper.h"
 #include "xla/service/gpu/gpu_constants.h"
-#include "xla/service/gpu/gpu_convert_async_collectives_to_sync.h"
 #include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_executable.h"
-#include "xla/service/gpu/gpu_hlo_schedule.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_unnested.h"
 #include "xla/service/gpu/metrics.h"
 #include "xla/service/gpu/sequential_thunk.h"
 #include "xla/service/gpu/while_thunk.h"
-#include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_dataflow_analysis.h"
-#include "xla/service/hlo_pass_pipeline.h"
-#include "xla/service/hlo_rematerialization.h"
 #include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/service/optimization_barrier_expander.h"
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
@@ -253,12 +246,13 @@ StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
     const std::string& platform_name, const se::Platform::Id platform_id,
-    GpuDeviceInfo gpu_device_info, int pointer_size) {
+    GpuDeviceInfo gpu_device_info,
+    const BufferValue::SizeFunction& buffer_size_bytes_function) {
   CompileModuleResults results;
   TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
       hlo_module, llvm_context, target_triple, data_layout, platform_name,
-      platform_id, gpu_device_info, &CanShareBufferHint, pointer_size,
-      &results));
+      platform_id, gpu_device_info, &CanShareBufferHint,
+      buffer_size_bytes_function, &results));
   return std::move(results.llvm_module);
 }
 
@@ -345,69 +339,12 @@ Status CompileModuleToLlvmIrImpl(
     const std::string& platform_name, se::Platform::Id platform_id,
     GpuDeviceInfo gpu_device_info,
     const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
-    int pointer_size, CompileModuleResults* results,
-    se::StreamExecutor* stream_exec) {
+    const BufferValue::SizeFunction& buffer_size_bytes_function,
+    CompileModuleResults* results, se::StreamExecutor* stream_exec) {
   results->llvm_module = std::make_unique<llvm::Module>("", *llvm_context);
   results->llvm_module->setTargetTriple(target_triple);
   results->llvm_module->setDataLayout(data_layout);
 
-  const int64_t scheduler_mem_limit =
-      GetSchedulerMemoryLimit(hlo_module, gpu_device_info, pointer_size);
-  TF_RETURN_IF_ERROR(
-      ScheduleGpuModule(hlo_module, pointer_size, scheduler_mem_limit));
-  {
-    HloPassPipeline pipeline("post-scheduling-passes");
-
-    HloPredicate is_nop =
-        HloPredicateIsOp<HloOpcode::kParameter, HloOpcode::kConstant,
-                         HloOpcode::kBitcast, HloOpcode::kGetTupleElement>;
-    pipeline.AddPass<GpuConvertAsyncCollectivesToSync>(is_nop);
-    pipeline.AddPass<OptimizationBarrierExpander>();
-
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
-  }
-
-  {
-    HloPassPipeline pipeline("remat-pipeline");
-
-    auto shape_size_func = [pointer_size](const Shape& shape) {
-      return GetSizeOfShape(shape, pointer_size);
-    };
-    HloCostAnalysis hlo_cost_analysis(shape_size_func);
-    HloRematerialization::RematerializationModeConfig
-        rematerialization_mode_config(/*recompute=*/true, /*compress=*/true,
-                                      /*host_offload=*/false);
-    HloRematerialization::Options options(
-        hlo_cost_analysis, rematerialization_mode_config,
-        // Assume 75% of the total device memory is available for XLA.
-        /*memory_limit_bytes=*/scheduler_mem_limit,
-        /*block_size_limit=*/1, /*block_rematerialization_factor=*/1,
-        /*min_remat_size=*/0, /*compact_shape_function=*/nullptr,
-        /*host_memory_offload_config=*/std::nullopt);
-    HloRematerialization::RematerializationSizes sizes;
-    pipeline.AddPass<HloRematerialization>(options, sizes);
-
-    TF_ASSIGN_OR_RETURN(bool changed, pipeline.Run(hlo_module));
-    if (changed) {
-      VLOG(1) << "HloRematerialization saved "
-              << sizes.before_bytes - sizes.after_bytes << " bytes";
-    }
-  }
-
-  {
-    HloPassPipeline pipeline("fusion-wrapper");
-    pipeline.AddPass<FusionWrapper>();
-    // Wrap remaining unfused ops that have no LHLO equivalent in single-op
-    // fusions. This needs to happen after rematerialization, because that will
-    // insert additional copies.
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
-  }
-
-  auto buffer_size_bytes_function =
-      [pointer_size](const BufferValue& buffer_value) -> int64_t {
-    return GetSizeOfShape(buffer_value.shape(), pointer_size);
-  };
-
   TF_ASSIGN_OR_RETURN(
       results->buffer_assignment,
       BufferAssigner::Run(
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
index a3e4518760516a..b757ca13bef35b 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_value.h"
 #include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_executable.h"
@@ -64,7 +65,8 @@ StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
     const std::string& platform_name, se::Platform::Id platform_id,
-    GpuDeviceInfo gpu_device_info, int pointer_size);
+    GpuDeviceInfo gpu_device_info,
+    const BufferValue::SizeFunction& buffer_size_bytes_function);
 
 Status CompileModuleToLlvmIrImpl(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
@@ -72,8 +74,8 @@ Status CompileModuleToLlvmIrImpl(
     const std::string& platform_name, se::Platform::Id platform_id,
     GpuDeviceInfo gpu_device_info,
     const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
-    int pointer_size, CompileModuleResults* results,
-    se::StreamExecutor* stream_exec = nullptr);
+    const BufferValue::SizeFunction& buffer_size_bytes_function,
+    CompileModuleResults* results, se::StreamExecutor* stream_exec = nullptr);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index c6b64525105223..83b904ff7639a8 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -95,12 +95,14 @@ limitations under the License.
 #include "xla/service/gpu/copy_fusion.h"
 #include "xla/service/gpu/dot_dimension_sorter.h"
 #include "xla/service/gpu/fusion_pipeline.h"
+#include "xla/service/gpu/fusion_wrapper.h"
 #include "xla/service/gpu/gemm_broadcast_folding_rewriter.h"
 #include "xla/service/gpu/gemm_rewriter.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/gpu_async_collective_annotator.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_conv_rewriter.h"
+#include "xla/service/gpu/gpu_convert_async_collectives_to_sync.h"
 #include "xla/service/gpu/gpu_cost_model_stats_collection.h"
 #include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_executable.h"
@@ -141,12 +143,14 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_pass_fix.h"
 #include "xla/service/hlo_pass_pipeline.h"
+#include "xla/service/hlo_rematerialization.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/service/layout_normalization.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/logistic_expander.h"
 #include "xla/service/loop_schedule_linearizer.h"
 #include "xla/service/operand_upcaster.h"
+#include "xla/service/optimization_barrier_expander.h"
 #include "xla/service/qr_expander.h"
 #include "xla/service/real_imag_expander.h"
 #include "xla/service/reduce_decomposer.h"
@@ -1478,11 +1482,18 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     }
   }
 
+  const int64_t scheduler_mem_limit =
+      GetSchedulerMemoryLimit(module.get(), gpu_device_info, pointer_size_);
+  TF_RETURN_IF_ERROR(
+      ScheduleGpuModule(module.get(), pointer_size_, scheduler_mem_limit));
+  TF_RETURN_IF_ERROR(
+      RunPostSchedulingPipelines(module.get(), scheduler_mem_limit));
+
   CompileModuleResults compile_module_results;
   TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
       module.get(), &llvm_context, target_triple_, data_layout_,
       stream_exec->platform()->Name(), stream_exec->platform()->id(),
-      gpu_device_info, GetCanShareBuffer(), pointer_size_,
+      gpu_device_info, GetCanShareBuffer(), BufferSizeBytesFunction(),
       &compile_module_results, stream_exec));
 
   if (user_pre_optimization_hook_) {
@@ -1595,6 +1606,16 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
   for (const auto& module : modules) {
     llvm::LLVMContext llvm_context;
 
+    const int64_t scheduler_mem_limit = GetSchedulerMemoryLimit(
+        module.get(),
+        gpu_target_config != nullptr ? gpu_target_config->gpu_device_info
+                                     : GetGpuDeviceInfo(options.executor()),
+        pointer_size_);
+    TF_RETURN_IF_ERROR(
+        ScheduleGpuModule(module.get(), pointer_size_, scheduler_mem_limit));
+    TF_RETURN_IF_ERROR(
+        RunPostSchedulingPipelines(module.get(), scheduler_mem_limit));
+
     // Compile the module
     CompileModuleResults compile_module_results;
 
@@ -1603,15 +1624,15 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
           module.get(), &llvm_context, target_triple_, data_layout_,
           gpu_target_config->platform_name, options.PlatformId(),
           gpu_target_config->gpu_device_info, GetCanShareBuffer(),
-          pointer_size_, &compile_module_results));
+          BufferSizeBytesFunction(), &compile_module_results));
     } else {
       CHECK(options.executor() != nullptr);
       auto stream_exec = options.executor();
       TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
           module.get(), &llvm_context, target_triple_, data_layout_,
           stream_exec->platform()->Name(), options.PlatformId(),
-          GetGpuDeviceInfo(stream_exec), GetCanShareBuffer(), pointer_size_,
-          &compile_module_results));
+          GetGpuDeviceInfo(stream_exec), GetCanShareBuffer(),
+          BufferSizeBytesFunction(), &compile_module_results));
     }
     if (user_pre_optimization_hook_) {
       user_pre_optimization_hook_(*compile_module_results.llvm_module);
@@ -1724,5 +1745,54 @@ StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
   return result;
 }
 
+Status GpuCompiler::RunPostSchedulingPipelines(
+    HloModule* module, int64_t scheduler_mem_limit) const {
+  {
+    HloPassPipeline pipeline("post-scheduling-passes");
+
+    HloPredicate is_nop =
+        HloPredicateIsOp<HloOpcode::kParameter, HloOpcode::kConstant,
+                         HloOpcode::kBitcast, HloOpcode::kGetTupleElement>;
+    pipeline.AddPass<GpuConvertAsyncCollectivesToSync>(is_nop);
+    pipeline.AddPass<OptimizationBarrierExpander>();
+
+    TF_RETURN_IF_ERROR(pipeline.Run(module).status());
+  }
+
+  {
+    HloPassPipeline pipeline("remat-pipeline");
+
+    HloCostAnalysis hlo_cost_analysis(ShapeSizeBytesFunction());
+    HloRematerialization::RematerializationModeConfig
+        rematerialization_mode_config(/*recompute=*/true, /*compress=*/true,
+                                      /*host_offload=*/false);
+    HloRematerialization::Options options(
+        hlo_cost_analysis, rematerialization_mode_config,
+        // Assume 75% of the total device memory is available for XLA.
+        /*memory_limit_bytes=*/scheduler_mem_limit,
+        /*block_size_limit=*/1, /*block_rematerialization_factor=*/1,
+        /*min_remat_size=*/0, /*compact_shape_function=*/nullptr,
+        /*host_memory_offload_config=*/std::nullopt);
+    HloRematerialization::RematerializationSizes sizes;
+    pipeline.AddPass<HloRematerialization>(options, sizes);
+
+    TF_ASSIGN_OR_RETURN(bool changed, pipeline.Run(module));
+    if (changed) {
+      VLOG(1) << "HloRematerialization saved "
+              << sizes.before_bytes - sizes.after_bytes << " bytes";
+    }
+  }
+
+  {
+    HloPassPipeline pipeline("fusion-wrapper");
+    pipeline.AddPass<FusionWrapper>();
+    // Wrap remaining unfused ops that have no LHLO equivalent in single-op
+    // fusions. This needs to happen after rematerialization, because that will
+    // insert additional copies.
+    TF_RETURN_IF_ERROR(pipeline.Run(module).status());
+  }
+  return OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index bef6f651119e62..ad288243d9ee61 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -160,6 +160,9 @@ class GpuCompiler : public LLVMCompiler {
   StatusOr<std::unique_ptr<AotCompilationResult>> Export(
       Executable* executable) const override;
 
+  Status RunPostSchedulingPipelines(HloModule* module,
+                                    int64_t scheduler_mem_limit) const;
+
  protected:
   // During compilation with device, stream_exec != null and autotune_results
   // == null. During deviceless AOT compilation, stream_exec == null and
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
index c9ecb5aa5048fc..df2237d266f275 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
@@ -26,7 +26,7 @@ int64_t GetSizeOfShape(const Shape& shape, int pointer_size);
 
 // Determines the schedule of HLO instructions for a module run on the GPU.
 Status ScheduleGpuModule(HloModule* module, int64_t pointer_size,
-                         int64_t memory_size);
+                         int64_t memory_limit);
 HloInstructionSequence PostProcessSchedule(const HloInstructionSequence& input);
 
 int64_t GetSchedulerMemoryLimit(const HloModule* module,
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 06d9e3ed3bc01a..bf81778c117a18 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -725,6 +725,7 @@ xla_cc_binary(
         "//xla/service/gpu:compile_module_to_llvm_ir",
         "//xla/service/gpu:gpu_compiler",
         "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:gpu_hlo_schedule",
         "//xla/service/gpu:target_constants",
         "//xla/service/gpu/llvm_gpu_backend",
         "//xla/stream_executor",
diff --git a/third_party/xla/xla/service/gpu/tests/add_preds.hlo b/third_party/xla/xla/service/gpu/tests/add_preds.hlo
index 48d939a9e02abe..af446aa619ec3c 100644
--- a/third_party/xla/xla/service/gpu/tests/add_preds.hlo
+++ b/third_party/xla/xla/service/gpu/tests/add_preds.hlo
@@ -23,7 +23,7 @@
 // CHECK:         store i8 %[[VAL_16]], ptr %[[VAL_17:.*]], align 1
 // CHECK:         br label %[[VAL_7]]
 
-HloModule xla_computation_f.8
+HloModule xla_computation_f.8, is_scheduled=true
 
 // Since the conversion to MLIR goes through completely different code paths
 // depending on whether an op is fused or not, this separately tests pred
diff --git a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
index e9ab9502fa7136..7729ba7cdcff07 100644
--- a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
+++ b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
@@ -10,12 +10,12 @@
 // CHECK-NOT: @buffer_for_dynamic
 // CHECK-NOT: @buffer_for_static
 
-HloModule SliceToDynamic
+HloModule SliceToDynamic, is_scheduled=true
 
 ENTRY main {
   %param = s32[2,2,2]{2,0,1} parameter(0)
-  %static = s32[] constant(2)
   %dynamic = s32[] constant(1)
+  %static = s32[] constant(2)
   ROOT %custom-call = s32[2,<=2, 2]{2,0,1} custom-call(s32[2,2,2]{2,0,1} %param,
                                                   s32[] %static,
                                                   s32[] %dynamic,
diff --git a/third_party/xla/xla/service/gpu/tests/concat.hlo b/third_party/xla/xla/service/gpu/tests/concat.hlo
index 4afff9293afdd3..9fbcca44675fe6 100644
--- a/third_party/xla/xla/service/gpu/tests/concat.hlo
+++ b/third_party/xla/xla/service/gpu/tests/concat.hlo
@@ -152,7 +152,7 @@
 // CHECK:         store half %[[VAL_119]], ptr %[[VAL_120]], align 2
 // CHECK:         br label %[[VAL_8]]
 
-HloModule module
+HloModule module, is_scheduled=true
 
 %fused_computation (param_0.1: f32[1000], param_1.2: f32[1000], param_2.3: f32[1000], param_3.4: f32[1000], param_4.5: f32[1000], param_5.6: f32[1000], param_6.7: f32[1000], param_7.8: f32[1000], param_8.9: f32[1000], param_9.10: f32[1000], param_10.11: f32[1000]) -> f16[11000] {
   %param_10.11 = f32[1000]{0} parameter(10)
diff --git a/third_party/xla/xla/service/gpu/tests/constant.hlo b/third_party/xla/xla/service/gpu/tests/constant.hlo
index 95aff75f76505d..5dc79d68cf8a00 100644
--- a/third_party/xla/xla/service/gpu/tests/constant.hlo
+++ b/third_party/xla/xla/service/gpu/tests/constant.hlo
@@ -1,11 +1,17 @@
 // RUN: hlo_to_llvm_ir %s | FileCheck %s
 
-HloModule Test
+HloModule Test, is_scheduled=true
+
+fused_computation {
+  param_0 = pred[2,2]{1,0} parameter(0)
+  param_1 = pred[2,2]{1,0} parameter(1)
+  ROOT xor.1 = pred[2,2]{1,0} xor(pred[2,2]{1,0} param_0, pred[2,2]{1,0} param_1)
+}
 
 ENTRY main {
 // CHECK: %{{.*}} = getelementptr inbounds i8, ptr %arg0, i32 %{{.*}}
 // CHECK: %{{.*}} = getelementptr inbounds i8, ptr %arg1, i32 %{{.*}}
   a = pred[2, 2]{1,0} constant({{false, true}, {true, false}})
   b = pred[2, 2]{1,0} constant({{false, true}, {false, true}})
-  ROOT xor = pred[2, 2]{1, 0} xor(a, b)
+  ROOT wrapped_xor = pred[2,2]{1,0} fusion(pred[2,2]{1,0} a, pred[2,2]{1,0} b), kind=kLoop, calls=fused_computation
 }
diff --git a/third_party/xla/xla/service/gpu/tests/copy.hlo b/third_party/xla/xla/service/gpu/tests/copy.hlo
index 2db3e42fc00f94..3b608d880ffbea 100644
--- a/third_party/xla/xla/service/gpu/tests/copy.hlo
+++ b/third_party/xla/xla/service/gpu/tests/copy.hlo
@@ -10,9 +10,15 @@
 
 // CHECK:         call void [[BARRIER]]
 
-HloModule Test
+HloModule Test, is_scheduled=true
+
+
+fused_computation {
+  param_0 = f32[100,200]{1,0} parameter(0)
+  ROOT b.1 = f32[100,200]{0,1} copy(f32[100,200]{1,0} param_0)
+}
 
 ENTRY main {
   a = f32[100, 200]{1,0} parameter(0)
-  ROOT b = f32[100, 200]{0,1} copy(a)
+  ROOT wrapped_b = f32[100,200]{0,1} fusion(f32[100,200]{1,0} a), kind=kInput, calls=fused_computation
 }
diff --git a/third_party/xla/xla/service/gpu/tests/copy_nested.hlo b/third_party/xla/xla/service/gpu/tests/copy_nested.hlo
index f1753617439674..d09d584eeaa0cd 100644
--- a/third_party/xla/xla/service/gpu/tests/copy_nested.hlo
+++ b/third_party/xla/xla/service/gpu/tests/copy_nested.hlo
@@ -72,9 +72,14 @@
 // CHECK:         store float %[[VAL_54]], ptr %[[VAL_55]], align 4
 // CHECK:         br label %[[VAL_2]]
 
-HloModule Test
+HloModule Test, is_scheduled=true
+
+fused_computation {
+  param_0 = f32[100,200,300]{2,1,0} parameter(0)
+  ROOT b.1 = f32[100,200,300]{2,0,1} copy(f32[100,200,300]{2,1,0} param_0)
+}
 
 ENTRY main {
   a = f32[100, 200, 300]{2,1,0} parameter(0)
-  ROOT b = f32[100, 200, 300]{2,0,1} copy(a)
+  ROOT wrapped_b = f32[100,200,300]{2,0,1} fusion(f32[100,200,300]{2,1,0} %a), kind=kLoop, calls=fused_computation
 }
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo b/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
index 69c64e70409c70..0c56797d56c451 100644
--- a/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
@@ -49,7 +49,7 @@
 // CHECK:         store half %[[VAL_39]], ptr %[[VAL_40]], align 2
 // CHECK:         br label %[[VAL_29]]
 
-HloModule TestModule
+HloModule TestModule, is_scheduled=true
 
 fusion.1 {
   p.0 = f16[50,96,1024]{2,1,0} parameter(0)
@@ -95,7 +95,7 @@ ENTRY entry {
 // CHECK:         store i32 %[[VAL_115]], ptr [[ADDRSPACE_ANNOTATION]]%[[RET_VALUE_ADDR]], align 4
 // CHECK:         br label %[[SLICE]]-after
 
-HloModule fusion
+HloModule fusion, is_scheduled=true
 
 fused_computation {
   param_0.1 = s32[6]{0} parameter(0)
@@ -140,7 +140,7 @@ ENTRY main {
 // CHECK-DAG:    getelementptr inbounds i16, ptr %[[ARG3]]
 // CHECK-DAG:    getelementptr inbounds [8 x [11 x [12 x i16]]], ptr %[[ARG2]]
 
-HloModule MultipleInplaceDus, input_output_alias={ {0}: (0, {}), {1}: (2, {}) }
+HloModule MultipleInplaceDus, is_scheduled=true, input_output_alias={ {0}: (0, {}), {1}: (2, {}) }
 
 fused_computation {
   p0 = bf16[10,11,12] parameter(0)
@@ -190,7 +190,7 @@ ENTRY main {
 // CHECK-DAG:    getelementptr inbounds i16, ptr %[[ARG3]]
 // CHECK-DAG:    getelementptr inbounds [8 x [11 x [12 x i16]]], ptr %[[ARG2]]
 
-HloModule MultipleInplaceDusWithTransposeBitcastToTheRoot, input_output_alias={ {0}: (0, {}), {1}: (2, {}) }
+HloModule MultipleInplaceDusWithTransposeBitcastToTheRoot, is_scheduled=true, input_output_alias={ {0}: (0, {}), {1}: (2, {}) }
 
 fused_computation {
   p0 = bf16[10,11,12] parameter(0)
@@ -233,7 +233,7 @@ ENTRY main {
 // CHECK-DAG:    getelementptr inbounds i16, ptr %[[ARG2]]
 // CHECK-DAG:    getelementptr inbounds [10 x [11 x [12 x i16]]], ptr %[[ARG0]]
 
-HloModule SingleInplaceDusWithTransposeBitcastToTheRoot, input_output_alias={ {}: (0, {}) }
+HloModule SingleInplaceDusWithTransposeBitcastToTheRoot, is_scheduled=true, input_output_alias={ {}: (0, {}) }
 
 single_inplace_dus_with_transpose_bitcast {
   p0 = bf16[10,11,12] parameter(0)
@@ -272,7 +272,7 @@ ENTRY main {
 // CHECK-DAG:    getelementptr inbounds i16, ptr %[[ARG2]]
 // CHECK-DAG:    getelementptr inbounds [10 x [11 x [12 x i16]]], ptr %[[ARG0]]
 
-HloModule SingleInplaceDusWithReshapeBitcastToTheRoot, input_output_alias={ {}: (0, {}) }
+HloModule SingleInplaceDusWithReshapeBitcastToTheRoot, is_scheduled=true, input_output_alias={ {}: (0, {}) }
 
 single_inplace_dus_with_reshape_bitcast {
   p0 = bf16[10,11,12] parameter(0)
@@ -311,7 +311,7 @@ ENTRY main {
 // CHECK-DAG:    getelementptr inbounds i16, ptr %[[ARG2]]
 // CHECK-DAG:    getelementptr inbounds [10 x [6 x [2 x [11 x i16]]]], ptr %[[ARG0]]
 
-HloModule SingleInplaceDusWithBitcastToTheRootAndFromTheParameter, input_output_alias={ {}: (0, {}) }
+HloModule SingleInplaceDusWithBitcastToTheRootAndFromTheParameter, is_scheduled=true, input_output_alias={ {}: (0, {}) }
 
 single_inplace_dus_with_bitcast_to_the_root_and_from_the_parameter {
   p0 = bf16[10,11,12] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo b/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
index 6dbde6c48b44d1..bd6ab76d2e8a2e 100644
--- a/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
+++ b/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
@@ -2,7 +2,7 @@
 // RUN: hlo_to_llvm_ir --xla_disable_all_hlo_passes=true %s | FileCheck --check-prefix=CHECK-LLVM %s
 // We check that the row loads are vectorized.
 
-HloModule SimpleAddRowBroadcasting
+HloModule SimpleAddRowBroadcasting, is_scheduled=true
 
 %fused_computation.0 (param_0: f32[672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672]{
   %param_0 = f32[672]{0} parameter(0)
@@ -25,7 +25,7 @@ ENTRY main {
 
 // -----
 
-HloModule SimpleAddSmallRowBroadcasting
+HloModule SimpleAddSmallRowBroadcasting, is_scheduled=true
 
 %fused_computation.0 (param_0: f32[48], param_1: f32[512,14,14,48]) -> f32[512,14,14,48]{
   %param_0 = f32[48]{0} parameter(0)
@@ -49,7 +49,7 @@ ENTRY main {
 // -----
 
 // This test an BatchNorm fused kernel found in EfficientNet.
-HloModule EfficientNetSwish
+HloModule EfficientNetSwish, is_scheduled=true
 
 %fused_computation.1 (param_0.89: f32[672], param_1: f32[672], param_2: f32[672], param_3: f32[672], param_4: f16[512,14,14,672], param_5: f32[672], param_6: f16[512,14,14,672], param_7: f32[672]) -> f16[512,14,14,672] {
   %param_2 = f32[672]{0} parameter(2)
@@ -134,7 +134,7 @@ ENTRY main {
 
 // -----
 
-HloModule TransposeOutput
+HloModule TransposeOutput, is_scheduled=true
 
 %fused_computation.2 (param_0: f32[672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] {
   %param_0 = f32[672]{0} parameter(0)
@@ -157,7 +157,7 @@ ENTRY main {
 
 // -----
 
-HloModule TransposeInput
+HloModule TransposeInput, is_scheduled=true
 
 %fused_computation.3 (param_0: f32[672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] {
   %param_0 = f32[672]{0} parameter(0)
@@ -180,7 +180,7 @@ ENTRY main {
 
 // -----
 
-HloModule MOF
+HloModule MOF, is_scheduled=true
 
 %fused_computation.4 (param_0: f32[672], param_1: f32[512,14,14,672]) -> (f32[512,14,14,672], f32[512,14,14,672]) {
   %param_0 = f32[672]{0} parameter(0)
@@ -205,7 +205,7 @@ ENTRY main {
 
 // -----
 
-HloModule ScalarBroadcasting
+HloModule ScalarBroadcasting, is_scheduled=true
 
 %fused_computation.5 (param_0: f32[], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] {
   %param_0 = f32[] parameter(0)
@@ -227,7 +227,7 @@ ENTRY main {
 
 // -----
 
-HloModule NotSupportedBroadcasting
+HloModule NotSupportedBroadcasting, is_scheduled=true
 
 %fused_computation.6 (param_0: f32[14,672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] {
   %param_0 = f32[14,672]{1,0} parameter(0)
@@ -249,7 +249,7 @@ ENTRY main {
 // CHECK: ld.global.nc.f
 
 // -----
-HloModule Module
+HloModule Module, is_scheduled=true
 
 %fused_computation.7 {
   %constant_2 = f32[] constant(0)
@@ -277,7 +277,7 @@ ENTRY %computation {
 // CHECK-LLVM-NOT: row_index
 
 // -----
-HloModule RowToLong
+HloModule RowToLong, is_scheduled=true
 
 %fused_computation.1 {
   %p0 = f32[2025]{0} parameter(0)
@@ -295,7 +295,7 @@ ENTRY main {
 
 // -----
 
-HloModule module
+HloModule module, is_scheduled=true
 
 %fused_computation.1 {
   %p0 = f16[5000,64,64,32] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo b/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
index edfd3c4ed00be1..dd5fd28c27c874 100644
--- a/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
@@ -102,21 +102,36 @@
 // CHECK:         store atomic i32 %[[VAL_84]], ptr %[[VAL_79]] unordered, align 4
 // CHECK:         br label %[[VAL_70]]
 
-HloModule TensorFlowScatterV1
+HloModule TensorFlowScatterV1, is_scheduled=true
 
 update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
   lhs = s32[] parameter(0)
   ROOT rhs = s32[] parameter(1)
 }
 
+fused_computation {
+  param_0 = s32[3,3]{1,0} parameter(0)
+  ROOT operand.1 = s32[3,3]{1,0} add(s32[3,3]{1,0} param_0, s32[3,3]{1,0} param_0)
+}
+
+fused_computation.1 {
+  param_0.1 = s32[2]{0} parameter(0)
+  ROOT indices.1 = s32[2]{0} add(s32[2]{0} param_0.1, s32[2]{0} param_0.1)
+}
+
+fused_computation.2 {
+  param_0.2 = s32[2,3]{1,0} parameter(0)
+  ROOT updates.1 = s32[2,3]{1,0} add(s32[2,3]{1,0} param_0.2, s32[2,3]{1,0} param_0.2)
+}
+
 ENTRY main {
-  p0 = s32[3,3] parameter(0)
-  operand = s32[3,3] add(p0, p0)
   p1 = s32[2] parameter(1)
-  indices = s32[2] add(p1, p1)
+  wrapped_indices = s32[2]{0} fusion(s32[2]{0} p1), kind=kLoop, calls=fused_computation.1
   p2 = s32[2,3] parameter(2)
-  updates = s32[2,3] add(p2, p2)
-  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+  wrapped_updates = s32[2,3]{1,0} fusion(s32[2,3]{1,0} p2), kind=kLoop, calls=fused_computation.2
+  p0 = s32[3,3] parameter(0)
+  wrapped_operand = s32[3,3]{1,0} fusion(s32[3,3]{1,0} p0), kind=kLoop, calls=fused_computation
+  ROOT scatter = s32[3,3] scatter(wrapped_operand, wrapped_indices, wrapped_updates),
       to_apply=update_s32,
       update_window_dims={1},
       inserted_window_dims={0},
diff --git a/third_party/xla/xla/service/gpu/tests/fused_slice.hlo b/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
index 890a437d6c988e..2a1ba79a77ba73 100644
--- a/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
+++ b/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
@@ -76,7 +76,7 @@
 // CHECK:         store half %[[VAL_36]], ptr %[[VAL_58]], align 2
 // CHECK:         br label %[[VAL_9]]
 
-HloModule input_fusion_with_a_tuple_of_slices
+HloModule input_fusion_with_a_tuple_of_slices, is_scheduled=true
 
 fused_computation {
   arg.1 = f16[1024]{0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/fused_slice_different_operands.hlo b/third_party/xla/xla/service/gpu/tests/fused_slice_different_operands.hlo
index 61057be3c45152..5d1f11d0ba9cb6 100644
--- a/third_party/xla/xla/service/gpu/tests/fused_slice_different_operands.hlo
+++ b/third_party/xla/xla/service/gpu/tests/fused_slice_different_operands.hlo
@@ -67,7 +67,7 @@
 // CHECK:         store half %[[VAL_48]], ptr %[[VAL_49]], align 2
 // CHECK:         br label %[[VAL_8]]
 
-HloModule input_fusion_with_a_tuple_of_slices
+HloModule input_fusion_with_a_tuple_of_slices, is_scheduled=true
 
 fused_computation {
   arg.1 = f16[1024]{0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/fusion.hlo b/third_party/xla/xla/service/gpu/tests/fusion.hlo
index e8a06f6136c368..84a6953aa89384 100644
--- a/third_party/xla/xla/service/gpu/tests/fusion.hlo
+++ b/third_party/xla/xla/service/gpu/tests/fusion.hlo
@@ -1,6 +1,6 @@
 // RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %{IR_SUBST} %s
 
-HloModule TestModule
+HloModule TestModule, is_scheduled=true
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
index a7cb3b59f20544..e1624902375d42 100644
--- a/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/gpu_hlo_schedule.h"
 #include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "xla/service/gpu/target_constants.h"
 #include "xla/status.h"
@@ -36,10 +37,10 @@ limitations under the License.
 #include "tsl/util/command_line_flags.h"
 
 const char* const kUsage = R"(
-This tool reads in an HloModule from a file, compiles it using the NVPTX or AMDGPU
-compiler and prints out the LLVM IR generated by the IR emitter.  The LLVM IR is
-not optimized by the LLVM pass pipeline, so this tool can be used to unit test
-the XLA GPU IR emitters.
+This tool reads in an scheduled HloModule from a file, compiles it using the
+NVPTX or AMDGPU compiler and prints out the LLVM IR generated by the IR emitter.
+The LLVM IR is not optimized by the LLVM pass pipeline, so this tool can be used
+to unit test the XLA GPU IR emitters.
 
 Note that the LLVM IR does not contain the *full* module, but only parts that
 will be code generated into PTX/Hsaco. The NVPTX/Hsaco compiler also generates a
@@ -56,6 +57,7 @@ xla::Status CompileAndPrintLlvmIr(const std::string& hlo_text,
       std::unique_ptr<xla::HloModule> hlo_module,
       xla::LoadModuleFromData(/*data=*/hlo_text, /*format=*/"hlo"));
 
+  CHECK(hlo_module->has_schedule());
   TF_RETURN_IF_ERROR(VerifyHloModule(hlo_module.get(),
                                      /*layout_sensitive=*/false,
                                      /*allow_mixed_precision=*/true));
@@ -85,12 +87,15 @@ xla::Status CompileAndPrintLlvmIr(const std::string& hlo_text,
       stream_executor::rocm::kROCmPlatformId;
 #endif
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<llvm::Module> llvm_module,
-      xla::gpu::CompileModuleToLlvmIr(hlo_module.get(), &llvm_context,
-                                      target_triple, data_layout, platform_name,
-                                      platform_id, gpu_device_info,
-                                      /*pointer_size=*/8));
+  auto buffer_size_bytes_function = [](const xla::BufferValue& buffer) {
+    return xla::gpu::GetSizeOfShape(buffer.shape(), /*pointer_size=*/8);
+  };
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::Module> llvm_module,
+                      xla::gpu::CompileModuleToLlvmIr(
+                          hlo_module.get(), &llvm_context, target_triple,
+                          data_layout, platform_name, platform_id,
+                          gpu_device_info, buffer_size_bytes_function));
 
   if (!generate_ptx) {
     llvm_module->print(llvm::outs(), nullptr);
diff --git a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
index 97ad2bb07d3f43..c497a09d92c92e 100644
--- a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
+++ b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
@@ -5,7 +5,7 @@
 // CHECK: define [[KERNEL_ANNOTATION]]void
 // CHECK-NOT: define [[KERNEL_ANNOTATION]]void
 
-HloModule KernelReuse
+HloModule KernelReuse, is_scheduled=true
 
 fused_computation {
   param_0.2 = f32[5,5]{1,0} parameter(0)
@@ -48,7 +48,7 @@ ENTRY main {
 // CHECK: define [[KERNEL_ANNOTATION]]void @triton_gemm_dot1(
 // CHECK-NOT: define [[KERNEL_ANNOTATION]]void
 
-HloModule t
+HloModule t, is_scheduled=true
 
 triton_gemm_dot0 {
   parameter_1 = f16[15,19]{1,0} parameter(1)
@@ -69,8 +69,8 @@ ENTRY e {
   p2 = f16[15,19]{1,0} parameter(2)
   p1 = s8[19,17]{1,0} parameter(1)
   p0 = f16[15,19]{1,0} parameter(0)
-  triton_gemm_dot0 = f16[15,17]{1,0} fusion(p1, p0), kind=kCustom, calls=triton_gemm_dot0, backend_config="{kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}}"
   triton_gemm_dot1 = f16[15,17]{1,0} fusion(p3, p2), kind=kCustom, calls=triton_gemm_dot1, backend_config="{kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}}"
+  triton_gemm_dot0 = f16[15,17]{1,0} fusion(p1, p0), kind=kCustom, calls=triton_gemm_dot0, backend_config="{kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}}"
   ROOT tuple = (f16[15,17]{1,0}, f16[15,17]{1,0}) tuple(triton_gemm_dot0, triton_gemm_dot1)
 }
 
@@ -84,7 +84,7 @@ ENTRY e {
 // CHECK: define [[KERNEL_ANNOTATION]]void @fusion_1(ptr noalias align 128 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1)
 // CHECK-NOT: define [[KERNEL_ANNOTATION]]void
 
-HloModule KernelReuse
+HloModule KernelReuse, is_scheduled=true
 
 fused_computation {
   param_0.2 = f32[5,5]{1,0} parameter(0)
@@ -128,7 +128,7 @@ ENTRY main {
 // CHECK: define [[KERNEL_ANNOTATION]]void @fusion_1(ptr noalias align 128 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1, ptr noalias align 128 dereferenceable(100) %arg2)
 // CHECK-NOT: define void
 
-HloModule KernelReuse
+HloModule KernelReuse, is_scheduled=true
 
 fused_computation {
   a = f32[5,5]{1,0} parameter(0)
@@ -175,7 +175,7 @@ ENTRY main {
 // CHECK: !invariant.load
 // CHECK-NOT: define [[KERNEL_ANNOTATION]]void
 
-HloModule KernelReuse
+HloModule KernelReuse, is_scheduled=true
 
 fused_computation {
   a = f32[5,5]{1,0} parameter(0)
@@ -197,9 +197,9 @@ ENTRY main {
   custom-call = f32[5,5]{1,0} custom-call(a, a), custom_call_target="__cublas$gemm", backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
   fusion.2 = f32[5,5]{1,0} fusion(custom-call), kind=kLoop, calls=fused_computation.2
   custom-call.1 = f32[5,5]{1,0} custom-call(fusion.2, fusion.2), custom_call_target="__cublas$gemm", backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
+  fusion = f32[5,5]{1,0} fusion(custom-call.1), kind=kLoop, calls=fused_computation
   fusion.1 = f32[5,5]{1,0} fusion(custom-call.1), kind=kLoop, calls=fused_computation.1
   custom-call.2 = f32[5,5]{1,0} custom-call(fusion.1, fusion.1), custom_call_target="__cublas$gemm", backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
-  fusion = f32[5,5]{1,0} fusion(custom-call.1), kind=kLoop, calls=fused_computation
   custom-call.3 = f32[5,5]{1,0} custom-call(fusion, fusion), custom_call_target="__cublas$gemm", backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
   // We don't output custom-call, so fusion.2 can change its input.
   ROOT tuple = (f32[5,5]{1,0}, f32[5,5]{1,0}, f32[5,5]{1,0}) tuple(custom-call.1, custom-call.2, custom-call.3)
diff --git a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
index c2d36d0ad94042..8e3077a3620f97 100644
--- a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
+++ b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
@@ -9,11 +9,16 @@
 // CHECK:   ![[tid_range]] = !{i32 0, i32 1024}
 
 
-HloModule Test
+HloModule Test, is_scheduled=true
+
+fused_computation {
+  param_0 = f32[100,20]{1,0} parameter(0)
+  ROOT b.1 = f32[100,20]{1,0} round-nearest-even(f32[100,20]{1,0} param_0)
+}
 
 ENTRY main {
   a = f32[100, 20]{1,0} parameter(0)
-  ROOT b = f32[100, 20]{1,0} round-nearest-even(a)
+  ROOT wrapped_b = f32[100,20]{1,0} fusion(f32[100,20]{1,0} a), kind=kLoop, calls=fused_computation
 }
 
 // -----
@@ -27,11 +32,16 @@ ENTRY main {
 // CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
 // CHECK:   ![[tid_range]] = !{i32 0, i32 128}
 
-HloModule Test
+HloModule Test, is_scheduled=true
+
+fused_computation {
+  param_0 = f32[10000,10000]{1,0} parameter(0)
+  ROOT b.1 = f32[10000,10000]{1,0} round-nearest-even(f32[10000,10000]{1,0} param_0)
+}
 
 ENTRY main {
   a = f32[10000, 10000]{1,0} parameter(0)
-  ROOT b = f32[10000, 10000]{1,0} round-nearest-even(a)
+  ROOT wrapped_b = f32[10000,10000]{1,0} fusion(f32[10000,10000]{1,0} a), kind=kLoop, calls=fused_computation
 }
 
 // -----
@@ -49,7 +59,7 @@ ENTRY main {
 // CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
 // CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 128}
 
-HloModule ScalarBroadcast
+HloModule ScalarBroadcast, is_scheduled=true
 
 %fused_computation.3 (param_0: f32[], param_1: f32[10000, 10000]) -> f32[10000, 10000] {
   %param_0 = f32[] parameter(0)
@@ -80,7 +90,7 @@ ENTRY main {
 // CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
 // CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 128}
 
-HloModule SimpleFusion
+HloModule SimpleFusion, is_scheduled=true
 
 %fused_computation (param_0: f32[], param_1: f32[10000, 10000]) -> f32[10000, 10000] {
   %param_0 = f32[10000,10000] parameter(0)
@@ -109,7 +119,7 @@ ENTRY main {
 // CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
 // CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 128}
 
-HloModule LargeConstant
+HloModule LargeConstant, is_scheduled=true
 
 %fused_computation (param_0: f32[], param_1: f32[10000, 10000]) -> f32[10000, 10000] {
   %param_0 = f32[10000,10000] parameter(0)
@@ -137,7 +147,7 @@ ENTRY main {
 // CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 97657}
 // CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 256}
 
-HloModule NonElementwise
+HloModule NonElementwise, is_scheduled=true
 
 %fused_computation (param_0: f32[], param_1: f32[10000, 10000]) -> f32[10000, 10000] {
   %param_0 = f32[10000,10000] parameter(0)
@@ -171,7 +181,7 @@ ENTRY main {
 // CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 3907}
 // CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 256}
 
-HloModule NoFewWaves
+HloModule NoFewWaves, is_scheduled=true
 
 %fused_computation (param_0: f32[], param_1: f32[2000, 2000]) -> f32[2000, 2000] {
   %param_0 = f32[2000] parameter(0)
@@ -215,7 +225,7 @@ ENTRY main {
 // CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 2000}
 // CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 500}
 
-HloModule RowVectorizable
+HloModule RowVectorizable, is_scheduled=true
 
 %fused_computation (param_0: f32[], param_1: f32[2000, 2000]) -> f32[2000, 2000] {
   %param_0 = f32[2000] parameter(0)
@@ -256,7 +266,7 @@ ENTRY main {
 // CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
 // CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 128}
 
-HloModule ScalarBroadcastFourInputs
+HloModule ScalarBroadcastFourInputs, is_scheduled=true
 
 %fused_computation (param_0: f32[], param_1: f32[2000, 2000]) -> f32[2000, 2000] {
   %param_0 = f32[] parameter(0)
@@ -293,7 +303,7 @@ ENTRY main {
 // CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
 // CHECK:   ![[tid_range]] = !{i32 0, i32 128}
 
-HloModule Test
+HloModule Test, is_scheduled=true
 
 %fused_computation (param_0: f16[6,512,4096]) -> f16[6,512,4096] {
   %param_0 = f16[6,512,4096]{2,1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
index 48095007b5ac7f..77273eb9302046 100644
--- a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
+++ b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
@@ -65,7 +65,7 @@
 // CHECK:         br label %[[VAL_31]]
 
 
-HloModule PadToStatic
+HloModule PadToStatic, is_scheduled=true
 
 ENTRY main {
   %param = s32[2,<=2,2] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo b/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
index 0d04f86da9cee4..9f70f2749e576d 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
@@ -3,7 +3,7 @@
 // CHECK: target datalayout
 // CHECK:         call float @llvm.nvvm.shfl.sync.down.f32
 
-HloModule Test
+HloModule Test, is_scheduled=true
 
 Add {
   scalar_lhs.0 = f32[] parameter(0)
@@ -15,11 +15,18 @@ Add {
   ROOT t = (f32[], f32[]) tuple(add.0, add.1)
 }
 
+fused_computation {
+  param_0 = f32[5,200,300]{2,1,0} parameter(0)
+  param_1 = f32[5,200,300]{2,1,0} parameter(1)
+  param_2 = f32[] parameter(2)
+  ROOT d.1 = (f32[200]{0}, f32[200]{0}) reduce(f32[5,200,300]{2,1,0} param_0, f32[5,200,300]{2,1,0} %param_1, f32[] param_2, f32[] param_2), dimensions={0,2}, to_apply=Add
+}
+
 ENTRY main {
   a = f32[5, 200, 300]{2,1,0} parameter(0)
   b = f32[5, 200, 300]{2,1,0} parameter(1)
   c = f32[] constant(0)
-  ROOT d = (f32[200]{0}, f32[200]{0}) reduce(a, b, c, c), dimensions={0,2}, to_apply=Add
+  ROOT wrapped_d = (f32[200]{0}, f32[200]{0}) fusion(f32[5,200,300]{2,1,0} a, f32[5,200,300]{2,1,0} b, f32[] c), kind=kInput, calls=fused_computation
 }
 
 // -----
@@ -27,7 +34,7 @@ ENTRY main {
 // CHECK: target datalayout
 // CHECK:         @llvm.nvvm.shfl.sync.down
 
-HloModule LargeReduction
+HloModule LargeReduction, is_scheduled=true
 
 Sum {
     x.1 = c128[] parameter(0)
@@ -35,10 +42,16 @@ Sum {
     ROOT add.1 = c128[] add(x.1, y.1)
 }
 
+fused_computation {
+  param_0 = c128[10000]{0} parameter(0)
+  param_1 = c128[] parameter(1)
+  ROOT out1.1 = c128[] reduce(c128[10000]{0} param_0, c128[] param_1), dimensions={0}, to_apply=Sum
+}
+
 ENTRY reduce.1 {
     parameter = c128[10000] parameter(0)
     init_value = c128[] constant((0, 0))
-    ROOT out1 = c128[] reduce(parameter, init_value), dimensions={0}, to_apply=Sum
+    ROOT wrapped_out1 = c128[] fusion(c128[10000]{0} parameter, c128[] init_value), kind=kInput, calls=fused_computation
 }
 
 // -----
@@ -48,7 +61,7 @@ ENTRY reduce.1 {
 // CHECK: target datalayout
 // CHECK: cmpxchg
 
-HloModule MinReduce
+HloModule MinReduce, is_scheduled=true
 
 Min {
   x.1 = f32[] parameter(0)
@@ -56,10 +69,16 @@ Min {
   ROOT min.1 = f32[] minimum(x.1, y.1)
 }
 
+fused_computation {
+  param_0 = f32[300000]{0} parameter(0)
+  param_1 = f32[] parameter(1)
+  ROOT reduce.1 = f32[] reduce(f32[300000]{0} param_0, f32[] param_1), dimensions={0}, to_apply=Min
+}
+
 ENTRY reduce.1 {
   parameter = f32[300000] parameter(0)
   init_value = f32[] constant(0)
-  ROOT reduce = reduce(parameter, init_value), dimensions={0}, to_apply=Min
+  ROOT wrapped_reduce = f32[] fusion(f32[300000]{0} parameter, f32[] init_value), kind=kInput, calls=fused_computation
 }
 
 // -----
@@ -68,7 +87,7 @@ ENTRY reduce.1 {
 // CHECK-COUNT-12: call void @Sum
 // CHECK-NOT: call void @Sum
 
-HloModule Abc1
+HloModule Abc1, is_scheduled=true
 
 Sum {
     x.1 = f32[] parameter(0)
@@ -76,10 +95,16 @@ Sum {
     ROOT add.1 = f32[] add(x.1, y.1)
 }
 
+fused_computation {
+  param_0 = f32[64,1048576]{1,0} parameter(0)
+  param_1 = f32[] parameter(1)
+  ROOT vectorized_reduce.1 = f32[1048576]{0} reduce(f32[64,1048576]{1,0} param_0, f32[] param_1), dimensions={0}, to_apply=Sum
+}
+
 ENTRY reduce.1 {
     parameter = f32[64,1048576] parameter(0)
     init_value = f32[] constant(0)
-    ROOT vectorized_reduce = f32[1048576] reduce(parameter, init_value), dimensions={0}, to_apply=Sum
+    ROOT wrapped_vectorized_reduce = f32[1048576]{0} fusion(f32[64,1048576]{1,0} parameter, f32[] init_value), kind=kInput, calls=fused_computation
 }
 
 // -----
@@ -88,7 +113,7 @@ ENTRY reduce.1 {
 // CHECK-COUNT-6: call void @Sum
 // CHECK-NOT: call void @Sum
 
-HloModule ColumnReductionNotVectorizedTooManyLargeParams
+HloModule ColumnReductionNotVectorizedTooManyLargeParams, is_scheduled=true
 
 Sum {
     x.1 = f32[] parameter(0)
@@ -130,7 +155,7 @@ ENTRY reduce.1 {
 // CHECK-COUNT-12: call void @Sum
 // CHECK-NOT: call void @Sum
 
-HloModule ColumnReductionVectorizedSomeLargeParams
+HloModule ColumnReductionVectorizedSomeLargeParams, is_scheduled=true
 
 Sum {
     x.1 = f32[] parameter(0)
@@ -167,7 +192,7 @@ ENTRY reduce.1 {
 // CHECK-COUNT-12: call void @Sum
 // CHECK-NOT: call void @Sum
 
-HloModule ColumnReductionVectorizedNonElementwise
+HloModule ColumnReductionVectorizedNonElementwise, is_scheduled=true
 
 Sum {
     x.1 = f32[] parameter(0)
@@ -201,7 +226,7 @@ ENTRY reduce.1 {
 // CHECK-COUNT-6: call void @Sum
 // CHECK-NOT: call void @Sum
 
-HloModule ColumnReductionNotVectorizedNonElementwise
+HloModule ColumnReductionNotVectorizedNonElementwise, is_scheduled=true
 
 Sum {
     x.1 = f32[] parameter(0)
@@ -241,7 +266,7 @@ ENTRY reduce.1 {
 // CHECK-COUNT-2: {{^x_in_tile-true}}
 // CHECK-NOT: {{^x_in_tile-true}}
 
-HloModule RowReductionVectorized
+HloModule RowReductionVectorized, is_scheduled=true
 
 Sum {
     x.1 = f32[] parameter(0)
@@ -268,7 +293,7 @@ ENTRY reduce.1 {
 // CHECK-COUNT-1: {{^x_in_tile-true}}
 // CHECK-NOT: {{^x_in_tile-true}}
 
-HloModule RowReductionNotVectorized
+HloModule RowReductionNotVectorized, is_scheduled=true
 
 Sum {
     x.1 = f32[] parameter(0)
@@ -295,7 +320,7 @@ ENTRY reduce.1 {
 // CHECK-COUNT-1: {{^x_in_tile-true}}
 // CHECK-NOT: {{^x_in_tile-true}}
 
-HloModule RowReductionNotVectorized
+HloModule RowReductionNotVectorized, is_scheduled=true
 
 Sum {
     x.1 = f32[] parameter(0)
@@ -323,7 +348,7 @@ ENTRY reduce.1 {
 // CHECK-COUNT-1: {{^x_in_tile-true}}
 // CHECK-NOT: {{^x_in_tile-true}}
 
-HloModule RowReductionNotVectorized
+HloModule RowReductionNotVectorized, is_scheduled=true
 
 Sum {
     x.1 = f32[] parameter(0)
@@ -354,7 +379,7 @@ ENTRY reduce.1 {
 // CHECK-COUNT-2: {{^x_in_tile-true}}
 // CHECK-NOT: {{^x_in_tile-true}}
 
-HloModule RowReductionVectorized
+HloModule RowReductionVectorized, is_scheduled=true
 
 Sum {
     x.1 = f32[] parameter(0)
@@ -387,7 +412,7 @@ ENTRY reduce.1 {
 // - has a small "smallest input size" (1 for pred)
 // - exceeds the shmem budget because `num_partial_results` is 8
 
-HloModule m
+HloModule m, is_scheduled=true
 
 add {
  a = f64[] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo b/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
index a1c5a315b15de6..fb3509310802ae 100644
--- a/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
@@ -10,7 +10,7 @@
 // CHECK-NOT: ld.global.nc.u64
 // CHECK-NOT: ld.global.u64
 
-HloModule ReduceOddRowSize
+HloModule ReduceOddRowSize, is_scheduled=true
 
 %max_ {
   %x = f32[] parameter(0)
@@ -18,10 +18,16 @@ HloModule ReduceOddRowSize
   ROOT %maximum.7 = f32[] maximum(%x, %y)
 }
 
+%fused_computation {
+  %param_0.1 = f32[5,4071]{1,0} parameter(0)
+  %param_1 = f32[] parameter(1)
+  ROOT %reduce.odd_row.1 = f32[5]{0} reduce(f32[5,4071]{1,0} %param_0.1, f32[] %param_1), dimensions={1}, to_apply=%max_
+}
+
 ENTRY %main {
   %param_0 = f32[5,4071] parameter(0)
   %constant.3 = f32[] constant(0)
-  ROOT %reduce.odd_row = f32[5] reduce(f32[5,4071] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+  ROOT %wrapped_reduce.odd_row = f32[5]{0} fusion(f32[5,4071]{1,0} %param_0, f32[] %constant.3), kind=kInput, calls=%fused_computation
 }
 
 // -----
@@ -29,7 +35,7 @@ ENTRY %main {
 // CHECK-SM86-LABEL: .entry wrapped_reduce_small_row
 // CHECK-SM86: .reqntid 96, 1, 1
 
-HloModule ReduceSmallRow
+HloModule ReduceSmallRow, is_scheduled=true
 
 addition {
   x = f32[] parameter(0)
@@ -37,10 +43,16 @@ addition {
   ROOT out = add(%x, %y)
 }
 
+%fused_computation {
+  %param_0 = f32[700000,32]{1,0} parameter(0)
+  %param_1 = f32[] parameter(1)
+  ROOT %reduce_small_row.1 = f32[700000]{0} reduce(f32[700000,32]{1,0} %param_0, f32[] %param_1), dimensions={1}, to_apply=%addition
+}
+
 ENTRY main {
   p = f32[700000,32] parameter(0)
   zero = f32[] constant(0)
-  ROOT reduce_small_row = f32[700000] reduce(p, zero), dimensions={1}, to_apply=%addition
+  ROOT %wrapped_reduce_small_row = f32[700000]{0} fusion(f32[700000,32]{1,0} %p, f32[] %zero), kind=kInput, calls=%fused_computation
 }
 
 // -----
@@ -48,7 +60,7 @@ ENTRY main {
 // CHECK-LABEL: .entry wrapped_reduce_sine
 // CHECK-COUNT-7: ld.global.nc.v2.f32
 
-HloModule DisableSin
+HloModule DisableSin, is_scheduled=true
 
 %add_float {
   %x = f32[] parameter(0)
@@ -56,11 +68,22 @@ HloModule DisableSin
   ROOT %add.17 = f32[] add(f32[] %x, f32[] %y)
 }
 
+%fused_computation {
+  %param_0 = f32[5,3584]{1,0} parameter(0)
+  ROOT %sine.1 = f32[5,3584]{1,0} sine(f32[5,3584]{1,0} %param_0)
+}
+
+%fused_computation.1 {
+  %param_0.1 = f32[5,3584]{1,0} parameter(0)
+  %param_1 = f32[] parameter(1)
+  ROOT %reduce.sine.1 = f32[5]{0} reduce(f32[5,3584]{1,0} %param_0.1, f32[] %param_1), dimensions={1}, to_apply=%add_float
+}
+
 ENTRY %main {
   %arg0.1 = f32[5,3584] parameter(0)
-  %sine = f32[5,3584] sine(f32[5,3584] %arg0.1)
+  %wrapped_sine = f32[5,3584]{1,0} fusion(f32[5,3584]{1,0} %arg0.1), kind=kLoop, calls=%fused_computation
   %constant.0 = f32[] constant(0)
-  ROOT %reduce.sine = f32[5] reduce(f32[5,3584] %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float
+  ROOT %wrapped_reduce.sine = f32[5]{0} fusion(f32[5,3584]{1,0} %wrapped_sine, f32[] %constant.0), kind=kInput, calls=%fused_computation.1
 }
 
 // -----
@@ -79,7 +102,7 @@ ENTRY %main {
 // CHECK-SM70-LABEL: .entry wrapped_reduce_exp
 // CHECK-SM70-COUNT-8: ld.global.nc.v2.f32
 
-HloModule Exp
+HloModule Exp, is_scheduled=true
 
 %add_float {
   %x = f32[] parameter(0)
@@ -87,16 +110,27 @@ HloModule Exp
   ROOT %add.17 = f32[] add(f32[] %x, f32[] %y)
 }
 
+%fused_computation {
+  %param_0 = f32[5,3584]{1,0} parameter(0)
+  ROOT %exp.1 = f32[5,3584]{1,0} exponential(f32[5,3584]{1,0} %param_0)
+}
+
+%fused_computation.1 {
+  %param_0.1 = f32[5,3584]{1,0} parameter(0)
+  %param_1 = f32[] parameter(1)
+  ROOT %reduce.exp.1 = f32[5]{0} reduce(f32[5,3584]{1,0} %param_0.1, f32[] %param_1), dimensions={1}, to_apply=%add_float
+}
+
 ENTRY %main {
   %arg0.1 = f32[5,3584] parameter(0)
-  %exp = f32[5,3584] exponential(f32[5,3584] %arg0.1)
+  %wrapped_exp = f32[5,3584]{1,0} fusion(f32[5,3584]{1,0} %arg0.1), kind=kLoop, calls=%fused_computation
   %constant.0 = f32[] constant(0)
-  ROOT %reduce.exp = f32[5] reduce(f32[5,3584] %exp, f32[] %constant.0), dimensions={1}, to_apply=%add_float
+  ROOT %wrapped_reduce.exp = f32[5]{0} fusion(f32[5,3584]{1,0} %wrapped_exp, f32[] %constant.0), kind=kInput, calls=%fused_computation.1
 }
 
 // -----
 
-HloModule ReduceTileFit
+HloModule ReduceTileFit, is_scheduled=true
 
 // CHECK-SM50-LABEL: .entry wrapped_reduce_tile_fit
 // CHECK-SM50-NOT: ld.global.nc.v2.f32
@@ -114,15 +148,21 @@ HloModule ReduceTileFit
   ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
 }
 
+%fused_computation {
+  %param_0.1 = f32[5,3584]{1,0} parameter(0)
+  %param_1 = f32[] parameter(1)
+  ROOT %reduce.tile_fit.1 = f32[5]{0} reduce(f32[5,3584]{1,0} %param_0.1, f32[] %param_1), dimensions={1}, to_apply=%max_
+}
+
 ENTRY %main {
   %param_0 = f32[5,3584] parameter(0)
   %constant.3 = f32[] constant(0)
-  ROOT %reduce.tile_fit = f32[5] reduce(f32[5,3584] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+  ROOT %wrapped_reduce.tile_fit = f32[5]{0} fusion(f32[5,3584]{1,0} %param_0, f32[] %constant.3), kind=kInput, calls=%fused_computation
 }
 
 // -----
 
-HloModule ReducePower2
+HloModule ReducePower2, is_scheduled=true
 
 // CHECK-SM50-LABEL: .entry wrapped_reduce_pow_2
 // CHECK-SM50-NOT: ld.global.nc.v2.f32
@@ -140,15 +180,21 @@ HloModule ReducePower2
   ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
 }
 
+%fused_computation {
+  %param_0.1 = f32[5,4096]{1,0} parameter(0)
+  %param_1 = f32[] parameter(1)
+  ROOT %reduce.pow_2.1 = f32[5]{0} reduce(f32[5,4096]{1,0} %param_0.1, f32[] %param_1), dimensions={1}, to_apply=%max_
+}
+
 ENTRY %main {
   %param_0 = f32[5,4096] parameter(0)
   %constant.3 = f32[] constant(0)
-  ROOT %reduce.pow_2 = f32[5] reduce(f32[5,4096] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+  ROOT %wrapped_reduce.pow_2 = f32[5]{0} fusion(f32[5,4096]{1,0} %param_0, f32[] %constant.3), kind=kInput, calls=%fused_computation
 }
 
 // -----
 
-HloModule ReduceEvenColumns
+HloModule ReduceEvenColumns, is_scheduled=true
 
 // CHECK-SM60-LABEL: .entry wrapped_reduce_even_col
 // CHECK-SM60-NOT: ld.global.nc.f32
@@ -164,8 +210,14 @@ HloModule ReduceEvenColumns
   ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
 }
 
+%fused_computation {
+  %param_0.1 = f32[5,4070]{1,0} parameter(0)
+  %param_1 = f32[] parameter(1)
+  ROOT %reduce.even_col.1 = f32[5]{0} reduce(f32[5,4070]{1,0} %param_0.1, f32[] %param_1), dimensions={1}, to_apply=%max_
+}
+
 ENTRY %main {
   %param_0 = f32[5,4070] parameter(0)
   %constant.3 = f32[] constant(0)
-  ROOT %reduce.even_col = f32[5] reduce(f32[5,4070] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+  ROOT %wrapped_reduce.even_col = f32[5]{0} fusion(f32[5,4070]{1,0} %param_0, f32[] %constant.3), kind=kInput, calls=%fused_computation
 }
diff --git a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
index eb7f65c0f4bd08..297d60f372050e 100644
--- a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
+++ b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
@@ -1,6 +1,6 @@
 // RUN: hlo_to_llvm_ir %s | FileCheck %s
 
-HloModule TestModule
+HloModule TestModule, is_scheduled=true
 
 // CHECK-LABEL: entry:
 // CHECK:         %[[VAL_0:.*]] = load i128, ptr addrspace(1) @rng_state, align {{.*}}
diff --git a/third_party/xla/xla/service/gpu/tests/scatter.hlo b/third_party/xla/xla/service/gpu/tests/scatter.hlo
index 86d42a7df68ee7..0aa6c3ed535770 100644
--- a/third_party/xla/xla/service/gpu/tests/scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/scatter.hlo
@@ -151,7 +151,7 @@
 // CHECK:         store atomic i32 %[[VAL_111]], ptr %[[VAL_107]] unordered, align 4
 // CHECK:         br label %[[VAL_99]]
 
-HloModule TensorFlowScatterV1
+HloModule TensorFlowScatterV1, is_scheduled=true
 
 update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
   lhs = s32[] parameter(0)
@@ -174,7 +174,7 @@ ENTRY main {
 // -----
 
 
-HloModule ScatterIntoScalar
+HloModule ScatterIntoScalar, is_scheduled=true
 
 update_s32 {
   lhs = s32[] parameter(0)
@@ -197,7 +197,7 @@ ENTRY main {
 // -----
 
 
-HloModule TensorFlowScatter_Mul
+HloModule TensorFlowScatter_Mul, is_scheduled=true
 
 mul_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
   lhs = s32[] parameter(0)
@@ -220,7 +220,7 @@ ENTRY main {
 // -----
 
 
-HloModule ScalarUpdate
+HloModule ScalarUpdate, is_scheduled=true
 
 update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
   lhs = s32[] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo b/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
index c9e883e0a417e2..1f2be41ad5ab26 100644
--- a/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
@@ -101,7 +101,7 @@
 // CHECK:         store i8 %[[VAL_63]], ptr %[[VAL_64:.*]], align 1
 // CHECK:         ret void
 
-HloModule SelectAndScatter
+HloModule SelectAndScatter, is_scheduled=true
 
 %ge_F32 (lhs.5: f32[], rhs.6: f32[]) -> pred[] {
   %lhs.5 = f32[] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
index 3793fb4a6fb6b9..50af63fb3a866d 100644
--- a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
+++ b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
@@ -2,42 +2,62 @@
 
 // CHECK-DAG: sqrt.approx.f32
 
-HloModule Test
+HloModule Test, is_scheduled=true
+
+fused_computation {
+  param_0 = f32[] parameter(0)
+  ROOT b.1 = f32[] sqrt(f32[] param_0)
+}
 
 ENTRY main {
   a = f32[] parameter(0)
-  ROOT b = f32[] sqrt(a)
+  ROOT wrapped_b = f32[] fusion(f32[] a), kind=kLoop, calls=fused_computation
 }
 
 // -----
 
 // CHECK-DAG: sqrt.approx.f32
 
-HloModule Test
+HloModule Test, is_scheduled=true
+
+fused_computation {
+  param_0 = f16[] parameter(0)
+  ROOT b.1 = f16[] sqrt(f16[] param_0)
+}
 
 ENTRY main {
   a = f16[] parameter(0)
-  ROOT b = f16[] sqrt(a)
+  ROOT wrapped_b = f16[] fusion(f16[] a), kind=kLoop, calls=fused_computation
 }
 
 // -----
 
 // CHECK-DAG: rsqrt.approx.f32
 
-HloModule Test
+HloModule Test, is_scheduled=true
+
+fused_computation {
+  param_0 = f32[] parameter(0)
+  ROOT b.1 = f32[] rsqrt(f32[] param_0)
+}
 
 ENTRY main {
   a = f32[] parameter(0)
-  ROOT b = f32[] rsqrt(a)
+  ROOT wrapped_b = f32[] fusion(f32[] a), kind=kLoop, calls=fused_computation
 }
 
 // -----
 
 // CHECK-DAG: ex2.approx.ftz.f32
 
-HloModule Test
+HloModule Test, is_scheduled=true
+
+fused_computation {
+  param_0 = f32[] parameter(0)
+  ROOT b.1 = f32[] exponential(f32[] param_0)
+}
 
 ENTRY main {
   a = f32[] parameter(0)
-  ROOT b = f32[] exponential(a)
+  ROOT wrapped_b = f32[] fusion(f32[] a), kind=kLoop, calls=fused_computation
 }
diff --git a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
index 8c2b1144e951f9..b67edac01be63b 100644
--- a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
+++ b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
@@ -65,7 +65,7 @@
 // CHECK:         br label %[[VAL_29]]
 
 
-HloModule SliceToDynamic
+HloModule SliceToDynamic, is_scheduled=true
 
 ENTRY main {
   %param = s32[2,2,2]{2,0,1} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/sorting.hlo b/third_party/xla/xla/service/gpu/tests/sorting.hlo
index 1248013d86293e..31e9c3dad40163 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sorting.hlo
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 // RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %{IR_SUBST} %s
 
-HloModule TestModule
+HloModule TestModule, is_scheduled=true
 
 compare {
   p.0.lhs = f32[] parameter(0)
@@ -291,7 +291,7 @@ ENTRY main {
 
 // -----
 
-HloModule TestModule
+HloModule TestModule, is_scheduled=true
 
 compare {
   p.0.lhs = s32[] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
index f9f5c20f2e6909..53be10f72e3c8a 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
@@ -2,7 +2,7 @@
 
 // CHECK: define [[KERNEL_ANNOTATION]]void @triton_gemm_r(
 
-HloModule t, entry_computation_layout={(f16[15,19]{1,0},s8[19,17]{1,0})->f16[15,17]{1,0}}
+HloModule t, is_scheduled=true, entry_computation_layout={(f16[15,19]{1,0},s8[19,17]{1,0})->f16[15,17]{1,0}}
 
 %triton_gemm_r (parameter_0: s8[19,17], parameter_1: f16[15,19]) -> f16[15,17] {
   %parameter_1 = f16[15,19]{1,0} parameter(1)

From dbba95ec7ea4ea35d891f964ae25d572d33c2543 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Wed, 27 Sep 2023 04:31:30 -0700
Subject: [PATCH 315/567] [XLA:GPU] NFC: improve variable names.

PiperOrigin-RevId: 568811156
---
 .../xla/service/gpu/gpu_performance_model.cc  | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/gpu_performance_model.cc
index 1c64fac75e1809..431c09c0812450 100644
--- a/third_party/xla/xla/service/gpu/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/gpu_performance_model.cc
@@ -176,10 +176,10 @@ EstimateRunTimeData EstimateRunTimeImpl(
   int64_t flops = cost_analysis->flop_count(*instr);
   float bytes_written = cost_analysis->output_bytes_accessed(*instr);
   float bytes_read = cost_analysis->bytes_accessed(*instr) - bytes_written;
-  float elements_out = ShapeUtil::ElementsInRecursive(instr->shape());
+  float num_threads = ShapeUtil::ElementsInRecursive(instr->shape());
 
   const absl::Duration compute_time =
-      ComputeTime(*device_info, /*n_flops=*/flops, /*n_threads=*/elements_out);
+      ComputeTime(*device_info, /*n_flops=*/flops, /*n_threads=*/num_threads);
   const absl::Duration read_time =
       ProducerInputAccessTime(cost_analysis, *device_info, /*producer=*/instr);
   const absl::Duration write_time =
@@ -191,20 +191,20 @@ EstimateRunTimeData EstimateRunTimeImpl(
     LOG(INFO) << "FLOPs: " << flops;
     LOG(INFO) << "Bytes read: " << bytes_read;
     LOG(INFO) << "Bytes written: " << bytes_written;
-    LOG(INFO) << "Elements out: " << elements_out;
+    LOG(INFO) << "Num threads:" << num_threads;
     LOG(INFO) << "Compute time: " << compute_time;
     LOG(INFO) << "Input read time: " << read_time;
     LOG(INFO) << "Output write time: " << write_time;
   }
 
-  return {flops, bytes_written, elements_out, write_time, exec_time};
+  return {flops, bytes_written, num_threads, write_time, exec_time};
 }
 
 }  // namespace
 
 GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
-    std::vector<HloInstruction*> fused_users, bool multi_output) {
+    std::vector<HloInstruction*> fused_consumers, bool multi_output) {
   VLOG(8) << "Producer: " << producer->name();
   if (producer->opcode() == HloOpcode::kFusion) {
     VLOG(10) << producer->fused_instructions_computation()->ToString();
@@ -215,14 +215,15 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
   EstimateRunTimeData producer_data =
       EstimateRunTimeImpl(producer, cost_analysis);
 
-  int64_t fused_consumer_count = fused_users.size();
+  int64_t fused_consumer_count = fused_consumers.size();
   float total_producer_utilization = 0;
 
   absl::Duration exec_time_fused = absl::ZeroDuration();
   absl::Duration producer_output_read_time_unfused = absl::ZeroDuration();
-  for (const HloInstruction* u : fused_users) {
+  for (const HloInstruction* fused_consumer : fused_consumers) {
     const float utilization_by_this_consumer =
-        cost_analysis->operand_utilization(*u, u->operand_index(producer));
+        cost_analysis->operand_utilization(
+            *fused_consumer, fused_consumer->operand_index(producer));
     total_producer_utilization += utilization_by_this_consumer;
 
     // The model ignores consumer computation and output writes. The main goal
@@ -236,7 +237,7 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     int64_t num_threads =
         producer_data.elements_out * utilization_by_this_consumer;
     if (auto thread_count =
-            EstimateThreadCount(u, *cost_analysis->device_info_)) {
+            EstimateThreadCount(fused_consumer, *cost_analysis->device_info_)) {
       num_threads = std::min(*thread_count, num_threads);
     }
     const absl::Duration compute_time_by_this_consumer = ComputeTime(
@@ -246,7 +247,7 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
 
     const absl::Duration input_access_type_by_this_consumer =
         ProducerInputAccessTime(cost_analysis, *device_info, producer,
-                                /*fused_consumer=*/u);
+                                /*fused_consumer=*/fused_consumer);
 
     exec_time_fused += std::max(compute_time_by_this_consumer,
                                 input_access_type_by_this_consumer);

From 72aa8ab84e24f66762e502ba6f39f12587e2415b Mon Sep 17 00:00:00 2001
From: pemeliya <141146080+pemeliya@users.noreply.github.com>
Date: Wed, 27 Sep 2023 06:32:14 -0700
Subject: [PATCH 316/567] PR #5936: [ROCm] Fixing stream_executor related build
 errors

Imported from GitHub PR https://github.com/openxla/xla/pull/5936

@akuegel: can you please review this change?
These are ongoing changes to fix XLA tests build brakes
Copybara import of the project:

--
bf5fa597ef24340efd6d675807c79d4eacb39b0f by Pavel Emeliyanenko <pavel.emeliyanenko@amd.com>:

fixing stream_executor related build errors

Merging this change closes #5936

PiperOrigin-RevId: 568832863
---
 third_party/xla/xla/stream_executor/gpu/BUILD  | 7 +++----
 third_party/xla/xla/stream_executor/rocm/BUILD | 5 ++---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 0da2b1e3b3033b..7ddab344718dc9 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -109,7 +109,7 @@ cc_library(
     deps = if_gpu_is_configured([
         ":gpu_driver_header",
         ":gpu_stream_header",
-        "//xla/stream_executor:event",
+        "//xla/stream_executor:stream_executor_headers",
         "@local_tsl//tsl/platform:status",
     ]),
 )
@@ -200,7 +200,6 @@ cc_library(
     deps = [
         ":gpu_driver_header",
         "//xla/stream_executor:stream_executor_headers",
-        "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor/platform",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -212,7 +211,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":gpu_types_header",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/base:core_headers",
     ],
 )
@@ -237,7 +236,7 @@ cc_library(
     deps = [
         ":gpu_driver_header",
         ":gpu_executor_header",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_headers",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index e28a71f07e712c..812dae5f7d4be0 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -129,7 +129,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocm_kernel",
-        "//xla/stream_executor:event",
+        "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/platform",
     ]),
@@ -217,7 +217,6 @@ cc_library(
         ":rocm_platform_id",
         "@eigen_archive//:eigen3",
         "//xla/stream_executor",
-        "//xla/stream_executor:event",
         "//xla/stream_executor:host_or_device_scalar",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
@@ -259,7 +258,7 @@ cc_library(
     deps = if_rocm_is_configured([
         ":hipfft_if_static",
         ":rocm_platform_id",
-        "//xla/stream_executor:event",
+        "//xla/stream_executor",
         "//xla/stream_executor:fft",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",

From 44c0a47ad5ad1b8978723e8ca5944568bd46eea8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 08:29:46 -0700
Subject: [PATCH 317/567] Integrate LLVM at llvm/llvm-project@aecb58005c69

Updates LLVM usage to match
[aecb58005c69](https://github.com/llvm/llvm-project/commit/aecb58005c69)

PiperOrigin-RevId: 568858383
---
 third_party/llvm/generated.patch              | 47 +++++++++++++++++++
 third_party/llvm/workspace.bzl                |  4 +-
 .../tests/Dialect/gml_st/greedy_fusion.mlir   | 22 ++++-----
 .../mlir_hlo/tests/Dialect/thlo/tiling.mlir   | 16 +++----
 4 files changed, 68 insertions(+), 21 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 509398da979e83..aea3ea01bf42bf 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1 +1,48 @@
 Auto generated patch. Do not edit or delete it, even if empty.
+diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
++++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+@@ -5889,9 +5889,15 @@
+     for (const unsigned E = G.size();
+          BestIdx < E && (G[BestIdx].second == TargetLowering::C_Other ||
+                          G[BestIdx].second == TargetLowering::C_Immediate);
+-         ++BestIdx)
++         ++BestIdx) {
+       if (lowerImmediateIfPossible(G[BestIdx], Op, DAG, *this))
+         break;
++      // If we're out of constraints, just pick the first one.
++      if (BestIdx + 1 == E) {
++        BestIdx = 0;
++        break;
++      }
++    }
+ 
+     OpInfo.ConstraintCode = G[BestIdx].first;
+     OpInfo.ConstraintType = G[BestIdx].second;
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/X86/inline-asm-bad-constraint-n.ll b/llvm/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
+--- a/llvm/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
++++ b/llvm/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
+@@ -8,3 +8,9 @@
+   call void asm sideeffect "foo $0", "n"(ptr %a) nounwind
+   ret void
+ }
++
++; CHECK: error: invalid operand for inline asm constraint 'i'
++define void @bar(i32 %v) {
++  call void asm "", "in"(i32 %v)
++  ret void
++}
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/X86/inline-asm-n-constraint.ll b/llvm/test/CodeGen/X86/inline-asm-n-constraint.ll
+--- a/llvm/test/CodeGen/X86/inline-asm-n-constraint.ll
++++ b/llvm/test/CodeGen/X86/inline-asm-n-constraint.ll
+@@ -8,6 +8,10 @@
+ ; CHECK:      #APP
+ ; CHECK-NEXT: foo    $42
+ ; CHECK-NEXT: #NO_APP
++  call void asm "# $0", "in"(i32 1392848979)
++; CHECK-NEXT: #APP
++; CHECK-NEXT: # $1392848979
++; CHECK-NEXT: #NO_APP
+   ret void
+ ; CHECK-NEXT: retq
+ }
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 7879bbfdad7a0e..7d4569adf21255 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "69ba4aed004dd96dcf254e1ee19bd5d3875c0d09"
-    LLVM_SHA256 = "bcf5d0f7b9a8d49cbb9f296a02d87be47cc64bbbd3aa45b6aa356c5adcd85e01"
+    LLVM_COMMIT = "aecb58005c692ed756d748437896498b124e9d45"
+    LLVM_SHA256 = "9d5f9be58aa8dcde7b8b4b454ac2b5c92531b5ecf61254438e70ac772efac607"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
index 781836a3120508..f54c4ed7174932 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
@@ -21,7 +21,7 @@ transform.sequence failures(propagate) {
   ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %forall_op, %tiled_op = transform.structured.tile_to_forall_op %0 num_threads [10, 20] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %forall_op, %tiled_op = transform.structured.tile_using_forall %0 num_threads [10, 20] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK:      %[[INIT:.*]] = tensor.empty()
@@ -65,7 +65,7 @@ transform.sequence failures(propagate) {
   ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [0, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [0, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK: tensor.empty
@@ -96,7 +96,7 @@ transform.sequence failures(propagate) {
   ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK:      %[[INIT:.*]] = tensor.empty()
@@ -179,7 +179,7 @@ transform.sequence failures(propagate) {
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // -----
@@ -210,7 +210,7 @@ transform.sequence failures(propagate) {
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @fuse_reshape_middle_unit_dim_map
@@ -254,7 +254,7 @@ transform.sequence failures(propagate) {
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @fuse_reshape_trailing_unit_dim_map
@@ -298,7 +298,7 @@ transform.sequence failures(propagate) {
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @fuse_reshape_leading_unit_dim_map
@@ -342,7 +342,7 @@ transform.sequence failures(propagate) {
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @fuse_reshape_multiple_unit_dims_map
@@ -386,7 +386,7 @@ transform.sequence failures(propagate) {
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @fuse_reshape_reassoc_only_unit_dims_map
@@ -430,7 +430,7 @@ transform.sequence failures(propagate) {
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @do_not_fuse_collapse_shape
@@ -475,7 +475,7 @@ transform.sequence failures(propagate) {
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @do_not_fuse_expand_shape
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/tiling.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/tiling.mlir
index f9c838d764bc38..6709cd91100068 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/tiling.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/tiling.mlir
@@ -12,7 +12,7 @@ transform.sequence failures(propagate) {
   ^bb0(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["thlo.dynamic_broadcast_in_dim"]} in %arg1
       : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile %0 [256, 512]
+    %1, %loops:2 = transform.structured.tile_using_for %0 [256, 512]
       : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
 }
 
@@ -75,7 +75,7 @@ transform.sequence failures(propagate) {
   ^bb0(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["thlo.scatter"]} in %arg1
       : (!pdl.operation) -> !pdl.operation
-    %1, %loop = transform.structured.tile %0 [1]
+    %1, %loop = transform.structured.tile_using_for %0 [1]
       : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
 }
 
@@ -125,7 +125,7 @@ transform.sequence failures(propagate) {
   ^bb0(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["thlo.gather"]} in %arg1
       : (!pdl.operation) -> !pdl.operation
-    %1, %loop = transform.structured.tile %0 [1]
+    %1, %loop = transform.structured.tile_using_for %0 [1]
       : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
 }
 
@@ -167,7 +167,7 @@ transform.sequence failures(propagate) {
   ^bb0(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["thlo.concatenate"]} in %arg1
       : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile %0 [256, 512]
+    %1, %loops:2 = transform.structured.tile_using_for %0 [256, 512]
       : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
 }
 
@@ -243,7 +243,7 @@ transform.sequence failures(propagate) {
   ^bb0(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["thlo.sort"]} in %arg1
       : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile %0 [256, 512]
+    %1, %loops:2 = transform.structured.tile_using_for %0 [256, 512]
       : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
 }
 
@@ -323,7 +323,7 @@ transform.sequence failures(propagate) {
   ^bb0(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["thlo.sort"]} in %arg1
       : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile %0 [256, 512]
+    %1, %loops:2 = transform.structured.tile_using_for %0 [256, 512]
       : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
 }
 
@@ -344,7 +344,7 @@ transform.sequence failures(propagate) {
   ^bb0(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["thlo.reverse"]} in %arg1
       : (!pdl.operation) -> !pdl.operation
-    %1, %loop = transform.structured.tile %0 [10]
+    %1, %loop = transform.structured.tile_using_for %0 [10]
       : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
 }
 
@@ -380,7 +380,7 @@ transform.sequence failures(propagate) {
   ^bb0(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["thlo.reverse"]} in %arg1
       : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile %0 [256, 512]
+    %1, %loops:2 = transform.structured.tile_using_for %0 [256, 512]
       : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
 }
 

From c57315017a07a1750193b8dbe510efdc75dc9a28 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 09:03:10 -0700
Subject: [PATCH 318/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/9d58a7aefd3b0b184bfc30be383bdbe17c5233ae.

PiperOrigin-RevId: 568866604
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index f0af407c117416..d6c8f2e2431d7b 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "ad4ba135f9a5ca24862daf7c93e4a3b6e53c69c8"
-    TFRT_SHA256 = "f523256aa10d35e9c2f8c2d7e5674e8496aef7fd2a6547942696462d0d0497f6"
+    TFRT_COMMIT = "9d58a7aefd3b0b184bfc30be383bdbe17c5233ae"
+    TFRT_SHA256 = "f0bd547a2a36e4866af3101c9f3c7da39e66fe77ead83bf8b2fdc1c9b5674ce0"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index f0af407c117416..d6c8f2e2431d7b 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "ad4ba135f9a5ca24862daf7c93e4a3b6e53c69c8"
-    TFRT_SHA256 = "f523256aa10d35e9c2f8c2d7e5674e8496aef7fd2a6547942696462d0d0497f6"
+    TFRT_COMMIT = "9d58a7aefd3b0b184bfc30be383bdbe17c5233ae"
+    TFRT_SHA256 = "f0bd547a2a36e4866af3101c9f3c7da39e66fe77ead83bf8b2fdc1c9b5674ce0"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index f0af407c117416..d6c8f2e2431d7b 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "ad4ba135f9a5ca24862daf7c93e4a3b6e53c69c8"
-    TFRT_SHA256 = "f523256aa10d35e9c2f8c2d7e5674e8496aef7fd2a6547942696462d0d0497f6"
+    TFRT_COMMIT = "9d58a7aefd3b0b184bfc30be383bdbe17c5233ae"
+    TFRT_SHA256 = "f0bd547a2a36e4866af3101c9f3c7da39e66fe77ead83bf8b2fdc1c9b5674ce0"
 
     tf_http_archive(
         name = "tf_runtime",

From a4b033e9147d4b5bc2fd1691c8c94772bb4557ac Mon Sep 17 00:00:00 2001
From: Shixin Li <shixinli@google.com>
Date: Wed, 27 Sep 2023 09:29:29 -0700
Subject: [PATCH 319/567] Rename `LoadSerializedExecutable` to `LoadSerialized`
 to avoid overriding the method.

PiperOrigin-RevId: 568873573
---
 third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc       | 9 ++++-----
 third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h        | 6 ++++--
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc    | 4 ++--
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 733942386f645c..7ffc7e263e6a2f 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -554,9 +554,9 @@ StatusOr<std::unique_ptr<StreamExecutorUnloadedExecutable>> FromProto(
 }  // namespace
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-StreamExecutorGpuClient::LoadSerializedExecutable(
-    absl::string_view serialized, std::optional<CompileOptions> options,
-    const LoadOptions& load_options) {
+StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
+                                        std::optional<CompileOptions> options,
+                                        const LoadOptions& load_options) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   StreamExecutorUnloadedExecutableProto proto;
   if (serialized.size() > std::numeric_limits<int>::max()) {
@@ -573,8 +573,7 @@ StreamExecutorGpuClient::LoadSerializedExecutable(
   // TODO(b/296466237): Unify the `Load` method.
   return Load(std::move(se_executable));
 #endif
-  return absl::InternalError(
-      "LoadSerializedExecutable only works with cuda or rocm.");
+  return absl::InternalError("LoadSerialized only works with cuda or rocm.");
 }
 
 std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index f7b7e703beb2ce..bb4044f667be7d 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -203,9 +203,11 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
       std::unique_ptr<PjRtExecutable> executable);
 
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerializedExecutable(
+  // TODO(b/296466237): Unify `LoadSerializedExecutable` after fixing existing
+  // tests.
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerialized(
       absl::string_view serialized, std::optional<CompileOptions> options,
-      const LoadOptions& load_options) override;
+      const LoadOptions& load_options);
 
  private:
   xla::StreamExecutorGpuTopologyDescription topology_;
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index d3063d4b27be5b..42f28daf377447 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -188,8 +188,8 @@ TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
                           executable->SerializeExecutable());
   TF_ASSERT_OK_AND_ASSIGN(
       auto loaded_executable,
-      se_client->LoadSerializedExecutable(serialized_executable, std::nullopt,
-                                          LoadOptions()));
+      se_client->LoadSerialized(serialized_executable, std::nullopt,
+                                LoadOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));

From 7f3a8b5f5b8e56b5893895d1908d2a3726c60b76 Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Wed, 27 Sep 2023 09:34:05 -0700
Subject: [PATCH 320/567] Fix build breakage introduced in CL/564547199.

That CL renamed the GetOpenGlBufferReadView method
as GetOpenGlBuffer but did not make the corresponding change
to the unit test, leading to a build error.

PiperOrigin-RevId: 568874896
---
 tensorflow/lite/delegates/gpu/async_buffers_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/async_buffers_test.cc b/tensorflow/lite/delegates/gpu/async_buffers_test.cc
index 9b0212dcc6fdb1..2f51e408661358 100644
--- a/tensorflow/lite/delegates/gpu/async_buffers_test.cc
+++ b/tensorflow/lite/delegates/gpu/async_buffers_test.cc
@@ -51,13 +51,13 @@ TEST(AsyncBufferTest, DuplicateTest) {
   EXPECT_OK(gl::EglEnvironment::NewEglEnvironment(&env));
   AsyncBuffer async_buffer1 = AsyncBuffer(*tie, ahwb);
   GLuint buffer1, buffer2;
-  EXPECT_OK(async_buffer1.GetOpenGlBufferReadView(buffer1));
+  EXPECT_OK(async_buffer1.GetOpenGlBuffer(buffer1));
   EXPECT_GE(buffer1, 0);
-  EXPECT_OK(async_buffer1.GetOpenGlBufferReadView(buffer2));
+  EXPECT_OK(async_buffer1.GetOpenGlBuffer(buffer2));
   // Check that each instance of AsyncBuffer class has only one id
   EXPECT_EQ(buffer1, buffer2);
   AsyncBuffer async_buffer2 = AsyncBuffer(*tie, ahwb);
-  EXPECT_OK(async_buffer2.GetOpenGlBufferReadView(buffer2));
+  EXPECT_OK(async_buffer2.GetOpenGlBuffer(buffer2));
   // Check that each different instance will produce unique id
   EXPECT_NE(buffer1, buffer2);
 }

From 3e337d7165f2236487a669622e4520d847fead11 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Wed, 27 Sep 2023 09:45:21 -0700
Subject: [PATCH 321/567] Add a test case for fusion of convert into bitcast.

Bug found by shyshkov@google.com and kramerb@google.com

PiperOrigin-RevId: 568877930
---
 .../xla/service/gpu/priority_fusion_test.cc   | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/priority_fusion_test.cc b/third_party/xla/xla/service/gpu/priority_fusion_test.cc
index d72319fbac8982..0ca74743644a81 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion_test.cc
@@ -109,5 +109,34 @@ TEST_F(PriorityFusionTest, FusionFusionWithDuplication) {
                                         m::Fusion(m::Parameter(0)))));
 }
 
+TEST_F(PriorityFusionTest, FuseWideningConvertIntoConsumers) {
+  // Because of the bitcast consumer, the convert is currently fused only with
+  // the log, resulting in three fusions:
+  //   1. log + convert
+  //   2. multiply
+  //   3. bitcast
+  // This is a bug.
+  absl::string_view kHlo = R"(
+    HloModule test_module
+
+    ENTRY main {
+      p = f16[512]{0} parameter(0)
+      l = f16[512]{0} log(p)
+      c = f32[512]{0} convert(l)
+      s = f32[512]{0} multiply(c, c)
+      bc = s32[512]{0} bitcast(c)
+      ROOT t = (f32[512], s32[512]) tuple(s, bc)
+    })";
+
+  RunAndFilecheckHloRewrite(kHlo, std::move(priority_fusion_), R"(
+CHECK:      ENTRY
+CHECK-NEXT: %[[PARAM:.*]] = f16[512]{0} parameter(0)
+CHECK-NEXT: %[[FUSION:.*]] = f32[512]{0} fusion(%[[PARAM]])
+CHECK-NEXT: %[[MUL:.*]] = f32[512]{0} multiply(%[[FUSION]], %[[FUSION]])
+CHECK-NEXT: %[[BITCAST:.*]] = s32[512]{0} bitcast(%[[FUSION]])
+CHECK-NEXT: ROOT %{{.*}} = (f32[512]{0}, s32[512]{0}) tuple(%[[MUL]], %[[BITCAST]])
+  )");
+}
+
 }  // namespace gpu
 }  // namespace xla

From 457b3ddb9648cc2362ded1ac18cea67b96887c79 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 10:18:46 -0700
Subject: [PATCH 322/567] Support partially-known shapes in
 `SmartBroadcastGradientArgs`.

This version is able to infer static reduction axes in cases like `[?, ?, ?] x []`, `[1, ?] x [?, 1]`, etc.

Since the logic is now fully in Python (i.e. may not trigger expensive constant folding) and relatively cheap, the cache used in the previous version is no longer needed.

PiperOrigin-RevId: 568888492
---
 tensorflow/python/framework/ops.py            |   4 -
 .../kernel_tests/math_ops/cwise_ops_test.py   |   3 +-
 tensorflow/python/ops/BUILD                   |   5 +-
 tensorflow/python/ops/math_grad.py            | 156 +++++++++---------
 tensorflow/python/ops/math_grad_test.py       |  29 ++++
 5 files changed, 111 insertions(+), 86 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 5926125e3fdc25..622208c996fa71 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -2052,10 +2052,6 @@ def __init__(self):
 
     # Cache for OpDef protobufs retrieved via the C API.
     self._op_def_cache = {}
-    # Cache for constant results of `broadcast_gradient_args()`. The keys are
-    # tuples of fully-defined shapes: (x_shape_tuple, y_shape_tuple), and the
-    # values are tuples of reduction indices: (rx, ry).
-    self._bcast_grad_args_cache = {}
     # Cache for constant results of `reduced_shape()`. The keys are pairs of
     # tuples: (input_shape_tuple, reduction_indices_tuple), and the values
     # are pairs of tuples: (output_shape_kept_dims, tile_scaling).
diff --git a/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py b/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py
index 5a439dc2061d71..d8d7d28af280bc 100644
--- a/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py
@@ -1223,7 +1223,8 @@ def _compareGradient(self, x):
               math_ops.square(math_ops.imag(cplx)))
       epsilon = 1e-3
       jacob_t, jacob_n = gradient_checker.compute_gradient(
-          inx, list(x.shape), loss, [1], x_init_value=x, delta=epsilon)
+          inx, list(x.shape), loss, list(loss.shape), x_init_value=x,
+          delta=epsilon)
     self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
   def _compareBroadcastGradient(self, x):
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index d9c0bceb0f35b7..7bc37182165395 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
-load("//tensorflow/python:build_defs.bzl", "tf_gen_op_strict_wrapper_private_py")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_xla_deps_py")
+load("//tensorflow/python:build_defs.bzl", "tf_gen_op_strict_wrapper_private_py")
 
 visibility = [
     "//engedu/ml/tf_from_scratch:__pkg__",
@@ -3562,15 +3562,18 @@ cuda_py_strict_test(
         ":gradient_checker",
         ":gradient_checker_v2",
         ":gradients",
+        ":math_grad",
         ":math_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index d702d7fad03c19..3a120565a1603b 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -57,75 +57,71 @@ def _EuclideanNormGrad(op: ops.Operation, grad):
   return math_ops.truediv(op.inputs[0], output / grad), None
 
 
-def SmartBroadcastGradientArgs(x, y, grad):
-  """Optimized version of `broadcast_gradient_args` that caches results.
-
-  This implementation avoids creating `broadcast_gradient_args` ops in the case
-  that the input shapes are fully defined, and provides hints to the calling
-  code that can be used to avoid creating reduction and reshaping ops.
+def SmartBroadcastGradientArgs(x, y, grad=None):
+  """Version of `BroadcastGradientArgs` optimized for partially-known shapes.
 
   Args:
-    x: The left input tensor to a broadcasting binary op.
-    y: The right input tensor to a broadcasting binary op.
-    grad: The incoming gradient tensor for a broadcasting binary op.
+    x: The first argument of a broadcasting binary op.
+    y: The second argument of a broadcasting binary op.
+    grad: Deprecated.
 
   Returns:
-    A pair of tuples, containing:
-      * A 3-tuple of broadcast information for x, containing:
-        * The shape of x (as a tuple or Tensor).
-        * The reduction indices for x (as a tuple or Tensor).
-        * A boolean, which if True, indicates that x's shape differs from grad's
-          shape (and so x's gradient must be reduced and/or reshaped).
-      * A 3-tuple of broadcast information for y, containing the respective
-        details for y.
+    A pair of triples, one per argument with
+      * Shape of the argument (tensor);
+      * Reduction axes for the argument (list or tensor);
+      * Boolean indicating whether the reduction must be applied.
   """
-  # NOTE: It may be productive to apply these optimizations in the eager case
-  # as well.
-  if context.executing_eagerly() or not (
-      isinstance(x, tensor.Tensor) and isinstance(y, tensor.Tensor)
-      and isinstance(grad, tensor.Tensor)):
-    sx = array_ops.shape(x)
-    sy = array_ops.shape(y)
-    rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
-    return (sx, rx, True), (sy, ry, True)
-
-  # pylint: disable=protected-access
-  x_shape_tuple = x._shape_tuple()
-  y_shape_tuple = y._shape_tuple()
-  grad_shape_tuple = grad._shape_tuple()
-  # pylint: enable=protected-access
-
-  if (x_shape_tuple is None or None in x_shape_tuple or
-      y_shape_tuple is None or None in y_shape_tuple):
-    sx = array_ops.shape_internal(x, optimize=False)
-    sy = array_ops.shape_internal(y, optimize=False)
-    rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
-    return (sx, rx, True), (sy, ry, True)
-
-  x_needs_reduction = x_shape_tuple != grad_shape_tuple
-  y_needs_reduction = y_shape_tuple != grad_shape_tuple
-
-  # Get the default graph rather than relying on `x.graph`, `y.graph`, or
-  # `grad.graph`, because these may be eager tensors.
-  g = ops.get_default_graph()
+  del grad
+  x_shape = array_ops.shape(x)
+  y_shape = array_ops.shape(y)
+
+  if (not context.executing_eagerly() and
+      isinstance(x, tensor.Tensor) and
+      isinstance(y, tensor.Tensor)):
+    x_axes, y_axes = _InferGradientReductionAxes(x.shape, y.shape)
+  else:
+    x_axes, y_axes = None, None
 
-  try:
-    rx, ry = g._bcast_grad_args_cache[(x_shape_tuple, y_shape_tuple)]  # pylint: disable=protected-access
-    return (x_shape_tuple, rx, x_needs_reduction), (
-        y_shape_tuple, ry, y_needs_reduction)
-  except KeyError:
-    rx, ry = array_ops.broadcast_gradient_args(x_shape_tuple, y_shape_tuple)
-    # TODO(mrry): If this becomes a bottleneck, add a multi-output version of
-    # `TF_TryEvaluateConstant()`.
-    rx_value = tuple(tensor_util.try_evaluate_constant(rx))
-    assert rx_value is not None
-    ry_value = tuple(tensor_util.try_evaluate_constant(ry))
-    assert ry_value is not None
-    g._bcast_grad_args_cache[(x_shape_tuple, y_shape_tuple)] = (  # pylint: disable=protected-access
-        rx_value, ry_value)
-
-    return (x_shape_tuple, rx_value, x_needs_reduction), (
-        y_shape_tuple, ry_value, y_needs_reduction)
+  if x_axes is None or y_axes is None:
+    # NOTE: In graph mode, this is never exercised for statically known shapes.
+    x_axes, y_axes = gen_array_ops.broadcast_gradient_args(x_shape, y_shape)
+    x_must_reduce = True
+    y_must_reduce = True
+  else:
+    x_must_reduce = x_axes or x.shape.rank < y.shape.rank
+    y_must_reduce = y_axes or y.shape.rank < x.shape.rank
+
+  return (x_shape, x_axes, x_must_reduce), (y_shape, y_axes, y_must_reduce)
+
+
+def _InferGradientReductionAxes(x_shape, y_shape):
+  """Infers the sets of axes that might have been broadcasted."""
+  x_rank = x_shape.rank
+  y_rank = y_shape.rank
+  if x_rank is None or y_rank is None:
+    return None, None
+
+  # Convert shapes for V1 compatibility, can be omitted in V2.
+  x_shape = x_shape.as_list()
+  y_shape = y_shape.as_list()
+
+  b_rank = max(x_rank, y_rank)
+  x_axes = []
+  y_axes = []
+  for axis in range(b_rank):
+    x_dim = 1 if axis < b_rank - x_rank else x_shape[axis - (b_rank - x_rank)]
+    y_dim = 1 if axis < b_rank - y_rank else y_shape[axis - (b_rank - y_rank)]
+    if x_dim == 1 and y_dim != 1:
+      # It's safe to assume that x_dim was broadcasted.
+      x_axes.append(axis)
+    elif y_dim == 1 and x_dim != 1:
+      # It's safe to assume that y_dim was broadcasted.
+      y_axes.append(axis)
+    elif x_dim is None or y_dim is None:
+      # Broadcasting decision is dynamic (data-dependent).
+      return None, None
+
+  return x_axes, y_axes
 
 
 def _ReduceGradientArg(grad, shape_axes_must_reduce):
@@ -140,10 +136,10 @@ def _ReduceGradientArg(grad, shape_axes_must_reduce):
   return grad
 
 
-def _ReduceGradientArgs(x, y, grad, gx, gy):
+def _ReduceGradientArgs(x, y, gx, gy):
   """Reduces gradients of both arguments of a broadcasting binary op."""
   if gx is not None or gy is not None:
-    bx, by = SmartBroadcastGradientArgs(x, y, grad)
+    bx, by = SmartBroadcastGradientArgs(x, y)
     gx = _ReduceGradientArg(gx, bx)
     gy = _ReduceGradientArg(gy, by)
   return gx, gy
@@ -1338,7 +1334,7 @@ def _Atan2Grad(op: ops.Operation, grad):
     gy = x * grad_inv
     gx = -y * grad_inv
     # pylint: disable=arguments-out-of-order
-    return _ReduceGradientArgs(y, x, grad, gy, gx)
+    return _ReduceGradientArgs(y, x, gy, gx)
     # pylint: enable=arguments-out-of-order
 
 
@@ -1380,7 +1376,7 @@ def _AddGrad(op: ops.Operation, grad):
 
   gx = None if 0 in skip_input_indices else grad
   gy = None if 1 in skip_input_indices else grad
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 @ops.RegisterGradient("Sub")
@@ -1403,7 +1399,7 @@ def _SubGrad(op: ops.Operation, grad):
 
   gx = None if 0 in skip_input_indices else grad
   gy = None if 1 in skip_input_indices else -grad
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 @ops.RegisterGradient("Mul")
@@ -1437,7 +1433,7 @@ def _MulGrad(op: ops.Operation, grad):
   else:
     gy = gen_math_ops.mul(math_ops.conj(x), grad)
 
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 @ops.RegisterGradient("MulNoNan")
@@ -1453,7 +1449,7 @@ def _MulNoNanGrad(op: ops.Operation, grad):
   assert x.dtype.base_dtype == y.dtype.base_dtype, (x.dtype, " vs. ", y.dtype)
   gx = gen_math_ops.mul_no_nan(grad, y)
   gy = gen_math_ops.mul_no_nan(x, grad)
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 @ops.RegisterGradient("Div")
@@ -1465,7 +1461,7 @@ def _DivGrad(op: ops.Operation, grad):
   cy = math_ops.conj(y)
   gx = math_ops.divide(grad, cy)
   gy = grad * math_ops.divide(math_ops.divide(-cx, cy), cy)
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 @ops.RegisterGradient("FloorDiv")
@@ -1482,7 +1478,7 @@ def _FloorModGrad(op: ops.Operation, grad):
   floor_xy = math_ops.floor_div(x, y)
   gx = grad
   gy = grad * math_ops.negative(floor_xy)
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 @ops.RegisterGradient("TruncateDiv")
@@ -1499,7 +1495,7 @@ def _RealDivGrad(op: ops.Operation, grad):
   cy = math_ops.conj(op.inputs[1])
   gx = math_ops.realdiv(grad, cy)
   gy = grad * math_ops.realdiv(math_ops.realdiv(-cx, cy), cy)
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 @ops.RegisterGradient("DivNoNan")
@@ -1509,7 +1505,7 @@ def _DivNoNanGrad(op: ops.Operation, grad):
   y = math_ops.conj(op.inputs[1])
   gx = math_ops.div_no_nan(grad, y)
   gy = grad * math_ops.div_no_nan(math_ops.div_no_nan(-x, y), y)
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 @ops.RegisterGradient("Pow")
@@ -1546,7 +1542,7 @@ def _PowGrad(op: ops.Operation, grad):
     log_x = array_ops.where(mask, math_ops.log(safe_x), array_ops.zeros_like(x))
     gy = grad * math_ops.conj(op.outputs[0]) * log_x
 
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 def _MaximumMinimumGradInputOnly(op: ops.Operation, grad, selector_op):
@@ -1582,7 +1578,7 @@ def _MaximumMinimumGrad(op: ops.Operation, grad, selector_op):
     gy = None
   else:
     gy = array_ops.where_v2(xmask, zeros, grad)
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 @ops.RegisterGradient("Maximum")
@@ -1620,7 +1616,7 @@ def _SquaredDifferenceGrad(op: ops.Operation, grad):
 
   gx = None if 0 in skip_input_indices else x_grad
   gy = None if 1 in skip_input_indices else -x_grad
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 # Logical operations have no gradients.
@@ -1657,8 +1653,8 @@ def _SelectGradV2(op: ops.Operation, grad):
   zeros = array_ops.zeros([], dtype=grad.dtype.base_dtype)
   gx = array_ops.where_v2(c, grad, zeros)
   gy = array_ops.where_v2(c, zeros, grad)
-  gx, _ = _ReduceGradientArgs(x, z, grad, gx, None)
-  gy, _ = _ReduceGradientArgs(y, z, grad, gy, None)
+  gx, _ = _ReduceGradientArgs(x, z, gx, None)
+  gy, _ = _ReduceGradientArgs(y, z, gy, None)
   return None, gx, gy
 
 
@@ -1883,7 +1879,7 @@ def _ComplexGrad(op: ops.Operation, grad):
   y = op.inputs[1]
   gx = math_ops.real(grad)
   gy = math_ops.imag(grad)
-  return _ReduceGradientArgs(x, y, grad, gx, gy)
+  return _ReduceGradientArgs(x, y, gx, gy)
 
 
 @ops.RegisterGradient("Real")
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 25cf6995b108a8..da089fa5e9e932 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for Python ops defined in math_grad.py."""
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import backprop
@@ -21,15 +22,43 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_grad
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+class InferGradientReductionAxes(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      (None, None, None, None),
+      (None, [], None, None),
+      ([], [], [], []),
+      ([], [None], [0], []),
+      ([None], [None], None, None),
+      ([None, 1], [None], [1], [0]),
+      ([None, 1], [1, None], [1], [0]),
+      ([None, 1], [2, None], None, None),
+      ([2], [1], [], [0]),
+      ([2], [2], [], []),
+      ([3, 1, 5, 1, 7], [2, 1, 4, 7], [1, 3], [0, 2]),
+  )
+  def testShapes(self, x_shape, y_shape, expected_x_axes, expected_y_axes):
+    x_axes1, y_axes1 = math_grad._InferGradientReductionAxes(
+        tensor_shape.TensorShape(x_shape), tensor_shape.TensorShape(y_shape))
+    y_axes2, x_axes2 = math_grad._InferGradientReductionAxes(
+        tensor_shape.TensorShape(y_shape), tensor_shape.TensorShape(x_shape))
+    self.assertEqual(x_axes1, x_axes2)
+    self.assertEqual(y_axes1, y_axes2)
+    self.assertEqual(expected_x_axes, x_axes1)
+    self.assertEqual(expected_y_axes, y_axes1)
+
+
 class SquaredDifferenceOpTest(test.TestCase):
 
   def _testGrad(self, left_shape, right_shape):

From 1b31f7a14d0cf6379e4a4bd6ae6e50b2c5020cf4 Mon Sep 17 00:00:00 2001
From: Fiona Lang <flang@google.com>
Date: Wed, 27 Sep 2023 10:19:30 -0700
Subject: [PATCH 323/567] Move distribute docstring to distribute/__init__.py.

PiperOrigin-RevId: 568888737
---
 tensorflow/python/distribute/__init__.py      | 176 +++++++++++++++++-
 .../python/distribute/distribute_lib.py       |   2 -
 .../python/tools/api/generator/doc_srcs.py    |   2 +-
 3 files changed, 176 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/distribute/__init__.py b/tensorflow/python/distribute/__init__.py
index 9b23758841983a..86dc16cfd36492 100644
--- a/tensorflow/python/distribute/__init__.py
+++ b/tensorflow/python/distribute/__init__.py
@@ -12,6 +12,180 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""tf.distribute is a TensorFlow API to distribute training across multiple GPUs, multiple machines or TPUs.
+"""Library for running a computation across multiple devices.
 
+The intent of this library is that you can write an algorithm in a stylized way
+and it will be usable with a variety of different `tf.distribute.Strategy`
+implementations. Each descendant will implement a different strategy for
+distributing the algorithm across multiple devices/machines.  Furthermore, these
+changes can be hidden inside the specific layers and other library classes that
+need special treatment to run in a distributed setting, so that most users'
+model definition code can run unchanged. The `tf.distribute.Strategy` API works
+the same way with eager and graph execution.
+
+*Guides*
+
+* [TensorFlow v2.x](https://www.tensorflow.org/guide/distributed_training)
+* [TensorFlow
+v1.x](https://github.com/tensorflow/docs/blob/master/site/en/r1/guide/distribute_strategy.ipynb)
+
+*Tutorials*
+
+* [Distributed Training
+Tutorials](https://www.tensorflow.org/tutorials/distribute/)
+
+  The tutorials cover how to use `tf.distribute.Strategy` to do distributed
+  training with native Keras APIs, custom training loops,
+  and Estimator APIs. They also cover how to save/load model when using
+  `tf.distribute.Strategy`.
+
+*Glossary*
+
+* _Data parallelism_ is where we run multiple copies of the model
+  on different slices of the input data. This is in contrast to
+  _model parallelism_ where we divide up a single copy of a model
+  across multiple devices.
+  Note: we only support data parallelism for now, but
+  hope to add support for model parallelism in the future.
+* A _device_ is a CPU or accelerator (e.g. GPUs, TPUs) on some machine that
+  TensorFlow can run operations on (see e.g. `tf.device`). You may have multiple
+  devices on a single machine, or be connected to devices on multiple
+  machines. Devices used to run computations are called _worker devices_.
+  Devices used to store variables are _parameter devices_. For some strategies,
+  such as `tf.distribute.MirroredStrategy`, the worker and parameter devices
+  will be the same (see mirrored variables below). For others they will be
+  different. For example, `tf.distribute.experimental.CentralStorageStrategy`
+  puts the variables on a single device (which may be a worker device or may be
+  the CPU), and `tf.distribute.experimental.ParameterServerStrategy` puts the
+  variables on separate machines called _parameter servers_ (see below).
+* A _replica_ is one copy of the model, running on one slice of the
+  input data. Right now each replica is executed on its own
+  worker device, but once we add support for model parallelism
+  a replica may span multiple worker devices.
+* A _host_ is the CPU device on a machine with worker devices, typically
+  used for running input pipelines.
+* A _worker_ is defined to be the physical machine(s) containing the physical
+  devices (e.g. GPUs, TPUs) on which the replicated computation is executed. A
+  worker may contain one or more replicas, but contains at least one
+  replica. Typically one worker will correspond to one machine, but in the case
+  of very large models with model parallelism, one worker may span multiple
+  machines. We typically run one input pipeline per worker, feeding all the
+  replicas on that worker.
+* _Synchronous_, or more commonly _sync_, training is where the updates from
+  each replica are aggregated together before updating the model variables. This
+  is in contrast to _asynchronous_, or _async_ training, where each replica
+  updates the model variables independently. You may also have replicas
+  partitioned into groups which are in sync within each group but async between
+  groups.
+* _Parameter servers_: These are machines that hold a single copy of
+  parameters/variables, used by some strategies (right now just
+  `tf.distribute.experimental.ParameterServerStrategy`). All replicas that want
+  to operate on a variable retrieve it at the beginning of a step and send an
+  update to be applied at the end of the step. These can in principle support
+  either sync or async training, but right now we only have support for async
+  training with parameter servers. Compare to
+  `tf.distribute.experimental.CentralStorageStrategy`, which puts all variables
+  on a single device on the same machine (and does sync training), and
+  `tf.distribute.MirroredStrategy`, which mirrors variables to multiple devices
+  (see below).
+
+* _Replica context_ vs. _Cross-replica context_ vs _Update context_
+
+  A _replica context_ applies
+  when you execute the computation function that was called with `strategy.run`.
+  Conceptually, you're in replica context when executing the computation
+  function that is being replicated.
+
+  An _update context_ is entered in a `tf.distribute.StrategyExtended.update`
+  call.
+
+  An _cross-replica context_ is entered when you enter a `strategy.scope`. This
+  is useful for calling `tf.distribute.Strategy` methods which operate across
+  the replicas (like `reduce_to()`). By default you start in a _replica context_
+  (the "default single _replica context_") and then some methods can switch you
+  back and forth.
+
+* _Distributed value_: Distributed value is represented by the base class
+  `tf.distribute.DistributedValues`. `tf.distribute.DistributedValues` is useful
+  to represent values on multiple devices, and it contains a map from replica id
+  to values. Two representative types of `tf.distribute.DistributedValues`
+  are `tf.types.experimental.PerReplica` and `tf.types.experimental.Mirrored`
+  values.
+
+  `PerReplica` values exist on the worker devices, with a different value for
+  each replica. They are produced by iterating through a distributed dataset
+  returned by `tf.distribute.Strategy.experimental_distribute_dataset` and
+  `tf.distribute.Strategy.distribute_datasets_from_function`. They are also the
+  typical result returned by `tf.distribute.Strategy.run`.
+
+  `Mirrored` values are like `PerReplica` values, except we know that the value
+  on all replicas are the same. `Mirrored` values are kept synchronized by the
+  distribution strategy in use, while `PerReplica` values are left
+  unsynchronized. `Mirrored` values typically represent model weights. We can
+  safely read a `Mirrored` value in a cross-replica context by using the value
+  on any replica, while PerReplica values can only be read within a replica
+  context.
+
+* _Unwrapping_ and _merging_: Consider calling a function `fn` on multiple
+  replicas, like `strategy.run(fn, args=[w])` with an
+  argument `w` that is a `tf.distribute.DistributedValues`. This means `w` will
+  have a map taking replica id `0` to `w0`, replica id `1` to `w1`, etc.
+  `strategy.run()` unwraps `w` before calling `fn`, so it calls `fn(w0)` on
+  device `d0`, `fn(w1)` on device `d1`, etc.  It then merges the return
+  values from `fn()`, which leads to one common object if the returned values
+  are the same object from every replica, or a `DistributedValues` object
+  otherwise.
+
+* _Reductions_ and _all-reduce_: A _reduction_ is a method of aggregating
+  multiple values into one value, like "sum" or "mean". If a strategy is doing
+  sync training, we will perform a reduction on the gradients to a parameter
+  from all replicas before applying the update. _All-reduce_ is an algorithm for
+  performing a reduction on values from multiple devices and making the result
+  available on all of those devices.
+
+* _Mirrored variables_: These are variables that are created on multiple
+  devices, where we keep the variables in sync by applying the same
+  updates to every copy. Mirrored variables are created with
+  `tf.Variable(...synchronization=tf.VariableSynchronization.ON_WRITE...)`.
+  Normally they are only used in synchronous training.
+
+* _SyncOnRead variables_
+
+  _SyncOnRead variables_ are created by
+  `tf.Variable(...synchronization=tf.VariableSynchronization.ON_READ...)`, and
+  they are created on multiple devices. In replica context, each
+  component variable on the local replica can perform reads and writes without
+  synchronization with each other. When the
+  _SyncOnRead variable_ is read in cross-replica context, the values from
+  component variables are aggregated and returned.
+
+  _SyncOnRead variables_ bring a lot of custom configuration difficulty to the
+  underlying logic, so we do not encourage users to instantiate and use
+  _SyncOnRead variable_ on their own. We have mainly used _SyncOnRead
+  variables_ for use cases such as batch norm and metrics. For performance
+  reasons, we often don't need to keep these statistics in sync every step and
+  they can be accumulated on each replica independently. The only time we want
+  to sync them is reporting or checkpointing, which typically happens in
+  cross-replica context. _SyncOnRead variables_ are also often used by advanced
+  users who want to control when variable values are aggregated. For example,
+  users sometimes want to maintain gradients independently on each replica for a
+  couple of steps without aggregation.
+
+* _Distribute-aware layers_
+
+  Layers are generally called in a replica context, except when defining a
+  Keras functional model. `tf.distribute.in_cross_replica_context` will let you
+  determine which case you are in. If in a replica context,
+  the `tf.distribute.get_replica_context` function will return the default
+  replica context outside a strategy scope, `None` within a strategy scope, and
+  a `tf.distribute.ReplicaContext` object inside a strategy scope and within a
+  `tf.distribute.Strategy.run` function. The `ReplicaContext` object has an
+  `all_reduce` method for aggregating across all replicas.
+
+
+Note that we provide a default version of `tf.distribute.Strategy` that is
+used when no other strategy is in scope, that provides the same API with
+reasonable default behavior.
+
+API docstring: tensorflow.distribute
 """
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 2a9e68c7580cc9..55e7fb57e1f3e8 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -187,8 +187,6 @@
 Note that we provide a default version of `tf.distribute.Strategy` that is
 used when no other strategy is in scope, that provides the same API with
 reasonable default behavior.
-
-API docstring: tensorflow.distribute
 """
 # pylint: enable=line-too-long
 
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index 43c468e98d6fe1..5aaed2beb2323e 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -43,7 +43,7 @@ def __init__(self, docstring=None, docstring_module_name=None):
     'compat':
         DocSource(docstring_module_name='util.compat'),
     'distribute':
-        DocSource(docstring_module_name='distribute.distribute_lib'),
+        DocSource(docstring_module_name='distribute'),
     'distributions': DocSource(
         docstring='Core module for TensorFlow distribution objects and helpers.'
     ),

From 5db0660075a0ebb499e74b9dafa4868ea3cddd8c Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Wed, 27 Sep 2023 10:19:33 -0700
Subject: [PATCH 324/567] Add MLIR tests for tf-verify-for-export pass

PiperOrigin-RevId: 568888762
---
 .../tensorflow/tests/verify_for_export.mlir   | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/verify_for_export.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/verify_for_export.mlir b/tensorflow/compiler/mlir/tensorflow/tests/verify_for_export.mlir
new file mode 100644
index 00000000000000..3c59b27fda6f60
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/verify_for_export.mlir
@@ -0,0 +1,24 @@
+// RUN: tf-opt %s -split-input-file -tf-verify-for-export -verify-diagnostics | FileCheck %s
+
+module {
+  func.func @failsNoIslands() {
+    // expected-error @+1 {{functions must be of a single Graph with single op Islands: first op in function is not a tf_executor.graph}}
+    func.return
+  }
+}
+
+// -----
+
+module {
+  // CHECK-LABEL: func @passesSingleIslandOp
+  func.func @passesSingleIslandOp() {
+    // CHECK: _class = ["loc:@class"]
+    tf_executor.graph {
+      %c, %control0 = tf_executor.island wraps "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+      %a, %control1 = tf_executor.island wraps "tf.A"() {_class = ["loc:@class"]} : () -> (tensor<2xf32>)
+      %s:2, %control2 = tf_executor.island wraps "tf.Split"(%c, %a) {num_split = 2 : i32} : (tensor<i32>, tensor<2xf32>) -> (tensor<1xf32>, tensor<1xf32>)
+      tf_executor.fetch
+    }
+    func.return
+  }
+}
\ No newline at end of file

From e4053ca38f72f11504f95c13f69ff9ed4cb296e5 Mon Sep 17 00:00:00 2001
From: Fiona Lang <flang@google.com>
Date: Wed, 27 Sep 2023 10:19:42 -0700
Subject: [PATCH 325/567] No public description

PiperOrigin-RevId: 568888806
---
 tensorflow/python/framework/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index a5ca3b6ea44acc..194a9707672622 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -409,6 +409,7 @@ py_strict_library(
     srcs = ["constant_op.py"],
     srcs_version = "PY3",
     visibility = visibility + [
+        "//smartass:__subpackages__",
         "//tensorflow:internal",
         "//third_party/py/tf_slim:__subpackages__",
     ],

From b32fa41f93f2f45ab0482d262ce45934d9102517 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 10:43:42 -0700
Subject: [PATCH 326/567] Increases saltiplier to eliminate the
 once-in-a-blue-moon issue with unit test flakiness.

PiperOrigin-RevId: 568896545
---
 .../xla/hlo/experimental/auto_sharding/auto_sharding_solver.h   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
index fa295826fd1d28..5c65a087874f91 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
@@ -57,7 +57,7 @@ struct AutoShardingSolverRequest {
   std::optional<double> max_departures;
   bool crash_at_infinity_costs_check = false;
   bool compute_iis = true;
-  double saltiplier = 0.0001;  // Modifies each objective term by at most 0.01%
+  double saltiplier = 0.001;  // Modifies each objective term by at most 0.1%
 };
 
 struct AutoShardingSolverResult {

From 8e9347ba8a8a9d37c26d386178faae18f8dc6b51 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Wed, 27 Sep 2023 10:53:01 -0700
Subject: [PATCH 327/567] [xla] Modify P2PSchedulePreparation to work with the
 CollectivPermuteDecomposer and handle pipelined Send-Recv chains.

The P2PSchedulePreparation pass now amends control dependence to express the
intended ordering of an unpipelined Send-Recv chain, as the
CollectivePermuteDecomposer no longer generates such control dependence.

Add control dependence to ensure that in a whild-body with pipelined a Send-Recv
chain, Send the ordered before Recv.

Add control dependence to ensure if an instruction with nested Send-Recv and
also used the result of a while-loop with pipelined Send-Recv, the instruction
is ordered after the pipelined send-done.

Add tests.

PiperOrigin-RevId: 568899565
---
 third_party/xla/xla/service/BUILD             |  15 +-
 .../xla/service/p2p_schedule_preparation.cc   | 490 +++++++++++++++---
 .../xla/service/p2p_schedule_preparation.h    |  89 +++-
 .../service/p2p_schedule_preparation_test.cc  | 486 +++++++++++++++--
 4 files changed, 969 insertions(+), 111 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 2228ee9c084e42..d31af5b43fc45a 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1237,10 +1237,16 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
+        "//xla:status",
         "//xla:statusor",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_reachability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -1250,13 +1256,14 @@ xla_cc_test(
     deps = [
         ":hlo_parser",
         ":p2p_schedule_preparation",
-        "//xla:test",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
-        "//xla/tests:test_utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.cc b/third_party/xla/xla/service/p2p_schedule_preparation.cc
index b6ab4362cc5824..cf86a21d88ef7f 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.cc
@@ -15,73 +15,253 @@ limitations under the License.
 
 #include "xla/service/p2p_schedule_preparation.h"
 
+#include <cstdint>
 #include <memory>
+#include <set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_reachability.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
 
 namespace xla {
 
 namespace {
 
 // Returns a boolean to indicate whether the operation is a non-host P2P
-// operation. We exclude non-host P2P operations for two reasons: (1) this pass
-// currently can only amend control dependence for non-host P2P operations. (2)
-// we need to exclude host P2P operations when looking for a nested chain
+// operation. We exclude non-host P2P operations for two reasons: (1) this
+// pass currently only amend control dependence for non-host P2P operations.
+// (2) we need to exclude host P2P operations when looking for a nested chain
 // of non-host P2P operations.
 bool IsP2POp(const HloInstruction* op) {
   auto p2p = DynCastOrNull<HloSendRecvInstruction>(op);
   return p2p != nullptr && !p2p->is_host_transfer();
 }
 
-// Returns the predecessor of op with instruction type T1 or nullptr if such a
-// control predecessor doesn't exist. The routine gives an error if there are
-// more than one such control predecessor.
-template <typename T1>
-const T1* GetChainedOp(const HloInstruction* op) {
-  const T1* chained_op = nullptr;
-  for (const HloInstruction* predecessor : op->control_predecessors()) {
-    auto tmp = DynCastOrNull<T1>(predecessor);
-    if (!tmp || !IsP2POp(tmp)) {
-      continue;
+bool IsP2PDoneOp(const HloInstruction* op) {
+  return IsP2POp(op) && (op->opcode() == HloOpcode::kRecvDone ||
+                         op->opcode() == HloOpcode::kSendDone);
+}
+
+enum P2PGroupKind { kUnpipelined = 0, kPipelined = 1, kUnrecognized = 2 };
+
+// A P2P group node represents the P2P instructions that are in the same
+// computation and have the same channel ID. This includes one Send/SendDone
+// and one Recv/RecvDone. If the P2P instructions for the given channel ID are
+// pipelined, the group node for the computation containing the while-loop also
+// records the while-loop instruction.
+//
+struct P2PGroupNode {
+  bool RecordParentComputation(HloComputation* parent) {
+    if (parent_computation == nullptr) {
+      parent_computation = parent;
+      return true;
     }
-    CHECK_EQ(chained_op, nullptr);
-    chained_op = tmp;
+    return parent_computation == parent;
   }
-  return chained_op;
-}
 
-// Given a send_done, returns the recv_done if it is in a chain of
-//   Recv => Send => RecvDone => SendDone
-// Returns nullptr if such a chain doesn't exist.
-const HloRecvDoneInstruction* GetChainedRecvDone(
-    const HloSendDoneInstruction* send_done) {
-  const HloRecvDoneInstruction* recv_done =
-      GetChainedOp<HloRecvDoneInstruction>(send_done);
-  if (!recv_done) {
-    return nullptr;
+  bool RecordDoneOp(HloSendRecvInstruction* p2p) {
+    if (!RecordParentComputation(p2p->parent())) {
+      return false;
+    }
+
+    if (p2p->opcode() == HloOpcode::kRecvDone) {
+      if (recv_done == nullptr) {
+        recv_done = Cast<HloRecvDoneInstruction>(p2p);
+        return true;
+      }
+    } else if (p2p->opcode() == HloOpcode::kSendDone) {
+      if (send_done == nullptr) {
+        send_done = Cast<HloSendDoneInstruction>(p2p);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool RecordWhileOp(HloInstruction* while_op) {
+    if (while_loop != nullptr) {
+      return false;
+    }
+    if (!RecordParentComputation(while_op->parent())) {
+      return false;
+    }
+    while_loop = while_op;
+    return true;
+  }
+
+  bool Incomplete() const {
+    return recv_done == nullptr || send_done == nullptr;
+  }
+
+  bool IncompletePipelinedParent() const {
+    return Incomplete() || while_loop == nullptr;
+  }
+
+  HloRecvDoneInstruction* recv_done = nullptr;
+  HloSendDoneInstruction* send_done = nullptr;
+  // The computation that contains the Send and Recv instructions.
+  HloComputation* parent_computation = nullptr;
+  // The while-loop instruction that calls the while-body with the pipelined
+  // P2P Send and Recv instructions.
+  HloInstruction* while_loop = nullptr;
+};
+
+static constexpr int kUnpipelinedNodeIdx = 0;
+static constexpr int kPipelinedChildNodeIdx = 0;
+static constexpr int kPipelinedParentNodeIdx = 1;
+
+// Represent a P2P instruction group for a given channel.
+//
+// A kUnpipelined P2P group contains only one P2PGroupNode while a kPipelined
+// P2P group contains a P2PGroupNode for the while-body and a P2PGroupNode
+// for the computation with the while-loop instruction calling the while-body.
+struct P2PGroup {
+  Status RecordDoneOpForUnpipelinedGroup(HloSendRecvInstruction* p2p) {
+    if (kind == kUnrecognized) {
+      // Leave unrecognized P2P groups alone.
+      return OkStatus();
+    }
+    if (kind != kUnpipelined) {
+      return InternalError("Expected unpipelined group");
+    }
+    P2PGroupNode& node = nodes[kUnpipelinedNodeIdx];
+    if (!node.RecordDoneOp(p2p)) {
+      kind = kUnrecognized;
+    }
+    return OkStatus();
+  }
+
+  Status RecordDoneOpForPipelinedGroup(HloSendRecvInstruction* p2p) {
+    if (kind == kUnrecognized) {
+      // Leave unrecognized P2P groups alone.
+      return OkStatus();
+    }
+    if (kind == kUnpipelined) {
+      if (nodes[kPipelinedParentNodeIdx].parent_computation != nullptr) {
+        return InternalError("Expected unpipelined group");
+      }
+      kind = kPipelined;
+    }
+    P2PGroupNode& node = nodes[kPipelinedParentNodeIdx];
+    if (!node.RecordDoneOp(p2p)) {
+      kind = kUnrecognized;
+    }
+    return OkStatus();
+  }
+
+  Status RecordWhileOpToPipelinedGroup(HloInstruction* while_op) {
+    if (kind == kUnrecognized) {
+      // Leave unrecognized P2P groups alone.
+      return OkStatus();
+    }
+    if (kind == kUnpipelined) {
+      return InternalError("Expected pipelined group");
+    }
+    P2PGroupNode& node = nodes[kPipelinedParentNodeIdx];
+    if (!node.RecordWhileOp(while_op)) {
+      kind = kUnrecognized;
+    }
+    return OkStatus();
   }
 
-  auto send = DynCast<HloSendInstruction>(send_done->operand(0));
-  CHECK_NE(send, nullptr);
-  CHECK_EQ(send->is_host_transfer(), false);
+  P2PGroupKind kind = kUnpipelined;
+  P2PGroupNode nodes[2];
+};
 
-  const HloRecvInstruction* recv = GetChainedOp<HloRecvInstruction>(send);
-  if (!recv) {
-    return nullptr;
+//  Maps the channel ID to the corresponding P2P operation group.
+using P2PGroupMap = absl::flat_hash_map<int64_t, P2PGroup>;
+
+// Maps the computation to the channel IDs used by the computation for P2P
+// operations. We use std::set instead of hash set for deterministic iterators.
+using P2PInComputation =
+    absl::flat_hash_map<const HloComputation*, std::set<int64_t>>;
+
+// If the while-body contains a P2P chain that use the same channel as another
+// P2P chain in the caller computation, assume these two P2P chain belong to
+// the same pipelined P2P sequence. Adds the WhileOp to the pipelined group
+// representation in this case.
+Status MayAddWhileOpToPipelinedGroup(HloInstruction* while_op,
+                                     P2PInComputation& p2p_in_computation,
+                                     P2PGroupMap& p2p_group_map) {
+  HloComputation* body = while_op->called_computations()[0];
+  auto p2p = p2p_in_computation.find(body);
+  if (p2p == p2p_in_computation.end()) {
+    return OkStatus();
   }
-  if (recv_done->operand(0) != recv) {
-    return nullptr;
+  int pipelined_group = 0;
+  for (auto channel : p2p->second) {
+    auto p2p_group = p2p_group_map.find(channel);
+    if (p2p_group == p2p_group_map.end() ||
+        p2p_group->second.kind != kPipelined) {
+      continue;
+    }
+    pipelined_group++;
+    if (pipelined_group > 1) {
+      return InternalError(
+          "Expecting only one pipelined P2P group for each while-loop");
+    }
+    TF_RETURN_IF_ERROR(
+        p2p_group->second.RecordWhileOpToPipelinedGroup(while_op));
   }
+  return OkStatus();
+}
+
+// For an unpipelined Send-Recv chain, add control dependence to enforce this
+// ordering:
+//   recv => send => recv-done => send-done.
+Status ChainUnpipelinedP2P(P2PGroupNode& node) {
+  HloSendRecvInstruction* recv_done = node.recv_done;
+  HloRecvInstruction* recv =
+      DynCast<HloRecvInstruction>(recv_done->mutable_operand(0));
+  HloSendRecvInstruction* send_done = node.send_done;
+  HloSendInstruction* send =
+      DynCast<HloSendInstruction>(send_done->mutable_operand(0));
+  // We want the Recv to be scheduled before the Send.
+  TF_RETURN_IF_ERROR(recv->AddControlDependencyTo(send));
+  VLOG(10) << "Add control predecessor " << send->ToString();
+  // We want the Send to be scheduled before RecvDone to prevent the scheduler
+  // from interleaving two Send-Recv sequences.
+  TF_RETURN_IF_ERROR(send->AddControlDependencyTo(recv_done));
+  VLOG(10) << "Add control predecessor " << recv_done->ToString();
+
+  // We want the RecvDone to be scheduled before the SendDone.
+  TF_RETURN_IF_ERROR(recv_done->AddControlDependencyTo(send_done));
+  VLOG(10) << "Add control predecessor " << send_done->ToString();
+  return OkStatus();
+}
 
-  return recv_done;
+// For the pipelined Send-Recv chain in a while-body, we need to make sure
+// that the Send is scheduled before Recv as the Send release the unrolled
+// Recv before entering the transformed loop. We let the scheduler to decide
+// where to schedule send-done and recv-done. For example, if the while-body
+// has other unpiplined Send-Recv chains, it may produce this ordering:
+//   send => send-done => other Send-Recv chains => recv => recv-done
+// If the while-body doesn't have other Send-Recv chains, it may produce this
+// ordering:
+//   send => recv => recv-done => send-done
+Status ChainPipelinedP2PChild(P2PGroupNode& node) {
+  HloSendRecvInstruction* recv_done = node.recv_done;
+  HloRecvInstruction* recv =
+      DynCast<HloRecvInstruction>(recv_done->mutable_operand(0));
+  HloSendRecvInstruction* send_done = node.send_done;
+  HloSendInstruction* send =
+      DynCast<HloSendInstruction>(send_done->mutable_operand(0));
+  // We want the Send to be scheduled before the Send.
+  TF_RETURN_IF_ERROR(send->AddControlDependencyTo(recv));
+  VLOG(10) << "Add control predecessor " << recv->ToString();
+  return OkStatus();
 }
 
 // Inspects the instructions in the computation to find out whether the
@@ -113,10 +293,10 @@ bool FindP2PInComputation(
 }
 
 // Returns a boolean to indicate whether there are any operation in the range
-// [start, end] that contains non-host P2P transfer that are reachable from the
-// given instruction.
+// [start, end] that contains non-host P2P transfer that are reachable from
+// the given instruction.
 bool OperationChainHasP2P(
-    absl::flat_hash_map<const HloComputation*, bool>& p2p_in_computation_cache,
+    P2PInComputation& p2p_in_computation,
     const std::vector<HloInstruction*>::const_iterator& start,
     const std::vector<HloInstruction*>::const_iterator& end,
     const HloReachabilityMap* reachability, const HloInstruction* instr) {
@@ -129,7 +309,8 @@ bool OperationChainHasP2P(
     }
 
     for (const HloComputation* called_comp : op->called_computations()) {
-      if (FindP2PInComputation(p2p_in_computation_cache, called_comp)) {
+      auto p2p_in_comp = p2p_in_computation.find(called_comp);
+      if (p2p_in_comp != p2p_in_computation.end()) {
         return true;
       }
     }
@@ -137,42 +318,235 @@ bool OperationChainHasP2P(
   return false;
 }
 
+// Collects P2P send-done and recv-done instructions from the computation and
+// group them by channel IDs.
+Status CollectP2PGroups(const HloComputation* computation,
+                        P2PInComputation& p2p_in_computation,
+                        P2PGroupMap& p2p_group_map) {
+  for (auto hlo : computation->MakeInstructionPostOrder()) {
+    if (hlo->opcode() == HloOpcode::kWhile) {
+      TF_RETURN_IF_ERROR(MayAddWhileOpToPipelinedGroup(hlo, p2p_in_computation,
+                                                       p2p_group_map));
+      continue;
+    }
+    if (!IsP2PDoneOp(hlo)) {
+      continue;
+    }
+    HloSendRecvInstruction* p2p = Cast<HloSendRecvInstruction>(hlo);
+    int64_t channel = p2p->channel_id().value();
+    auto p2p_group = p2p_group_map.find(channel);
+    if (p2p_group == p2p_group_map.end()) {
+      // First time to see this P2P channel, assume it is for a kUnpipelined
+      // P2P group and may turn it into a kPipelined group or kUnrecognized
+      // group.
+      P2PGroup group;
+      TF_RETURN_IF_ERROR(group.RecordDoneOpForUnpipelinedGroup(p2p));
+      p2p_group_map[channel] = group;
+    } else {
+      P2PGroup& group = p2p_group->second;
+      if (group.nodes[kUnpipelinedNodeIdx].parent_computation == computation) {
+        TF_RETURN_IF_ERROR(group.RecordDoneOpForUnpipelinedGroup(p2p));
+      } else {
+        // We are at the parent computation for a pipelined P2P group.
+        TF_RETURN_IF_ERROR(group.RecordDoneOpForPipelinedGroup(p2p));
+      }
+    }
+    // We can't rely on the operation on p2p_group_map above to find out
+    // whether it is the first time to handle this channel for the current
+    // computation, as we may drop information in the present of kUncognized
+    // groups.
+    auto p2p_in_comp = p2p_in_computation.find(computation);
+    if (p2p_in_comp == p2p_in_computation.end()) {
+      // First time to see a P2P channel for the computation.
+      p2p_in_computation[computation] = {channel};
+    } else {
+      // Add the channel only if it is not recorded yet.
+      p2p_in_comp->second.insert(channel);
+    }
+  }
+
+  // Now finalize each group, in particular, if a kPipelined or kUnpipelined
+  // group is missing some instructions, change the group to kUnrecognized.
+  for (auto& [channel, p2p_group] : p2p_group_map) {
+    if (p2p_group.kind == kUnpipelined) {
+      if (p2p_group.nodes[kUnpipelinedNodeIdx].Incomplete()) {
+        p2p_group.kind = kUnrecognized;
+      }
+    } else if (p2p_group.kind == kPipelined) {
+      if (p2p_group.nodes[kPipelinedChildNodeIdx].Incomplete() ||
+          p2p_group.nodes[kPipelinedParentNodeIdx]
+              .IncompletePipelinedParent()) {
+        p2p_group.kind = kUnrecognized;
+      }
+    }
+  }
+  // Erase kUnrecognized groups.
+  absl::erase_if(p2p_group_map, [](const auto& p2p_group) {
+    return p2p_group.second.kind == kUnrecognized;
+  });
+
+  return OkStatus();
+}
+
 }  // namespace
 
 StatusOr<bool> P2PSchedulePreparation::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
-  absl::flat_hash_map<const HloComputation*, bool> p2p_in_computation_cache;
-  for (HloComputation* computation : module->computations(execution_threads)) {
+  P2PGroupMap p2p_group_map;
+  P2PInComputation p2p_in_computation;
+
+  std::vector<HloComputation*> all_computations =
+      module->MakeComputationPostOrder(execution_threads);
+
+  // Collect P2P group information. We visit computations in the order of
+  // callees to callers, so that when we process a while-loop instruction, we
+  // already have information for the while-body.
+  for (auto iter = all_computations.begin(); iter != all_computations.end();
+       ++iter) {
+    TF_RETURN_IF_ERROR(
+        CollectP2PGroups(*iter, p2p_in_computation, p2p_group_map));
+  }
+
+  // We visit computations in the order of callers to callees, so that the
+  // computation containing the while-loop is processed before the while-body.
+  for (auto iter = all_computations.rbegin(); iter != all_computations.rend();
+       ++iter) {
+    HloComputation* computation = *iter;
+    // Record the P2P chain result and the corresponding send-done
+    // instruction. The unpipelined P2P chain result is the recv-done
+    // instruction while the pipelined P2P chain result is the while-loop.
+    absl::flat_hash_map<HloInstruction*, HloSendRecvInstruction*>
+        p2p_result_to_send_done;
+    auto p2p_in_comp = p2p_in_computation.find(computation);
+    if (p2p_in_comp == p2p_in_computation.end()) {
+      // No P2P operations in the computation, do nothing.
+      continue;
+    }
+    std::set<int64_t>& p2p_channels = p2p_in_comp->second;
+    // If the current computation is a while-body and has a pipelined P2P chain,
+    // record such a P2P group.
+    P2PGroup* pipelined_group = nullptr;
+    for (int64_t channel : p2p_channels) {
+      auto it = p2p_group_map.find(channel);
+      if (it == p2p_group_map.end()) {
+        // The instructions that use the channel don't form an interested P2P
+        // group, do nothing.
+        continue;
+      }
+
+      changed = true;
+
+      P2PGroup& p2p_group = it->second;
+      P2PGroupKind kind = p2p_group.kind;
+      if (kind == P2PGroupKind::kUnpipelined) {
+        TF_RETURN_IF_ERROR(
+            ChainUnpipelinedP2P(p2p_group.nodes[kUnpipelinedNodeIdx]));
+        p2p_result_to_send_done[p2p_group.nodes[kUnpipelinedNodeIdx]
+                                    .recv_done] =
+            p2p_group.nodes[kUnpipelinedNodeIdx].send_done;
+        continue;
+      }
+
+      // For Pipelined group.
+      if (computation !=
+          p2p_group.nodes[kPipelinedParentNodeIdx].parent_computation) {
+        // We are at the computation for the while-body of the pipelined group.
+        TF_RETURN_IF_ERROR(
+            ChainPipelinedP2PChild(p2p_group.nodes[kPipelinedChildNodeIdx]));
+        p2p_result_to_send_done[p2p_group.nodes[kPipelinedChildNodeIdx]
+                                    .recv_done] =
+            p2p_group.nodes[kPipelinedChildNodeIdx].send_done;
+        if (pipelined_group != nullptr) {
+          return InternalError("Expected <=1 pipelined group in a while-body");
+        }
+        pipelined_group = &p2p_group;
+      } else {
+        // We are at the computation that contains the pipelined while-loop. No
+        // need to add control dependence as the natural data dependence express
+        // this ordering:
+        // recv => recv-done => while-loop => send => send-done
+        p2p_result_to_send_done[p2p_group.nodes[kPipelinedParentNodeIdx]
+                                    .while_loop] =
+            p2p_group.nodes[kPipelinedParentNodeIdx].send_done;
+      }
+    }
+
+    if (pipelined_group != nullptr) {
+      int64_t pipelined_channel = pipelined_group->nodes[kPipelinedChildNodeIdx]
+                                      .recv_done->channel_id()
+                                      .value();
+      HloInstruction* pipelined_send_done =
+          pipelined_group->nodes[kPipelinedChildNodeIdx].send_done;
+      HloInstruction* pipelined_recv =
+          pipelined_group->nodes[kPipelinedChildNodeIdx]
+              .recv_done->mutable_operand(0);
+      for (int64_t channel : p2p_channels) {
+        if (channel == pipelined_channel) {
+          continue;
+        }
+        auto it = p2p_group_map.find(channel);
+        if (it == p2p_group_map.end()) {
+          // The instructions that use the channel don't form an interested P2P
+          // group, do nothing.
+          continue;
+        }
+
+        P2PGroup& other_group = it->second;
+        if (other_group.kind != P2PGroupKind::kUnpipelined) {
+          return InternalError("Expected an unpipelined group");
+        }
+
+        // Add control dependence to make sure the unpipelined Recv is ordered
+        // after the pipelined Send-done and the unpipelined Send-done is
+        // ordered before the pipelined Recv.
+        HloInstruction* other_recv =
+            other_group.nodes[kUnpipelinedNodeIdx].recv_done->mutable_operand(
+                0);
+        HloInstruction* other_send_done =
+            other_group.nodes[kUnpipelinedNodeIdx].send_done;
+        TF_RETURN_IF_ERROR(
+            pipelined_send_done->AddControlDependencyTo(other_recv));
+        TF_RETURN_IF_ERROR(
+            other_send_done->AddControlDependencyTo(pipelined_recv));
+      }
+    }
+
+    // If an instruction has nested computation with P2P, we need to enforce
+    // its order related to the P2P chains in the current computation.
     std::unique_ptr<HloReachabilityMap> reachability;
     std::vector<HloInstruction*> all_instructions =
         computation->MakeInstructionPostOrder();
     for (auto it = all_instructions.begin(); it != all_instructions.end();
          ++it) {
       HloInstruction* hlo = *it;
-      if (hlo->opcode() != HloOpcode::kSendDone) {
-        continue;
-      }
-      auto send_done = Cast<HloSendDoneInstruction>(hlo);
-      if (send_done->is_host_transfer()) {
-        continue;
-      }
-      const HloRecvDoneInstruction* recv_done = GetChainedRecvDone(send_done);
-      if (!recv_done) {
+      auto p2p_result = p2p_result_to_send_done.find(hlo);
+      if (p2p_result == p2p_result_to_send_done.end()) {
         continue;
       }
       if (reachability == nullptr) {
         reachability = HloReachabilityMap::Build(computation);
       }
-      for (HloInstruction* recv_data : recv_done->users()) {
-        if (OperationChainHasP2P(p2p_in_computation_cache, it,
-                                 all_instructions.end(), reachability.get(),
-                                 recv_data)) {
-          // We need to schedule send_done before recv_data to avoid deadlock.
-          TF_RETURN_IF_ERROR(send_done->AddControlDependencyTo(recv_data));
-          VLOG(10) << "Add control predecessor to " << recv_data->ToString();
-          changed = true;
+      HloSendRecvInstruction* send_done = p2p_result->second;
+      const HloInstruction* send_data = nullptr;
+      if (p2p_result->first->opcode() == HloOpcode::kWhile) {
+        // For pipelined P2P, need to exclude send_data from the while-loop
+        // users to avoid creating circular dependence.
+        send_data = send_done->operand(0)->operand(0);
+        VLOG(10) << "Excluding send_data from while-loop users "
+                 << send_data->ToString();
+      }
+
+      for (HloInstruction* user : hlo->users()) {
+        if (user != send_data &&
+            OperationChainHasP2P(p2p_in_computation, it, all_instructions.end(),
+                                 reachability.get(), user)) {
+          // We need to schedule send_done before user to avoid scheduler
+          // deadlock.
+          TF_RETURN_IF_ERROR(send_done->AddControlDependencyTo(user));
+          VLOG(10) << "Add control predecessor from " << user->ToString()
+                   << " to " << send_done->ToString();
         }
       }
     }
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.h b/third_party/xla/xla/service/p2p_schedule_preparation.h
index 1939ba76f8d268..d0823224eb9d06 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.h
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.h
@@ -24,11 +24,85 @@ namespace xla {
 // P2PSchedulePreparation is a pass to linearize point-to-point operation chain
 // to prepare for any HLO scheduler. In particular, this pass currently does the
 // following:
+// (1) For an unpipelined P2P Send-Recv chain, add control dependence to
+//     express this ordering:
+//       recv => send => recv-done => send-done
 //
-// Adds control prececessors/successors to ensure that a P2P Send-Recv sequence
-// on a non-host device will be scheduled before other operations that use the
-// Recv result and may also invoke P2P operations indirectly. Here is an example
-// to illustrate the problem we address:
+// (2) For a pipelined P2P Send-Recv chain, add control dependence to the
+//     while-body to express this ordering:
+//       send => recv
+//     No control dependence are added to a pipelined Send-Recv in the
+// computation with the while-loop as the data dependence already expresses
+// this ordering:
+//       recv => recv-done => while-loop => send => send-done.
+//
+// (3) For a pipelined P2P Send-Recv chain, if the while-body has other P2P
+// chains, we need to add control dependence to ensure that the pipelined
+// Send-done is ordered before other P2P chains while the pipelined
+// Recv-done is ordered after other P2P chains. In particular, we make the
+// pipelined Send-done the control predecessor of other Recv and the pipelined
+// Recv the control successor of other Send-done. Here is an example to
+// illustrate the problem we address:
+//
+// Assume a while-body with the following HLO collective-permute operations:
+//    collective-permute-start-1 = (u32[2], u32[2])
+//      collective-permute-start(data), channel_id=1...
+//    collective-permute-done-1 = u32[2], channel_id=1
+//    use of collective-permute-done-1 result
+//    collective-permute-start-2 = (u32[2], u32[2])
+//      collective-permute-start(data), channel_id=2...
+//    collective-permute-done-2 = u32[2], channel_id=2
+//    use of collective-permute-done-2 result
+//
+// Now assume we transform the collective-permute operations into two P2P
+// Send-Recv chains, the block of code will become something like this:
+//    after-all-1 = token[] after-all()
+//    recv-1 = (u32[2], token[]) recv(after-all-1), channel_id=1 ...
+//    send-1 = (u32[2], token[]) send(data, after-all-1), channel_id=1 ...
+//    recv-done-1 = (u32[2], token[]) recv-done(recv-1), channel_id=1 ...
+//    send-done-1 = token[] send-done(send-1), channel_id=1 ...
+//    use of recv-done-1 result
+//    after-all-2 = token[] after-all()
+//    recv-2 = (u32[2], token[]) recv(after-all-2), channel_id=2 ...
+//    send-2 = (u32[2], token[]) send(data, after-all-2), channel_id=2 ...
+//    recv-done-2 = (u32[2], token[]) recv-done(recv-2), channel_id=2 ...
+//    send-done-2 = token[] send-done(send-2), channel_id=2 ...
+//    use of recv-done-2 result
+//
+// If the while-loop is not pipelined, this pass adds control dependence to
+// make sure the first Send-Recv chain finish before the second Send-Recv
+// starts.
+//
+// If the while-loop is pipelined for the first Send-Recv chain, then the
+// first Recv and the last Send of the chain are moved to the computation
+// that calls the while-loop, and the block of code in the while-body will
+// become something like this:
+
+//    after-all-1 = token[] after-all()
+//    send-1 = (u32[2], token[]) send(data, after-all-1), channel_id=1 ...
+//    send-done-1 = token[] send-done(send-1), channel_id=1 ...
+//    use of recv-done-1 result from the previous iteration or the computation
+//      that calls the while-loop (for the first iteration)
+//
+//    after-all-2 = token[] after-all()
+//    recv-2 = (u32[2], token[]) recv(after-all-2), channel_id=2 ...
+//    send-2 = (u32[2], token[]) send(data, after-all-2), channel_id=2 ...
+//    recv-done-2 = (u32[2], token[]) recv-done(recv-2), channel_id=2 ...
+//    send-done-2 = token[] send-done(send-2), channel_id=2 ...
+//    use of recv-done-2 result
+//
+//    recv-1 = (u32[2], token[]) recv(after-all-1), channel_id=1 ...
+//    recv-done-1 = (u32[2], token[]) recv-done(recv-1), channel_id=1 ...
+//
+// In this case, we make send-done-1 the control predecessor of recv-2 and
+// send-done-2 the control predecessor of recv-1 to ensure that the second
+// Send-Recv chain is executed after the Send for the first chain finishes and
+// before the Recv for the first chain starts.
+//
+// (4) Adds control prececessors/successors to ensure that a P2P Send-Recv
+// sequence on a non-host device will be scheduled before other operations that
+// use the Recv result and may also invoke P2P operations indirectly. Here is an
+// example to illustrate the problem we address:
 //
 // Assume a computation with the following HLO instructions, where while-body
 // invokes collective-permute operations:
@@ -74,6 +148,13 @@ namespace xla {
 // recv-data and while-init to be scheduled before send-done. However, doing so
 // would complicate the implementation. We leave this to future improvement if
 // we will find out it can actually help performance in real practice.
+//
+// (4) Similar to case (3), if the result of the while-loop with pipelined P2P
+// chain is used by an instruction with nested P2P chains, we meed to schedule
+// send-done of the pipelined while-loop before the instruction to avoid
+// deadlock. We express this by making the send-done a control predecessor of
+// the get-tuple-element instruction that retrieve the while-loop result.
+//
 class P2PSchedulePreparation : public HloModulePass {
  public:
   absl::string_view name() const override {
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
index b23d21489d8d0e..5623b12fd5ae44 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
@@ -19,32 +19,150 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_format.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-using P2PSchedulePreparationTest = HloTestBase;
+class P2PSchedulePreparationTest : public HloTestBase {
+ public:
+  // Verifies that no control dependence enforces are added to the P2P chain.
+  void VerifyP2PNotTransformed(HloModule* module, std::string suffix = "") {
+    HloInstruction* recv = FindInstruction(module, "recv" + suffix);
+    HloInstruction* recv_done = FindInstruction(module, "recv-done" + suffix);
+    HloInstruction* send_done = FindInstruction(module, "send-done" + suffix);
+    EXPECT_EQ(recv->control_predecessors().size(), 0);
+    EXPECT_EQ(recv_done->control_predecessors().size(), 0);
+    EXPECT_EQ(send_done->control_predecessors().size(), 0);
+  }
+
+  // Verifies that the control dependence enforces this ordering for an
+  // unpipelined Send-Recv chain:
+  //   recv => send => recv-done => send-done.
+  void VerifyUnpipelinedP2P(HloModule* module, std::string suffix = "") {
+    HloInstruction* send = FindInstruction(module, "send" + suffix);
+    HloInstruction* recv = FindInstruction(module, "recv" + suffix);
+    HloInstruction* recv_done = FindInstruction(module, "recv-done" + suffix);
+    HloInstruction* send_done = FindInstruction(module, "send-done" + suffix);
+    EXPECT_EQ(send->control_predecessors()[0], recv);
+    EXPECT_EQ(recv_done->control_predecessors()[0], send);
+    EXPECT_EQ(send_done->control_predecessors()[0], recv_done);
+  }
+
+  // Verifies that the control dependence enforces this ordering for a pipelined
+  // Send-Recv chain in the while-body:
+  // send => recv.
+  void VerifyPipelinedP2PChild(HloModule* module, std::string suffix = "") {
+    HloInstruction* send = FindInstruction(module, "send" + suffix);
+    HloInstruction* recv = FindInstruction(module, "recv" + suffix);
+    HloInstruction* recv_done = FindInstruction(module, "recv-done" + suffix);
+    HloInstruction* send_done = FindInstruction(module, "send-done" + suffix);
+    // If the while-body has other P2P, the pipelined Recv should also the
+    // Send-done of the other P2P as control predecessors.
+    EXPECT_EQ(1, absl::c_count(recv->control_predecessors(), send));
+    EXPECT_EQ(recv_done->control_predecessors().size(), 0);
+    EXPECT_EQ(send_done->control_predecessors().size(), 0);
+  }
+
+  // Verifies that no control dependence are added to a pipelined Send-Recv
+  // in the computation with the while-loop as the data dependence already
+  // expresses this ordering:
+  //   recv => recv-done => while-loop => send => send-done.
+  void VerifyPipelinedP2PParent(HloModule* module, std::string suffix = "") {
+    VerifyP2PNotTransformed(module, suffix);
+  }
+};
 
 constexpr char kEmpty[] = "";
 constexpr char kHostTransfer[] = ", is_host_transfer=true";
-constexpr char kChainRecvDoneToSendDone[] =
-    ", control-predecessors={recv-done.1}";
 
-// Returns an HLO module string for testing, which is generated from a
-// templated string with placeholders for specifying the following values:
+// Returns an HLO module string for testing unnested P2P chain. The string is
+// generated from a templated string with placeholders for specifying the
+// following values:
+//  Whether the Send/Recv operations are host transfer
+//  Whether the Send/Recv operations form a complete P2P chain
+//
+std::string GetUnnestedP2PModuleString(bool is_host = false,
+                                       bool incomplete = false) {
+  constexpr char kSend[] = R"(
+    send = (f32[1, 1024, 1024], u32[], token[]) send(init, after-all),
+      channel_id=2, frontend_attributes={
+      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}}"
+    } %s
+    send-done = token[] send-done(send), channel_id=2 %s
+)";
+  constexpr char kSimpleModule[] = R"(
+  HloModule test
+  ENTRY main {
+    c0 = u32[] constant(0)
+    f0 = f32[] constant(0.0)
+    init = f32[1, 1024, 1024] broadcast(f0), dimensions={}
+
+    after-all = token[] after-all()
+    recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=2,
+      frontend_attributes={
+      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}}"
+    } %s
+    recv-done = (f32[1, 1024, 1024], token[]) recv-done(recv), channel_id=2 %s
+    %s
+    ROOT recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done), index=0
+  }
+)";
+
+  const char* is_host_str = is_host ? kHostTransfer : kEmpty;
+  if (incomplete) {
+    return absl::StrFormat(kSimpleModule, is_host_str, is_host_str, kEmpty);
+  }
+  std::string send_str = absl::StrFormat(kSend, is_host_str, is_host_str);
+  return absl::StrFormat(kSimpleModule, is_host_str, is_host_str, send_str);
+}
+
+TEST_F(P2PSchedulePreparationTest, UnnestedP2PChainHostNotTransformed) {
+  std::string kModuleStr = GetUnnestedP2PModuleString(/*is_host=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kModuleStr)));
+  P2PSchedulePreparation preparation;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(P2PSchedulePreparationTest, UnnestedP2PChainIncompleteNotTransformed) {
+  std::string kModuleStr =
+      GetUnnestedP2PModuleString(/*is_host=*/false, /*incomplete*/ true);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kModuleStr)));
+  P2PSchedulePreparation preparation;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(P2PSchedulePreparationTest, UnnestedP2PChainTransformed) {
+  std::string kModuleStr = GetUnnestedP2PModuleString();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kModuleStr)));
+  P2PSchedulePreparation preparation;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
+  EXPECT_TRUE(changed);
+  VerifyUnpipelinedP2P(module.get());
+}
+
+// Returns an HLO module string for testing nested unpipelined P2P chains. The
+// string is generated from a templated string with placeholders for specifying
+// the following values:
 //  Whether the Send/Recv operations in the while-body are host transfer
 //  Whether the Send/Recv operations in the main computation are host transfer
-//  Wether the SendDone/RecvDone operations in the main computation are chain
 //
-std::string GetHloModuleString(bool whileP2PIsHost = false,
-                               bool mainP2PIsHost = false,
-                               bool chainRecvDoneToSendDone = true) {
-  // A template string for the input HLO module.
+std::string GetNestedP2PModuleString(bool while_p2p_is_host = false,
+                                     bool main_p2p_is_host = false) {
   constexpr char kModuleTemplate[] = R"(
   HloModule test
   while-cond {
@@ -62,18 +180,18 @@ std::string GetHloModuleString(bool whileP2PIsHost = false,
     after-all = token[] after-all()
     recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=1,
       frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0, 1}}"
+      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}"
     } %s
     send = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all),
-      channel_id=1, control-predecessors={recv}, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0, 1}}"
+      channel_id=1, frontend_attributes={
+      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}}"
     } %s
-    recv-done = (f32[1, 1024, 1024], token[]) recv-done(recv), channel_id=1, control-predecessors={send} %s
+    recv-done = (f32[1, 1024, 1024], token[]) recv-done(recv), channel_id=1 %s
     recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done), index=0
-    send-done = token[] send-done(send), control-predecessors={recv-done}, channel_id=1 %s %s
+    send-done = token[] send-done(send), channel_id=1 %s
     c1 = u32[] constant(1)
     new-count = u32[] add(count, c1)
-    ROOT result = (u32[], f32[1, 1024, 1024]) tuple(new-count, recv-data)
+    ROOT body-result = (u32[], f32[1, 1024, 1024]) tuple(new-count, recv-data)
   }
 
   ENTRY main {
@@ -84,13 +202,13 @@ std::string GetHloModuleString(bool whileP2PIsHost = false,
     after-all.1 = token[] after-all()
     recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=2,
       frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0, 1}}"
+      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}}"
     } %s
     send.1 = (f32[1, 1024, 1024], u32[], token[]) send(init, after-all.1),
-      channel_id=2, control-predecessors={recv.1}, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0, 1}}"
+      channel_id=2, frontend_attributes={
+      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}}"
     } %s
-    recv-done.1 = (f32[1, 1024, 1024], token[]) recv-done(recv.1), channel_id=2, control-predecessors={send.1} %s
+    recv-done.1 = (f32[1, 1024, 1024], token[]) recv-done(recv.1), channel_id=2 %s
     send-done.1 = token[] send-done(send.1), channel_id=2 %s
     recv-data.1 = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
 
@@ -102,59 +220,337 @@ std::string GetHloModuleString(bool whileP2PIsHost = false,
     ROOT entry-result = f32[1, 1024, 1024] add(while-result-data, recv-data.1)
   }
   )";
-  const char* while_p2p = whileP2PIsHost ? kHostTransfer : kEmpty;
-  const char* main_p2p = mainP2PIsHost ? kHostTransfer : kEmpty;
-  const char* chain =
-      chainRecvDoneToSendDone ? kChainRecvDoneToSendDone : kEmpty;
+  const char* while_p2p = while_p2p_is_host ? kHostTransfer : kEmpty;
+  const char* main_p2p = main_p2p_is_host ? kHostTransfer : kEmpty;
   return absl::StrFormat(kModuleTemplate, while_p2p, while_p2p, while_p2p,
-                         while_p2p, main_p2p, main_p2p, main_p2p, main_p2p,
-                         chain);
+                         while_p2p, main_p2p, main_p2p, main_p2p, main_p2p);
 }
 
-TEST_F(P2PSchedulePreparationTest, WhileP2PIsHostNotTransformed) {
-  std::string kModuleStr = GetHloModuleString(/*whileP2PIsHost=*/true);
-  VLOG(0) << kModuleStr;
+TEST_F(P2PSchedulePreparationTest, WhileP2PIsHostNotMainTransformed) {
+  std::string kModuleStr = GetNestedP2PModuleString(/*while_p2p_is_host=*/true);
+
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule((kModuleStr)));
   P2PSchedulePreparation preparation;
   TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
-  EXPECT_FALSE(changed);
+  EXPECT_TRUE(changed);
+
+  VLOG(10) << module->ToString();
+  VerifyP2PNotTransformed(module.get());
+  VerifyUnpipelinedP2P(module.get(), ".1");
 }
 
-TEST_F(P2PSchedulePreparationTest, MainP2PIsHostNotTransformed) {
-  std::string kModuleStr = GetHloModuleString(/*whileP2PIsHost=*/false,
-                                              /*mainP2PIsHost=*/true);
+TEST_F(P2PSchedulePreparationTest, MainP2PIsHostNotWhileTransformed) {
+  std::string kModuleStr = GetNestedP2PModuleString(/*while_p2p_is_host=*/false,
+                                                    /*main_p2p_is_host=*/true);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule((kModuleStr)));
   P2PSchedulePreparation preparation;
   TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
-  EXPECT_FALSE(changed);
+  EXPECT_TRUE(changed);
+
+  VLOG(10) << module->ToString();
+  VerifyUnpipelinedP2P(module.get());
+  VerifyP2PNotTransformed(module.get(), ".1");
 }
 
-TEST_F(P2PSchedulePreparationTest, MainP2PNotChainedNotTransformed) {
-  std::string kModuleStr =
-      GetHloModuleString(/*whileP2PIsHost=*/false,
-                         /*mainP2PIsHost=*/false,
-                         /*chainRecvDoneToSendDone=*/false);
+TEST_F(P2PSchedulePreparationTest, NestedP2PChainTransformed) {
+  std::string kModuleStr = GetNestedP2PModuleString();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kModuleStr)));
+  P2PSchedulePreparation preparation;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  VLOG(10) << module->ToString();
+  VerifyUnpipelinedP2P(module.get());
+  VerifyUnpipelinedP2P(module.get(), ".1");
+
+  HloInstruction* send_done = FindInstruction(module.get(), "send-done.1");
+  HloInstruction* recv_data = FindInstruction(module.get(), "recv-data.1");
+  EXPECT_EQ(recv_data->control_predecessors()[0], send_done);
+}
+
+// Returns an HLO module string for testing pipelined P2P chains. The string
+// is generated from a templated string with placeholders for specifying the
+// following values:
+//  Whether the main computation contains another nested P2P chain besides the
+//    pipelined P2P chain.
+//  Whether the pipelined while-body contains another P2P chain besides the
+//    pipelined P2P chain.
+//
+std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
+                                        bool other_p2p_in_while = false) {
+  // This is to support the while-loop with nested P2P chains called from the
+  // main computation.
+  constexpr char kWhileForMain[] = R"(
+  while-cond-2 {
+    param = (u32[], f32[1, 1024, 1024]) parameter(0)
+    count = get-tuple-element(param), index=0
+    ub = u32[] constant(25)
+    ROOT cond-result-2 = pred[] compare(count, ub), direction=LT
+  }
 
+  while-body-2 {
+    param = (u32[], f32[1, 1024, 1024]) parameter(0)
+    count = get-tuple-element(param), index=0
+    send-data = get-tuple-element(param), index=1
+
+    after-all.3 = token[] after-all()
+    recv.3 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.3), channel_id=3,
+      frontend_attributes={
+      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}"
+    }
+    send.3 = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all.3),
+      channel_id=3, frontend_attributes={
+      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}}"
+    }
+    recv-done.3 = (f32[1, 1024, 1024], token[]) recv-done(recv.3), channel_id=3
+    recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.3), index=0
+    send-done.3 = token[] send-done(send.3), channel_id=3
+    c1 = u32[] constant(1)
+    new-count = u32[] add(count, c1)
+    ROOT body-result-2 = (u32[], f32[1, 1024, 1024]) tuple(new-count, recv-data)
+  }
+)";
+
+  // This is the result for the main computation, if it doesn't have another
+  // while-loop with nested P2P chains.
+  constexpr char kUnnestedResult[] = R"(
+  ROOT entry-result = f32[1, 1024, 1024] get-tuple-element(while-result), index=1
+)";
+
+  // This is the result for the main computation, if it has another while-loop
+  // with nested P2P chains.
+  constexpr char kNestedResult[] = R"(
+  while-result-1 = f32[1, 1024, 1024] get-tuple-element(while-result), index=1
+  while-init-2 =  (u32[], f32[1, 1024, 1024]) tuple(c0, while-result-1)
+  while-result-2 = (u32[], f32[1, 1024, 1024]) while(while-init-2),
+      body=while-body-2, condition=while-cond-2,
+      backend_config={"known_trip_count":{"n":"25"}}
+  ROOT entry-result = f32[1, 1024, 1024] get-tuple-element(while-result-2), index=1
+)";
+
+  constexpr char kPipelinedWhileBodyWithoutOtherP2P[] = R"(
+  while-body {
+    param = (u32[], f32[1, 1024, 1024], f32[1, 1024, 1024]) parameter(0)
+    count = get-tuple-element(param), index=0
+    send-data = get-tuple-element(param), index=1
+    recv-data = get-tuple-element(param), index=2
+
+    after-all.1 = token[] after-all()
+    send.1 = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all.1),
+      channel_id=1, frontend_attributes={
+      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
+    }
+    send-done.1 = token[] send-done(send.1), channel_id=1
+    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=1,
+      frontend_attributes={
+       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
+    }
+
+    c1 = u32[] constant(1)
+    new-count = u32[] add(count, c1)
+    replica = u32[] replica-id()
+    c10 = u32[] constant(10)
+    sum = u32[] add(replica, c10)
+    sum2 = u32[] add(sum, count)
+    conv = f32[] convert(sum2)
+    p = f32[1, 1024, 1024] broadcast(conv), dimensions={}
+    b = f32[1, 1024, 1024] add(p, recv-data)
+    c = f32[1, 1024, 1024] multiply(b, b)
+    d = f32[1, 1024, 1024] tan(c)
+    s = f32[1, 1024, 1024] dot(c, d), lhs_batch_dims={0},
+      lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+    new-data = f32[1, 1024, 1024] add(c, s)
+
+    recv-done.1 = (f32[1, 1024, 1024], token[]) recv-done(recv.1), channel_id=1
+    new-recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
+
+    ROOT body-result = (u32[], f32[1, 1024, 1024], f32[1, 1024, 1024]) tuple(new-count, new-data, new-recv-data)
+  }
+)";
+
+  constexpr char kPipelinedWhileBodyWithOtherP2P[] = R"(
+  while-body {
+    param = (u32[], f32[1, 1024, 1024], f32[1, 1024, 1024]) parameter(0)
+    count = get-tuple-element(param), index=0
+    send-data = get-tuple-element(param), index=1
+    recv-data = get-tuple-element(param), index=2
+
+    after-all.1 = token[] after-all()
+    send.1 = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all.1),
+      channel_id=1, frontend_attributes={
+      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
+    }
+    send-done.1 = token[] send-done(send.1), channel_id=1
+    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=1,
+      frontend_attributes={
+       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
+    }
+
+    c1 = u32[] constant(1)
+    new-count = u32[] add(count, c1)
+    replica = u32[] replica-id()
+    c10 = u32[] constant(10)
+    sum = u32[] add(replica, c10)
+    sum2 = u32[] add(sum, count)
+    conv = f32[] convert(sum2)
+    p = f32[1, 1024, 1024] broadcast(conv), dimensions={}
+    b = f32[1, 1024, 1024] add(p, recv-data)
+    c = f32[1, 1024, 1024] multiply(b, b)
+    d = f32[1, 1024, 1024] tan(c)
+    s = f32[1, 1024, 1024] dot(c, d), lhs_batch_dims={0},
+      lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+    new-data-0 = f32[1, 1024, 1024] add(c, s)
+
+    recv-done.1 = (f32[1, 1024, 1024], token[]) recv-done(recv.1), channel_id=1
+    new-recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
+
+    after-all.4 = token[] after-all()
+    send.4 = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all.4),
+      channel_id=4, frontend_attributes={
+      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
+    }
+    send-done.4 = token[] send-done(send.4), channel_id=4
+    recv.4 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.4), channel_id=4,
+      frontend_attributes={
+       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
+    }
+    recv-done.4 = (f32[1, 1024, 1024], token[]) recv-done(recv.4), channel_id=4
+    recv-data-4 = f32[1, 1024, 1024] get-tuple-element(recv-done.4), index=0
+    new-data = f32[1, 1024, 1024] add(new-data-0, recv-data-4)
+
+    ROOT body-result = (u32[], f32[1, 1024, 1024], f32[1, 1024, 1024]) tuple(new-count, new-data, new-recv-data)
+  }
+)";
+
+  constexpr char kModuleTemplate[] = R"(
+  HloModule test
+
+  while-cond {
+    param = (u32[], f32[1, 1024, 1024], f32[1, 1024, 1024]) parameter(0)
+    count = get-tuple-element(param), index=0
+    ub = u32[] constant(25)
+    ROOT cond-result = pred[] compare(count, ub), direction=LT
+  }
+
+  // The pipelined while-body goes here.
+  %s
+
+  // The code that support the while-loop with nested P2P chains goes here.
+  %s
+
+  ENTRY test-computation {
+    c0 = u32[] constant(0)
+    f0 = f32[] constant(0.0)
+    init = f32[1, 1024, 1024] broadcast(f0), dimensions={}
+
+    after-all.2 = token[] after-all()
+    recv.2 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.2), channel_id=1,
+      frontend_attributes={
+       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
+    }
+    recv-done.2 = (f32[1, 1024, 1024], token[]) recv-done(recv.2), channel_id=1
+    recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.2), index=0
+
+    while-init =  (u32[], f32[1, 1024, 1024], f32[1, 1024, 1024]) tuple(c0, init, recv-data)
+    while-result = (u32[], f32[1, 1024, 1024], f32[1, 1024, 1024]) while(while-init),
+      body=while-body, condition=while-cond,
+      backend_config={"known_trip_count":{"n":"25"}}
+
+    send-data = f32[1, 1024, 1024] get-tuple-element(while-result), index=2
+    send.2 = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all.2),
+      channel_id=1, frontend_attributes={
+      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
+    }
+    send-done.2 = token[] send-done(send.2), channel_id=1
+
+    // The code for the computation result goes here.
+    %s
+  }
+)";
+
+  const char* while_str = nested_p2p_in_main ? kWhileForMain : kEmpty;
+  const char* pipelined_while_body_str =
+      other_p2p_in_while ? kPipelinedWhileBodyWithOtherP2P
+                         : kPipelinedWhileBodyWithoutOtherP2P;
+  const char* result_str = nested_p2p_in_main ? kNestedResult : kUnnestedResult;
+  return absl::StrFormat(kModuleTemplate, while_str, pipelined_while_body_str,
+                         result_str);
+}
+
+TEST_F(P2PSchedulePreparationTest, UnnestedPipelinedP2PChainTransformed) {
+  std::string kModuleStr = GetPipelinedP2PModuleString();
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule((kModuleStr)));
   P2PSchedulePreparation preparation;
   TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
-  EXPECT_FALSE(changed);
+  EXPECT_TRUE(changed);
+
+  VLOG(10) << module->ToString();
+  // Verify the pipelined P2P chain in the whild-body.
+  VerifyPipelinedP2PChild(module.get(), ".1");
+  // Verify the pipelined P2P chain in the main computation.
+  VerifyPipelinedP2PParent(module.get(), ".2");
 }
 
-TEST_F(P2PSchedulePreparationTest, ChainedWithNestedP2PTransformed) {
-  std::string kModuleStr = GetHloModuleString();
+TEST_F(P2PSchedulePreparationTest, NestedPipelinedP2PChainTransformed) {
+  std::string kModuleStr =
+      GetPipelinedP2PModuleString(/*nested_p2p_in_main=*/true);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule((kModuleStr)));
   P2PSchedulePreparation preparation;
   TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
   EXPECT_TRUE(changed);
 
-  HloInstruction* send_done = FindInstruction(module.get(), "send-done.1");
-  HloInstruction* recv_data = FindInstruction(module.get(), "recv-data.1");
-  EXPECT_EQ(recv_data->control_predecessors()[0], send_done);
+  VLOG(10) << module->ToString();
+  // Verify the pipelined P2P chain in the whild-body.
+  VerifyPipelinedP2PChild(module.get(), ".1");
+  // Verify the pipelined P2P chain in the main computation.
+  VerifyPipelinedP2PParent(module.get(), ".2");
+  // Verify the unpipelined P2P chain in the other while-body.
+  VerifyUnpipelinedP2P(module.get(), ".3");
+
+  // Verify that the while-loop with nested P2P is schedule after the last
+  // send-done of the pipeline P2P chain. This is expressed by making
+  // send-done.2 a control predecessor of while-result-1, which is the init
+  // value for the while-loop.
+  HloInstruction* send_done = FindInstruction(module.get(), "send-done.2");
+  HloInstruction* while_input = FindInstruction(module.get(), "while-result-1");
+  EXPECT_EQ(while_input->control_predecessors()[0], send_done);
+}
+
+TEST_F(P2PSchedulePreparationTest,
+       UnnestedPipelinedP2PChainWithOtherP2PTransformed) {
+  std::string kModuleStr = GetPipelinedP2PModuleString(
+      /*nested_p2p_in_main=*/false, /*other_p2p_in_while=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kModuleStr)));
+  P2PSchedulePreparation preparation;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  VLOG(10) << module->ToString();
+  // Verify the pipelined P2P chain in the whild-body.
+  VerifyPipelinedP2PChild(module.get(), ".1");
+  // Verify the pipelined P2P chain in the main computation.
+  VerifyPipelinedP2PParent(module.get(), ".2");
+  // Verify the other unpipelined P2P chain in the while-body.
+  VerifyUnpipelinedP2P(module.get(), ".4");
+
+  // Verify that in the pipelined while-body, the pipelined Send is ordered
+  // before other P2P while the pipelined Recv is ordered after other P2P.
+  HloInstruction* pipelined_send_done =
+      FindInstruction(module.get(), "send-done.1");
+  HloInstruction* pipelined_recv = FindInstruction(module.get(), "recv.1");
+  HloInstruction* other_recv = FindInstruction(module.get(), "recv.4");
+  HloInstruction* other_send_done =
+      FindInstruction(module.get(), "send-done.4");
+  EXPECT_EQ(1, absl::c_count(other_recv->control_predecessors(),
+                             pipelined_send_done));
+  EXPECT_EQ(1, absl::c_count(pipelined_recv->control_predecessors(),
+                             other_send_done));
 }
 
 }  // namespace

From 5977b36696548d78544fdd54a2e13d90c35a1c95 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 10:55:13 -0700
Subject: [PATCH 328/567] Implement mocking nccl call support for collective
 operations

PiperOrigin-RevId: 568900214
---
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  47 +++++-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     |   3 +-
 .../service/gpu/gpu_executable_run_options.h  |  13 +-
 third_party/xla/xla/service/gpu/runtime/BUILD |  26 +++-
 .../xla/service/gpu/runtime/collectives.cc    | 139 +++++++++++++-----
 .../service/gpu/runtime/sleep_kernel.cu.cc    |  36 +++++
 .../xla/service/gpu/runtime/sleep_kernel.h    |  21 +++
 .../xla/xla/tools/multihost_hlo_runner/BUILD  |  11 +-
 .../functional_hlo_runner.cc                  |  10 ++
 .../functional_hlo_runner.h                   |   4 +
 .../multihost_hlo_runner/hlo_runner_main.cc   |  28 +++-
 11 files changed, 288 insertions(+), 50 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/runtime/sleep_kernel.cu.cc
 create mode 100644 third_party/xla/xla/service/gpu/runtime/sleep_kernel.h

diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 7ffc7e263e6a2f..b1916e471c8b6a 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -32,8 +32,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/time/time.h"
 #include "xla/client/local_client.h"
@@ -982,7 +984,7 @@ StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
     std::optional<std::string> platform_name,
     bool should_stage_host_to_device_transfers,
     PjRtClient::KeyValueGetCallback kv_get,
-    PjRtClient::KeyValuePutCallback kv_put) {
+    PjRtClient::KeyValuePutCallback kv_put, bool enable_mock_nccl) {
   TF_ASSIGN_OR_RETURN(LocalClient * xla_client,
                       GetGpuXlaClient(platform_name, allowed_devices));
   std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states;
@@ -997,7 +999,50 @@ StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
 
   std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
   auto gpu_run_options = std::make_unique<gpu::GpuExecutableRunOptions>();
+  if (enable_mock_nccl) {
+    gpu_run_options->set_enable_mock_nccl_collectives();
+  }
   if (num_nodes > 1) {
+    absl::flat_hash_map<std::string, std::string> device_maps;
+    absl::Mutex mu;
+    if (enable_mock_nccl) {
+      kv_get = [&device_maps, &mu, &num_nodes](
+                   const std::string& k,
+                   absl::Duration timeout) -> xla::StatusOr<std::string> {
+        std::string result;
+        {
+          absl::MutexLock lock(&mu);
+          if (device_maps.contains(k)) {
+            result = device_maps[k];
+          } else {
+            int device_id;
+            std::vector<std::string> tokens = absl::StrSplit(k, ':');
+            if (tokens.size() != 2 ||
+                !absl::SimpleAtoi(tokens[1], &device_id)) {
+              device_id = num_nodes - 1;
+            }
+            // Return fake local topology with device_id info back.
+            xla::LocalTopologyProto local;
+            local.set_boot_id("fake_boot_id");
+            local.set_node_id(device_id);
+            xla::DeviceProto* device = local.add_devices();
+            device->set_global_device_id(device_id);
+            device->set_name("fake_device");
+            device->set_vendor("fake_vendor");
+            result = local.SerializeAsString();
+          }
+        }
+        return result;
+      };
+      kv_put = [&device_maps, &mu](const std::string& k,
+                                   const std::string& v) -> xla::Status {
+        {
+          absl::MutexLock lock(&mu);
+          device_maps[k] = v;
+        }
+        return xla::OkStatus();
+      };
+    }
     TF_RET_CHECK(kv_get != nullptr);
     TF_RET_CHECK(kv_put != nullptr);
     TF_RETURN_IF_ERROR(BuildDistributedDevices(
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index bb4044f667be7d..be6b4cb75a9777 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -227,7 +227,8 @@ StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
     std::optional<std::string> platform_name = std::nullopt,
     bool should_stage_host_to_device_transfers = true,
     PjRtClient::KeyValueGetCallback kv_get = nullptr,
-    PjRtClient::KeyValuePutCallback kv_put = nullptr);
+    PjRtClient::KeyValuePutCallback kv_put = nullptr,
+    bool enable_mock_nccl = false);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/gpu_executable_run_options.h b/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
index 05d1a7116c1252..ffb8785bf7cce9 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
@@ -90,14 +90,25 @@ class GpuExecutableRunOptions {
     return requires_exclusive_lock_on_gpu_;
   }
 
-  // Require writers lock on the GRPU.
+  // Require writers lock on the GPU.
   GpuExecutableRunOptions& set_requires_exclusive_lock_on_gpu() {
     requires_exclusive_lock_on_gpu_ = true;
     return *this;
   }
 
+  bool enable_mock_nccl_collectives() const {
+    return enable_mock_nccl_collectives_;
+  }
+
+  // Enable mocking nccl collective operations on the GPU
+  GpuExecutableRunOptions& set_enable_mock_nccl_collectives() {
+    enable_mock_nccl_collectives_ = true;
+    return *this;
+  }
+
  private:
   bool requires_exclusive_lock_on_gpu_ = false;
+  bool enable_mock_nccl_collectives_ = false;
   std::optional<std::map<int, GlobalDeviceId>> gpu_global_device_ids_;
   NcclUniqueIdCallback nccl_unique_id_callback_;
 };
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index b3936182b93a17..2c48d0087d358d 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -40,6 +40,21 @@ cc_library(
     ],
 )
 
+cuda_library(
+    name = "sleep_kernel_cuda",
+    srcs = if_cuda_is_configured(
+        [
+            "sleep_kernel.cu.cc",
+        ],
+    ),
+    hdrs = if_cuda_is_configured(["sleep_kernel.h"]),
+    compatible_with = [],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
 cc_library(
     name = "collectives",
     srcs = ["collectives.cc"],
@@ -57,9 +72,18 @@ cc_library(
         "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-    ],
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+    ] + if_cuda_is_configured([
+        ":sleep_kernel_cuda",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//xla/stream_executor/gpu:gpu_types_header",
+        "//xla/stream_executor/gpu:gpu_stream_header",
+    ]),
 )
 
 cc_library(
diff --git a/third_party/xla/xla/service/gpu/runtime/collectives.cc b/third_party/xla/xla/service/gpu/runtime/collectives.cc
index 2b28efbe470b24..34cd90b4228701 100644
--- a/third_party/xla/xla/service/gpu/runtime/collectives.cc
+++ b/third_party/xla/xla/service/gpu/runtime/collectives.cc
@@ -15,11 +15,16 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/collectives.h"
 
+#include <cstdint>
+#include <memory>
 #include <string>
 #include <string_view>
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "xla/runtime/custom_call.h"
 #include "xla/runtime/executable.h"
 #include "xla/service/computation_placer.h"
@@ -35,6 +40,17 @@ limitations under the License.
 #include "xla/service/gpu/runtime/support.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/stream.h"
+
+#if XLA_ENABLE_XCCL
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/driver_types.h"
+#include "third_party/gpus/cuda/include/vector_types.h"
+#include "xla/service/gpu/runtime/sleep_kernel.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
+#include "xla/stream_executor/gpu/gpu_types.h"
+#endif  // XLA_ENABLE_XCCL
 
 namespace xla {
 namespace gpu {
@@ -163,6 +179,24 @@ absl::Status AsyncDoneImpl(const ServiceExecutableRunOptions* run_options,
 #endif  // XLA_ENABLE_XCCL
 }
 
+#if XLA_ENABLE_XCCL
+absl::Status NcclMockImplCommon(se::Stream* stream) {
+  se::gpu::GpuStreamHandle gpu_stream = se::gpu::AsGpuStreamValue(stream);
+  uint32_t sleep_duration_ns = 1000;
+  void* kernel = GetSleepKernel();
+  void* kernel_args[] = {&sleep_duration_ns};
+  dim3 gridDim = {1, 1, 1};
+  dim3 blockDim = {512, 1, 1};
+  cudaError_t launch_status =
+      cudaLaunchKernel(kernel, gridDim, blockDim, kernel_args, 0, gpu_stream);
+  if (launch_status != cudaSuccess) {
+    return absl::InternalError(absl::StrCat("Failed to launch kernel: ",
+                                            cudaGetErrorString(launch_status)));
+  }
+  return absl::OkStatus();
+}
+#endif  // XLA_ENABLE_XCCL
+
 //===----------------------------------------------------------------------===//
 // CollectivePermute.
 //===----------------------------------------------------------------------===//
@@ -242,15 +276,20 @@ absl::Status CollectivePermuteImpl(
     absl::Span<const int64_t> target_peers) {
 #if XLA_ENABLE_XCCL
   VLOG(3) << "Running CollectivePermute " << (is_async ? "(Async)" : "(Sync)");
-  return RunSyncOrAsync(run_options, collectives, async_collectives, uid,
-                        is_async, [&](se::Stream* stream) {
-                          return P2PImplCommon(
-                              run_options, debug_options, stream, args,
-                              group_mode, op_id, replica_group_offsets,
-                              replica_group_values, source_peers, target_peers,
-                              RunCollectivePermute, GetDeviceBufferPairs,
-                              GetStreamId(is_async));
-                        });
+  return RunSyncOrAsync(
+      run_options, collectives, async_collectives, uid, is_async,
+      [&](se::Stream* stream) {
+        const gpu::GpuExecutableRunOptions* gpu_opts =
+            run_options->run_options().gpu_executable_run_options();
+        if (gpu_opts && gpu_opts->enable_mock_nccl_collectives()) {
+          return NcclMockImplCommon(stream);
+        }
+        return P2PImplCommon(run_options, debug_options, stream, args,
+                             group_mode, op_id, replica_group_offsets,
+                             replica_group_values, source_peers, target_peers,
+                             RunCollectivePermute, GetDeviceBufferPairs,
+                             GetStreamId(is_async));
+      });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
 #endif  // XLA_ENABLE_XCCL
@@ -410,13 +449,18 @@ absl::Status AllGatherImpl(const ServiceExecutableRunOptions* run_options,
                            absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
   VLOG(3) << "Running AllGather " << (is_async ? "(Async)" : "(Sync)");
-  return RunSyncOrAsync(run_options, collectives, async_collectives, uid,
-                        is_async, [&](se::Stream* stream) {
-                          return AllGatherImplCommon(
-                              run_options, debug_options, stream, args,
-                              group_mode, op_id, replica_group_offsets,
-                              replica_group_values, is_async);
-                        });
+  return RunSyncOrAsync(
+      run_options, collectives, async_collectives, uid, is_async,
+      [&](se::Stream* stream) {
+        const gpu::GpuExecutableRunOptions* gpu_opts =
+            run_options->run_options().gpu_executable_run_options();
+        if (gpu_opts && gpu_opts->enable_mock_nccl_collectives()) {
+          return NcclMockImplCommon(stream);
+        }
+        return AllGatherImplCommon(run_options, debug_options, stream, args,
+                                   group_mode, op_id, replica_group_offsets,
+                                   replica_group_values, is_async);
+      });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL diasbled");
 #endif  // XLA_ENABLE_XCCL
@@ -477,14 +521,19 @@ absl::Status AllReduceImpl(const ServiceExecutableRunOptions* run_options,
                            absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
   VLOG(3) << "Running AllReduce " << (is_async ? "(Async)" : "(Sync)");
-  return RunSyncOrAsync(run_options, collectives, async_collectives, uid,
-                        is_async, [&](se::Stream* stream) {
-                          return AllReduceImplCommon(
-                              run_options, debug_options, stream, args,
-                              group_mode, op_id, reduction_kind,
-                              replica_group_offsets, replica_group_values,
-                              is_async);
-                        });
+  return RunSyncOrAsync(
+      run_options, collectives, async_collectives, uid, is_async,
+      [&](se::Stream* stream) {
+        const gpu::GpuExecutableRunOptions* gpu_opts =
+            run_options->run_options().gpu_executable_run_options();
+        if (gpu_opts && gpu_opts->enable_mock_nccl_collectives()) {
+          return NcclMockImplCommon(stream);
+        }
+        return AllReduceImplCommon(run_options, debug_options, stream, args,
+                                   group_mode, op_id, reduction_kind,
+                                   replica_group_offsets, replica_group_values,
+                                   is_async);
+      });
 #else   // XLA_ENABLE_XCCL
   // NCCL disabled.
   return absl::InternalError("NCCL disabled");
@@ -549,14 +598,19 @@ absl::Status AllToAllImpl(const ServiceExecutableRunOptions* run_options,
                           absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
   VLOG(3) << "Running AllToAll " << (is_async ? "(Async)" : "(Sync)");
-  return RunSyncOrAsync(run_options, collectives, async_collectives, uid,
-                        is_async, [&](se::Stream* stream) {
-                          return AllToAllImplCommon(
-                              run_options, debug_options, stream, args,
-                              group_mode, has_split_dimension, op_id,
-                              replica_group_offsets, replica_group_values,
-                              is_async);
-                        });
+  return RunSyncOrAsync(
+      run_options, collectives, async_collectives, uid, is_async,
+      [&](se::Stream* stream) {
+        const gpu::GpuExecutableRunOptions* gpu_opts =
+            run_options->run_options().gpu_executable_run_options();
+        if (gpu_opts && gpu_opts->enable_mock_nccl_collectives()) {
+          return NcclMockImplCommon(stream);
+        }
+        return AllToAllImplCommon(run_options, debug_options, stream, args,
+                                  group_mode, has_split_dimension, op_id,
+                                  replica_group_offsets, replica_group_values,
+                                  is_async);
+      });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
 #endif  // XLA_ENABLE_XCCL
@@ -618,14 +672,19 @@ absl::Status ReduceScatterImpl(const ServiceExecutableRunOptions* run_options,
                                absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
   VLOG(3) << "Running ReduceScatter " << (is_async ? "(Async)" : "(Sync)");
-  return RunSyncOrAsync(run_options, collectives, async_collectives, uid,
-                        is_async, [&](se::Stream* stream) {
-                          return ReduceScatterImplCommon(
-                              run_options, debug_options, stream, args,
-                              group_mode, op_id, reduction_kind,
-                              replica_group_offsets, replica_group_values,
-                              is_async);
-                        });
+  return RunSyncOrAsync(
+      run_options, collectives, async_collectives, uid, is_async,
+      [&](se::Stream* stream) {
+        const gpu::GpuExecutableRunOptions* gpu_opts =
+            run_options->run_options().gpu_executable_run_options();
+        if (gpu_opts && gpu_opts->enable_mock_nccl_collectives()) {
+          return NcclMockImplCommon(stream);
+        }
+        return ReduceScatterImplCommon(run_options, debug_options, stream, args,
+                                       group_mode, op_id, reduction_kind,
+                                       replica_group_offsets,
+                                       replica_group_values, is_async);
+      });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
 #endif  // XLA_ENABLE_XCCL
diff --git a/third_party/xla/xla/service/gpu/runtime/sleep_kernel.cu.cc b/third_party/xla/xla/service/gpu/runtime/sleep_kernel.cu.cc
new file mode 100644
index 00000000000000..44b528deb5b352
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime/sleep_kernel.cu.cc
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/runtime/sleep_kernel.h"
+
+namespace xla::gpu {
+namespace {
+
+__global__ void mock_nccl_call(unsigned sleep_ns) {
+#if __CUDA_ARCH__ >= 700  // __nanosleep requires compute capability 7.0
+  // Passing too high a number to __nanosleep makes it sleep for much less time
+  // than the passed-in number. So only pass 1,000,000 and keep calling
+  // __nanosleep in a loop.
+  int n = sleep_ns / 1000000;
+  unsigned rem = sleep_ns % 1000000;
+  for (int i = 0; i < n; i++) __nanosleep(1000000U);
+  __nanosleep(rem);
+#endif
+}
+
+}  // namespace
+
+void* GetSleepKernel() { return reinterpret_cast<void*>(&mock_nccl_call); }
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/sleep_kernel.h b/third_party/xla/xla/service/gpu/runtime/sleep_kernel.h
new file mode 100644
index 00000000000000..0bcaed7c56cfb3
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime/sleep_kernel.h
@@ -0,0 +1,21 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_RUNTIME_SLEEP_KERNEL_H_
+#define XLA_SERVICE_GPU_RUNTIME_SLEEP_KERNEL_H_
+
+namespace xla::gpu {
+
+void* GetSleepKernel();
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_SLEEP_KERNEL_H_
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index 01bff3c03b4f2f..61f9a7c23aeb33 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -1,10 +1,10 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla/tests:build_defs.bzl", "xla_test")
-load("@local_tsl//tsl:tsl.bzl", "if_cuda_or_rocm")
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("//xla:xla.bzl", "xla_cc_binary")
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load("@local_tsl//tsl:tsl.bzl", "if_cuda_or_rocm")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -35,6 +35,9 @@ xla_cc_binary(
         ":hlo_runner_flags",
         "//xla:debug_options_flags",
         "//xla:status",
+        "//xla:statusor",
+        "//xla/pjrt:pjrt_client",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index 2a4735296343e0..02873b1d4daee1 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -241,6 +241,16 @@ StatusOr<std::unique_ptr<PjRtClient>> FunctionalHloRunner::CreateGpuClient() {
       /*asynchronous=*/true, GpuAllocatorConfig(), /*node_id=*/0);
 }
 
+StatusOr<std::unique_ptr<PjRtClient>> FunctionalHloRunner::CreateMockGpuClient(
+    int num_nodes) {
+  return GetStreamExecutorGpuClient(
+      /*asynchronous=*/true, GpuAllocatorConfig(), /*node_id=*/0,
+      /*num_nodes=*/num_nodes, /*allowed_devices=*/std::nullopt,
+      /*platform_name=*/std::nullopt,
+      /*should_stage_host_to_device_transfers=*/true,
+      /*kv_get=*/nullptr, /*kv_put=*/nullptr, /*enable_mock_nccl=*/true);
+}
+
 StatusOr<ExecutionOptions> FunctionalHloRunner::LoadExecutionOptions(
     absl::string_view path) {
   ExecutionOptions execution_options;
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
index 37ab653e39dc68..4acb66091bbbd4 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
@@ -207,6 +207,10 @@ class FunctionalHloRunner {
   // Create a PjRtClient which can run HLOs on GPU.
   static StatusOr<std::unique_ptr<PjRtClient>> CreateGpuClient();
 
+  // Create a PjRtClient which mocks multi-hosts GPU run
+  static StatusOr<std::unique_ptr<PjRtClient>> CreateMockGpuClient(
+      int num_nodes = 1);
+
   // Loads an ExecutionOptions proto (which can be used in RawCompileOptions).
   static StatusOr<ExecutionOptions> LoadExecutionOptions(
       absl::string_view path);
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
index a3f188d3bc7240..18515ad05918f3 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
@@ -20,9 +20,12 @@ limitations under the License.
 #include <string_view>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "xla/debug_options_flags.h"
+#include "xla/pjrt/pjrt_client.h"
 #include "xla/status.h"
+#include "xla/statusor.h"
 #include "xla/tools/multihost_hlo_runner/functional_hlo_runner.h"
 #include "xla/tools/multihost_hlo_runner/hlo_runner_flags.h"
 #include "tsl/platform/init_main.h"
@@ -53,6 +56,15 @@ Single-GPU HLO:
     --num_partitions=2 \
     --hlo_file=path/to/hlo_module
 
+2 hosts-Mock GPU sharded HLO:
+  bazel run hlo_runner_main -- \
+     --use_spmd_partitioning=true \
+    --num_replicas=1 \
+    --num_partitions=2 \
+    --num_nodes=2 \
+    --enable_mock_gpu=true \
+    --hlo_file=path/to/hlo_module
+
 Tip: If the input generation takes too long or uses too much host memory,
 consider using --hlo_argument_mode=uninitialized.
 )";
@@ -64,8 +76,10 @@ int main(int argc, char** argv) {
   xla::InputFormat input_format;
   std::string hlo_file = "";
   bool should_run = true;
+  bool enable_mock_nccl = false;
   std::string dump_output_literal_to = "";
   int task_id = 0;
+  int num_nodes = 1;
   std::string device_type_str = "gpu";
   xla::FunctionalHloRunner::PreprocessingOptions preproc_options;
   xla::FunctionalHloRunner::RawCompileOptions raw_compile_options;
@@ -83,6 +97,10 @@ int main(int argc, char** argv) {
                 "Example: /a/b/literal.txt."),
       tsl::Flag("task_id", &task_id, "Borg task id."),
       tsl::Flag("device_type", &device_type_str, "Device type: gpu"),
+      tsl::Flag("num_nodes", &num_nodes, "Number of nodes (hosts)"),
+      tsl::Flag(
+          "enable_mock_nccl", &enable_mock_nccl,
+          "Should we simulate multi-hosts run with mock nccl collectives?"),
   };
 
   xla::MultiHostHloRunnerFlags hlo_runner_flags;
@@ -113,8 +131,14 @@ int main(int argc, char** argv) {
   }
 
   // The main logic:
-  xla::StatusOr<std::unique_ptr<xla::PjRtClient>> client =
-      xla::FunctionalHloRunner::CreateGpuClient();
+  xla::StatusOr<std::unique_ptr<xla::PjRtClient>> client;
+  if (enable_mock_nccl) {
+    CHECK_GT(num_nodes, 1);
+    client = xla::FunctionalHloRunner::CreateMockGpuClient(num_nodes);
+  } else {
+    CHECK_EQ(num_nodes, 1);
+    client = xla::FunctionalHloRunner::CreateGpuClient();
+  }
   TF_QCHECK_OK(client.status());
 
   if (should_run) {

From 4b4d32acc916311ca7300cfdff4cadf5055bbc2b Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Wed, 27 Sep 2023 11:00:46 -0700
Subject: [PATCH 329/567] [XLA] Add option to pattern match true only scalars
 in offset computation. - This option will force reduce-scatter pattern
 matching to only recognize   true scalar computation for DUS offset
 computation.

PiperOrigin-RevId: 568901880
---
 .../xla/xla/service/reduce_scatter_utils.cc   | 29 +++++++------------
 .../xla/xla/service/reduce_scatter_utils.h    |  3 +-
 2 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/service/reduce_scatter_utils.cc b/third_party/xla/xla/service/reduce_scatter_utils.cc
index 7248d702470d8c..87938e868bb395 100644
--- a/third_party/xla/xla/service/reduce_scatter_utils.cc
+++ b/third_party/xla/xla/service/reduce_scatter_utils.cc
@@ -141,8 +141,7 @@ bool IsPerIdOffsets(absl::Span<const HloInstruction*> offsets,
 // Returns if `offset` == shard_size * id.
 bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
                    const MapIdToTableOffset& map_id, int64_t group_size,
-                   const HloAllReduceInstruction* ar,
-                   bool true_scalar_for_offset_computation) {
+                   const HloAllReduceInstruction* ar) {
   const bool iota_group =
       ar->replica_groups().empty() ||
       (ar->IsCrossModuleAllReduce() && !ar->use_global_device_ids());
@@ -153,10 +152,6 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
       VLOG(2) << "Offset is not a scalar " << offset->ToString();
       return false;
     }
-    if (true_scalar_for_offset_computation && offset->shape().rank() != 0) {
-      VLOG(2) << "Offset is not a true scalar " << offset->ToString();
-      return false;
-    }
     int64_t const_operand = -1;
     if (offset->operand(0)->IsConstant()) {
       const_operand = 0;
@@ -173,8 +168,7 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
       return false;
     }
     return IsPerIdOffset(offset->operand(1 - const_operand),
-                         shard_size / *multiplier, map_id, group_size, ar,
-                         true_scalar_for_offset_computation);
+                         shard_size / *multiplier, map_id, group_size, ar);
   }
   if (shard_size == 1 && iota_group) {
     bool id_mapping_is_identity = true;
@@ -192,16 +186,16 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
   if (offset->opcode() == HloOpcode::kBitcast ||
       offset->opcode() == HloOpcode::kReshape ||
       offset->opcode() == HloOpcode::kCopy) {
-    return IsPerIdOffset(offset->operand(0), shard_size, map_id, group_size, ar,
-                         true_scalar_for_offset_computation);
+    return IsPerIdOffset(offset->operand(0), shard_size, map_id, group_size,
+                         ar);
   }
 
   if (offset->opcode() == HloOpcode::kConvert &&
       offset->operand(0)->shape().IsInteger() &&
       primitive_util::BitWidth(offset->operand(0)->shape().element_type()) <=
           primitive_util::BitWidth(offset->shape().element_type())) {
-    return IsPerIdOffset(offset->operand(0), shard_size, map_id, group_size, ar,
-                         true_scalar_for_offset_computation);
+    return IsPerIdOffset(offset->operand(0), shard_size, map_id, group_size,
+                         ar);
   }
 
   if (offset->opcode() == HloOpcode::kClamp) {
@@ -213,8 +207,8 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
               << offset->ToString();
       return false;
     }
-    return IsPerIdOffset(offset->operand(1), shard_size, map_id, group_size, ar,
-                         true_scalar_for_offset_computation);
+    return IsPerIdOffset(offset->operand(1), shard_size, map_id, group_size,
+                         ar);
   }
 
   const int64_t num_groups = iota_group ? 1 : ar->replica_groups().size();
@@ -272,8 +266,7 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
     const HloAllReduceInstruction* ar, int64_t num_partitions,
     int64_t num_replicas, bool allow_multiple_split_dims,
     bool allow_intervening_reshape, int64_t min_rank,
-    HloPredicate match_partition_id, HloPredicate match_replica_id,
-    bool true_scalar_for_offset_computation) {
+    HloPredicate match_partition_id, HloPredicate match_replica_id) {
   if (!ar->shape().IsArray() || ar->constrain_layout() ||
       (ar->IsCrossModuleAllReduce() &&
        !ar->GetModule()->config().use_spmd_partitioning())) {
@@ -477,8 +470,8 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
   } else {
     if (!IsPerIdOffset(user->operand(spec.split_dim + 1),
                        user->dynamic_slice_sizes()[spec.split_dim], map_id,
-                       group_size, ar, true_scalar_for_offset_computation)) {
-      VLOG(2) << "IsPerIdOffset() failed " << ar->ToString();
+                       group_size, ar)) {
+      VLOG(2) << "IsPerIdOffsets() failed " << ar->ToString();
       return std::nullopt;
     }
   }
diff --git a/third_party/xla/xla/service/reduce_scatter_utils.h b/third_party/xla/xla/service/reduce_scatter_utils.h
index c45f4e369aff6f..2ff610950e9f0b 100644
--- a/third_party/xla/xla/service/reduce_scatter_utils.h
+++ b/third_party/xla/xla/service/reduce_scatter_utils.h
@@ -38,8 +38,7 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
     int64_t num_replicas, bool allow_multiple_split_dims = false,
     bool allow_intervening_reshape = false, int64_t min_rank = 1,
     HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
-    HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>,
-    bool true_scalar_for_offset_computation = false);
+    HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>);
 
 }  // namespace xla
 

From 900e7e3814e44ac3afcd5d96df1ee12ee5f5f968 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 27 Sep 2023 11:17:00 -0700
Subject: [PATCH 330/567] [stream_executor] Replace stream_executor_headers
 with stream_executor in Tensorflow pip package

This is NFC because stream_executor transitively depends on exactly the same set of headers.

PiperOrigin-RevId: 568907149
---
 tensorflow/tools/pip_package/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 4d7b9194ab80e6..d5b33e25a5a55a 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -67,7 +67,7 @@ transitive_hdrs(
         "//tensorflow/python/framework:python_op_gen",
         "//tensorflow/python/client:tf_session_helper",
         "//third_party/eigen3",
-        "@local_xla//xla/stream_executor:stream_executor_headers",
+        "@local_xla//xla/stream_executor",
     ] + if_cuda([
         "@local_config_cuda//cuda:cuda_headers",
     ]),

From a481ebbf2594d8cd5b103842e06b4f62cd2ae44b Mon Sep 17 00:00:00 2001
From: "Ryan M. Lefever" <lefever@google.com>
Date: Wed, 27 Sep 2023 11:45:50 -0700
Subject: [PATCH 331/567] Change 6/6 for making MSA repacking slice aware.

Change MSA's best fit repacker to be slicing aware.

Note, the original repacker's GetTemporalBufferIntervalCompare() method was never called. Instead, GlobalDecreasingSizeBestFitHeap::GetTemporalBufferIntervalCompare() was called. Thus:
- I deleted the repacker's original GetTemporalBufferIntervalCompare().
- For non-slices, we could have continued to use GlobalDecreasingSizeBestFitHeap::GetTemporalBufferIntervalCompare(). However, that method would not have used the full duration of slices because the repacker only stores part of a sliced allocation block in buffer_intervals_. (See the comments in memory_space_assignment_best_fit_repacker.cc for details.) Thus, to maintain parity, we created DefaultBufferIntervalCompare(). DefaultBufferIntervalCompare() does what GlobalDecreasingSizeBestFitHeap::GetTemporalBufferIntervalCompare() does, except it gets the correct durations for sliced allocation blocks.

PiperOrigin-RevId: 568915967
---
 .../xla/service/memory_space_assignment/BUILD |  12 +
 ...mory_space_assignment_best_fit_repacker.cc | 574 +++++++++++++++++-
 ...emory_space_assignment_best_fit_repacker.h |  32 +-
 ...space_assignment_best_fit_repacker_test.cc | 181 +++++-
 .../memory_space_assignment_repacking.h       |  24 +-
 5 files changed, 783 insertions(+), 40 deletions(-)

diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index 48ff95b0a13f4e..cd54c1fb8df285 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -110,6 +110,7 @@ cc_library(
     deps = [
         "//xla:statusor",
         "//xla:types",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -122,7 +123,15 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":memory_space_assignment_repacking",
+        "//xla:comparison_util",
+        "//xla:statusor",
         "//xla/service:heap_simulator",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -154,7 +163,10 @@ xla_cc_test(
     srcs = ["memory_space_assignment_best_fit_repacker_test.cc"],
     deps = [
         ":memory_space_assignment_best_fit_repacker",
+        "//xla:comparison_util",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.cc
index 6eaf744f5a9e84..35d2428c7c7572 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.cc
@@ -13,85 +13,597 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+/*
+BestFitRepacker Algorithm
+
+Preliminary terminology/ideas
+  * Consider a sliced AllocationBlock B with S slices.
+    - Let s_{B,i} be the size of slice at the ith smallest offset.
+    - Let s_B be the sum(s_{B,i}, for all i).
+    - t_{B,i} be the ith earliest start time for a slice in B. (Note, an
+      AllocationBlock has not determined which slice offset maps to which slice
+      start time.)
+    - Let e_B be the allocation end time for all slices.
+
+    We defined the following:
+    - B's full buffer interval
+      * Defined by the buffer interval spanning time [s_{B,0}, e_B], with size
+        sum(s_{B, i}, for all i).
+      * In the illustration below, B's full buffer interval occupies W+X+Y+Z
+      * In more colloquial language, the smallest rectangle that can bound
+        a sliced allocation.
+    - B's minimum buffer interval
+      * Defined by the buffer interval spanning the time [t_{B,S-1}, e_B],
+        with size s_B.
+      * In the illustration below, B's minimum buffer interval occupies W.
+      * In more colloquial language, the sliced allocation once all slices are
+        allocated.
+    - Potential placement of B
+      * A potential placement of B occupies W+X. Note other placements are
+        possible, e.g., starting the slice at offset o at time t_{B,0} or
+        t_{B,2}.
+
+    Illustration of B with S=3:
+                      space
+                        ^
+                  o+s_B |  +-------------------------+------------+
+                        |  |            Y            |            |
+      o+s_{B,0}+s_{B,1} |  +-------------------------+            |
+                        |  |                         |      W     |
+              o+s_{B,0} |  +------------+     X      |            |
+                        |  |      Z     |            |            |
+                      o |  +------------+------------+------------+
+                        |
+                        +--|------------|------------|------------|-> time
+                         t_{B,0}      t_{B,1}      t_{B,2}       e_B
+
+  * For a non-sliced AllocationBlock the full buffer interval == the miniumum
+    buffer interval == potential placement (given starting offset o).
+
+Step 1: Data structure construction (ImportAllocationBlocks())
+  * allocation_blocks_: Contains the input AllocationBlocks to repack. If we
+        find a valid repacking, we update these AllocationBlocks with repacking
+        placement data.
+  * buffer_intervals_: Data structure owned by GlobalDecreasingSizeBestFitHeap.
+        We use it to map each each AllocationBlock to its minimum buffer
+        interval.
+  * full_buffer_interval_map_: Maps each AllocationBlock to its minimum buffer
+        interval.
+  * sliced_buffer_interval_map_: Maps each AllocationBlock to a
+        SlicedBufferInterval that wraps the corresponding full buffer interval
+        stored in full_buffer_interval_map_.
+
+Step 2: Sort buffers (GetSortedBufferIntervals())
+  * This step prioritizes the order in which we will try place each buffer in
+    the repacking.
+  * GetSortedBufferIntervals() sorts the buffer intervals in buffer_intervals_.
+    The DefaultBufferIntervalCompare() maps each buffer in buffer_intervals to
+    its full buffer interval, and uses the full buffer interval's
+    properties/dimensions for sorting.
+
+Step 3: Find and commit buffer chunks (FindAndCommitChunks())
+  * We iterate through AllocationBlocks (in the sorted order from Step 2),
+    finding a location for them.
+  * When we try to find a placement for an AllocationBlock B, we also find
+    locations for its colocations. Colocations are done in tandem with B because
+    B cannot be committed to a location if the same offset does not work for its
+    colocations.
+  * Colocations are handled as follows:
+    - If a colocation is not sliced, MakeFreeChunks() will make sure we do not
+      consider any placement for B that does not also accomodate the colocation.
+    - If a colocation is sliced, MakeFreeChunks() will be inadequate for
+      enforcing a placement for the colocation. The problem is that during
+      placement, we have not yet determined the mapping between offsets and
+      slice times for a sliced colocation. So, we have 2 options.
+      Option 1) Overestimate the locations of free space available for B,
+                knowing that some may not work when we try to place sliced
+                colocations.
+      Option 2) Underestimate the locations of free space available for B,
+                knowing this will cause us not to choose some valid placement
+                options for B.
+      We have chosen option 1. To compensate for the free space over estimation,
+      every time we find a place for B, we must explicitly check that each of
+      its sliced colocations will also fit.
+
+Step 4: Check if the repacking fits in the heap size
+  * If the repacking does not fit in the heap size, the repacking fails.
+
+Step 5: Update AllocationBlocks with the repacking placements
+  * We update the offset and repacked_slicing fields of an AllocationBlock.
+    Callers extract that data to get the repacking locations.
+*/
+
 #include "xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <functional>
+#include <optional>
+#include <string>
 #include <tuple>
+#include <utility>
+#include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/comparison_util.h"
 #include "xla/service/heap_simulator.h"
+#include "xla/statusor.h"
+#include "tsl/platform/logging.h"
 
 namespace xla {
-
 namespace {
 
 using AllocationBlock = MemorySpaceAssignmentRepacker::AllocationBlock;
 using Type = GlobalDecreasingSizeBestFitHeap<AllocationBlock>::Type;
+using SlicedAllocationData =
+    MemorySpaceAssignmentRepacker::SlicedAllocationData;
+using Slice = MemorySpaceAssignmentRepacker::Slice;
+
+bool IsSliced(const AllocationBlock* block) {
+  return block->original_slice_data.has_value();
+}
 
-// This class inherits GlobalDecreasingSizeBestFitHeap and converts
-// AllocationBlock objects into BufferIntervals that the heap algorithm
-// understands.
+template <typename T>
+std::vector<const AllocationBlock*> SortAllocationBlocks(const T& container) {
+  std::vector<const AllocationBlock*> result;
+  result.insert(result.end(), container.begin(), container.end());
+  absl::c_sort(result,
+               [](const AllocationBlock* lhs, const AllocationBlock* rhs) {
+                 return std::make_tuple(lhs->start_time, lhs->end_time,
+                                        lhs->initial_offset, lhs->size) <
+                        std::make_tuple(rhs->start_time, rhs->end_time,
+                                        rhs->initial_offset, rhs->size);
+               });
+
+  return result;
+}
+
+// A slice-aware best-fit repacker.
 class BestFitRepacker
     : public GlobalDecreasingSizeBestFitHeap<AllocationBlock> {
  public:
-  BestFitRepacker(int64_t max_size, int64_t alignment, Type type)
-      : GlobalDecreasingSizeBestFitHeap<AllocationBlock>(alignment, type),
+  BestFitRepacker(
+      const MemorySpaceAssignmentBestFitRepacker::BestFitRepackOptions& options,
+      int64_t max_size, int64_t alignment)
+      : GlobalDecreasingSizeBestFitHeap<AllocationBlock>(
+            alignment, kCustom,
+            (options.buffer_interval_compare ? options.buffer_interval_compare
+                                             : DefaultBufferIntervalCompare())),
+        validate_(options.validate),
         max_size_(max_size) {}
 
+  // Initialize our basic data structures: allocation_blocks_,
+  // buffer_intervals_, full_buffer_interval_map_, and
+  // sliced_buffer_interval_map_.
   void ImportAllocationBlocks(absl::Span<AllocationBlock*> allocations) {
     allocation_blocks_ = allocations;
-    for (AllocationBlock* allocation_block : allocations) {
+    // We loop through allocation_blocks_ once to build
+    // full_buffer_interval_map_, with colocations fully specified.
+    for (AllocationBlock* allocation_block : allocation_blocks_) {
       // Check if any of the colocations are already added to buffer_intervals_.
       bool need_allocation = true;
       auto aliased_it = absl::c_find_if(
           allocation_block->colocations, [&](AllocationBlock* search) {
-            return buffer_intervals_.contains(search);
+            return full_buffer_interval_map_.contains(search);
           });
       if (aliased_it != allocation_block->colocations.end()) {
-        buffer_intervals_[*aliased_it].colocations.push_back(allocation_block);
+        full_buffer_interval_map_[*aliased_it].colocations.push_back(
+            allocation_block);
         need_allocation = false;
       }
-      buffer_intervals_[allocation_block] = {allocation_block,
-                                             allocation_block->size,
-                                             allocation_block->start_time,
-                                             allocation_block->end_time,
-                                             {},
-                                             need_allocation};
+      full_buffer_interval_map_.insert(std::make_pair(
+          allocation_block, BufferInterval{allocation_block,
+                                           allocation_block->size,
+                                           allocation_block->start_time,
+                                           allocation_block->end_time,
+                                           {},
+                                           need_allocation}));
+    }
+
+    // Now that full_buffer_interval_map_ has full colocation specifications,
+    // we loop through allocation_blocks_ again to build
+    // sliced_buffer_interval_map_ and buffer_intervals_. Also note, at this
+    // point we will not longer add or remove items from
+    // full_buffer_interval_map_. This is important because
+    // sliced_buffer_interval_map_ will retain pointers to BufferIntervals in
+    // full_buffer_interval_map_.
+    for (AllocationBlock* allocation_block : allocation_blocks_) {
+      BufferInterval& full_buffer_interval =
+          full_buffer_interval_map_[allocation_block];
+      SlicedBufferInterval& sliced_buffer_interval =
+          sliced_buffer_interval_map_
+              .insert(std::make_pair(
+                  allocation_block, SlicedBufferInterval::CreateMutableInterval(
+                                        full_buffer_interval)))
+              .first->second;
+      if (IsSliced(allocation_block)) {
+        const SlicedAllocationData& original_slice_data =
+            allocation_block->original_slice_data.value();
+        CHECK(!original_slice_data.slices_sorted_by_offset.empty());
+
+        sliced_buffer_interval.Slice(original_slice_data.SizesSortedByOffset());
+        sliced_buffer_interval.UpdateSliceStartTimes(
+            original_slice_data.SortedStartTimes());
+      }
+
+      // We use buffer_intervals_ to store the minimum buffer interval for
+      // allocation_block. See the algorithm description (at the head of this
+      // file) for more details.
+      buffer_intervals_[allocation_block] =
+          sliced_buffer_interval.IntervalForMakeFreeChunks(
+              sliced_buffer_interval.num_slices() - 1);
     }
+
+    CHECK_EQ(allocation_blocks_.size(), buffer_intervals_.size());
+    CHECK_EQ(allocation_blocks_.size(), full_buffer_interval_map_.size());
+    CHECK_EQ(allocation_blocks_.size(), sliced_buffer_interval_map_.size());
+
+    VLOG(1) << [&]() -> std::string {
+      int sliced_blocks = 0;
+      int colocation_sets = 0;
+      int colocation_sets_with_multiple_sliced_blocks = 0;
+      absl::flat_hash_set<const AllocationBlock*> seen_blocks;
+
+      for (const auto& allocation_and_buffer_interval : buffer_intervals_) {
+        const AllocationBlock* block = allocation_and_buffer_interval.first;
+        const BufferInterval& min_buffer_interval =
+            allocation_and_buffer_interval.second;
+
+        if (IsSliced(block)) {
+          ++sliced_blocks;
+        }
+        if (seen_blocks.contains(block)) {
+          continue;
+        }
+        seen_blocks.insert(block);
+        // We process a block and all its colocations at once below. So every
+        // time a block is unseen, it indicates a new colocation set.
+        ++colocation_sets;
+
+        int num_sliced_colocations = (IsSliced(block) ? 1 : 0);
+        // GetTranstivieColocations must be called on BufferIntervals from
+        // buffer_intervals_.
+        for (const AllocationBlock* colocation :
+             GetTransitiveColocations(min_buffer_interval)) {
+          seen_blocks.insert(colocation);
+          if (IsSliced(colocation)) {
+            ++num_sliced_colocations;
+          }
+        }
+        if (num_sliced_colocations > 1) {
+          ++colocation_sets_with_multiple_sliced_blocks;
+        }
+      }
+
+      return absl::StrCat(
+          "Imported repacking stats: num_blocks=", allocation_blocks_.size(),
+          "; num_sliced_blocks=", sliced_blocks,
+          "; num_colocation_sets=", colocation_sets,
+          "; num_colocation_sets_with_multiple_sliced_blocks=",
+          colocation_sets_with_multiple_sliced_blocks);
+    }();
   }
 
-  // Sorting by initial offset gives better buffer order stability between
-  // related programs improving the success rate for cross-program prefetching.
-  BufferIntervalCompare GetTemporalBufferIntervalCompare() const override {
+  BufferIntervalCompare DefaultBufferIntervalCompare() const {
     return LessThanByKey([this](const BufferInterval& x) {
-      int64_t x_end = x.end;
+      const BufferInterval& full_buffer_interval =
+          full_buffer_interval_map_.at(x.buffer);
+      int64_t full_buffer_interval_end = full_buffer_interval.end;
+
+      // GetTranstivieColocations must be called on BufferIntervals from
+      // buffer_intervals_.
       for (auto colocation : GetTransitiveColocations(x)) {
-        x_end = std::max(x_end, buffer_intervals_.at(colocation).end);
+        full_buffer_interval_end =
+            std::max(full_buffer_interval_end,
+                     full_buffer_interval_map_.at(colocation).end);
       }
-      // Sort by duration (descending), size (descending), initial offset
-      // (ascending), buffer (ascending).
-      return std::make_tuple(x.start - x_end, -x.size, x.buffer->initial_offset,
-                             std::cref(*x.buffer));
+
+      // Sort by duration (descending), size (descending), buffer (ascending).
+      return std::make_tuple(
+          full_buffer_interval.start - full_buffer_interval_end,
+          -full_buffer_interval.size, std::cref(*full_buffer_interval.buffer));
     });
   }
 
+  // CommitChunks() does the following:
+  // 1) Commits chunks to interval_tree_.
+  // 2) Updates the entries in new_offsets_ and new_repacked_slicing_ for
+  //    allocation_block, with the information from chunks.
+  // 3) Updates result._heap_size with the information from chunks.
+  //
+  // REQUIRED:
+  // - chunks is sorted in slice time order
+  void CommitChunks(const AllocationBlock* allocation_block,
+                    const std::vector<Chunk>& chunks) {
+    VLOG(2) << "Committing repack chunks for " << allocation_block->ToString();
+
+    int64_t new_offset = -1;
+    std::optional<SlicedAllocationData> repacked_slice_data = std::nullopt;
+
+    if (IsSliced(allocation_block)) {
+      const SlicedAllocationData& original_slice_data =
+          allocation_block->original_slice_data.value();
+
+      // We should have a chunk per slice time.
+      CHECK_EQ(chunks.size(),
+               original_slice_data.slices_sorted_by_offset.size());
+      repacked_slice_data = SlicedAllocationData();
+      repacked_slice_data->slices_sorted_by_offset.reserve(chunks.size());
+
+      // Chunks and start times are sorted in start time order.
+      std::vector<int64_t> sorted_start_times =
+          original_slice_data.SortedStartTimes();
+      for (int i = 0; i < chunks.size(); ++i) {
+        const Chunk& chunk = chunks[i];
+        int64_t start_time = sorted_start_times[i];
+        result_.heap_size = result_.UpdatedHeapSize(chunk);
+        VLOG(2) << "Adding sliced chunk " << chunk.ToString() << " at ["
+                << start_time << ", " << allocation_block->end_time << "]";
+        interval_tree_.Add(start_time, allocation_block->end_time, chunk);
+        new_offset = (new_offset == -1 ? chunk.offset
+                                       : std::min(new_offset, chunk.offset));
+        repacked_slice_data->slices_sorted_by_offset.push_back(
+            Slice({chunk.size, chunk.offset, start_time}));
+      }
+      absl::c_sort(repacked_slice_data->slices_sorted_by_offset,
+                   [](const Slice& lhs, const Slice& rhs) {
+                     return lhs.offset < rhs.offset;
+                   });
+    } else {
+      CHECK_EQ(chunks.size(), 1);
+      new_offset = chunks.front().offset;
+      result_.heap_size = result_.UpdatedHeapSize(chunks.front());
+      VLOG(2) << "Adding unsliced chunk " << chunks.front().ToString()
+              << " at [" << allocation_block->start_time << ", "
+              << allocation_block->end_time << ")";
+      interval_tree_.Add(allocation_block->start_time,
+                         allocation_block->end_time, chunks.front());
+    }
+
+    CHECK_NE(new_offset, -1);
+    CHECK(!new_offsets_.contains(allocation_block));
+    new_offsets_[allocation_block] = new_offset;
+    if (repacked_slice_data.has_value()) {
+      CHECK(IsSliced(allocation_block));
+      CHECK(!new_repacked_slicing_.contains(allocation_block));
+      new_repacked_slicing_[allocation_block] = *repacked_slice_data;
+    }
+  }
+
+  // A helper structure of information we keep in FindAndCommitChunks() for
+  // each sliced colocation.
+  struct SlicedColocationData {
+    SlicedBufferInterval* sliced_buffer_interval;
+    SlicedAllocationFinder sliced_allocation_finder;
+    std::vector<Chunk> chunks;
+  };
+
+  // Finds and commits chunks for the AllocationBlock associated with
+  // min_buffer_interval, and all of that block's colocations.
+  void FindAndCommitChunks(BufferInterval* min_buffer_interval) {
+    const AllocationBlock* allocation_block = min_buffer_interval->buffer;
+    SlicedBufferInterval& sliced_buffer_interval =
+        sliced_buffer_interval_map_.at(allocation_block);
+
+    int64_t max_colocation_size = GetMaxColocationSize(*min_buffer_interval);
+
+    // Additional data we track for sliced colocations. In particular, we create
+    // a SlicedAllocationFinder for each sliced colocation, so we can check if
+    // a sliced colocation can fit at a particular offset, to accommodate the
+    // overestimation of free space in MakeFreeChunks(), as described at the top
+    // of this file.
+    absl::flat_hash_map<const AllocationBlock*, SlicedColocationData>
+        sliced_buffer_map;
+    for (auto colocation :
+         SortAllocationBlocks(GetTransitiveColocations(*min_buffer_interval))) {
+      if (IsSliced(colocation)) {
+        SlicedBufferInterval& colocation_sliced_buffer_interval =
+            sliced_buffer_interval_map_.at(colocation);
+        SlicedAllocationFinder sliced_colocation_finder =
+            CreateSlicedAllocationFinder(colocation_sliced_buffer_interval,
+                                         max_colocation_size,
+                                         /*preferred_offset=*/-1);
+        sliced_buffer_map.insert(std::make_pair(
+            colocation,
+            SlicedColocationData{&colocation_sliced_buffer_interval,
+                                 std::move(sliced_colocation_finder),
+                                 /*chunks=*/{}}));
+      }
+    }
+
+    // Using the SlicedAllocationFinders for each sliced colocation, this
+    // function returns true if we can place all sliced colocations at a given
+    // offset.
+    auto is_offset_allowed = [this, &sliced_buffer_map](int64_t offset) {
+      for (auto& block_and_colocation_data : sliced_buffer_map) {
+        SlicedColocationData& sliced_colocation_data =
+            block_and_colocation_data.second;
+        auto colocation_chunks =
+            sliced_colocation_data.sliced_allocation_finder.FindForOffset(
+                offset);
+        colocation_chunks = PostProcessFindChunkCandidatesResult(
+            *sliced_colocation_data.sliced_buffer_interval,
+            std::move(colocation_chunks));
+        if (colocation_chunks.empty()) {
+          return false;
+        }
+        sliced_colocation_data.chunks = std::move(colocation_chunks);
+      }
+
+      return true;
+    };
+
+    // Find chunks for allocation_block and its colocations.
+    SlicedAllocationFinder finder = CreateSlicedAllocationFinder(
+        sliced_buffer_interval, max_colocation_size, /*preferred_offset=*/-1,
+        is_offset_allowed);
+    std::vector<Chunk> chunks = PostProcessFindChunkCandidatesResult(
+        sliced_buffer_interval, finder.Find());
+    int64_t min_offset =
+        absl::c_min_element(chunks, [](const Chunk& lhs, const Chunk& rhs) {
+          return lhs.offset < rhs.offset;
+        })->offset;
+
+    // Commit chunks for allocation_block.
+    CommitChunks(allocation_block, chunks);
+
+    // Commit chunks for colocations.
+    for (auto colocation : GetTransitiveColocations(*min_buffer_interval)) {
+      if (IsSliced(colocation)) {
+        CommitChunks(colocation, sliced_buffer_map.at(colocation).chunks);
+      } else {
+        const BufferInterval& colocation_full_buffer_interval =
+            full_buffer_interval_map_[colocation];
+        CommitChunks(colocation,
+                     {Chunk::FromOffsetSize(
+                         min_offset, colocation_full_buffer_interval.size)});
+      }
+    }
+  }
+
+  // We do not use result_.chunk_map, and we have our own method that combines
+  // finding chunks with committing them; thus, we expect this method to never
+  // be called.
+  void AddToChunkMap(const AllocationBlock* buffer, Chunk chunk) override {
+    LOG(FATAL) << "We should never get here.";
+  }
+
+  Result Finish() override {
+    std::vector<BufferInterval> sorted_buffer_intervals =
+        GetSortedBufferIntervals();
+
+    for (auto& buffer_interval : sorted_buffer_intervals) {
+      if (!buffer_interval.need_allocation) {
+        continue;
+      }
+
+      FindAndCommitChunks(&buffer_interval);
+    }
+
+    Result result;
+    result.heap_size = result_.heap_size;
+    result.heap_results.emplace_back(result_);
+    return result;
+  }
+
+  // A data structure for storing a chunk and its live time for use in
+  // debugging.
+  struct TimedChunk {
+    std::string id;
+    const AllocationBlock* block;
+    int64_t start_inclusive;
+    int64_t end_inclusive;
+    Chunk chunk;
+
+    bool Overlaps(const TimedChunk& timed_chunk) {
+      if (timed_chunk.start_inclusive > end_inclusive ||
+          timed_chunk.end_inclusive < start_inclusive) {
+        return false;
+      }
+      return chunk.OverlapsWith(timed_chunk.chunk);
+    }
+  };
+
+  void DebuggingValidate() {
+    std::vector<TimedChunk> timed_chunks;
+    for (const AllocationBlock* block : allocation_blocks_) {
+      if (IsSliced(block)) {
+        for (int i = 0;
+             i < block->repacked_slice_data->slices_sorted_by_offset.size();
+             ++i) {
+          const Slice& slice =
+              block->repacked_slice_data->slices_sorted_by_offset[i];
+          timed_chunks.push_back(
+              TimedChunk{absl::StrCat(((int64_t)block), "_slice_", i), block,
+                         slice.start_time, block->end_time,
+                         Chunk::FromOffsetSize(slice.offset, slice.size)});
+        }
+      } else {
+        timed_chunks.push_back(
+            TimedChunk{absl::StrCat(((int64_t)block)), block, block->start_time,
+                       block->end_time,
+                       Chunk::FromOffsetSize(block->offset, block->size)});
+      }
+    }
+
+    bool overlap_found = false;
+    for (int i = 0; i < timed_chunks.size(); ++i) {
+      for (int j = i + 1; j < timed_chunks.size(); ++j) {
+        if (timed_chunks[i].Overlaps(timed_chunks[j])) {
+          overlap_found = true;
+          LOG(ERROR) << "Allocation block overlap\n"
+                     << "     " << timed_chunks[i].block->ToString()
+                     << "\n     " << timed_chunks[j].block->ToString();
+        }
+      }
+    }
+
+    if (overlap_found) {
+      LOG(FATAL) << "Allocation overlap found";
+    }
+  }
+
   bool Repack() {
     Finish();
     bool success = result_.heap_size <= max_size_;
-    if (success) {
+    if (!success) {
+      LOG(INFO) << "Repacking unsuccessful with heap size "
+                << result_.heap_size;
+      return false;
+    }
+
+    // Update AllocationBlocks.
+    for (AllocationBlock* block : allocation_blocks_) {
+      CHECK(new_offsets_.contains(block));
+      block->offset = new_offsets_[block];
+      if (!IsSliced(block)) {
+        continue;
+      }
+
+      CHECK(new_repacked_slicing_.contains(block));
+      block->repacked_slice_data = std::move(new_repacked_slicing_[block]);
+    }
+
+    if (validate_) {
+      DebuggingValidate();
+    }
+
+    if (VLOG_IS_ON(1)) {
       for (AllocationBlock* block : allocation_blocks_) {
-        auto chunk_it = result_.chunk_map.find(block);
-        if (chunk_it != result_.chunk_map.end()) {
-          block->offset = chunk_it->second.offset;
-        }
+        VLOG(1) << "AllocationBlock after repacking: " << block->ToString();
       }
     }
-    return success;
+
+    LOG(INFO) << "Repacking successful with heap size " << result_.heap_size;
+
+    return true;
   }
 
  private:
+  // If true, we run a potentially expensive validation to make sure there are
+  // no overlaps in the repacked chunks. Note, there should never be an overlap.
+  bool validate_ = false;
+
+  // Maximum heap size.
   int64_t max_size_;
+
+  // Input AllocationBlocks to repack.
   absl::Span<AllocationBlock*> allocation_blocks_;
+
+  absl::flat_hash_map<const AllocationBlock*, BufferInterval>
+      full_buffer_interval_map_;
+  absl::flat_hash_map<const AllocationBlock*, SlicedBufferInterval>
+      sliced_buffer_interval_map_;
+
+  // Data structures updated with repacking placement information as we compute
+  // it.
+  absl::flat_hash_map<const AllocationBlock*, int64_t> new_offsets_;
+  absl::flat_hash_map<const AllocationBlock*, SlicedAllocationData>
+      new_repacked_slicing_;
 };
 
 }  // namespace
@@ -99,7 +611,7 @@ class BestFitRepacker
 StatusOr<bool> MemorySpaceAssignmentBestFitRepacker::Repack(
     absl::Span<AllocationBlock*> allocations) {
   BestFitRepacker best_fit_repacker =
-      BestFitRepacker(max_size_, alignment_, type_);
+      BestFitRepacker(options_, max_size_, alignment_);
   best_fit_repacker.ImportAllocationBlocks(allocations);
   return best_fit_repacker.Repack();
 }
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h
index 220c19350b5925..39be380eae599b 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h
@@ -16,8 +16,12 @@ limitations under the License.
 #ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
 #define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
 
+#include <cstdint>
+
+#include "absl/types/span.h"
 #include "xla/service/heap_simulator.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
+#include "xla/statusor.h"
 
 namespace xla {
 
@@ -26,17 +30,31 @@ namespace xla {
 class MemorySpaceAssignmentBestFitRepacker
     : public MemorySpaceAssignmentRepacker {
  public:
-  using Type = GlobalDecreasingSizeBestFitHeap<AllocationBlock>::Type;
-
-  explicit MemorySpaceAssignmentBestFitRepacker(
-      int64_t max_size, int64_t alignment,
-      Type type = GlobalDecreasingSizeBestFitHeap<AllocationBlock>::kTemporal)
-      : MemorySpaceAssignmentRepacker(max_size, alignment), type_(type) {}
+  using BufferInterval =
+      GlobalDecreasingSizeBestFitHeap<AllocationBlock>::BufferInterval;
+  using BufferIntervalCompare =
+      GlobalDecreasingSizeBestFitHeap<AllocationBlock>::BufferIntervalCompare;
+
+  struct BestFitRepackOptions {
+    // Running the validator is potentially expensive.
+    bool validate = false;
+
+    // Specify the comparison function used for determining the order in which
+    // buffers will be allocated, during repacking.
+    BufferIntervalCompare buffer_interval_compare = nullptr;
+  };
+
+  MemorySpaceAssignmentBestFitRepacker(int64_t max_size, int64_t alignment)
+      : MemorySpaceAssignmentRepacker(max_size, alignment),
+        options_(BestFitRepackOptions()) {}
+  MemorySpaceAssignmentBestFitRepacker(int64_t max_size, int64_t alignment,
+                                       BestFitRepackOptions options)
+      : MemorySpaceAssignmentRepacker(max_size, alignment), options_(options) {}
 
   StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override;
 
  private:
-  Type type_;
+  BestFitRepackOptions options_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker_test.cc
index 52b87ffebb3c28..70a05e78abcaca 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker_test.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h"
 
+#include <cstdint>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "xla/comparison_util.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -22,8 +27,11 @@ namespace xla {
 class MemorySpaceAssignmentBestFitRepackerTest : public ::testing::Test {
  protected:
   using AllocationBlock = MemorySpaceAssignmentRepacker::AllocationBlock;
+  using SlicedAllocationData =
+      MemorySpaceAssignmentRepacker::SlicedAllocationData;
+  using Slice = MemorySpaceAssignmentRepacker::Slice;
 
-  MemorySpaceAssignmentBestFitRepackerTest() : repacker_(100, 1) {}
+  MemorySpaceAssignmentBestFitRepackerTest() : repacker_(100, 1, options_) {}
 
   AllocationBlock* MakeAllocationBlock(int64_t start_time, int64_t end_time,
                                        int64_t size,
@@ -42,6 +50,8 @@ class MemorySpaceAssignmentBestFitRepackerTest : public ::testing::Test {
   }
 
   std::list<AllocationBlock> allocation_blocks_;
+  MemorySpaceAssignmentBestFitRepacker::BestFitRepackOptions options_{
+      /*validate=*/true, /*buffer_interval_compare=*/nullptr};
   MemorySpaceAssignmentBestFitRepacker repacker_;
 };
 
@@ -105,4 +115,173 @@ TEST_F(MemorySpaceAssignmentBestFitRepackerTest, ColocationDifferentSizes) {
   EXPECT_EQ(allocation_blocks[3]->offset, 5);
 }
 
+TEST_F(MemorySpaceAssignmentBestFitRepackerTest, RepackedSlicesFit) {
+  // Expected repacking:
+  //
+  // space
+  //   ^
+  //  8 |
+  //  7 |                  +-----+
+  //  6 |                  |  E  |
+  //  5 |          +-------+-++  |
+  //  4 |          |    B    |+--++--++--+
+  //  3 |          |         ||  || F||  |
+  //  2 +----------+---++----++  |+--++  |
+  //  1 |      A       ||    C   ||   D  |
+  //  0 +--------------++--------++------+
+  //    +----|----|----|----|----|----|----|--> time
+  //    0    5    10   15   20   25   30   35
+
+  std::vector<AllocationBlock*> allocation_blocks;
+  // Block A
+  allocation_blocks.push_back(MakeAllocationBlock(0, 15, 2));
+  // Block B
+  allocation_blocks.push_back(MakeAllocationBlock(11, 21, 3));
+  // Block C
+  allocation_blocks.push_back(MakeAllocationBlock(16, 25, 4));
+  allocation_blocks.back()->original_slice_data =
+      SlicedAllocationData({{Slice{2, -1, 16}, Slice{2, -1, 22}}});
+  // Block D
+  allocation_blocks.push_back(MakeAllocationBlock(26, 33, 4));
+  allocation_blocks.back()->original_slice_data =
+      SlicedAllocationData({{Slice{2, -1, 26}, Slice{2, -1, 30}}});
+  // Block E
+  allocation_blocks.push_back(MakeAllocationBlock(19, 25, 3));
+  allocation_blocks.back()->original_slice_data =
+      SlicedAllocationData({{Slice{1, -1, 19}, Slice{2, -1, 22}}});
+  // Block F
+  allocation_blocks.push_back(MakeAllocationBlock(26, 29, 2));
+
+  // Specify the repacking sort order as the order in which blocks were added to
+  // allocation_blocks.
+  // - By placing C after B, we test that the repacker can place a sliced block
+  //   around another block
+  // - By placing F after D, we test that the repacker can fill in the extra
+  //   space left behind by slicing.
+  absl::flat_hash_map<AllocationBlock*, int> sort_keys;
+  for (int i = 0; i < allocation_blocks.size(); ++i) {
+    sort_keys[allocation_blocks[i]] = i;
+  }
+  options_.buffer_interval_compare = LessThanByKey(
+      [sort_keys](
+          const MemorySpaceAssignmentBestFitRepacker::BufferInterval& x) {
+        return sort_keys.at(x.buffer);
+      });
+  repacker_ = MemorySpaceAssignmentBestFitRepacker(100, 1, options_);
+
+  EXPECT_TRUE(*repacker_.Repack(absl::MakeSpan(allocation_blocks)));
+
+  // Check results
+  // Block A
+  EXPECT_EQ(allocation_blocks[0]->offset, 0);
+  EXPECT_FALSE(allocation_blocks[0]->repacked_slice_data.has_value());
+  // Block B
+  EXPECT_EQ(allocation_blocks[1]->offset, 2);
+  EXPECT_FALSE(allocation_blocks[1]->repacked_slice_data.has_value());
+  // Block C
+  EXPECT_EQ(allocation_blocks[2]->offset, 0);
+  ASSERT_TRUE(allocation_blocks[2]->repacked_slice_data.has_value());
+  EXPECT_EQ(*allocation_blocks[2]->repacked_slice_data,
+            (SlicedAllocationData({{Slice{2, 0, 16}, Slice{2, 2, 22}}})));
+  // Block D
+  EXPECT_EQ(allocation_blocks[3]->offset, 0);
+  ASSERT_TRUE(allocation_blocks[3]->repacked_slice_data.has_value());
+  EXPECT_EQ(*allocation_blocks[3]->repacked_slice_data,
+            (SlicedAllocationData({{Slice{2, 0, 26}, Slice{2, 2, 30}}})));
+  // Block E
+  EXPECT_EQ(allocation_blocks[4]->offset, 4);
+  ASSERT_TRUE(allocation_blocks[4]->repacked_slice_data.has_value());
+  EXPECT_EQ(*allocation_blocks[4]->repacked_slice_data,
+            (SlicedAllocationData({{Slice{1, 4, 22}, Slice{2, 5, 19}}})));
+  // Block F
+  EXPECT_EQ(allocation_blocks[5]->offset, 2);
+  EXPECT_FALSE(allocation_blocks[5]->repacked_slice_data.has_value());
+}
+
+TEST_F(MemorySpaceAssignmentBestFitRepackerTest, SlicedColocationsFit) {
+  // Expected repacking:
+  //
+  // space
+  //   ^
+  //  9 |              +-+
+  //  8 |              | |
+  //  7 |              |F|+-+
+  //  6 |    +-----++-+| || |
+  //  5 |    |  C  || || || |
+  //  4 +----+--++-++ |+-++ |
+  //  3 |    B  || E  || D  |
+  //  2 +-------++--+-++----+
+  //  1 |     A     |
+  //  0 +-----------+
+  //    +----|----|----|----|----|----|----|--> time
+  //    0    5    10   15   20   25   30   35
+
+  // D is colocated with E; thus, they will be allocated together.
+  std::vector<AllocationBlock*> allocation_blocks;
+  // Block A
+  allocation_blocks.push_back(MakeAllocationBlock(0, 12, 2));
+  // Block B
+  allocation_blocks.push_back(MakeAllocationBlock(0, 8, 2));
+  // Block C
+  allocation_blocks.push_back(MakeAllocationBlock(5, 11, 2));
+  // Block D
+  allocation_blocks.push_back(MakeAllocationBlock(15, 20, 5));
+  // Note, below we put the later start time first in the original slice data.
+  // This shouldn't make a difference to the repacker because it will map
+  // sizes to times as it sees fit.
+  allocation_blocks.back()->original_slice_data =
+      SlicedAllocationData({{Slice{2, -1, 18}, Slice{3, -1, 15}}});
+  // Block E
+  allocation_blocks.push_back(MakeAllocationBlock(9, 14, 4));
+  allocation_blocks.back()->original_slice_data =
+      SlicedAllocationData({{Slice{2, -1, 9}, Slice{2, -1, 12}}});
+  // Colocate E with D.
+  allocation_blocks.back()->colocations.push_back(allocation_blocks[3]);
+  // Block F
+  allocation_blocks.push_back(MakeAllocationBlock(15, 17, 5));
+
+  // Specify the repacking sort order as the order in which blocks were added to
+  // allocation_blocks.
+  // - By placing E after C, we test that the repacker can place a sliced block
+  //   around another block
+  // - By placing F after D, we test that the repacker can fill in the extra
+  //   space left behind by slicing.
+  absl::flat_hash_map<AllocationBlock*, int> sort_keys;
+  for (int i = 0; i < allocation_blocks.size(); ++i) {
+    sort_keys[allocation_blocks[i]] = i;
+  }
+  options_.buffer_interval_compare = LessThanByKey(
+      [sort_keys](
+          const MemorySpaceAssignmentBestFitRepacker::BufferInterval& x) {
+        return sort_keys.at(x.buffer);
+      });
+  repacker_ = MemorySpaceAssignmentBestFitRepacker(100, 1, options_);
+
+  EXPECT_TRUE(*repacker_.Repack(absl::MakeSpan(allocation_blocks)));
+
+  // Check results
+  // Block A
+  EXPECT_EQ(allocation_blocks[0]->offset, 0);
+  EXPECT_FALSE(allocation_blocks[0]->repacked_slice_data.has_value());
+  // Block B
+  EXPECT_EQ(allocation_blocks[1]->offset, 2);
+  EXPECT_FALSE(allocation_blocks[1]->repacked_slice_data.has_value());
+  // Block C
+  EXPECT_EQ(allocation_blocks[2]->offset, 4);
+  ASSERT_FALSE(allocation_blocks[2]->repacked_slice_data.has_value());
+  // Block D
+  EXPECT_EQ(allocation_blocks[3]->offset, 2);
+  ASSERT_TRUE(allocation_blocks[3]->repacked_slice_data.has_value());
+  EXPECT_EQ(*allocation_blocks[3]->repacked_slice_data,
+            (SlicedAllocationData({{Slice{2, 2, 15}, Slice{3, 4, 18}}})));
+  // Block E
+  EXPECT_EQ(allocation_blocks[4]->offset, 2);
+  ASSERT_TRUE(allocation_blocks[4]->repacked_slice_data.has_value());
+  EXPECT_EQ(*allocation_blocks[4]->repacked_slice_data,
+            (SlicedAllocationData({{Slice{2, 2, 9}, Slice{2, 4, 12}}})));
+  // Block F
+  EXPECT_EQ(allocation_blocks[5]->offset, 4);
+  EXPECT_FALSE(allocation_blocks[5]->repacked_slice_data.has_value());
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_repacking.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_repacking.h
index 9d0be634ddda01..e26d5ecf5643cc 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_repacking.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_repacking.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <tuple>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -61,6 +62,27 @@ class MemorySpaceAssignmentRepacker {
   struct SlicedAllocationData {
     std::vector<Slice> slices_sorted_by_offset;
 
+    std::vector<int64_t> SizesSortedByOffset() const {
+      std::vector<int64_t> sizes_sorted_by_offset;
+      sizes_sorted_by_offset.reserve(slices_sorted_by_offset.size());
+      absl::c_for_each(slices_sorted_by_offset,
+                       [&sizes_sorted_by_offset](const Slice& slice) {
+                         sizes_sorted_by_offset.push_back(slice.size);
+                       });
+      return sizes_sorted_by_offset;
+    }
+
+    std::vector<int64_t> SortedStartTimes() const {
+      std::vector<int64_t> sorted_start_times;
+      sorted_start_times.reserve(slices_sorted_by_offset.size());
+      absl::c_for_each(slices_sorted_by_offset,
+                       [&sorted_start_times](const Slice& slice) {
+                         sorted_start_times.push_back(slice.start_time);
+                       });
+      absl::c_sort(sorted_start_times);
+      return sorted_start_times;
+    }
+
     std::string ToString() const {
       return absl::StrCat(
           "{ slices_sorted_by_offset: [ ",
@@ -114,7 +136,7 @@ class MemorySpaceAssignmentRepacker {
         repacked_slicing_str = absl::StrCat("; repacked_slice_data: ",
                                             repacked_slice_data->ToString());
       }
-      return absl::StrCat("[", start_time, ", ", end_time, "]: size: ", size,
+      return absl::StrCat("[", start_time, ", ", end_time, "]; size: ", size,
                           "; offset: ", offset,
                           "; initial offset: ", initial_offset,
                           "; # colocations: ", colocations.size(),

From 6f453dccff100bdbfa7d0324d46e397f6f120994 Mon Sep 17 00:00:00 2001
From: wenchenvincent <32376000+wenchenvincent@users.noreply.github.com>
Date: Wed, 27 Sep 2023 11:51:08 -0700
Subject: [PATCH 332/567] PR #5852: [ROCm] HipBLASLt row major layout support.

Imported from GitHub PR https://github.com/openxla/xla/pull/5852

hipBlasLt currently only supports column major. This PR enables using hipBLASLt for matrices of row major layout by swapping the operands and manipulating the transpose attributes.
Copybara import of the project:

--
8a48f13b8b69538a58d3c3cd08beb76b91c1ce54 by Rahul Batra <rahbatra@amd.com>:

[ROCm]: Add row-major hipblasLT support and enable hipblasLt runtime

--
72826a60aee73312f74a23e6921c2326b0281537 by Wen Chen <Wen.Chen@amd.com>:

[ROCM] Added workaround to get hipBlasLt work correctly for HIPBLASLT_EPILOGUE_BIAS.

--
f9f24c259512c6b1675556812934138cf21b1027 by Wen Chen <Wen.Chen@amd.com>:

[ROCM] Enabled gemm_rewrite_test on ROCm but disabled many cublasLt tests within it for now.

Merging this change closes #5852

PiperOrigin-RevId: 568917445
---
 .../xla/xla/service/gpu/gemm_rewriter.cc      |  2 --
 .../xla/xla/service/gpu/matmul_utils.cc       | 17 ++++++++--
 .../xla/xla/service/gpu/runtime/executable.h  |  3 +-
 third_party/xla/xla/service/gpu/tests/BUILD   | 10 +++---
 .../service/gpu/tests/gemm_rewrite_test.cc    |  8 +++++
 .../xla/stream_executor/rocm/hip_blas_lt.cc   | 34 ++++++++++++++-----
 6 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index 5a6abe9592fb6c..a6e899edb06e98 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -1846,8 +1846,6 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
                         MatrixIsColumnMajor(instr, gemm_backend_config));
 
     if (std::holds_alternative<se::RocmComputeCapability>(gpu_version_)) {
-      if (!output_is_column_major) return false;
-
       auto rocm_compute_capability_ =
           std::get<se::RocmComputeCapability>(gpu_version_);
 
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index f141e5a3d89af9..2d9328987db37a 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -875,7 +875,7 @@ StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
   rhs_layout.batch_size = batch_size;
 
   bool must_swap_operands =
-      MakeOutputColumnMajor(lhs_layout, rhs_layout, c_layout, output_layout);
+      MakeOutputColumnMajor(lhs_layout, rhs_layout, output_layout, c_layout);
 
   // Do not transopse either input. Note the cuBLASLt documentation somewhat
   // incorrectly claims "A must be transposed and B non-transposed" when A and B
@@ -884,8 +884,8 @@ StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
   // *not* be transposed, and if B is row-major, B must be transposed. We never
   // transpose A or B, and expect the caller to ensure A is row-major and B is
   // column when A and B are FP8.
-  const se::blas::Transpose trans_a = se::blas::Transpose::kNoTranspose;
-  const se::blas::Transpose trans_b = se::blas::Transpose::kNoTranspose;
+  se::blas::Transpose trans_a = se::blas::Transpose::kNoTranspose;
+  se::blas::Transpose trans_b = se::blas::Transpose::kNoTranspose;
   if (primitive_util::IsF8Type(lhs_layout.dtype) &&
       lhs_layout.order == MatrixLayout::Order::kColumnMajor) {
     return InternalError("The F8 LHS must be column-major");
@@ -902,6 +902,17 @@ StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
       GetBlasComputationType(lhs_layout.dtype, output_layout.dtype,
                              config.compute_precision));
 
+#if TENSORFLOW_USE_ROCM
+  if (lhs_layout.order == MatrixLayout::Order::kRowMajor) {
+    trans_a = se::blas::Transpose::kTranspose;
+    lhs_layout.Transpose();
+  }
+  if (rhs_layout.order == MatrixLayout::Order::kRowMajor) {
+    trans_b = se::blas::Transpose::kTranspose;
+    rhs_layout.Transpose();
+  }
+#endif
+
   TF_ASSIGN_OR_RETURN(
       se::gpu::BlasLt::MatmulDesc op_desc,
       se::gpu::BlasLt::MatmulDesc::Create(
diff --git a/third_party/xla/xla/service/gpu/runtime/executable.h b/third_party/xla/xla/service/gpu/runtime/executable.h
index 81ec31658893c1..73f357596d16e9 100644
--- a/third_party/xla/xla/service/gpu/runtime/executable.h
+++ b/third_party/xla/xla/service/gpu/runtime/executable.h
@@ -171,7 +171,8 @@ class GpuRuntimeExecutable {
   // Keep a cache of fft plans for all FFT operations in the program.
   FftPlans fft_plans_;
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM  // Keep matmul execution plans.
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  // Keep matmul execution plans (only if cuBLASLt is available).
   MatmulPlans cublas_lt_matmul_plans_;
   // Keep captured and instantiated GPU graphs instances.
   GraphInstances graph_instances_;
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index bf81778c117a18..5cd1ba4a5c87ee 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -124,11 +124,9 @@ xla_cc_test(
 
 xla_cc_test(
     name = "gemm_rewrite_test",
-    srcs = if_cuda_is_configured(["gemm_rewrite_test.cc"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    tags = tf_cuda_tests_tags() + [
-        "no_rocm",
-    ],
+    srcs = if_cuda_is_configured(["gemm_rewrite_test.cc"]) + if_rocm_is_configured(["gemm_rewrite_test.cc"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
+    tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
         "//xla:statusor",
@@ -147,6 +145,8 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test_main",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
 
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index b8d55c6bbf0b71..19292305a67cf2 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -76,6 +76,7 @@ ENTRY AddDotsFunc {
   }
 }
 
+#if GOOGLE_CUDA
 TEST_F(GemmRewriteTest, TestBatchedAutotuning) {
   if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::AMPERE)) {
     GTEST_SKIP()
@@ -97,6 +98,7 @@ ENTRY %test {
 ; CHECK: selected_algorithm
       )");
 }
+#endif
 
 TEST_F(GemmRewriteTest, SimpleRewriteDeterministic) {
   const char* hlo_text = R"(
@@ -184,6 +186,7 @@ ENTRY broadcast {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
+#if GOOGLE_CUDA
 // A test fixture class for tests which should have similar results with legacy
 // cublas and cublasLt
 class ParameterizedGemmRewriteTest
@@ -1262,6 +1265,7 @@ ENTRY main {
 
 INSTANTIATE_TEST_SUITE_P(CublasTestsBothLegacyAndLt,
                          ParameterizedGemmRewriteTest, ::testing::Bool());
+#endif
 
 // A test fixture class for tests which are specific to legacy cublas
 class LegacyCublasGemmRewriteTest : public GemmRewriteTest {
@@ -1693,6 +1697,7 @@ ENTRY test {
 )");
 }
 
+#if GOOGLE_CUDA
 // Test gemm matrix bias add fusion with mix type
 TEST_F(LegacyCublasGemmRewriteTest, MatrixBiasMixType) {
   std::vector<std::tuple<absl::string_view, absl::string_view>>
@@ -1763,6 +1768,7 @@ ENTRY test {
                                          m::Negate(m::Parameter(2)))));
   }
 }
+#endif
 
 // Test batch gemm matrix bias add fusion with mix type that is not supported
 TEST_F(LegacyCublasGemmRewriteTest, MatrixBiasMixTypeNotSupported) {
@@ -1896,6 +1902,7 @@ ENTRY test {
           m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()))));
 }
 
+#if GOOGLE_CUDA
 // A test fixture class for tests which are specific to cublasLt
 class CublasLtGemmRewriteTest : public GemmRewriteTest {
  public:
@@ -7062,6 +7069,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, FnuzTypeF8) {
 
 INSTANTIATE_TEST_SUITE_P(Fp8CublasTestsBothLegacyAndLt,
                          ParameterizedFp8GemmRewriteTest, ::testing::Bool());
+#endif
 
 TEST_F(GemmRewriteTest, NoFuseBiasBroadcast) {
   const char* hlo = R"(
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
index 61ebbaf21b8c5b..4d609b15758f18 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -36,12 +36,15 @@ limitations under the License.
 #define SET_ATTR(setter, handle, attr, value) \
   ToStatus(setter(handle, attr, &value, sizeof(decltype(value))), #setter)
 
-#define GET_ATTR(getter, handle, attr, ValueT)                            \
-  [&]() -> tsl::StatusOr<ValueT> {                                        \
-    ValueT value;                                                         \
-    TF_RETURN_IF_ERROR(ToStatus(                                          \
-        getter(handle, attr, &value, sizeof(ValueT), nullptr), #getter)); \
-    return std::move(value);                                              \
+// hipblasLtMatmulDescGetAttribute does not allow nullptr for the last
+// argument (size_t* sizeWritten)
+#define GET_ATTR(getter, handle, attr, ValueT)                          \
+  [&]() -> tsl::StatusOr<ValueT> {                                      \
+    ValueT value;                                                       \
+    size_t size;                                                        \
+    TF_RETURN_IF_ERROR(ToStatus(                                        \
+        getter(handle, attr, &value, sizeof(ValueT), &size), #getter)); \
+    return std::move(value);                                            \
   }()
 
 namespace stream_executor {
@@ -57,7 +60,7 @@ tsl::Status SetAttr(hipblasLtMatrixLayout_t handle,
 template <typename T>
 tsl::StatusOr<T> GetAttr(hipblasLtMatrixLayout_t handle,
                          hipblasLtMatrixLayoutAttribute_t attr) {
-  return GET_ATTR(hipblasLtMatrixLayoutGetAttribute, handle, attr, T);
+  return GET_ATTR(wrap::hipblasLtMatrixLayoutGetAttribute, handle, attr, T);
 }
 
 template <typename T>
@@ -69,7 +72,7 @@ tsl::Status SetAttr(hipblasLtMatmulDesc_t handle,
 template <typename T>
 tsl::StatusOr<T> GetAttr(hipblasLtMatmulDesc_t handle,
                          hipblasLtMatmulDescAttributes_t attr) {
-  return GET_ATTR(hipblasLtMatmulDescGetAttribute, handle, attr, T);
+  return GET_ATTR(wrap::hipblasLtMatmulDescGetAttribute, handle, attr, T);
 }
 
 template <typename T>
@@ -208,6 +211,21 @@ tsl::StatusOr<std::vector<BlasLt::MatmulAlgorithm>> BlasLt::GetMatmulAlgorithms(
 
     gpu::ScopedActivateExecutorContext sac{parent_};
 
+    // Right now, hipBlasLt would require setting the bias pointer (even a dummy
+    // one) before finding the algorithms for
+    // HIPBLASLT_MATMUL_DESC_BIAS_POINTER. Can remove this later once this
+    // restriction is gone.
+    static int dummy_pointer = 0;
+    TF_ASSIGN_OR_RETURN(
+        hipblasLtEpilogue_t epilogue,
+        GetAttr<hipblasLtEpilogue_t>(plan.op_desc.get(),
+                                     HIPBLASLT_MATMUL_DESC_EPILOGUE));
+    if (epilogue == HIPBLASLT_EPILOGUE_BIAS) {
+      TF_RETURN_IF_ERROR(SetAttr(plan.op_desc.get(),
+                                 HIPBLASLT_MATMUL_DESC_BIAS_POINTER,
+                                 &dummy_pointer));
+    }
+
     int found_algorithm_count = 0;
     auto error = wrap::hipblasLtMatmulAlgoGetHeuristic(
         blas_lt_.get(), plan.op_desc.get(), plan.a_desc.get(),

From a141fa94a727396217e744fa53a8845c2fc729d9 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 27 Sep 2023 11:52:27 -0700
Subject: [PATCH 333/567] [TSL:CUDA] Improve cuDNN stubs: handle symbols that
 get versions and error strings more gracefully if the library load fails.

PiperOrigin-RevId: 568917829
---
 .../xla/third_party/tsl/tsl/cuda/BUILD.bazel  |  1 +
 .../third_party/tsl/tsl/cuda/cudnn_stub.cc    | 26 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
index 47291d50c4e2cd..8231d3cc0c54cc 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
+++ b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
@@ -149,6 +149,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cudnn_version",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@local_config_cuda//cuda:cudnn_header",
         "//tsl/platform:dso_loader",
         "//tsl/platform:env",
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc
index 705dae12f903f3..f3cab179eb0b71 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
+#include "absl/container/flat_hash_map.h"
 #include "third_party/gpus/cudnn/cudnn.h"
 #include "tsl/platform/dso_loader.h"
 #include "tsl/platform/env.h"
@@ -53,10 +55,26 @@ constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
 
 extern "C" {
 
+static size_t GetVersionStub() { return 0; }
+
+static const char* GetErrorStringStub() {
+  return "cuDNN could not be found or could not be loaded.";
+}
+
 static cudnnStatus_t GetSymbolNotFoundError() {
   return CUDNN_STATUS_INTERNAL_ERROR;
 }
 
+static absl::flat_hash_map<std::string_view, void*> const& SymbolOverrides() {
+  static auto* syms = new absl::flat_hash_map<std::string_view, void*>{
+      {"cudnnGetVersion", reinterpret_cast<void*>(&GetVersionStub)},
+      {"cudnnGetMaxDeviceVersion", reinterpret_cast<void*>(&GetVersionStub)},
+      {"cudnnGetCudartVersion", reinterpret_cast<void*>(&GetVersionStub)},
+      {"cudnnGetErrorString", reinterpret_cast<void*>(&GetErrorStringStub)},
+  };
+  return *syms;
+}
+
 extern void* _cudnn_tramp_table[];
 
 void _cudnn_tramp_resolve(int i) {
@@ -64,7 +82,13 @@ void _cudnn_tramp_resolve(int i) {
   CHECK_LT(i, kNumSymbols);
   void* p = LoadSymbol(kSymbols[i]);
   if (!p) {
-    p = reinterpret_cast<void*>(&GetSymbolNotFoundError);
+    const auto& overrides = SymbolOverrides();
+    auto it = overrides.find(kSymbols[i]);
+    if (it == overrides.end()) {
+      p = reinterpret_cast<void*>(&GetSymbolNotFoundError);
+    } else {
+      p = it->second;
+    }
   }
   _cudnn_tramp_table[i] = p;
 }

From cb7c35369cc53ec82763dbbe65899d76a2598375 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 12:06:51 -0700
Subject: [PATCH 334/567] [PJRT C API] Clarify that PJRT_Buffer_CopyToDevice
 and PJRT_Buffer_CopyToMemory are within the same client.

PiperOrigin-RevId: 568921998
---
 third_party/xla/xla/pjrt/c/pjrt_c_api.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index 5b836c7dac0d30..489acc800f1892 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -1565,9 +1565,9 @@ struct PJRT_Buffer_CopyToDevice_Args {
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyToDevice_Args, dst_buffer);
 
-// Copies the buffer to device `dst_device`. Caller is responsible for freeing
-// returned `dst_buffer` with PJRT_Buffer_Destroy. Returns an error if the
-// buffer is already on `dst_device`.
+// Copies the buffer to device `dst_device` within the same client. Caller is
+// responsible for freeing returned `dst_buffer` with PJRT_Buffer_Destroy.
+// Returns an error if the buffer is already on `dst_device`.
 typedef PJRT_Error* PJRT_Buffer_CopyToDevice(
     PJRT_Buffer_CopyToDevice_Args* args);
 
@@ -1580,9 +1580,9 @@ struct PJRT_Buffer_CopyToMemory_Args {
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyToMemory_Args, dst_buffer);
 
-// Copies the buffer to memory `dst_memory`. Caller is responsible for freeing
-// returned `dst_buffer` with PJRT_Buffer_Destroy. Returns an error if the
-// buffer is already on `dst_memory`.
+// Copies the buffer to memory `dst_memory` within the same client. Caller is
+// responsible for freeing returned `dst_buffer` with PJRT_Buffer_Destroy.
+// Returns an error if the buffer is already on `dst_memory`.
 typedef PJRT_Error* PJRT_Buffer_CopyToMemory(
     PJRT_Buffer_CopyToMemory_Args* args);
 

From 7c45b3e9692e1c92587f21ccce3274102b757e3b Mon Sep 17 00:00:00 2001
From: "Ryan M. Lefever" <lefever@google.com>
Date: Wed, 27 Sep 2023 12:37:47 -0700
Subject: [PATCH 335/567] There was a bug in which the MSA best fit repacker
 was overriding GetTemporalBufferIntervalCompare(), but that override was
 never being executed because it's callsite was in a constructor. A previous
 change, removed the override.

This change makes GetTemporalBufferIntervalCompare() non-virtual, so that the mistake is not repeated.

This is a no-op since there are not overrides in the codebase.

PiperOrigin-RevId: 568930191
---
 third_party/xla/xla/service/heap_simulator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/heap_simulator.h b/third_party/xla/xla/service/heap_simulator.h
index 94cb22fd5e272b..882300f979b19c 100644
--- a/third_party/xla/xla/service/heap_simulator.h
+++ b/third_party/xla/xla/service/heap_simulator.h
@@ -802,7 +802,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
   // end of the last co-located buffer.  There could be "holes" in the live
   // ranges of each co-located buffers, but in this heuristics we think they are
   // contiguous.
-  virtual BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
+  BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
 
   absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
   HeapResult result_;

From 543f1fb6c5e8de4e8b5ec328a6e30f0a6d9b38cb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 13:00:52 -0700
Subject: [PATCH 336/567] BUILD visibility change only

PiperOrigin-RevId: 568936117
---
 third_party/xla/xla/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 1b728600ea6a37..66dfb7e4663851 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -17,7 +17,7 @@ package_group(
     name = "friends",
     includes = ["//xla:internal"],
     packages = [
-        # copybara:uncomment "//learning/infra/mira/...",
+        # copybara:uncomment "//learning/...",
         "//third_party/australis/...",
         "//third_party/iree/...",
         "//third_party/mira/...",

From 6df7096432a0db764d4f7e0d4d692af0842dc363 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Wed, 27 Sep 2023 13:07:21 -0700
Subject: [PATCH 337/567] Fix double-specified libtensorflow config target

PiperOrigin-RevId: 568937970
---
 .bazelrc                                 | 4 ++--
 ci/official/libtensorflow.sh             | 4 ++--
 third_party/xla/.bazelrc                 | 4 ++--
 third_party/xla/third_party/tsl/.bazelrc | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 696c200eb79c7c..a55118d273a570 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -667,8 +667,8 @@ build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_result
 # at the scripts of ci/official/ to see how TF's CI uses them.
 
 # LIBTENSORFLOW TESTS are for building Libtensorflow archives. These are CUDA/CPU-agnostic.
-test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
-build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
+test:linux_libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
+build:linux_libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
 
 # PYTHON TESTS run a suite of Python tests intended for verifying that the Python wheel
 # will work properly. These are usually run Nightly or upon Release.
diff --git a/ci/official/libtensorflow.sh b/ci/official/libtensorflow.sh
index e225e4d52526c4..402de63ebc97a2 100755
--- a/ci/official/libtensorflow.sh
+++ b/ci/official/libtensorflow.sh
@@ -25,8 +25,8 @@ if [[ "$TFCI_NIGHTLY_UPDATE_VERSION_ENABLE" == 1 ]]; then
   tfrun python3 tensorflow/tools/ci_build/update_version.py --nightly
 fi
 
-tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" test "${TFCI_BAZEL_COMMON_ARGS[@]}" --config=libtensorflow_test
-tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" build "${TFCI_BAZEL_COMMON_ARGS[@]}" --config=libtensorflow_build
+tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" test "${TFCI_BAZEL_COMMON_ARGS[@]}" --config=linux_libtensorflow_test
+tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" build "${TFCI_BAZEL_COMMON_ARGS[@]}" --config=linux_libtensorflow_build
 
 tfrun ./ci/official/utilities/repack_libtensorflow.sh "$TFCI_OUTPUT_DIR" "$TFCI_LIB_SUFFIX"
 
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 696c200eb79c7c..a55118d273a570 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -667,8 +667,8 @@ build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_result
 # at the scripts of ci/official/ to see how TF's CI uses them.
 
 # LIBTENSORFLOW TESTS are for building Libtensorflow archives. These are CUDA/CPU-agnostic.
-test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
-build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
+test:linux_libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
+build:linux_libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
 
 # PYTHON TESTS run a suite of Python tests intended for verifying that the Python wheel
 # will work properly. These are usually run Nightly or upon Release.
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 696c200eb79c7c..a55118d273a570 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -667,8 +667,8 @@ build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_result
 # at the scripts of ci/official/ to see how TF's CI uses them.
 
 # LIBTENSORFLOW TESTS are for building Libtensorflow archives. These are CUDA/CPU-agnostic.
-test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
-build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
+test:linux_libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
+build:linux_libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
 
 # PYTHON TESTS run a suite of Python tests intended for verifying that the Python wheel
 # will work properly. These are usually run Nightly or upon Release.

From 5101489aa60c58adcb0fa796ec11618f86f8b1bb Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Wed, 27 Sep 2023 14:01:52 -0700
Subject: [PATCH 338/567] create boilerplate graph export file in api/v1, which
 will be used to expose the graph export for MLIR.

PiperOrigin-RevId: 568953653
---
 tensorflow/compiler/mlir/tf2xla/api/v1/BUILD  | 22 ++++++++++
 .../tf2xla/api/v1/tf_dialect_to_executor.cc   | 34 ++++++++++++++
 .../tf2xla/api/v1/tf_dialect_to_executor.h    | 44 +++++++++++++++++++
 .../api/v1/tf_dialect_to_executor_test.cc     | 37 ++++++++++++++++
 4 files changed, 137 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
index 08b5e4d485a6e4..f9a5cac4e5ed29 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -166,3 +166,25 @@ tf_cc_test(
         "@local_tsl//tsl/lib/core:status_test_util",
     ],
 )
+
+cc_library(
+    name = "tf_dialect_to_executor",
+    srcs = ["tf_dialect_to_executor.cc"],
+    hdrs = ["tf_dialect_to_executor.h"],
+    deps = [
+        "//tensorflow/core/platform:status",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
+tf_cc_test(
+    name = "tf_dialect_to_executor_test",
+    srcs = ["tf_dialect_to_executor_test.cc"],
+    deps = [
+        ":tf_dialect_to_executor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/lib/core:status_test_util",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
new file mode 100644
index 00000000000000..dc6a190bd35d28
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h"
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+
+using mlir::ModuleOp;
+
+tensorflow::Status ExportFromTensorflowDialectToExecutor(ModuleOp module) {
+  return tsl::OkStatus();
+}
+
+}  // namespace v1
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h
new file mode 100644
index 00000000000000..06ddc799bf75ad
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h
@@ -0,0 +1,44 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_TF_DIALECT_TO_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_TF_DIALECT_TO_EXECUTOR_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+
+// Given the input Module op that's in the Tensorflow Dialect, convert the MLIR
+// module in place to the Tensorflow Executor Dialect. Returns an OK Status if
+// success, otherwise failure with an error message.
+// The Tensorflow Executor Dialect is required to export an MLIR module to a
+// Tensorflow GraphDef. This API will add control dependencies and verify that
+// the conversion was successful. This version adds extra control dependencies
+// for replication and parallel execution ops, which may slow performance.
+// Prefer to use the v2 of this API.
+//
+// Input: A MLIR Module in the Tensorflow Dialect with no
+// `tf_device.cluster_func` ops.
+// Output: A MLIR module in the Tensorflow Executor Dialect.
+tensorflow::Status ExportFromTensorflowDialectToExecutor(mlir::ModuleOp module);
+
+}  // namespace v1
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_TF_DIALECT_TO_EXECUTOR_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc
new file mode 100644
index 00000000000000..eb87a7ae80444f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h"
+
+#include <gtest/gtest.h>
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tsl/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+namespace {
+
+using mlir::ModuleOp;
+
+TEST(TensorflowDialectToExecutor, ConvertsToExecutor) {
+  ModuleOp module;
+  TF_ASSERT_OK(ExportFromTensorflowDialectToExecutor(module));
+}
+
+}  // namespace
+}  // namespace v1
+}  // namespace tf2xla
+}  // namespace tensorflow

From 30d28de817f7477dde22974ffff5af268279c26c Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Wed, 27 Sep 2023 14:09:06 -0700
Subject: [PATCH 339/567] Fix issue in which, in an environment with both
 tf_keras and keras 2 installed, and with TF_USE_LEGACY_KERAS=1 set, doing
 `from tensorflow.keras import x` would pull x from keras 2 rathre than
 tf_keras.

PiperOrigin-RevId: 568955899
---
 tensorflow/api_template.__init__.py       | 14 ++++++---
 tensorflow/api_template_v1.__init__.py    | 36 +++++++++++++++--------
 tensorflow/compat_template_v1.__init__.py | 32 +++++++++++++-------
 3 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 794abc94ec9b6a..321738084016a7 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -89,10 +89,16 @@
 setattr(_current_module, "estimator", estimator)
 
 # Lazy-load Keras v2/3.
+_tf_uses_legacy_keras = (
+    _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"))
 setattr(_current_module, "keras", _KerasLazyLoader(globals()))
-for _module_dir in ("keras._tf_keras.keras", "keras.api._v2.keras"):
-  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
-  _current_module.__path__ = [_module_dir] + _current_module.__path__
+_module_dir = _module_util.get_parent_dir_for_name("keras._tf_keras.keras")
+_current_module.__path__ = [_module_dir] + _current_module.__path__
+if _tf_uses_legacy_keras:
+  _module_dir = _module_util.get_parent_dir_for_name("tf_keras.api._v2.keras")
+else:
+  _module_dir = _module_util.get_parent_dir_for_name("keras.api._v2.keras")
+_current_module.__path__ = [_module_dir] + _current_module.__path__
 
 
 # Enable TF2 behaviors
@@ -167,7 +173,7 @@ def _running_from_pip_package():
 # SavedModel registry.
 # See b/196254385 for more details.
 try:
-  if _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"):
+  if _tf_uses_legacy_keras:
     importlib.import_module("tf_keras.src.optimizers")
   else:
     importlib.import_module("keras.src.optimizers")
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index 6265c294dc733d..d15f442593f06d 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -79,10 +79,16 @@
 setattr(_current_module, "estimator", estimator)
 
 # Lazy-load Keras v1.
+_tf_uses_legacy_keras = (
+    _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"))
 setattr(_current_module, "keras", _KerasLazyLoader(globals(), mode="v1"))
-for _module_dir in ("keras._tf_keras.keras", "keras.api._v1.keras"):
-  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
-  _current_module.__path__ = [_module_dir] + _current_module.__path__
+_module_dir = _module_util.get_parent_dir_for_name("keras._tf_keras.keras")
+_current_module.__path__ = [_module_dir] + _current_module.__path__
+if _tf_uses_legacy_keras:
+  _module_dir = _module_util.get_parent_dir_for_name("tf_keras.api._v1.keras")
+else:
+  _module_dir = _module_util.get_parent_dir_for_name("keras.api._v1.keras")
+_current_module.__path__ = [_module_dir] + _current_module.__path__
 
 _CONTRIB_WARNING = """
 The TensorFlow contrib module will not be included in TensorFlow 2.0.
@@ -114,22 +120,26 @@
     submodule="__internal__.legacy.layers",
     name="layers",
     mode="v1")
-for _module_dir in (
-    "keras.api._v1.keras.__internal__.legacy.layers",
-    "tf_keras.api._v1.keras.__internal__.legacy.layers"):
-  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
-  _current_module.__path__ = [_module_dir] + _current_module.__path__
+if _tf_uses_legacy_keras:
+  _module_dir = _module_util.get_parent_dir_for_name(
+      "tf_keras.api._v1.keras.__internal__.legacy.layers")
+else:
+  _module_dir = _module_util.get_parent_dir_for_name(
+      "keras.api._v1.keras.__internal__.legacy.layers")
+_current_module.__path__ = [_module_dir] + _current_module.__path__
 
 _current_module.nn.rnn_cell = _KerasLazyLoader(
     globals(),
     submodule="__internal__.legacy.rnn_cell",
     name="rnn_cell",
     mode="v1")
-for _module_dir in (
-    "keras.api._v1.keras.__internal__.legacy.rnn_cell",
-    "tf_keras.api._v1.keras.__internal__.legacy.rnn_cell"):
-  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
-  _current_module.nn.__path__ = [_module_dir] + _current_module.nn.__path__
+if _tf_uses_legacy_keras:
+  _module_dir = _module_util.get_parent_dir_for_name(
+      "tf_keras.api._v1.keras.__internal__.legacy.rnn_cell")
+else:
+  _module_dir = _module_util.get_parent_dir_for_name(
+      "keras.api._v1.keras.__internal__.legacy.rnn_cell")
+_current_module.nn.__path__ = [_module_dir] + _current_module.nn.__path__
 
 del importlib
 
diff --git a/tensorflow/compat_template_v1.__init__.py b/tensorflow/compat_template_v1.__init__.py
index 18282f4569e5dc..e71d88c21b040b 100644
--- a/tensorflow/compat_template_v1.__init__.py
+++ b/tensorflow/compat_template_v1.__init__.py
@@ -16,6 +16,7 @@
 
 # pylint: disable=g-bad-import-order,g-import-not-at-top,protected-access
 
+import os as _os
 import sys as _sys
 import typing as _typing
 
@@ -39,8 +40,13 @@
 setattr(_current_module, "estimator", estimator)
 
 # Lazy load Keras v1
+_tf_uses_legacy_keras = (
+    _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"))
 setattr(_current_module, "keras", _KerasLazyLoader(globals(), mode="v1"))
-_module_dir = _module_util.get_parent_dir_for_name("keras.api._v1.keras")
+if _tf_uses_legacy_keras:
+  _module_dir = _module_util.get_parent_dir_for_name("tf_keras.api._v1.keras")
+else:
+  _module_dir = _module_util.get_parent_dir_for_name("keras.api._v1.keras")
 _current_module.__path__ = [_module_dir] + _current_module.__path__
 
 
@@ -55,22 +61,26 @@
     submodule="__internal__.legacy.layers",
     name="layers",
     mode="v1")
-for _module_dir in (
-    "keras.api._v1.keras.__internal__.legacy.layers",
-    "tf_keras.api._v1.keras.__internal__.legacy.layers"):
-  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
-  _current_module.__path__ = [_module_dir] + _current_module.__path__
+if _tf_uses_legacy_keras:
+  _module_dir = _module_util.get_parent_dir_for_name(
+      "tf_keras.api._v1.keras.__internal__.legacy.layers")
+else:
+  _module_dir = _module_util.get_parent_dir_for_name(
+      "keras.api._v1.keras.__internal__.legacy.layers")
+_current_module.__path__ = [_module_dir] + _current_module.__path__
 
 _current_module.nn.rnn_cell = _KerasLazyLoader(
     globals(),
     submodule="__internal__.legacy.rnn_cell",
     name="rnn_cell",
     mode="v1")
-for _module_dir in (
-    "keras.api._v1.keras.__internal__.legacy.rnn_cell",
-    "tf_keras.api._v1.keras.__internal__.legacy.rnn_cell"):
-  _module_dir = _module_util.get_parent_dir_for_name(_module_dir)
-  _current_module.nn.__path__ = [_module_dir] + _current_module.nn.__path__
+if _tf_uses_legacy_keras:
+  _module_dir = _module_util.get_parent_dir_for_name(
+      "tf_keras.api._v1.keras.__internal__.legacy.rnn_cell")
+else:
+  _module_dir = _module_util.get_parent_dir_for_name(
+      "keras.api._v1.keras.__internal__.legacy.rnn_cell")
+_current_module.nn.__path__ = [_module_dir] + _current_module.nn.__path__
 
 # Explicitly import lazy-loaded modules to support autocompletion.
 # pylint: disable=g-import-not-at-top

From a54e168668782a85a040791d1d678993842e8df4 Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Wed, 27 Sep 2023 14:15:16 -0700
Subject: [PATCH 340/567] Add wildcards after JNI_OnLoad and JNI_OnUnload in
 linker script.

This avoids errors of the form

ld.lld: error: version script assignment of 'VERS_1.0' to symbol 'JNI_OnLoad' failed: symbol not defined
ld.lld: error: version script assignment of 'VERS_1.0' to symbol 'JNI_OnUnload' failed: symbol not defined

when the code being linked does not define JNI_OnLoad or JNI_OnUnload.

PiperOrigin-RevId: 568957558
---
 tensorflow/lite/java/src/main/native/version_script.lds | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/java/src/main/native/version_script.lds b/tensorflow/lite/java/src/main/native/version_script.lds
index 38c93dda730550..e8aadfd355191a 100644
--- a/tensorflow/lite/java/src/main/native/version_script.lds
+++ b/tensorflow/lite/java/src/main/native/version_script.lds
@@ -2,8 +2,8 @@ VERS_1.0 {
   # Export JNI symbols.
   global:
     Java_*;
-    JNI_OnLoad;
-    JNI_OnUnload;
+    JNI_OnLoad*;
+    JNI_OnUnload*;
 
   # Hide everything else.
   local:

From 84dbea0f5d61f772f7d4930b94b90fc737f85336 Mon Sep 17 00:00:00 2001
From: Fiona Lang <flang@google.com>
Date: Wed, 27 Sep 2023 14:22:47 -0700
Subject: [PATCH 341/567] Fix nn gradient registrations.

Ensure nn_fused_batch_grad and ctc_ops grads are always available by adding unused imports to nn_impl.py.

PiperOrigin-RevId: 568959466
---
 tensorflow/python/ops/BUILD      | 2 ++
 tensorflow/python/ops/nn_impl.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 7bc37182165395..dc013d882788ad 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -4342,10 +4342,12 @@ py_strict_library(
         ":array_ops_stack",
         ":candidate_sampling_ops",
         ":cond",
+        ":ctc_ops",
         ":custom_gradient",
         ":embedding_ops",
         ":linalg_ops",
         ":math_ops",
+        ":nn_fused_batch_norm_grad",
         ":nn_ops",
         ":nn_ops_gen",
         ":sparse_ops_gen",
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 36630da7649fb6..230b0ea3b6368e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -23,12 +23,14 @@
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import cond as tf_cond
+from tensorflow.python.ops import ctc_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_fused_batch_norm_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import device_context

From c01b5622108d504d6bacd0022d01607037180acf Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 27 Sep 2023 14:24:58 -0700
Subject: [PATCH 342/567] [stream_executor] NFC: Rename cuda_gpu_executor to
 cuda_executor for consistency

No other cuda platform header or cc file has `gpu` in it.

PiperOrigin-RevId: 568960040
---
 third_party/xla/xla/service/gpu/BUILD         |  2 +-
 .../xla/xla/stream_executor/cuda/BUILD        | 26 +++++++++----------
 .../stream_executor/cuda/cuda_activation.h    |  2 +-
 .../xla/xla/stream_executor/cuda/cuda_blas.cc |  2 +-
 .../xla/xla/stream_executor/cuda/cuda_dnn.cc  |  2 +-
 .../xla/stream_executor/cuda/cuda_event.cc    |  2 +-
 ...{cuda_gpu_executor.cc => cuda_executor.cc} |  4 +--
 .../{cuda_gpu_executor.h => cuda_executor.h}  |  6 ++---
 .../xla/xla/stream_executor/cuda/cuda_fft.cc  |  2 +-
 .../xla/stream_executor/cuda/cuda_platform.cc |  2 +-
 third_party/xla/xla/stream_executor/gpu/BUILD |  2 +-
 11 files changed, 26 insertions(+), 26 deletions(-)
 rename third_party/xla/xla/stream_executor/cuda/{cuda_gpu_executor.cc => cuda_executor.cc} (99%)
 rename third_party/xla/xla/stream_executor/cuda/{cuda_gpu_executor.h => cuda_executor.h} (88%)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 914070c60edba2..277946771f3516 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -290,7 +290,7 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
-        "//xla/stream_executor/cuda:cuda_gpu_executor_header",
+        "//xla/stream_executor/cuda:cuda_executor_header",
     ]) + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
         "//xla/stream_executor/rocm:rocm_gpu_executor_header",
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index fd617c9c9266a8..fc13f11efd413f 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -56,7 +56,7 @@ cc_library(
     deps = if_cuda_is_configured(
         [
             ":cuda_driver",
-            ":cuda_gpu_executor",
+            ":cuda_executor",
             ":cuda_platform_id",
             ":cuda_activation",
             "//xla/stream_executor",  # buildcleaner: keep
@@ -186,7 +186,7 @@ xla_cc_test(
 )
 
 # The activation library is tightly coupled to the executor library.
-# TODO(leary) split up cuda_gpu_executor.cc so that this can stand alone.
+# TODO(leary) split up cuda_executor.cc so that this can stand alone.
 cc_library(
     name = "cuda_activation_header",
     hdrs = ["cuda_activation.h"],
@@ -213,8 +213,8 @@ cc_library(
 )
 
 cc_library(
-    name = "cuda_gpu_executor_header",
-    textual_hdrs = if_cuda_is_configured(["cuda_gpu_executor.h"]),
+    name = "cuda_executor_header",
+    textual_hdrs = if_cuda_is_configured(["cuda_executor.h"]),
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_kernel",
@@ -254,7 +254,7 @@ cc_library(
     deps = if_cuda_is_configured([
         ":cuda_activation",
         ":cuda_blas_utils",
-        ":cuda_gpu_executor",
+        ":cuda_executor",
         ":cuda_helpers",
         ":cuda_platform_id",
         ":cuda_stream",
@@ -305,7 +305,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_activation_header",
-        ":cuda_gpu_executor_header",
+        ":cuda_executor_header",
         ":cuda_platform_id",
         ":cuda_stream",
         ":cuda_helpers",
@@ -342,7 +342,7 @@ cc_library(
         ":cuda_activation",
         ":cuda_diagnostics",
         ":cuda_driver",
-        ":cuda_gpu_executor",
+        ":cuda_executor",
         ":cuda_platform_id",
         ":cuda_stream",
         "@com_google_absl//absl/base:core_headers",
@@ -415,7 +415,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_driver",
-        ":cuda_gpu_executor_header",
+        ":cuda_executor_header",
         ":cuda_stream",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_event",
@@ -430,7 +430,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_driver",
-        ":cuda_gpu_executor_header",
+        ":cuda_executor_header",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_stream",
         "//xla/stream_executor/platform",
@@ -454,9 +454,9 @@ cc_library(
 )
 
 cc_library(
-    name = "cuda_gpu_executor",
-    srcs = if_cuda_is_configured(["cuda_gpu_executor.cc"]),
-    hdrs = if_cuda_is_configured(["cuda_gpu_executor.h"]),
+    name = "cuda_executor",
+    srcs = if_cuda_is_configured(["cuda_executor.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_executor.h"]),
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_activation",
@@ -546,7 +546,7 @@ xla_cc_test(
     ],
     deps = [
         ":cuda_activation",
-        ":cuda_gpu_executor",
+        ":cuda_executor",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor/gpu:gpu_asm_opts",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_activation.h b/third_party/xla/xla/stream_executor/cuda/cuda_activation.h
index e7bc7f8680537e..bcdc8f5e178efe 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_activation.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_activation.h
@@ -17,7 +17,7 @@ limitations under the License.
 // It reaches into the CUDA implementation to activate an underlying CUDA
 // context.
 //
-// Having this file separate from cuda/cuda_gpu_executor.h means that dependent
+// Having this file separate from cuda/cuda_executor.h means that dependent
 // code does not also have to depend on cuda.h.
 
 #ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index d9d5451eb912a7..831798624e46d8 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/cuda/cuda_activation.h"
 #include "xla/stream_executor/cuda/cuda_blas_utils.h"
-#include "xla/stream_executor/cuda/cuda_gpu_executor.h"
+#include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_helpers.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/cuda_stream.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 4ebad8290a80ee..8e371d60fb064b 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_activation.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_driver.h"
-#include "xla/stream_executor/cuda/cuda_gpu_executor.h"
+#include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/cuda_stream.h"
 #include "xla/stream_executor/dnn.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
index 3909c9fba3fec2..8ca395e4c951ec 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_event.h"
 
-#include "xla/stream_executor/cuda/cuda_gpu_executor.h"
+#include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_stream.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
similarity index 99%
rename from third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc
rename to third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 212cc6643a006f..970962372d2e4e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/cuda/cuda_gpu_executor.h"
+#include "xla/stream_executor/cuda/cuda_executor.h"
 
 #include <cstdint>
 #include <memory>
@@ -1071,4 +1071,4 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
 
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {});
+REGISTER_MODULE_INITIALIZER(cuda_executor, {});
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.h b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
similarity index 88%
rename from third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.h
rename to third_party/xla/xla/stream_executor/cuda/cuda_executor.h
index e5270317b3e90b..8d7ee5d9cd3a0b 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
@@ -19,8 +19,8 @@ limitations under the License.
 // The notions from the StreamExecutor basically correspond to the CUDA streams
 // programming model provided by the libcuda.so driver APIs, so we don't have
 // to do much more than wrap the calls to the libraries appropriately.
-#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
-#define XLA_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
 
 #include "xla/stream_executor/gpu/gpu_executor.h"
 
@@ -32,4 +32,4 @@ using CUDAExecutor = gpu::GpuExecutor;
 }  // namespace cuda
 }  // namespace stream_executor
 
-#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
index f7eec72b14e3a5..4aa37fd6713908 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "xla/stream_executor/cuda/cuda_activation.h"
-#include "xla/stream_executor/cuda/cuda_gpu_executor.h"
+#include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_helpers.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/cuda_stream.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
index 73506391b24288..9e81e0eb8c9eb2 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "xla/stream_executor/cuda/cuda_driver.h"
-#include "xla/stream_executor/cuda/cuda_gpu_executor.h"
+#include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 7ddab344718dc9..52d4762e20b55a 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -403,7 +403,7 @@ tsl_gpu_library(
     hdrs = ["gpu_cudamallocasync_allocator.h"],
     cuda_deps = [
         "//xla/stream_executor/cuda:cuda_activation",
-        "//xla/stream_executor/cuda:cuda_gpu_executor",
+        "//xla/stream_executor/cuda:cuda_executor",
     ],
     visibility = ["//visibility:public"],
     deps = [

From fc5cdebd8e9b04d22a6075b8902bc3d17918d204 Mon Sep 17 00:00:00 2001
From: Subhankar Shah <subhankarshah@google.com>
Date: Wed, 27 Sep 2023 14:27:24 -0700
Subject: [PATCH 343/567] [MemorySpaceAssignment] Remove
 memory_space_assignment from file names in memory_space_assignment folder.
 Move all MemorySpaceAssignment code to memory_space_assignment namespace.

PiperOrigin-RevId: 568960729
---
 third_party/xla/xla/service/BUILD             |  2 +-
 third_party/xla/xla/service/heap_simulator.cc |  8 ++--
 third_party/xla/xla/service/heap_simulator.h  |  4 +-
 .../xla/service/memory_space_assignment/BUILD | 40 +++++++++----------
 ...t_fit_repacker.cc => best_fit_repacker.cc} | 16 +++++---
 ...est_fit_repacker.h => best_fit_repacker.h} | 10 +++--
 ...cker_test.cc => best_fit_repacker_test.cc} | 31 ++++++++------
 .../memory_space_assignment.cc                |  6 +--
 .../memory_space_assignment.h                 |  2 +-
 .../memory_space_assignment_test.cc           |  3 +-
 ...ace_assignment_repacking.h => repacking.h} |  8 ++--
 ...gnment_tuning_utils.cc => tuning_utils.cc} |  3 +-
 ...signment_tuning_utils.h => tuning_utils.h} |  6 +--
 ...ory_space_assignment_utils.cc => utils.cc} |  4 +-
 ...emory_space_assignment_utils.h => utils.h} |  8 ++--
 15 files changed, 85 insertions(+), 66 deletions(-)
 rename third_party/xla/xla/service/memory_space_assignment/{memory_space_assignment_best_fit_repacker.cc => best_fit_repacker.cc} (97%)
 rename third_party/xla/xla/service/memory_space_assignment/{memory_space_assignment_best_fit_repacker.h => best_fit_repacker.h} (85%)
 rename third_party/xla/xla/service/memory_space_assignment/{memory_space_assignment_best_fit_repacker_test.cc => best_fit_repacker_test.cc} (90%)
 rename third_party/xla/xla/service/memory_space_assignment/{memory_space_assignment_repacking.h => repacking.h} (96%)
 rename third_party/xla/xla/service/memory_space_assignment/{memory_space_assignment_tuning_utils.cc => tuning_utils.cc} (92%)
 rename third_party/xla/xla/service/memory_space_assignment/{memory_space_assignment_tuning_utils.h => tuning_utils.h} (83%)
 rename third_party/xla/xla/service/memory_space_assignment/{memory_space_assignment_utils.cc => utils.cc} (96%)
 rename third_party/xla/xla/service/memory_space_assignment/{memory_space_assignment_utils.h => utils.h} (83%)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index d31af5b43fc45a..4aa33f467ba7a0 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1745,7 +1745,7 @@ cc_library(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_live_range",
-        "//xla/service/memory_space_assignment:memory_space_assignment_repacking",
+        "//xla/service/memory_space_assignment:repacking",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/heap_simulator.cc b/third_party/xla/xla/service/heap_simulator.cc
index d5e099d587397d..516c1e5b12a439 100644
--- a/third_party/xla/xla/service/heap_simulator.cc
+++ b/third_party/xla/xla/service/heap_simulator.cc
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/map_util.h"
-#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
+#include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/status.h"
 #include "xla/util.h"
 
@@ -757,7 +757,7 @@ bool BufferIntervalTree::Remove(int64_t start, int64_t end,
     //      /
     //    left
     if (root_ == to_delete) {
-      // Deleting root is simply reseting root;
+      // Deleting root is simply resetting root;
       root_ = to_delete->left;
       return true;
     }
@@ -1493,7 +1493,7 @@ Status GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
 
 namespace {
 
-// An iterator for iterating through permuations of slice times.
+// An iterator for iterating through permutations of slice times.
 class SliceTimePermutationIterator {
  public:
   explicit SliceTimePermutationIterator(int64_t latest_slice_time)
@@ -1890,7 +1890,7 @@ ChooseBestHeapAlgorithm<BufferType>::Finish() {
 
 template class GlobalDecreasingSizeBestFitHeap<HloValue>;
 template class GlobalDecreasingSizeBestFitHeap<
-    MemorySpaceAssignmentRepacker::AllocationBlock>;
+    memory_space_assignment::MemorySpaceAssignmentRepacker::AllocationBlock>;
 template class ChooseBestHeapAlgorithm<HloValue>;
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/heap_simulator.h b/third_party/xla/xla/service/heap_simulator.h
index 882300f979b19c..14f80a135c8051 100644
--- a/third_party/xla/xla/service/heap_simulator.h
+++ b/third_party/xla/xla/service/heap_simulator.h
@@ -46,7 +46,7 @@ limitations under the License.
 #include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_ordering.h"
-#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
+#include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/service/tuple_points_to_analysis.h"
 #include "xla/statusor.h"
 
@@ -901,7 +901,7 @@ class ChooseBestHeapAlgorithm : public HeapAlgorithm<BufferType> {
 
 extern template class GlobalDecreasingSizeBestFitHeap<HloValue>;
 extern template class GlobalDecreasingSizeBestFitHeap<
-    MemorySpaceAssignmentRepacker::AllocationBlock>;
+    memory_space_assignment::MemorySpaceAssignmentRepacker::AllocationBlock>;
 extern template class ChooseBestHeapAlgorithm<HloValue>;
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index cd54c1fb8df285..595124884d0f3a 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -38,9 +38,9 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":memory_space_assignment_proto_cc",
-        ":memory_space_assignment_repacking",
-        ":memory_space_assignment_tuning_utils",
-        ":memory_space_assignment_utils",
+        ":repacking",
+        ":tuning_utils",
+        ":utils",
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:status",
@@ -73,7 +73,7 @@ xla_cc_test(
     deps = [
         ":memory_space_assignment",
         ":memory_space_assignment_proto_cc",
-        ":memory_space_assignment_repacking",
+        ":repacking",
         "//xla:shape_util",
         "//xla:status",
         "//xla:util",
@@ -104,8 +104,8 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "memory_space_assignment_repacking",
-    hdrs = ["memory_space_assignment_repacking.h"],
+    name = "repacking",
+    hdrs = ["repacking.h"],
     visibility = ["//visibility:public"],
     deps = [
         "//xla:statusor",
@@ -117,12 +117,12 @@ cc_library(
 )
 
 cc_library(
-    name = "memory_space_assignment_best_fit_repacker",
-    srcs = ["memory_space_assignment_best_fit_repacker.cc"],
-    hdrs = ["memory_space_assignment_best_fit_repacker.h"],
+    name = "best_fit_repacker",
+    srcs = ["best_fit_repacker.cc"],
+    hdrs = ["best_fit_repacker.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":memory_space_assignment_repacking",
+        ":repacking",
         "//xla:comparison_util",
         "//xla:statusor",
         "//xla/service:heap_simulator",
@@ -136,9 +136,9 @@ cc_library(
 )
 
 cc_library(
-    name = "memory_space_assignment_utils",
-    srcs = ["memory_space_assignment_utils.cc"],
-    hdrs = ["memory_space_assignment_utils.h"],
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
     visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
@@ -147,22 +147,22 @@ cc_library(
 )
 
 cc_library(
-    name = "memory_space_assignment_tuning_utils",
-    srcs = ["memory_space_assignment_tuning_utils.cc"],
-    hdrs = ["memory_space_assignment_tuning_utils.h"],
+    name = "tuning_utils",
+    srcs = ["tuning_utils.cc"],
+    hdrs = ["tuning_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":memory_space_assignment_utils",
         "//xla/hlo/ir:hlo",
         "//xla/service:heap_simulator",
     ],
 )
 
 xla_cc_test(
-    name = "memory_space_assignment_best_fit_repacker_test",
-    srcs = ["memory_space_assignment_best_fit_repacker_test.cc"],
+    name = "best_fit_repacker_test",
+    srcs = ["best_fit_repacker_test.cc"],
     deps = [
-        ":memory_space_assignment_best_fit_repacker",
+        ":best_fit_repacker",
+        ":repacking",
         "//xla:comparison_util",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.cc b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
similarity index 97%
rename from third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.cc
rename to third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
index 35d2428c7c7572..1215116670d538 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
@@ -113,7 +113,7 @@ Step 5: Update AllocationBlocks with the repacking placements
     Callers extract that data to get the repacking locations.
 */
 
-#include "xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h"
+#include "xla/service/memory_space_assignment/best_fit_repacker.h"
 
 #include <algorithm>
 #include <cstdint>
@@ -131,17 +131,19 @@ Step 5: Update AllocationBlocks with the repacking placements
 #include "absl/types/span.h"
 #include "xla/comparison_util.h"
 #include "xla/service/heap_simulator.h"
+#include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/statusor.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
 namespace {
 
-using AllocationBlock = MemorySpaceAssignmentRepacker::AllocationBlock;
+using AllocationBlock =
+    memory_space_assignment::MemorySpaceAssignmentRepacker::AllocationBlock;
 using Type = GlobalDecreasingSizeBestFitHeap<AllocationBlock>::Type;
-using SlicedAllocationData =
+using SlicedAllocationData = memory_space_assignment::
     MemorySpaceAssignmentRepacker::SlicedAllocationData;
-using Slice = MemorySpaceAssignmentRepacker::Slice;
+using Slice = memory_space_assignment::MemorySpaceAssignmentRepacker::Slice;
 
 bool IsSliced(const AllocationBlock* block) {
   return block->original_slice_data.has_value();
@@ -167,7 +169,8 @@ class BestFitRepacker
     : public GlobalDecreasingSizeBestFitHeap<AllocationBlock> {
  public:
   BestFitRepacker(
-      const MemorySpaceAssignmentBestFitRepacker::BestFitRepackOptions& options,
+      const memory_space_assignment::MemorySpaceAssignmentBestFitRepacker::
+          BestFitRepackOptions& options,
       int64_t max_size, int64_t alignment)
       : GlobalDecreasingSizeBestFitHeap<AllocationBlock>(
             alignment, kCustom,
@@ -608,6 +611,8 @@ class BestFitRepacker
 
 }  // namespace
 
+namespace memory_space_assignment {
+
 StatusOr<bool> MemorySpaceAssignmentBestFitRepacker::Repack(
     absl::Span<AllocationBlock*> allocations) {
   BestFitRepacker best_fit_repacker =
@@ -616,4 +621,5 @@ StatusOr<bool> MemorySpaceAssignmentBestFitRepacker::Repack(
   return best_fit_repacker.Repack();
 }
 
+}  // namespace memory_space_assignment
 }  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h
similarity index 85%
rename from third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h
rename to third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h
index 39be380eae599b..eb69edb1ce93c9 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h
+++ b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h
@@ -13,17 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
-#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
 
 #include <cstdint>
 
 #include "absl/types/span.h"
 #include "xla/service/heap_simulator.h"
-#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
+#include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/statusor.h"
 
 namespace xla {
+namespace memory_space_assignment {
 
 // This is a repacker algorithm that wraps around best fit heap algorithm in
 // heap simulator.
@@ -57,6 +58,7 @@ class MemorySpaceAssignmentBestFitRepacker
   BestFitRepackOptions options_;
 };
 
+}  // namespace memory_space_assignment
 }  // namespace xla
 
-#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker_test.cc b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker_test.cc
similarity index 90%
rename from third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker_test.cc
rename to third_party/xla/xla/service/memory_space_assignment/best_fit_repacker_test.cc
index 70a05e78abcaca..4ff2e55e6fc1fe 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker_test.cc
@@ -13,23 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/memory_space_assignment/memory_space_assignment_best_fit_repacker.h"
+#include "xla/service/memory_space_assignment/best_fit_repacker.h"
 
 #include <cstdint>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "xla/comparison_util.h"
+#include "xla/service/memory_space_assignment/repacking.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 
 class MemorySpaceAssignmentBestFitRepackerTest : public ::testing::Test {
  protected:
-  using AllocationBlock = MemorySpaceAssignmentRepacker::AllocationBlock;
-  using SlicedAllocationData =
+  using AllocationBlock =
+      memory_space_assignment::MemorySpaceAssignmentRepacker::AllocationBlock;
+  using SlicedAllocationData = memory_space_assignment::
       MemorySpaceAssignmentRepacker::SlicedAllocationData;
-  using Slice = MemorySpaceAssignmentRepacker::Slice;
+  using Slice = memory_space_assignment::MemorySpaceAssignmentRepacker::Slice;
 
   MemorySpaceAssignmentBestFitRepackerTest() : repacker_(100, 1, options_) {}
 
@@ -50,9 +52,10 @@ class MemorySpaceAssignmentBestFitRepackerTest : public ::testing::Test {
   }
 
   std::list<AllocationBlock> allocation_blocks_;
-  MemorySpaceAssignmentBestFitRepacker::BestFitRepackOptions options_{
-      /*validate=*/true, /*buffer_interval_compare=*/nullptr};
-  MemorySpaceAssignmentBestFitRepacker repacker_;
+  memory_space_assignment::MemorySpaceAssignmentBestFitRepacker::
+      BestFitRepackOptions options_{/*validate=*/true,
+                                    /*buffer_interval_compare=*/nullptr};
+  memory_space_assignment::MemorySpaceAssignmentBestFitRepacker repacker_;
 };
 
 TEST_F(MemorySpaceAssignmentBestFitRepackerTest, Simple) {
@@ -163,11 +166,12 @@ TEST_F(MemorySpaceAssignmentBestFitRepackerTest, RepackedSlicesFit) {
     sort_keys[allocation_blocks[i]] = i;
   }
   options_.buffer_interval_compare = LessThanByKey(
-      [sort_keys](
-          const MemorySpaceAssignmentBestFitRepacker::BufferInterval& x) {
+      [sort_keys](const memory_space_assignment::
+                      MemorySpaceAssignmentBestFitRepacker::BufferInterval& x) {
         return sort_keys.at(x.buffer);
       });
-  repacker_ = MemorySpaceAssignmentBestFitRepacker(100, 1, options_);
+  repacker_ = memory_space_assignment::MemorySpaceAssignmentBestFitRepacker(
+      100, 1, options_);
 
   EXPECT_TRUE(*repacker_.Repack(absl::MakeSpan(allocation_blocks)));
 
@@ -251,11 +255,12 @@ TEST_F(MemorySpaceAssignmentBestFitRepackerTest, SlicedColocationsFit) {
     sort_keys[allocation_blocks[i]] = i;
   }
   options_.buffer_interval_compare = LessThanByKey(
-      [sort_keys](
-          const MemorySpaceAssignmentBestFitRepacker::BufferInterval& x) {
+      [sort_keys](const memory_space_assignment::
+                      MemorySpaceAssignmentBestFitRepacker::BufferInterval& x) {
         return sort_keys.at(x.buffer);
       });
-  repacker_ = MemorySpaceAssignmentBestFitRepacker(100, 1, options_);
+  repacker_ = memory_space_assignment::MemorySpaceAssignmentBestFitRepacker(
+      100, 1, options_);
 
   EXPECT_TRUE(*repacker_.Repack(absl::MakeSpan(allocation_blocks)));
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 317162b005a125..bf051d3a546cb3 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -46,9 +46,9 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/service/heap_simulator.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
-#include "xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.h"
-#include "xla/service/memory_space_assignment/memory_space_assignment_utils.h"
+#include "xla/service/memory_space_assignment/repacking.h"
+#include "xla/service/memory_space_assignment/tuning_utils.h"
+#include "xla/service/memory_space_assignment/utils.h"
 #include "xla/service/tuple_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
index e988d598332b6c..384c5bc46d985b 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
-#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
+#include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/shape.h"
 #include "xla/statusor.h"
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index 4032ec52bc7f36..381470ce021317 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -54,7 +54,7 @@ limitations under the License.
 #include "xla/service/hlo_value.h"
 #include "xla/service/instruction_hoister.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
-#include "xla/service/memory_space_assignment/memory_space_assignment_repacking.h"
+#include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -81,6 +81,7 @@ using memory_space_assignment::MemoryBoundLoopOptimizer;
 using memory_space_assignment::MemoryBoundLoopOptimizerOptions;
 using memory_space_assignment::MemorySpaceAssignment;
 using memory_space_assignment::MemorySpaceAssignmentCostAnalysis;
+using memory_space_assignment::MemorySpaceAssignmentRepacker;
 using memory_space_assignment::Options;
 using memory_space_assignment::PrefetchIntervalPicker;
 using memory_space_assignment::PresetAssignments;
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_repacking.h b/third_party/xla/xla/service/memory_space_assignment/repacking.h
similarity index 96%
rename from third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_repacking.h
rename to third_party/xla/xla/service/memory_space_assignment/repacking.h
index e26d5ecf5643cc..0379fc79483230 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_repacking.h
+++ b/third_party/xla/xla/service/memory_space_assignment/repacking.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
-#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
 
 #include <cstdint>
 #include <optional>
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/types.h"
 
 namespace xla {
+namespace memory_space_assignment {
 
 // An interface to define allocation repacking algorithms.
 class MemorySpaceAssignmentRepacker {
@@ -158,6 +159,7 @@ class MemorySpaceAssignmentRepacker {
   int64_t alignment_;
 };
 
+}  // namespace memory_space_assignment
 }  // namespace xla
 
-#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.cc b/third_party/xla/xla/service/memory_space_assignment/tuning_utils.cc
similarity index 92%
rename from third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.cc
rename to third_party/xla/xla/service/memory_space_assignment/tuning_utils.cc
index 2e48eaeb0f670f..3abbcc68f3a059 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/tuning_utils.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.h"
+#include "xla/service/memory_space_assignment/tuning_utils.h"
 
-#include "xla/service/memory_space_assignment/memory_space_assignment_utils.h"
 namespace xla {
 
 namespace memory_space_assignment {
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.h b/third_party/xla/xla/service/memory_space_assignment/tuning_utils.h
similarity index 83%
rename from third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.h
rename to third_party/xla/xla/service/memory_space_assignment/tuning_utils.h
index ce6230982bd5b0..749b4445e4be9a 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_tuning_utils.h
+++ b/third_party/xla/xla/service/memory_space_assignment/tuning_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
-#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
 
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/heap_simulator.h"
@@ -35,4 +35,4 @@ void CustomizeSortedBufferInterval(
 }  // namespace memory_space_assignment
 }  // namespace xla
 
-#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.cc b/third_party/xla/xla/service/memory_space_assignment/utils.cc
similarity index 96%
rename from third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.cc
rename to third_party/xla/xla/service/memory_space_assignment/utils.cc
index 03f1621dfbaea9..2b1eff9a074865 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/utils.cc
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/memory_space_assignment/memory_space_assignment_utils.h"
+#include "xla/service/memory_space_assignment/utils.h"
 
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 
 namespace xla {
+namespace memory_space_assignment {
 
 bool MemorySpaceAssignmentUtils::IsValueAllowedInAlternateMemory(
     const HloValue* value) {
@@ -90,4 +91,5 @@ bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
          absl::c_all_of(interval.colocations, IsValueAllowedInAlternateMemory);
 }
 
+}  // namespace memory_space_assignment
 }  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.h b/third_party/xla/xla/service/memory_space_assignment/utils.h
similarity index 83%
rename from third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.h
rename to third_party/xla/xla/service/memory_space_assignment/utils.h
index e4cf6f45b51517..c2d76208a28e18 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_utils.h
+++ b/third_party/xla/xla/service/memory_space_assignment/utils.h
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
-#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
 
 #include "xla/service/heap_simulator.h"
 
 namespace xla {
+namespace memory_space_assignment {
 
 // Encapsulates common utility methods for memory space assignment.
 class MemorySpaceAssignmentUtils {
@@ -33,6 +34,7 @@ class MemorySpaceAssignmentUtils {
   static bool IsValueAllowedInAlternateMemory(const HloValue* value);
 };
 
+}  // namespace memory_space_assignment
 }  // namespace xla
 
-#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_UTILS_H_
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_UTILS_H_

From 88a60479b5a873049ec40cacd0c5ad402f54e6ad Mon Sep 17 00:00:00 2001
From: Yishuang Pang <ypang@google.com>
Date: Wed, 27 Sep 2023 14:33:01 -0700
Subject: [PATCH 344/567] Legalize some MHLO broadcasted compare ops to TF
 GreaterEqual/Greater/LessEqual/Less op directly. tf.BroadcastTo op folder
 folds constants by default, this would increase the size of models converted
 from StableHLO because StableHLO requires explicit broadcasting. This change
 helps reduce model size by removing unnecessary broadcasts at legalization
 stage.

PiperOrigin-RevId: 568962588
---
 .../lite/stablehlo/tests/legalize_hlo.mlir    | 64 ++++++++++++++++---
 .../transforms/legalize_hlo_patterns.td       | 21 +++++-
 2 files changed, 75 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index 2df637ef62f263..669172daa30f0a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -737,12 +737,24 @@ func.func @greater(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 // CHECK-LABEL:   func @broadcast_greater(
-// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<1x1xi32>,
 // CHECK-SAME:                            %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Greater"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
+func.func @broadcast_greater(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x2xi32>
+  %1 = "mhlo.compare"(%0, %arg1) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  func.return %1 : tensor<1x2xi1>
+}
+
+// CHECK-LABEL:   func @broadcast_greater_chlo(
+// CHECK-SAME:                                 %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                                 %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
 // CHECK:           %[[VAL_2:.*]] = "tf.Greater"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
-func.func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+func.func @broadcast_greater_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction GT>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
@@ -766,12 +778,24 @@ func.func @greater_equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2
 }
 
 // CHECK-LABEL:   func @broadcast_greater_equal(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                                  %[[VAL_0:.*]]: tensor<1x1xi32>,
 // CHECK-SAME:                                  %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.GreaterEqual"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
+func.func @broadcast_greater_equal(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x2xi32>
+  %1 = "mhlo.compare"(%0, %arg1) {comparison_direction = #mhlo<comparison_direction GE>} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  func.return %1 : tensor<1x2xi1>
+}
+
+// CHECK-LABEL:   func @broadcast_greater_equal_chlo(
+// CHECK-SAME:                                       %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                                       %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
 // CHECK:           %[[VAL_2:.*]] = "tf.GreaterEqual"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
-func.func @broadcast_greater_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+func.func @broadcast_greater_equal_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction GE>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
@@ -788,12 +812,24 @@ func.func @less(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 // CHECK-LABEL:   func @broadcast_less(
-// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x1xi32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
+func.func @broadcast_less(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x2xi32>
+  %1 = "mhlo.compare"(%0, %arg1) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  func.return %1 : tensor<1x2xi1>
+}
+
+// CHECK-LABEL:   func @broadcast_less_chlo(
+// CHECK-SAME:                              %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
 // CHECK:           %[[VAL_2:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
-func.func @broadcast_less(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+func.func @broadcast_less_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
@@ -810,12 +846,24 @@ func.func @less_equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1
 }
 
 // CHECK-LABEL:   func @broadcast_less_equal(
-// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1x1xi32>,
 // CHECK-SAME:                               %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LessEqual"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
+func.func @broadcast_less_equal(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x2xi32>
+  %1 = "mhlo.compare"(%0, %arg1) {comparison_direction = #mhlo<comparison_direction LE>} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  func.return %1 : tensor<1x2xi1>
+}
+
+// CHECK-LABEL:   func @broadcast_less_equal_chlo(
+// CHECK-SAME:                                    %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                                    %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
 // CHECK:           %[[VAL_2:.*]] = "tf.LessEqual"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
-func.func @broadcast_less_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+func.func @broadcast_less_equal_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction LE>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
index 307f3515c68aff..6c29556a9799df 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
@@ -226,8 +226,9 @@ def : Pat<(MHLO_ClampOp $min, $arg, $max),
           (TF_MaximumOp (TF_MinimumOp $arg, $max), $min)>;
 def : Pat<(MHLO_SelectOp $cond, $t, $e), (TF_SelectOp $cond, $t, $e)>;
 def : Pat<(MHLO_SelectOp (MHLO_BroadcastInDimOp:$output
-                                  $bcast_cond,
-                                  $broadcast_dimensions), $t, $e),
+                           $bcast_cond,
+                           $broadcast_dimensions),
+                          $t, $e),
           (TF_SelectV2Op $bcast_cond, $t, $e),
           [(IsTFStyleBroadcast $broadcast_dimensions, $output)]>;
 def : Pat<(MHLO_SelectOp $cond, $t,
@@ -317,6 +318,22 @@ foreach p = [[TF_GreaterEqualOp, MHLO_ComparisonDirectionValue<"GE">],
              [TF_LessOp, MHLO_ComparisonDirectionValue<"LT">]] in {
   def : Pat<(MHLO_CompareOp $l, $r, p[1], IsMhloTFCompareType:$type),
             (p[0] $l, $r)>;
+  def : Pat<(MHLO_CompareOp (MHLO_BroadcastInDimOp:$output
+                              $bcast_operand,
+                              $broadcast_dimensions),
+                            $r,
+                            p[1],
+                            IsMhloTFCompareType:$type),
+            (p[0] $bcast_operand, $r),
+            [(IsTFStyleBroadcast $broadcast_dimensions, $output)]>;
+  def : Pat<(MHLO_CompareOp $l,
+                            (MHLO_BroadcastInDimOp:$output
+                              $bcast_operand,
+                              $broadcast_dimensions),
+                            p[1],
+                            IsMhloTFCompareType:$type),
+            (p[0] $l, $bcast_operand),
+            [(IsTFStyleBroadcast $broadcast_dimensions, $output)]>;
 }
 
 def ConvertDotOp : NativeCodeCall<"ConvertDotOp($_builder, "

From 6033f90d201e705a08fdf7bbdeabf974730c457a Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 27 Sep 2023 14:35:01 -0700
Subject: [PATCH 345/567] [stream_executor] Add initial test for CUDA command
 buffer

https://github.com/openxla/xla/issues/5857

PiperOrigin-RevId: 568963141
---
 third_party/xla/xla/stream_executor/BUILD     |  4 +
 .../xla/xla/stream_executor/command_buffer.cc | 26 ++++++-
 .../xla/xla/stream_executor/command_buffer.h  | 51 ++++++++++++-
 .../xla/xla/stream_executor/cuda/BUILD        | 25 +++++-
 .../cuda/cuda_command_buffer_test.cc          | 67 ++++++++++++++++
 .../stream_executor/cuda/cuda_kernel_test.cc  | 59 ++------------
 .../stream_executor/cuda/cuda_test_kernels.h  | 76 +++++++++++++++++++
 third_party/xla/xla/stream_executor/gpu/BUILD |  1 +
 .../stream_executor/gpu/gpu_command_buffer.cc | 20 ++++-
 .../stream_executor/gpu/gpu_command_buffer.h  |  2 +
 .../stream_executor_internal.h                | 10 +++
 11 files changed, 278 insertions(+), 63 deletions(-)
 create mode 100644 third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
 create mode 100644 third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.h

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index e92cdaf32f09c2..c8afc42b51a74a 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -217,6 +217,7 @@ cc_library(
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -253,9 +254,12 @@ cc_library(
     hdrs = ["command_buffer.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":stream_executor_headers",
         ":stream_executor_internal",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/stream_executor/command_buffer.cc b/third_party/xla/xla/stream_executor/command_buffer.cc
index 861eb801b7457b..9438d8c798b51e 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/command_buffer.cc
@@ -15,19 +15,24 @@ limitations under the License.
 
 #include "xla/stream_executor/command_buffer.h"
 
+#include <cstdint>
 #include <memory>
 #include <utility>
 
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
 /*static*/ tsl::StatusOr<CommandBuffer> CommandBuffer::Create(
     StreamExecutor* executor) {
-  // TODO(ezhulenev): Construct command buffer from platform-specific command
-  // buffer implementation. It requires cleaning up build files first.
-  std::unique_ptr<internal::CommandBufferInterface> command_buffer = nullptr;
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<internal::CommandBufferInterface> command_buffer,
+      executor->implementation()->GetCommandBufferImplementation());
   return tsl::StatusOr<CommandBuffer>(std::move(command_buffer));
 }
 
@@ -35,4 +40,19 @@ CommandBuffer::CommandBuffer(
     std::unique_ptr<internal::CommandBufferInterface> implementation)
     : implementation_(std::move(implementation)) {}
 
+tsl::Status CommandBuffer::Launch(const ThreadDim& threads,
+                                  const BlockDim& blocks,
+                                  const KernelBase& kernel,
+                                  const KernelArgsArrayBase& args) {
+  return implementation_->Launch(threads, blocks, kernel, args);
+}
+
+tsl::Status CommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
+                                                const DeviceMemoryBase& src,
+                                                uint64_t size) {
+  return implementation_->MemcpyDeviceToDevice(dst, src, size);
+}
+
+tsl::Status CommandBuffer::Finalize() { return implementation_->Finalize(); }
+
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index a22ddb501732fd..dff5e3705ffc3c 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -16,9 +16,15 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_COMMAND_BUFFER_H_
 #define XLA_STREAM_EXECUTOR_COMMAND_BUFFER_H_
 
+#include <cstdint>
 #include <memory>
+#include <tuple>
 
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform/port.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
 namespace stream_executor {
@@ -29,6 +35,10 @@ namespace internal {
 class CommandBufferInterface;
 }
 
+//===----------------------------------------------------------------------===//
+// CommandBuffer
+//===----------------------------------------------------------------------===//
+
 // Command buffer represent a "bundle of work items" for StreamExecutor device
 // that can be submitted with one API call, e.g. command buffer might have
 // multiple device kernels and synchronization barriers between them. Command
@@ -40,11 +50,29 @@ class CommandBuffer {
   explicit CommandBuffer(
       std::unique_ptr<internal::CommandBufferInterface> implementation);
 
+  CommandBuffer(CommandBuffer&&) = default;
+  CommandBuffer& operator=(CommandBuffer&&) = default;
+
   static tsl::StatusOr<CommandBuffer> Create(StreamExecutor* executor);
 
-  internal::CommandBufferInterface* operator->() {
-    return implementation_.get();
-  }
+  // Adds a kernel launch command to the command buffer.
+  tsl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
+                     const KernelBase& kernel, const KernelArgsArrayBase& args);
+
+  // Adds a device-to-device memory copy to the command buffer.
+  tsl::Status MemcpyDeviceToDevice(DeviceMemoryBase* dst,
+                                   const DeviceMemoryBase& src, uint64_t size);
+
+  // Finalizes command buffer and makes it executable. Once command buffer is
+  // finalized no commands can be added to it.
+  tsl::Status Finalize();
+
+  // Type-safe wrapper for launching typed kernels. Notice that the order of
+  // arguments is different do disambiguate from the regular launch API.
+  template <typename... Params, typename... Args>
+  tsl::Status Launch(const TypedKernel<Params...>& kernel,
+                     const ThreadDim& threads, const BlockDim& blocks,
+                     Args... args);
 
  private:
   std::unique_ptr<internal::CommandBufferInterface> implementation_;
@@ -52,6 +80,23 @@ class CommandBuffer {
   SE_DISALLOW_COPY_AND_ASSIGN(CommandBuffer);
 };
 
+//===----------------------------------------------------------------------===//
+// CommandBuffer templates implementation below
+//===----------------------------------------------------------------------===//
+
+template <typename... Params, typename... Args>
+inline tsl::Status CommandBuffer::Launch(const TypedKernel<Params...>& kernel,
+                                         const ThreadDim& threads,
+                                         const BlockDim& blocks, Args... args) {
+  KernelInvocationChecker<std::tuple<Params...>,
+                          std::tuple<Args...>>::CheckAllStaticAssert();
+
+  KernelArgsArray<sizeof...(args)> kernel_args;
+  kernel.PackParams(&kernel_args, args...);
+  TF_RETURN_IF_ERROR(Launch(threads, blocks, kernel, kernel_args));
+  return tsl::OkStatus();
+}
+
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_COMMAND_BUFFER_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index fc13f11efd413f..aa0567dc76856d 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -384,12 +384,35 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "cuda_test_kernels",
+    testonly = 1,
+    hdrs = ["cuda_test_kernels.h"],
+    visibility = ["//visibility:public"],
+)
+
 xla_test(
     name = "cuda_kernel_test",
     srcs = ["cuda_kernel_test.cc"],
     backends = ["gpu"],
     deps = [
-        ":cuda_kernel",
+        ":cuda_platform",
+        ":cuda_test_kernels",
+        "//xla/stream_executor",
+        "//xla/stream_executor:multi_platform_manager",
+        "//xla/stream_executor:platform",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+xla_test(
+    name = "cuda_command_buffer_test",
+    srcs = ["cuda_command_buffer_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":cuda_platform",
+        ":cuda_test_kernels",
         "//xla/stream_executor",
         "//xla/stream_executor:multi_platform_manager",
         "//xla/stream_executor:platform",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
new file mode 100644
index 00000000000000..0f0b6dd05b3406
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include <gtest/gtest.h>
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/cuda/cuda_test_kernels.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/test.h"
+
+namespace stream_executor::cuda {
+
+TEST(CudaCommandBufferTest, LaunchSingleKernel) {
+  using AddI32Kernel = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
+                                   DeviceMemory<int32_t>>;
+
+  Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
+  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  MultiKernelLoaderSpec spec(/*arity=*/3);
+  spec.AddCudaPtxInMemory(internal::kAddI32Kernel, "add");
+
+  AddI32Kernel add(executor);
+  ASSERT_TRUE(executor->GetKernel(spec, &add).ok());
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: a=1, b=2, c=0
+  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+
+  stream.ThenMemset32(&a, 1, byte_length);
+  stream.ThenMemset32(&b, 2, byte_length);
+  stream.ThenMemZero(&c, byte_length);
+
+  // Create a command buffer with a single kernel launch.
+  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  ASSERT_TRUE(cmd_buffer.Launch(add, ThreadDim(), BlockDim(4), a, b, c).ok());
+  ASSERT_TRUE(cmd_buffer.Finalize().ok());
+
+  // TODO(ezhulenev): Execute command buffer and check results.
+}
+
+}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
index 9b7f99ed8f34b0..4808389cde9d5b 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
-#include <string_view>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "xla/stream_executor/cuda/cuda_test_kernels.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
@@ -27,54 +27,6 @@ limitations under the License.
 
 namespace stream_executor::cuda {
 
-// PTX kernel compiled from:
-//
-//  __global__ void add(int* a, int* b, int* c) {
-//    int index = threadIdx.x + blockIdx.x * blockDim.x;
-//    c[index] = a[index] + b[index];
-//  }
-//
-// Easiest way to get PTX from C++ is to use https://godbolt.org.
-static std::string_view kAddI32Kernel = R"(
-.version 8.2
-.target sm_50
-.address_size 64
-
-.visible .entry add(
-        .param .u64 add_param_0,
-        .param .u64 add_param_1,
-        .param .u64 add_param_2
-)
-{
-        .reg .b32       %r<8>;
-        .reg .b64       %rd<11>;
-        .loc    1 1 0
-
-        ld.param.u64    %rd1, [add_param_0];
-        ld.param.u64    %rd2, [add_param_1];
-        ld.param.u64    %rd3, [add_param_2];
-        .loc    1 3 3
-        cvta.to.global.u64      %rd4, %rd3;
-        cvta.to.global.u64      %rd5, %rd2;
-        cvta.to.global.u64      %rd6, %rd1;
-        mov.u32         %r1, %tid.x;
-        mov.u32         %r2, %ctaid.x;
-        mov.u32         %r3, %ntid.x;
-        mad.lo.s32      %r4, %r2, %r3, %r1;
-        .loc    1 4 3
-        mul.wide.s32    %rd7, %r4, 4;
-        add.s64         %rd8, %rd6, %rd7;
-        ld.global.u32   %r5, [%rd8];
-        add.s64         %rd9, %rd5, %rd7;
-        ld.global.u32   %r6, [%rd9];
-        add.s32         %r7, %r6, %r5;
-        add.s64         %rd10, %rd4, %rd7;
-        st.global.u32   [%rd10], %r7;
-        .loc    1 5 1
-        ret;
-
-})";
-
 TEST(CudaKernelTest, Add) {
   using AddI32Kernel = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
                                    DeviceMemory<int32_t>>;
@@ -87,10 +39,10 @@ TEST(CudaKernelTest, Add) {
   ASSERT_TRUE(stream.ok());
 
   MultiKernelLoaderSpec spec(/*arity=*/3);
-  spec.AddCudaPtxInMemory(kAddI32Kernel, "add");
+  spec.AddCudaPtxInMemory(internal::kAddI32Kernel, "add");
 
-  AddI32Kernel add_kernel(executor);
-  ASSERT_TRUE(executor->GetKernel(spec, &add_kernel).ok());
+  AddI32Kernel add(executor);
+  ASSERT_TRUE(executor->GetKernel(spec, &add).ok());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -105,8 +57,7 @@ TEST(CudaKernelTest, Add) {
   stream.ThenMemZero(&c, byte_length);
 
   // Launch kernel.
-  auto st = stream.ThenLaunch(ThreadDim(), BlockDim(4), add_kernel, a, b, c);
-  ASSERT_TRUE(st.ok());
+  ASSERT_TRUE(stream.ThenLaunch(ThreadDim(), BlockDim(4), add, a, b, c).ok());
 
   // Copy data back to host.
   std::vector<int32_t> dst(4, 42);
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.h b/third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.h
new file mode 100644
index 00000000000000..25102a45038ba6
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.h
@@ -0,0 +1,76 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_TEST_KERNELS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_TEST_KERNELS_H_
+
+#include <string_view>
+
+namespace stream_executor::cuda::internal {
+
+// This is a collection of CUDA kernels compiled to PTX for use in various CUDA
+// tests in this package.
+
+// PTX kernel compiled from:
+//
+//  __global__ void add(int* a, int* b, int* c) {
+//    int index = threadIdx.x + blockIdx.x * blockDim.x;
+//    c[index] = a[index] + b[index];
+//  }
+//
+// Easiest way to get PTX from C++ is to use https://godbolt.org.
+inline constexpr std::string_view kAddI32Kernel = R"(
+.version 8.2
+.target sm_50
+.address_size 64
+
+.visible .entry add(
+        .param .u64 add_param_0,
+        .param .u64 add_param_1,
+        .param .u64 add_param_2
+)
+{
+        .reg .b32       %r<8>;
+        .reg .b64       %rd<11>;
+        .loc    1 1 0
+
+        ld.param.u64    %rd1, [add_param_0];
+        ld.param.u64    %rd2, [add_param_1];
+        ld.param.u64    %rd3, [add_param_2];
+        .loc    1 3 3
+        cvta.to.global.u64      %rd4, %rd3;
+        cvta.to.global.u64      %rd5, %rd2;
+        cvta.to.global.u64      %rd6, %rd1;
+        mov.u32         %r1, %tid.x;
+        mov.u32         %r2, %ctaid.x;
+        mov.u32         %r3, %ntid.x;
+        mad.lo.s32      %r4, %r2, %r3, %r1;
+        .loc    1 4 3
+        mul.wide.s32    %rd7, %r4, 4;
+        add.s64         %rd8, %rd6, %rd7;
+        ld.global.u32   %r5, [%rd8];
+        add.s64         %rd9, %rd5, %rd7;
+        ld.global.u32   %r6, [%rd9];
+        add.s32         %r7, %r6, %r5;
+        add.s64         %rd10, %rd4, %rd7;
+        st.global.u32   [%rd10], %r7;
+        .loc    1 5 1
+        ret;
+
+})";
+
+}  // namespace stream_executor::cuda::internal
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_TEST_KERNELS_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 52d4762e20b55a..545857f61094b8 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -96,6 +96,7 @@ cc_library(
         "//xla/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index 03c4d170bd11ed..3867d72f9f3777 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 
@@ -65,8 +66,9 @@ GpuCommandBuffer::GpuCommandBuffer(GpuExecutor* parent, GpuGraphHandle graph)
 
 GpuCommandBuffer::~GpuCommandBuffer() {
   if (exec_ != nullptr) {
-    VLOG(5) << "Destroy GPU command buffer graph exec. "
-            << "Remaining alive instances: " << NotifyExecDestroyed();
+    VLOG(5) << "Destroy GPU command buffer executable graph "
+            << "(remaining alive executable graphs: " << NotifyExecDestroyed()
+            << ")";
     auto st = GpuDriver::DestroyGraphExec(exec_);
     CHECK(st.ok()) << "Failed to destroy GPU graph exec: " << st.message();
   }
@@ -109,4 +111,18 @@ tsl::Status GpuCommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
   return tsl::OkStatus();
 }
 
+tsl::Status GpuCommandBuffer::Finalize() {
+  uint64_t start_nanos = tsl::Env::Default()->NowNanos();
+
+  GpuDriver::GraphInstantiateFlags flags;
+  TF_RETURN_IF_ERROR(GpuDriver::GraphInstantiate(&exec_, graph_, flags));
+  uint64_t end_nanos = tsl::Env::Default()->NowNanos();
+
+  VLOG(5) << "Instantiated executable graph #" << NotifyExecCreated() << " in "
+          << (end_nanos - start_nanos) / 1000
+          << " μs (alive executable graphs: " << AliveExecs() << ")";
+
+  return tsl::OkStatus();
+}
+
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index b744a02e90be1f..95c87e25f41a8e 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -43,6 +43,8 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
                                    const DeviceMemoryBase& src,
                                    uint64_t size) override;
 
+  tsl::Status Finalize() override;
+
   // We track the total number of allocated and alive executable graphs in the
   // process to track the command buffers resource usage. Executable graph
   // allocates resources on a GPU devices (rule of thumb is ~8kb per node), so
diff --git a/third_party/xla/xla/stream_executor/stream_executor_internal.h b/third_party/xla/xla/stream_executor/stream_executor_internal.h
index b57d683a82301d..8d1867dab70724 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_internal.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_internal.h
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/trace_listener.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
@@ -129,6 +130,11 @@ class KernelInterface {
 // can execute concurrently, and it's up to the caller to insert barriers to
 // guarantee correctness. Consider adding finer grained synchronization
 // mechanism between different commands.
+//
+// TODO(ezhulenev): Currently command buffers do no support updates, and once
+// finalized can be executed as recorded. We need to support cheap command
+// buffer updates that in GPU backend will be mapped to CUDA/HIP graph node
+// updates.
 class CommandBufferInterface {
  public:
   CommandBufferInterface() = default;
@@ -144,6 +150,10 @@ class CommandBufferInterface {
                                            const DeviceMemoryBase& src,
                                            uint64_t size) = 0;
 
+  // Finalizes command buffer and makes it executable. Once command buffer is
+  // finalized no commands can be added to it.
+  virtual tsl::Status Finalize() = 0;
+
  private:
   SE_DISALLOW_COPY_AND_ASSIGN(CommandBufferInterface);
 };

From 48dc0584aa026822dcac00073b76c495e4954b16 Mon Sep 17 00:00:00 2001
From: Arian Arfaian <aarfaian@google.com>
Date: Wed, 27 Sep 2023 14:51:51 -0700
Subject: [PATCH 346/567] Link tflite_jni with -Wl,--undefined-version

The version script has some symbols that are undefined for some configurations,
which will lead to a linker error with ld.lld's new --no-undefined-version
default.

```
ld: error: version script assignment of 'VERS_1.0' to symbol 'JNI_OnLoad' failed: symbol not defined
ld: error: version script assignment of 'VERS_1.0' to symbol 'JNI_OnUnload' failed: symbol not defined
```
PiperOrigin-RevId: 568967749
---
 tensorflow/lite/build_def.bzl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 398eebe7526d05..6a9f0a812feea4 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -213,9 +213,7 @@ def tflite_jni_binary(
         clean_dep("//tensorflow:windows"): [],
         "//conditions:default": [
             "-Wl,--version-script,$(location {})".format(linkscript),
-            # copybara:uncomment_begin(google-only)
-            # "-Wl,--undefined-version",
-            # copybara:uncomment_end
+            "-Wl,--undefined-version",
             "-Wl,-soname," + name,
         ],
     })

From 9704448afcf256a9908cd34405a826483e3aa81f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 14:53:09 -0700
Subject: [PATCH 347/567] [XLA:GPU] Allows fusion of binary element wise ops
 when exactly one operand is a (shared or unshared) splat constant.
 Previously, only splat constant operands with a single user would be
 accepted.

PiperOrigin-RevId: 568968119
---
 .../ir_emitter_triton_parametrized_test.cc    | 52 +++++++++++++++++++
 .../service/gpu/softmax_rewriter_triton.cc    | 14 +----
 .../gpu/softmax_rewriter_triton_test.cc       | 46 +++++++++-------
 3 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 9be7fdbede4e83..929b34ee656daf 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -2295,6 +2295,58 @@ ENTRY main {
                             ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
 }
 
+TEST_P(
+    TritonSoftmaxTest,
+    CanFuseAndEmitBinaryElementwiseOperationWhereOneOperandIsASharedSplatProducerIntoDiamond) {  // NOLINT(whitespace/line_length)
+  PrimitiveType data_type = GetParam();
+
+  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
+                               se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
+    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
+  }
+
+  const std::string hlo_text_template = R"(
+HloModule nonfusible_diamond
+max_computation {
+  arg_0.1 = $0[] parameter(0)
+  arg_1.1 = $0[] parameter(1)
+  ROOT max = $0[] maximum(arg_0.1, arg_1.1)
+}
+ENTRY main {
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant_2 = $0[] constant(0.333333343)
+  broadcast_splat = $0[127,125]{1,0} broadcast(constant_2), dimensions={}
+  param_1 = $0[127,125]{1,0} parameter(1)
+  multiply_splat = $0[127,125]{1,0} multiply(broadcast_splat, param_1)
+  multiply = $0[127,125]{1,0} multiply(param_0, broadcast_splat)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(multiply, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
+}
+)";
+  const std::string hlo_text = absl::Substitute(
+      hlo_text_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  const std::string hlo_ref_template = R"(
+; CHECK:    ENTRY
+; CHECK:      %[[P0:.*]] = $0[127,125]{1,0} parameter(0)
+; CHECK:      ROOT
+; CHECK-SAME: fusion(%[[P0]])
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   __triton_softmax
+)";
+  const std::string hlo_ref = absl::Substitute(
+      hlo_ref_template, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  MatchOptimizedHlo(hlo_text, hlo_ref);
+
+  float tolerance = 0.0;
+  EXPECT_TRUE(RunAndCompare(hlo_text,
+                            ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
+}
+
 INSTANTIATE_TEST_SUITE_P(TritonSoftmaxTestSuite, TritonSoftmaxTest,
                          ::testing::Values(F32, F16, BF16));
 
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
index ce3dcacebc23e2..81206454385da3 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
@@ -166,7 +166,7 @@ bool IsTriviallyFusible(HloInstruction* instr, const GpuVersion& gpu_version,
   }
 
   // Elementwise binary ops are trivially fusible if the operands are the same,
-  // or if exactly one of the operands is a splat constant with a single user.
+  // or if exactly one of the operands is a splat constant.
   if (instr->IsElementwiseBinary()) {
     const HloInstruction* operand_0 = instr->operand(0);
     const HloInstruction* operand_1 = instr->operand(1);
@@ -177,18 +177,6 @@ bool IsTriviallyFusible(HloInstruction* instr, const GpuVersion& gpu_version,
       return IsTritonSupportedInstruction(instr, gpu_version);
     }
 
-    // If either operand is a splat constant with multiple users, we should not
-    // fuse.
-    bool operand_0_is_shared_splat_constant =
-        IsBroadcastOfScalarConstant(*operand_0) && !HasOneUse(operand_0);
-    bool operand_1_is_shared_splat_constant =
-        IsBroadcastOfScalarConstant(*operand_1) && !HasOneUse(operand_1);
-
-    if (operand_0_is_shared_splat_constant ||
-        operand_1_is_shared_splat_constant) {
-      return false;
-    }
-
     // For simplicity we only fuse elementwise binary ops with splat operands
     // if they contain one non-splat operand.
     if (IsBroadcastOfScalarConstant(*operand_0) ^
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
index c4c506064f983d..31598ff2028092 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
@@ -1595,7 +1595,6 @@ ENTRY main {
 
 TEST_P(
     SoftmaxRewriterTritonTest,
-
     CanFuseBinaryElementwiseWhereTheFirstOperandIsASplatConstantWithinDiamond) {
   PrimitiveType data_type = GetParam();
   const std::string hlo_string_template = R"(
@@ -1663,33 +1662,40 @@ ENTRY main {
               GmockMatch(m::Fusion(m::Parameter())));
 }
 
-TEST_F(
-    SoftmaxRewriterTritonTest,
-    DoesNotFuseBinaryElementwiseOperationWhereOneOperandIsASharedSplatProducer) {  // NOLINT(whitespace/line_length)
-  const std::string hlo_string = R"(
+TEST_P(SoftmaxRewriterTritonTest,
+       CanFuseBinaryElementwiseOperationWhereOneOperandIsASharedSplatProducer) {
+  PrimitiveType data_type = GetParam();
+  const std::string hlo_string_template = R"(
 HloModule nonfusible_diamond
-add_computation {
-  arg_0.1 = f32[] parameter(0)
-  arg_1.1 = f32[] parameter(1)
-  ROOT add = f32[] add(arg_0.1, arg_1.1)
+max_computation {
+  arg_0 = $0[] parameter(0)
+  arg_1 = $0[] parameter(1)
+  ROOT max = $0[] maximum(arg_0, arg_1)
 }
 ENTRY main {
-  param_0 = f32[127,125]{1,0} parameter(0)
-  constant.2 = f32[] constant(0.333333343)
-  broadcast_splat = f32[127,125]{1,0} broadcast(constant.2), dimensions={}
-  param_1 = f32[127,125]{1,0} parameter(1)
-  multiply_splat = f32[127,125]{1,0} multiply(broadcast_splat, param_1)
-  multiply = f32[127,125]{1,0} multiply(param_0, broadcast_splat)
-  constant_neg_inf = f32[] constant(-inf)
-  reduce = f32[127]{0} reduce(multiply, constant_neg_inf), dimensions={1}, to_apply=add_computation
-  broadcast = f32[127,125]{1,0} broadcast(reduce), dimensions={0}
-  ROOT subtract = f32[127,125]{1,0} subtract(param_0, broadcast)
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant_2 = $0[] constant(0.333333343)
+  broadcast_splat = $0[127,125]{1,0} broadcast(constant_2), dimensions={}
+  param_1 = $0[127,125]{1,0} parameter(1)
+  multiply_splat = $0[127,125]{1,0} multiply(broadcast_splat, param_1)
+  multiply = $0[127,125]{1,0} multiply(param_0, broadcast_splat)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(multiply, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
 }
 )";
+  const std::string hlo_string =
+      absl::Substitute(hlo_string_template,
+                       primitive_util::LowercasePrimitiveTypeName(data_type));
 
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
-  EXPECT_FALSE(
+  EXPECT_TRUE(
       SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter())));
 }
 
 TEST_F(

From 58cf4efca092f58c4c1b47e3c99bd306d7054990 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 14:54:43 -0700
Subject: [PATCH 348/567] Implement the
 PjRtStreamExecutorBuffer::DonateWithControlDependency API for HostCallback in
 Pathways

PiperOrigin-RevId: 568968580
---
 third_party/xla/xla/pjrt/BUILD                | 13 +++-
 .../xla/pjrt/pjrt_stream_executor_client.cc   | 66 +++++++++++++++++++
 .../xla/pjrt/pjrt_stream_executor_client.h    |  3 +
 .../pjrt/pjrt_stream_executor_client_test.cc  | 47 +++++++++++++
 4 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 9997ed8fc94e9a..24e97a8c479e0e 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -448,7 +448,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -459,6 +460,7 @@ cc_library(
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:fingerprint",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -472,7 +474,11 @@ xla_cc_test(
     srcs = ["pjrt_stream_executor_client_test.cc"],
     deps = [
         ":pjrt_client",
+        ":pjrt_future",
         ":pjrt_stream_executor_client",
+        "//xla:literal",
+        "//xla:literal_comparison",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:test",
         "//xla:xla_data_proto_cc",
@@ -481,7 +487,12 @@ xla_cc_test(
         "//xla/service:cpu_plugin",
         "//xla/service:platform_util",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@tf_runtime//:async_value",
     ],
 )
 
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index f9fa35debcb7b0..c7e097fe25764d 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -83,6 +83,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
@@ -123,6 +125,7 @@ limitations under the License.
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/fingerprint.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/mem.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
@@ -693,6 +696,69 @@ PjRtStreamExecutorBuffer::ReleaseDeviceMemoryOwnership(
   return ref;
 }
 
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtStreamExecutorBuffer::DonateWithControlDependency(
+    PjRtFuture<absl::Status> dependency) {
+  VLOG(1) << "PjRtStreamExecutorBuffer::DonateWithControlDependency";
+  std::unique_ptr<PjRtBuffer> new_buffer;
+
+  auto tracked_buffer =
+      GetBufferWithHold(PjRtStreamExecutorBuffer::ScopedHold::kDonation);
+
+  if (!tracked_buffer.ok()) {
+    return InvalidArgument(
+        "Invalid buffer passed to DonateWithControlDependency: %s",
+        tracked_buffer.status().ToString());
+  }
+
+  // Copy all the data in the existing tracked_buffer.
+  absl::InlinedVector<se::DeviceMemoryBase, 4> buffers(
+      tracked_buffer->device_memory().begin(),
+      tracked_buffer->device_memory().end());
+  auto original_definition_events = tracked_buffer->definition_events();
+  absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 4>
+      definition_events;
+
+  auto definition_event_for_status =
+      std::make_shared<BufferSequencingEvent>(client()->thread_pool());
+  // definition_event_for_status must be the first one so that it blocks other
+  // actions like D2H transfer from execution before the buffer is ready.
+  definition_events.push_back(definition_event_for_status);
+  definition_events.insert(definition_events.end(),
+                           original_definition_events.begin(),
+                           original_definition_events.end());
+
+  auto new_device_buffer = std::make_shared<TrackedDeviceBuffer>(
+      tracked_buffer->allocator(), device()->local_hardware_id(),
+      std::move(buffers), std::move(definition_events),
+      /*on_delete_callback=*/std::function<void()>());
+
+  // Make the new buffer which is identical to the old, except for the new
+  // definition event.
+  new_buffer =
+      std::unique_ptr<PjRtBuffer>(std::make_unique<PjRtStreamExecutorBuffer>(
+          on_device_shape(), std::move(new_device_buffer), client(), device()));
+
+  PjRtStreamExecutorDevice* device = this->device();
+  LocalDeviceState* local_device = device->local_device_state();
+  dependency.OnReady(
+      [definition_event_for_status = std::move(definition_event_for_status),
+       local_device](absl::Status status) mutable {
+        // Forward the absl::Status from the supplied dependency to the
+        // definition event.
+        auto stream = local_device->BorrowStreamFromPool();
+        auto event =
+            local_device->event_pool().ThenAllocateAndRecordEvent(stream.get());
+        TF_CHECK_OK(event.status());
+        definition_event_for_status->SetSequencingEvent(
+            std::move(event).value(), stream.get());
+        local_device->ReturnStreamToPool(std::move(stream));
+      });
+
+  tracked_buffer.ConfirmDonation();
+  return new_buffer;
+}
+
 StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtStreamExecutorClient::BufferFromHostBuffer(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index a8225d46f4d030..14abf2e4851a41 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -707,6 +707,9 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   StatusOr<std::shared_ptr<TrackedDeviceBuffer>> Release(
       bool wait_for_operations_to_complete);
 
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
+      PjRtFuture<absl::Status> dependency) override;
+
  private:
   friend class PjRtClient;
 
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
index d61154b0c01aa6..b3bcf49f6af0fe 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
@@ -22,13 +22,22 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include "absl/functional/any_invocable.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/client/client_library.h"
 #include "xla/client/xla_builder.h"
+#include "xla/literal.h"
+#include "xla/literal_comparison.h"
+#include "xla/literal_util.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
 
 namespace xla {
 namespace {
@@ -114,5 +123,43 @@ TEST(PjRtStreamExecutorClientTest, DonateSameBufferTwice) {
               ::testing::HasSubstr("f(donate(a), donate(a))"));
 }
 
+TEST(PjRtStreamExecutorClientTest, DonateWithControlDependency) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetClient());
+  auto literal = LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostLiteral(literal, client->addressable_devices()[0]));
+
+  auto avr = tsl::MakeUnconstructedAsyncValueRef<absl::Status>();
+  PjRtFuture<absl::Status> future(avr);
+  auto blocked_buffer =
+      std::move(*(buffer->DonateWithControlDependency(future)));
+  EXPECT_TRUE(buffer->IsDeleted());
+
+  buffer.reset();
+  absl::Mutex mu;
+  auto result_literal = std::make_shared<Literal>(
+      ShapeUtil::DeviceShapeToHostShape(blocked_buffer->on_device_shape()));
+  bool got_literal = false;
+  blocked_buffer->ToLiteral(result_literal.get(), [&](absl::Status s) {
+    absl::MutexLock l(&mu);
+    TF_ASSERT_OK(s);
+    got_literal = true;
+  });
+  blocked_buffer.reset();
+
+  EXPECT_FALSE(got_literal);
+
+  avr.emplace(tsl::OkStatus());
+  EXPECT_TRUE(future.IsReady());
+
+  {
+    absl::MutexLock l(&mu);
+    mu.Await(absl::Condition(&got_literal));
+  }
+
+  TF_ASSERT_OK(literal_comparison::Equal(literal, *result_literal));
+}
+
 }  // namespace
 }  // namespace xla

From 19ce6551b9fd0f65e2467d3ba435243f9224d61e Mon Sep 17 00:00:00 2001
From: Shixin Li <shixinli@google.com>
Date: Wed, 27 Sep 2023 14:55:13 -0700
Subject: [PATCH 349/567] Remove "no_tap" tag for aot test.

PiperOrigin-RevId: 568968719
---
 third_party/xla/xla/pjrt/gpu/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 28addefa812c7a..eb1ddc2fad0b38 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -285,7 +285,6 @@ xla_cc_test(
         "config-cuda-only",
         "gpu",
         "no_oss",
-        "no_tap",
         "requires-gpu-nvidia",
     ],
     deps = [

From 11d397a9868a9b9531ee3ee112784508f846b6d7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 15:32:31 -0700
Subject: [PATCH 350/567] 1.Update the StatisticsGen logic to handle the custom
 file pattern defined by user. 2.Update get_split_tfxio to handle splits with
 custom file pattern properties.

PiperOrigin-RevId: 568978781
---
 tensorflow/python/util/protobuf/BUILD | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/util/protobuf/BUILD b/tensorflow/python/util/protobuf/BUILD
index 783e6a64707329..85c44a9eedccfa 100644
--- a/tensorflow/python/util/protobuf/BUILD
+++ b/tensorflow/python/util/protobuf/BUILD
@@ -1,7 +1,8 @@
 # Tensorflow protobuf utility package
 
-# Placeholder: load py_proto_library
 load("//tensorflow:strict.default.bzl", "py_strict_library")
+
+# Placeholder: load py_proto_library
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "tf_py_strict_test")
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")  # @unused
 
@@ -77,6 +78,7 @@ py_strict_library(
     visibility = visibility + [
         "//tensorflow:__pkg__",
         "//third_party/py/tensorflow_core:__subpackages__",
+        "//third_party/py/tensorflow_data_validation:__subpackages__",
         "//third_party/py/tf_agents:__subpackages__",
         "//third_party/py/tfx:__subpackages__",
     ],

From f95a4a1910d5bcc379b96093b6a83ef7be216880 Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Wed, 27 Sep 2023 16:04:08 -0700
Subject: [PATCH 351/567] Cleanup error logging when falling back to combined
 bridge

PiperOrigin-RevId: 568987102
---
 .../mlir/tf2xla/api/v2/legalize_tf.cc         | 24 ++++++++-----------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
index ae7bfa8076d917..2a2d0608f0ad55 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
@@ -98,21 +98,18 @@ tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
     IncrementTfMlirBridgeSecondPhaseCounter(
         MlirBridgeSecondPhaseMetric::kMlirWithFallbackModeSuccess);
     return *compilation_result;
-  }
-
-  bool filtered_graph = false;
-  if (mlir_bridge_status.status() == CompileToHloGraphAnalysisFailedError()) {
+  } else if (mlir_bridge_status.status() ==
+             CompileToHloGraphAnalysisFailedError()) {
     VLOG(1) << "Filtered out MLIR computation to XLA HLO using MLIR tf2xla "
-               "bridge. Falling back to old (non-MLIR) bridge.";
-    filtered_graph = true;
+               "bridge. Falling back to Combined Bridge.";
   } else {
+    VLOG(1) << "Failed to compile MLIR computation to XLA HLO using "
+               "MLIR Bridge. Falling back to Combined Bridge.";
+    tsl::error_logging::Log(kBridgeComponent, "TFXLA_API_V2_PHASE2_MLIR_BRIDGE",
+                            mlir_bridge_status.status().ToString())
+        .IgnoreError();
     IncrementTfMlirBridgeSecondPhaseCounter(
         MlirBridgeSecondPhaseMetric::kMlirWithFallbackModeFailure);
-
-    VLOG(1) << "Failed to compile MLIR computation to XLA HLO using MLIR "
-               "tf2xla bridge. Falling back to old (non-MLIR) bridge. MLIR "
-               "bridge compilation status: "
-            << mlir_bridge_status.status();
   }
 
   auto combined_bridge_status = internal::LegalizeTfToHlo(
@@ -127,9 +124,8 @@ tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
     return *compilation_result;
   }
 
-  VLOG(1)
-      << "Failed to compile MLIR computation to XLA HLO using "
-         "Combined MLIR and XlaBuilder Bridge. Falling back to Graph Bridge.";
+  VLOG(1) << "Failed to compile MLIR computation to XLA HLO using "
+             "Combined MLIR and XlaBuilder Bridge. Could not generate HLO.";
   tsl::error_logging::Log(kBridgeComponent, "TFXLA_API_V2_COMBINED_BRIDGE",
                           combined_bridge_status.status().ToString())
       .IgnoreError();

From 62b992d77c932b663daf96d2f353f7729a731d4b Mon Sep 17 00:00:00 2001
From: Haibo Huang <hhb@google.com>
Date: Wed, 27 Sep 2023 16:36:26 -0700
Subject: [PATCH 352/567] Not an error if failed to open
 libtensorflow_framework

PiperOrigin-RevId: 568995336
---
 tensorflow/core/tpu/tpu_api_dlsym_initializer.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index b4a871336b9cb2..a6d587b27abbe0 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -91,8 +91,8 @@ absl::Status FindAndLoadTpuLibrary() {
     LOG(INFO) << "Opening library: " << so_name;
     void* tf_lib = dlopen(so_name, RTLD_NOW | RTLD_GLOBAL);
     if (tf_lib == nullptr) {
-      return absl::InternalError(
-          absl::StrCat("Failed to open libtensorflow ", dlerror()));
+      LOG(WARNING) << "Failed to open library " << dlerror()
+                   << ". This may be expected if Tensorflow API is not used";
     }
   }
 

From b210366946ce38e3392a515c29b6b71a98df1fd8 Mon Sep 17 00:00:00 2001
From: Fiona Lang <flang@google.com>
Date: Wed, 27 Sep 2023 17:06:38 -0700
Subject: [PATCH 353/567] Delete remaining imports from python/__init__.py.

PiperOrigin-RevId: 569002242
---
 tensorflow/python/__init__.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 00ad7e3138798d..082cf4e7b175e4 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -21,11 +21,6 @@
 import tensorflow as tf
 """
 
-import ctypes
-import importlib
-import sys
-import traceback
-
 # We aim to keep this file minimal and ideally remove completely.
 # If you are adding a new file with @tf_export decorators,
 # import it in modules_with_exports.py instead.

From 66aa52232dcf00105c6bab09633568eb6b022c8e Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Wed, 27 Sep 2023 18:05:12 -0700
Subject: [PATCH 354/567] Add APIs for retrieving the TF Lite Extension APIs
 version number.

PiperOrigin-RevId: 569014286
---
 RELEASE.md                           |  3 +++
 tensorflow/lite/core/c/c_api.cc      |  4 ++++
 tensorflow/lite/core/c/c_api.h       | 20 ++++++++++++++++++++
 tensorflow/lite/core/c/c_api_test.cc | 15 +++++++++++++++
 tensorflow/lite/version.h            |  5 +++++
 5 files changed, 47 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index 5e6cc62a627a62..d4b23dfa88c25d 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -75,6 +75,9 @@
        *    `TfLiteSignatureRunnerInvoke`
        *    `TfLiteSignatureRunnerResizeInputTensor`
 
+    * New C API function `TfLiteExtensionApisVersion` added to
+      `tensorflow/lite/c/c_api.h`.
+
 * Android NDK r25 is supported.
 
 ### Bug Fixes and Other Changes
diff --git a/tensorflow/lite/core/c/c_api.cc b/tensorflow/lite/core/c/c_api.cc
index 343a2866645894..e5150cc69c35bb 100644
--- a/tensorflow/lite/core/c/c_api.cc
+++ b/tensorflow/lite/core/c/c_api.cc
@@ -55,6 +55,10 @@ const char* TfLiteVersion() { return TFLITE_VERSION_STRING; }
 
 int TfLiteSchemaVersion() { return TFLITE_SCHEMA_VERSION; }
 
+const char* TfLiteExtensionApisVersion() {
+  return TFLITE_EXTENSION_APIS_VERSION_STRING;
+}
+
 TfLiteModel* TfLiteModelCreate(const void* model_data, size_t model_size) {
   auto model = tflite::FlatBufferModel::VerifyAndBuildFromBuffer(
       static_cast<const char*>(model_data), model_size);
diff --git a/tensorflow/lite/core/c/c_api.h b/tensorflow/lite/core/c/c_api.h
index 07ca048fe08797..e299970e1691ec 100644
--- a/tensorflow/lite/core/c/c_api.h
+++ b/tensorflow/lite/core/c/c_api.h
@@ -130,6 +130,26 @@ typedef struct TfLiteSignatureRunner TfLiteSignatureRunner;
 /// e.g. "2.12.0" or "2.13.0-rc2".
 TFL_CAPI_EXPORT extern const char* TfLiteVersion(void);
 
+// --------------------------------------------------------------------------
+/// The TensorFlow Lite Extension APIs version.
+///
+/// Returns a pointer to a statically allocated string that is the version
+/// number of the TF Lite Extension APIs supported by the (potentially
+/// dynamically loaded) TF Lite Runtime library.  The TF Lite "Extension APIs"
+/// are the APIs for extending TF Lite with custom ops and delegates.
+/// More specifically, this version number covers the (non-experimental)
+/// functionality documented in the following header files:
+///
+///   * lite/c/c_api_opaque.h
+///   * lite/c/common.h
+///   * lite/c/builtin_op_data.h
+///   * lite/builtin_ops.h
+///
+/// This version number uses semantic versioning, and the return value should
+/// be in semver 2 format <http://semver.org>, starting with MAJOR.MINOR.PATCH,
+/// e.g. "2.14.0" or "2.15.0-rc2".
+TFL_CAPI_EXPORT extern const char* TfLiteExtensionApisVersion(void);
+
 /// The supported TensorFlow Lite model file Schema version.
 ///
 /// Returns the (major) version number of the Schema used for model
diff --git a/tensorflow/lite/core/c/c_api_test.cc b/tensorflow/lite/core/c/c_api_test.cc
index 20da6709ce5f09..abb0083e12578c 100644
--- a/tensorflow/lite/core/c/c_api_test.cc
+++ b/tensorflow/lite/core/c/c_api_test.cc
@@ -58,6 +58,21 @@ TEST(CApiSimple, Version) {
   EXPECT_STREQ(TfLiteVersion(), version);
 }
 
+TEST(CApiSimple, ExtensionApisVersion) {
+  const char* version = TfLiteExtensionApisVersion();
+  ASSERT_NE(version, nullptr);
+  EXPECT_STRNE(version, "");
+  int major = -1, minor = -1, patch = -1;
+  int ret = sscanf(version, "%d.%d.%d", &major, &minor, &patch);
+  // The version number should contain all three components.
+  EXPECT_GE(ret, 3);
+  EXPECT_GE(major, 0);
+  EXPECT_GE(minor, 0);
+  EXPECT_GE(patch, 0);
+  // Calling the function again should give the same result.
+  EXPECT_STREQ(TfLiteExtensionApisVersion(), version);
+}
+
 TEST(CApiSimple, SchemaVersion) {
   // The following checks will need updating if we change the schema version.
   EXPECT_EQ(TfLiteSchemaVersion(), 3);
diff --git a/tensorflow/lite/version.h b/tensorflow/lite/version.h
index f667447bc470a1..3cd3de26edfd22 100644
--- a/tensorflow/lite/version.h
+++ b/tensorflow/lite/version.h
@@ -26,4 +26,9 @@ limitations under the License.
 // This value is currently shared with that of TensorFlow.
 #define TFLITE_VERSION_STRING TF_VERSION_STRING
 
+// TensorFlow Lite Extension APIs version.
+// This is the semantic version number for the custom op and delegate APIs.
+// This value is currently shared with that of TensorFlow Lite.
+#define TFLITE_EXTENSION_APIS_VERSION_STRING TFLITE_VERSION_STRING
+
 #endif  // TENSORFLOW_LITE_VERSION_H_

From 812e85802a90c4158598bed297fe946da642cd12 Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Wed, 27 Sep 2023 18:08:09 -0700
Subject: [PATCH 355/567] Remove some unnecessary deps.

PiperOrigin-RevId: 569015036
---
 tensorflow/lite/c/BUILD | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index e2c4b896b12825..335dcfa671f5e8 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/lite:build_def.bzl",
     "tflite_cc_library_with_c_headers_test",
@@ -8,7 +9,6 @@ load(
     "tflite_linkopts_no_undefined",
     "tflite_self_contained_libs_test_suite",
 )
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "c_api_opaque_internal_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite_with_c_headers_test")
 
@@ -113,16 +113,7 @@ cc_library_with_tflite_with_c_headers_test(
     tflite_deps = [
         ":c_api",
     ],
-    deps = [
-        ":c_api_opaque_internal",
-        ":common_internal",
-        "//tensorflow/lite/core:subgraph",
-        "//tensorflow/lite/core/c:c_api",
-        "//tensorflow/lite/core/c:c_api_experimental",
-        "//tensorflow/lite/core/c:c_api_types",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/kernels:kernel_util",
-    ],
+    deps = ["//tensorflow/lite/core/c:c_api_experimental"],
 )
 
 # Same as ":c_api_experimental", but without linking in the default CreateOpResolver implementation.

From ea1e54bb56fc701d9930e6855574b7151a48ee5b Mon Sep 17 00:00:00 2001
From: Fiona Lang <flang@google.com>
Date: Wed, 27 Sep 2023 18:46:53 -0700
Subject: [PATCH 356/567] Fix docstring test for tf.summary.

PiperOrigin-RevId: 569022676
---
 tensorflow/python/tools/api/generator/doc_srcs.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index 5aaed2beb2323e..acbd65afc0ca15 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -89,8 +89,14 @@ def __init__(self, docstring=None, docstring_module_name=None):
         DocSource(docstring_module_name='ops.sparse_ops'),
     'strings':
         DocSource(docstring_module_name='ops.string_ops'),
-    'summary':
-        DocSource(docstring_module_name='summary.summary'),
+    'summary': DocSource(
+        docstring=(
+            'Operations for writing summary data, for use in analysis and'
+            ' visualization. See the [Summaries and'
+            ' TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard)'
+            ' guide.'
+        )
+    ),
     'sysconfig':
         DocSource(docstring='System configuration library.'),
     'test':

From d2d93477676a1c7735b01b51d4700eccc838b258 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Wed, 27 Sep 2023 19:55:10 -0700
Subject: [PATCH 357/567] [xla:gpu] Enable cuBLAS gemms in GPU graphs

PiperOrigin-RevId: 569034056
---
 third_party/xla/xla/debug_options_flags.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 97d2c5dfab2c74..7d778c051fd569 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -92,9 +92,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   // flag.
   opts.set_xla_gpu_enable_cublaslt(false);
 
-  // TODO(b/258036887): Enable gpu_graph_level=2. Currently blocked by CUDA 12
-  // integration.
-  opts.set_xla_gpu_graph_level(1);
+  // TODO(b/258036887): Enable gpu_graph_level=3.
+  opts.set_xla_gpu_graph_level(2);
   opts.set_xla_gpu_graph_num_runs_to_instantiate(-1);
   opts.set_xla_gpu_enable_persistent_temp_buffers(false);
   opts.set_xla_gpu_graph_min_graph_size(5);

From ef40d017f8dd0a8602ab0565c14a689d41ea4e3c Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Wed, 27 Sep 2023 20:41:35 -0700
Subject: [PATCH 358/567] [TSL] Use down_cast when down casting unique_ptr

No functional changes.

PiperOrigin-RevId: 569045182
---
 tensorflow/compiler/aot/compile.cc                     |  2 +-
 third_party/xla/xla/BUILD                              |  2 ++
 third_party/xla/xla/service/gpu/BUILD                  |  1 +
 .../xla/xla/service/gpu/compile_module_to_llvm_ir.cc   | 10 ++++++----
 third_party/xla/xla/service/gpu/gemm_rewriter.cc       |  2 +-
 .../xla/xla/tests/local_client_aot_test_helper.cc      |  2 +-
 third_party/xla/xla/util.h                             |  9 ++++++---
 7 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 610abbd0d0f25c..6bcea32f897e85 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -94,7 +94,7 @@ Status CompileXla(xla::CompileOnlyClient* client,
                            aot_or.status().message());
   }
   compile_result->aot =
-      xla::unique_ptr_static_cast<xla::cpu::CpuAotCompilationResult>(
+      xla::unique_ptr_down_cast<xla::cpu::CpuAotCompilationResult>(
           std::move(aot_or.value().back()));
   compile_result->entry_point = aot_opts.entry_point_name();
   compile_result->pointer_size =
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 66dfb7e4663851..b51515b3f02e93 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -313,12 +313,14 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/gtl:iterator_range",
         "@local_tsl//tsl/lib/math:math_util",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 277946771f3516..ac4bcd421e514b 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2501,6 +2501,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 574bfaf63aa5c6..80779f69fc489d 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -76,6 +76,7 @@ limitations under the License.
 #include "xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -157,19 +158,20 @@ void ForAllThunks(const std::function<void(Thunk*)>& fn,
                   ThunkSequence* thunk_sequence) {
   for (std::unique_ptr<Thunk>& thunk : *thunk_sequence) {
     if (thunk->kind() == Thunk::kConditional) {
-      auto* cond_thunk = static_cast<ConditionalThunk*>(thunk.get());
+      auto* cond_thunk = tensorflow::down_cast<ConditionalThunk*>(thunk.get());
       for (const std::unique_ptr<SequentialThunk>& branch_thunks :
            cond_thunk->branch_thunks()) {
         ForAllThunks(fn, &branch_thunks->thunks());
       }
     } else if (thunk->kind() == Thunk::kFor) {
-      auto* for_thunk = static_cast<ForThunk*>(thunk.get());
+      auto* for_thunk = tensorflow::down_cast<ForThunk*>(thunk.get());
       ForAllThunks(fn, &for_thunk->body_thunk_sequence()->thunks());
     } else if (thunk->kind() == Thunk::kSequential) {
-      auto* sequential_thunk = static_cast<SequentialThunk*>(thunk.get());
+      auto* sequential_thunk =
+          tensorflow::down_cast<SequentialThunk*>(thunk.get());
       ForAllThunks(fn, &sequential_thunk->thunks());
     } else if (thunk->kind() == Thunk::kWhile) {
-      auto* while_thunk = static_cast<WhileThunk*>(thunk.get());
+      auto* while_thunk = tensorflow::down_cast<WhileThunk*>(thunk.get());
       ForAllThunks(fn, &while_thunk->condition_thunk_sequence()->thunks());
       ForAllThunks(fn, &while_thunk->body_thunk_sequence()->thunks());
     } else {
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index a6e899edb06e98..b8084049a5aa07 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -387,7 +387,7 @@ auto BcastConstScalarNear(double value) {
         // Not a very robust floating-point comparison, but good enough for our
         // purposes.
         std::optional<double> actual =
-            static_cast<const HloConstantInstruction *>(instr)
+            tensorflow::down_cast<const HloConstantInstruction *>(instr)
                 ->literal()
                 .GetAsDouble({});
         if (!actual.has_value()) return false;
diff --git a/third_party/xla/xla/tests/local_client_aot_test_helper.cc b/third_party/xla/xla/tests/local_client_aot_test_helper.cc
index 8e3e7284eeac57..2fcd11a9cd55f0 100644
--- a/third_party/xla/xla/tests/local_client_aot_test_helper.cc
+++ b/third_party/xla/xla/tests/local_client_aot_test_helper.cc
@@ -93,7 +93,7 @@ int main(int argc, char** argv) {
       xla::cpu::CpuAotCompilationOptions::RelocationModel::Static);
 
   auto results = client->CompileAheadOfTime({instance}, options).value();
-  auto result = xla::unique_ptr_static_cast<xla::cpu::CpuAotCompilationResult>(
+  auto result = xla::unique_ptr_down_cast<xla::cpu::CpuAotCompilationResult>(
       std::move(results.front()));
   // It's lame to hard-code the buffer assignments, but we need
   // local_client_aot_test.cc to be able to easily invoke the function.
diff --git a/third_party/xla/xla/util.h b/third_party/xla/xla/util.h
index 84509715e34686..731db9967ef7d3 100644
--- a/third_party/xla/xla/util.h
+++ b/third_party/xla/xla/util.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <array>
 #include <functional>
 #include <limits>
+#include <memory>
 #include <string>
 #include <type_traits>
 #include <utility>
@@ -31,6 +32,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/memory/memory.h"
 #include "absl/numeric/bits.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -40,6 +42,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/math/math_util.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/errors.h"  // IWYU pragma: keep
 
 namespace xla {
@@ -612,10 +615,10 @@ T NanWithSignAndPayload(bool sign, uint64_t nan_payload) {
   return absl::bit_cast<T>(rep);
 }
 
-// Utility for performing a static_cast<> on a std::unique_ptr<>.
+// Utility for performing a down_cast<> on a std::unique_ptr<>.
 template <typename Derived, typename Base>
-std::unique_ptr<Derived> unique_ptr_static_cast(std::unique_ptr<Base> ptr) {
-  return std::unique_ptr<Derived>(static_cast<Derived*>(ptr.release()));
+std::unique_ptr<Derived> unique_ptr_down_cast(std::unique_ptr<Base> ptr) {
+  return absl::WrapUnique(tensorflow::down_cast<Derived*>(ptr.release()));
 }
 
 int64_t Product(absl::Span<const int64_t> xs);

From f92a70a41e76db5d0829120f46d5b001f89decdf Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Wed, 27 Sep 2023 22:14:45 -0700
Subject: [PATCH 359/567] Destruct objects owned by
 `WeakRefLRUCache::CacheEntry` out of band using `GlobalPyRefManager()`

This assumes less about whether the thread that destructs `CacheEntry` has GIL or not, which is difficult to reason about due to the `xla::LRUCache`'s use of `std::shared_ptr<CacheEntry>`.

The following changes have been made in JAX to accommodate the behavior differences from direct destruction to GC:

* Since `PyLoadedExecutable`s cached in `WeakRefLRUCache` are now destructed out of band, `PyClient::LiveExecutables()` calls `GlobalPyRefManager()->CollectGarbage()` to make the returned information accurate and up to date.
* `test_jit_reference_dropping` has been updated to call `gc.collect()` before verifying the live executable counts since the destruction of executables owned by weak ref maps is now done out of band as part of `GlobalPyRefManager`'s GC.

PiperOrigin-RevId: 569062402
---
 third_party/xla/xla/python/BUILD                | 2 ++
 third_party/xla/xla/python/py_client.cc         | 1 +
 third_party/xla/xla/python/weakref_lru_cache.cc | 6 ++++++
 3 files changed, 9 insertions(+)

diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 7d9b296bd27a9e..d177a0edbfea0a 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -946,9 +946,11 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
+        ":python_ref_manager",
         # placeholder for index annotation deps
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "//xla/pjrt:lru_cache",
         "@pybind11",
     ],
diff --git a/third_party/xla/xla/python/py_client.cc b/third_party/xla/xla/python/py_client.cc
index ce6e4dcba1c2e7..234086a525ed4b 100644
--- a/third_party/xla/xla/python/py_client.cc
+++ b/third_party/xla/xla/python/py_client.cc
@@ -102,6 +102,7 @@ std::vector<py::object> PyClient::LiveBuffers() {
 
 std::vector<std::shared_ptr<PyLoadedExecutable>> PyClient::LiveExecutables() {
   CHECK(PyGILState_Check());
+  GlobalPyRefManager()->CollectGarbage();
   std::vector<std::shared_ptr<PyLoadedExecutable>> executables;
   for (PyLoadedExecutable* exec = executables_; exec; exec = exec->next_) {
     if (!exec->is_deleted()) {
diff --git a/third_party/xla/xla/python/weakref_lru_cache.cc b/third_party/xla/xla/python/weakref_lru_cache.cc
index 7d0447b2395035..2f84767f02053a 100644
--- a/third_party/xla/xla/python/weakref_lru_cache.cc
+++ b/third_party/xla/xla/python/weakref_lru_cache.cc
@@ -23,8 +23,10 @@ limitations under the License.
 
 #include "absl/cleanup/cleanup.h"
 #include "absl/synchronization/notification.h"
+#include "absl/types/span.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/pjrt/lru_cache.h"
+#include "xla/python/python_ref_manager.h"
 
 namespace jax {
 namespace {
@@ -94,6 +96,10 @@ class WeakrefLRUCache : public std::enable_shared_from_this<WeakrefLRUCache> {
     pybind11::object result;
     absl::Notification completed;
     std::thread::id thread_id = std::this_thread::get_id();
+
+    ~CacheEntry() {
+      xla::GlobalPyRefManager()->AddGarbage(absl::MakeSpan(&result, 1));
+    }
   };
 
   struct CacheInfo {

From 354634b03f7e9fa0816e304ad6397f119ae00417 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 27 Sep 2023 22:55:09 -0700
Subject: [PATCH 360/567] [stream_executor] NFC: Remove includes of a private
 stream_executor_pimpl header

`stream_executor_pimpl` is an implementation detail and should not be included outside of `stream_executor` package, instead include `stream_executor.h` which exports StreamExecutor public API

PiperOrigin-RevId: 569070331
---
 tensorflow/core/tfrt/saved_model/saved_model_util.cc   |  2 +-
 tensorflow/core/tpu/kernels/transfer_ops.cc            |  2 +-
 tensorflow/core/tpu/kernels/transfer_ops.h             |  2 +-
 .../xla/xla/backends/interpreter/executable_base.cc    |  2 +-
 third_party/xla/xla/service/cpu/cpu_compiler.cc        |  2 +-
 third_party/xla/xla/service/gpu/autotuner_util.h       |  2 +-
 third_party/xla/xla/service/gpu/buffer_comparator.cc   | 10 +++++-----
 .../xla/xla/service/gpu/compile_module_to_llvm_ir.cc   |  1 -
 .../xla/xla/service/gpu/compile_module_to_llvm_ir.h    |  1 -
 .../xla/xla/service/gpu/conv_algorithm_picker.cc       |  2 +-
 third_party/xla/xla/service/gpu/gpu_compiler.cc        |  1 -
 third_party/xla/xla/service/gpu/gpu_compiler.h         |  1 -
 third_party/xla/xla/service/gpu/gpu_executable.cc      |  2 +-
 third_party/xla/xla/service/gpu/runtime/graph_launch.h |  1 -
 .../xla/xla/service/gpu/tests/gpu_fused_mha_test.cc    |  2 +-
 third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc   |  2 +-
 .../xla/xla/stream_executor/cuda/cuda_executor.cc      |  2 +-
 .../xla/xla/stream_executor/cuda/cuda_platform.h       |  2 +-
 third_party/xla/xla/stream_executor/event.cc           |  2 +-
 third_party/xla/xla/stream_executor/gpu/asm_compiler.h |  2 +-
 third_party/xla/xla/stream_executor/gpu/gpu_executor.h |  2 +-
 .../xla/xla/stream_executor/gpu/redzone_allocator.cc   |  2 +-
 .../xla/xla/stream_executor/host/host_platform.h       |  2 +-
 third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc   |  2 +-
 .../xla/xla/stream_executor/rocm/rocm_gpu_executor.cc  |  2 +-
 .../xla/xla/stream_executor/rocm/rocm_platform.h       |  2 +-
 third_party/xla/xla/stream_executor/stream.cc          |  2 +-
 .../xla/stream_executor/temporary_memory_manager.cc    |  2 +-
 .../xla/xla/stream_executor/tpu/tpu_platform.cc        |  2 +-
 29 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/tfrt/saved_model/saved_model_util.cc b/tensorflow/core/tfrt/saved_model/saved_model_util.cc
index dc3671c57cab11..f03f37a1966704 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_util.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_util.cc
@@ -49,7 +49,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.cc b/tensorflow/core/tpu/kernels/transfer_ops.cc
index 33245ef06b24e1..b7990aa4a0999d 100644
--- a/tensorflow/core/tpu/kernels/transfer_ops.cc
+++ b/tensorflow/core/tpu/kernels/transfer_ops.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "xla/literal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/tpu/noncopyable_buffer.h"
 #include "xla/stream_executor/tpu/tpu_node_context.h"
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.h b/tensorflow/core/tpu/kernels/transfer_ops.h
index b12d923c6b0172..765a8124c18222 100644
--- a/tensorflow/core/tpu/kernels/transfer_ops.h
+++ b/tensorflow/core/tpu/kernels/transfer_ops.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <string>
 
 #include "xla/literal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/tpu/noncopyable_buffer.h"
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "xla/stream_executor/tpu/tpu_transfer_manager_interface.h"
diff --git a/third_party/xla/xla/backends/interpreter/executable_base.cc b/third_party/xla/xla/backends/interpreter/executable_base.cc
index f286624938b896..bcac33eff56b0c 100644
--- a/third_party/xla/xla/backends/interpreter/executable_base.cc
+++ b/third_party/xla/xla/backends/interpreter/executable_base.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 9da505e9d10685..f74baacb59c697 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -222,7 +222,7 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
diff --git a/third_party/xla/xla/service/gpu/autotuner_util.h b/third_party/xla/xla/service/gpu/autotuner_util.h
index 0633fbd2a2aa40..d25f881274389d 100644
--- a/third_party/xla/xla/service/gpu/autotuner_util.h
+++ b/third_party/xla/xla/service/gpu/autotuner_util.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
 #include "xla/xla.pb.h"
 
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator.cc b/third_party/xla/xla/service/gpu/buffer_comparator.cc
index 5d3cbc21f06750..4cb9cee9ec19c5 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator.cc
+++ b/third_party/xla/xla/service/gpu/buffer_comparator.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/asm_compiler.h"
 #include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
@@ -743,11 +743,11 @@ static const char* buffer_compare_ptx = R"(
 
 $L__BB4_3:
 	{
-	.reg .b32 %temp; 
+	.reg .b32 %temp;
 	mov.b64 	{%r7, %temp}, %fd2;
 	}
 	{
-	.reg .b32 %temp; 
+	.reg .b32 %temp;
 	mov.b64 	{%temp, %r1}, %fd2;
 	}
 	and.b32  	%r8, %r1, 2147483647;
@@ -757,11 +757,11 @@ static const char* buffer_compare_ptx = R"(
 	@%p6 bra 	$L__BB4_6;
 
 	{
-	.reg .b32 %temp; 
+	.reg .b32 %temp;
 	mov.b64 	{%r9, %temp}, %fd1;
 	}
 	{
-	.reg .b32 %temp; 
+	.reg .b32 %temp;
 	mov.b64 	{%temp, %r2}, %fd1;
 	}
 	and.b32  	%r10, %r2, 2147483647;
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 80779f69fc489d..e68fb0215d502f 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -70,7 +70,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/translate/hlo_to_mhlo/hlo_utils.h"
 #include "xla/translate/mhlo_to_hlo/location_exporter.h"
 #include "xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
index b757ca13bef35b..8090af9d9aae67 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index cf8282d7d0084f..627bee57f2d222 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logger.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 83b904ff7639a8..de7585f516d455 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -186,7 +186,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index ad288243d9ee61..f806e3f492093a 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index bb931b0f728d67..5796c662432a34 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -60,7 +60,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/graph_launch.h b/third_party/xla/xla/service/gpu/runtime/graph_launch.h
index 5f931fb53f4278..2c14a0238595df 100644
--- a/third_party/xla/xla/service/gpu/runtime/graph_launch.h
+++ b/third_party/xla/xla/service/gpu/runtime/graph_launch.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/runtime/executable.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/gpu/gpu_graph.h"
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
index 35f05a6d258b46..10b03e874ad9a7 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/test_helpers.h"
 #include "xla/tests/client_library_test_base.h"
 #include "xla/tests/hlo_test_base.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 8e371d60fb064b..6500658e26f735 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -49,8 +49,8 @@ limitations under the License.
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tsl/cuda/cudnn_version.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 970962372d2e4e..cb351c4f3366cd 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -50,8 +50,8 @@ limitations under the License.
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.h b/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
index c343a5bd8bf696..fdf21fa9fa227e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
@@ -24,8 +24,8 @@ limitations under the License.
 #include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/port.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/stream_executor/trace_listener.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/stream_executor/event.cc b/third_party/xla/xla/stream_executor/event.cc
index cd76f2240f279f..7dcb28255d85e5 100644
--- a/third_party/xla/xla/stream_executor/event.cc
+++ b/third_party/xla/xla/stream_executor/event.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "xla/stream_executor/event.h"
 
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
index 2e0d397b5ed3c5..52776d1e3040e9 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_asm_opts.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/platform/port.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/statusor.h"
 #if GOOGLE_CUDA
 #include "xla/stream_executor/cuda/cuda_driver.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index f5ed9093451926..b5279bf657e432 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -38,8 +38,8 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_kernel.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/port.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
index 69025dff3dd600..c6415812a52ef2 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tsl/framework/allocator.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/stream_executor/host/host_platform.h b/third_party/xla/xla/stream_executor/host/host_platform.h
index 6cb4c97397c501..e08af9feab0c23 100644
--- a/third_party/xla/xla/stream_executor/host/host_platform.h
+++ b/third_party/xla/xla/stream_executor/host/host_platform.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/port.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/trace_listener.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index b987cdc6b55988..d533b781f71f8d 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -41,7 +41,7 @@ limitations under the License.
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/hash.h"
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
index 861637fff5ea7a..389b4ba6c390e0 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -42,8 +42,8 @@ limitations under the License.
 #include "xla/stream_executor/rocm/rocm_diagnostics.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
index ac746c466c0844..28fd9e7914acae 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
@@ -24,8 +24,8 @@ limitations under the License.
 #include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/port.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/stream_executor/trace_listener.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/stream.cc b/third_party/xla/xla/stream_executor/stream.cc
index d8cfa7bb793cd4..04de3eaeb77643 100644
--- a/third_party/xla/xla/stream_executor/stream.cc
+++ b/third_party/xla/xla/stream_executor/stream.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/port.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/stacktrace.h"
 
diff --git a/third_party/xla/xla/stream_executor/temporary_memory_manager.cc b/third_party/xla/xla/stream_executor/temporary_memory_manager.cc
index 690f64225611a9..f5b2955a3d2652 100644
--- a/third_party/xla/xla/stream_executor/temporary_memory_manager.cc
+++ b/third_party/xla/xla/stream_executor/temporary_memory_manager.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/logging.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc b/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
index f79fb5462744ba..91274499259ec4 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/status_helper.h"
 #include "xla/stream_executor/tpu/tpu_api.h"

From f2957eb767ee8ffdd142b7a7ea8a271bc46e98a5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Sep 2023 23:39:31 -0700
Subject: [PATCH 361/567] Reenable file upload in bazel remote config

This replaces the "transfer script via cli argument" hack by the upload support for remote configurations that landed in Bazel 3.1.0.

PiperOrigin-RevId: 569079584
---
 tensorflow/opensource_only.files              |  1 -
 third_party/gpus/compress_find_cuda_config.py | 36 ---------------
 third_party/gpus/compress_find_rocm_config.py | 36 ---------------
 third_party/gpus/cuda_configure.bzl           | 44 ++++++------------
 third_party/gpus/find_cuda_config.py          |  6 ---
 .../gpus/find_cuda_config.py.gz.base64        |  1 -
 .../gpus/find_rocm_config.py.gz.base64        |  1 -
 third_party/gpus/rocm_configure.bzl           | 45 ++++++-------------
 third_party/nccl/nccl_configure.bzl           | 18 ++++----
 third_party/tensorrt/tensorrt_configure.bzl   | 11 +++--
 third_party/xla/opensource_only.files         |  1 -
 .../gpus/compress_find_cuda_config.py         | 36 ---------------
 .../gpus/compress_find_rocm_config.py         | 36 ---------------
 .../xla/third_party/gpus/cuda_configure.bzl   | 44 ++++++------------
 .../xla/third_party/gpus/find_cuda_config.py  |  6 ---
 .../gpus/find_cuda_config.py.gz.base64        |  1 -
 .../gpus/find_rocm_config.py.gz.base64        |  1 -
 .../xla/third_party/gpus/rocm_configure.bzl   | 45 ++++++-------------
 .../xla/third_party/nccl/nccl_configure.bzl   | 18 ++++----
 .../tensorrt/tensorrt_configure.bzl           | 11 +++--
 .../xla/third_party/tsl/opensource_only.files |  1 -
 .../gpus/compress_find_cuda_config.py         | 36 ---------------
 .../gpus/compress_find_rocm_config.py         | 36 ---------------
 .../tsl/third_party/gpus/cuda_configure.bzl   | 44 ++++++------------
 .../tsl/third_party/gpus/find_cuda_config.py  |  6 ---
 .../gpus/find_cuda_config.py.gz.base64        |  1 -
 .../gpus/find_rocm_config.py.gz.base64        |  1 -
 .../tsl/third_party/gpus/rocm_configure.bzl   | 45 ++++++-------------
 .../tsl/third_party/nccl/nccl_configure.bzl   | 18 ++++----
 .../tensorrt/tensorrt_configure.bzl           | 11 +++--
 30 files changed, 129 insertions(+), 468 deletions(-)
 delete mode 100644 third_party/gpus/compress_find_cuda_config.py
 delete mode 100644 third_party/gpus/compress_find_rocm_config.py
 delete mode 100644 third_party/gpus/find_cuda_config.py.gz.base64
 delete mode 100644 third_party/gpus/find_rocm_config.py.gz.base64
 delete mode 100644 third_party/xla/third_party/gpus/compress_find_cuda_config.py
 delete mode 100644 third_party/xla/third_party/gpus/compress_find_rocm_config.py
 delete mode 100644 third_party/xla/third_party/gpus/find_cuda_config.py.gz.base64
 delete mode 100644 third_party/xla/third_party/gpus/find_rocm_config.py.gz.base64
 delete mode 100644 third_party/xla/third_party/tsl/third_party/gpus/compress_find_cuda_config.py
 delete mode 100644 third_party/xla/third_party/tsl/third_party/gpus/compress_find_rocm_config.py
 delete mode 100644 third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py.gz.base64
 delete mode 100644 third_party/xla/third_party/tsl/third_party/gpus/find_rocm_config.py.gz.base64

diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 907e661369c704..891ac1be7fcef3 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -250,7 +250,6 @@ tf_staging/third_party/gpus/cuda/build_defs.bzl.tpl:
 tf_staging/third_party/gpus/cuda/cuda_config.h.tpl:
 tf_staging/third_party/gpus/cuda/cuda_config.py.tpl:
 tf_staging/third_party/gpus/cuda_configure.bzl:
-tf_staging/third_party/gpus/find_cuda_config.py.gz.base64:
 tf_staging/third_party/gpus/find_cuda_config:.py
 tf_staging/third_party/gpus/rocm/BUILD.tpl:
 tf_staging/third_party/gpus/rocm/BUILD:
diff --git a/third_party/gpus/compress_find_cuda_config.py b/third_party/gpus/compress_find_cuda_config.py
deleted file mode 100644
index 4d1e417c96d576..00000000000000
--- a/third_party/gpus/compress_find_cuda_config.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Compresses the contents of 'find_cuda_config.py'.
-
-The compressed file is what is actually being used. It works around remote
-config not being able to upload files yet.
-"""
-import base64
-import zlib
-
-
-def main():
-  with open('find_cuda_config.py', 'rb') as f:
-    data = f.read()
-
-  compressed = zlib.compress(data)
-  b64encoded = base64.b64encode(compressed)
-
-  with open('find_cuda_config.py.gz.base64', 'wb') as f:
-    f.write(b64encoded)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/third_party/gpus/compress_find_rocm_config.py b/third_party/gpus/compress_find_rocm_config.py
deleted file mode 100644
index 90615d4b1ea24f..00000000000000
--- a/third_party/gpus/compress_find_rocm_config.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Compresses the contents of 'find_rocm_config.py'.
-
-The compressed file is what is actually being used. It works around remote
-config not being able to upload files yet.
-"""
-import base64
-import zlib
-
-
-def main():
-  with open('find_rocm_config.py', 'rb') as f:
-    data = f.read()
-
-  compressed = zlib.compress(data)
-  b64encoded = base64.b64encode(compressed)
-
-  with open('find_rocm_config.py.gz.base64', 'wb') as f:
-    f.write(b64encoded)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index b14e6835063704..4b09e4c7599f69 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -641,42 +641,19 @@ def _cudart_static_linkopt(cpu_value):
     """Returns additional platform-specific linkopts for cudart."""
     return "" if cpu_value == "Darwin" else "\"-lrt\","
 
-def _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries):
-    python_bin = get_python_bin(repository_ctx)
-
-    # If used with remote execution then repository_ctx.execute() can't
-    # access files from the source tree. A trick is to read the contents
-    # of the file in Starlark and embed them as part of the command. In
-    # this case the trick is not sufficient as the find_cuda_config.py
-    # script has more than 8192 characters. 8192 is the command length
-    # limit of cmd.exe on Windows. Thus we additionally need to compress
-    # the contents locally and decompress them as part of the execute().
-    compressed_contents = repository_ctx.read(script_path)
-    decompress_and_execute_cmd = (
-        "from zlib import decompress;" +
-        "from base64 import b64decode;" +
-        "from os import system;" +
-        "script = decompress(b64decode('%s'));" % compressed_contents +
-        "f = open('script.py', 'wb');" +
-        "f.write(script);" +
-        "f.close();" +
-        "system('\"%s\" script.py %s');" % (python_bin, " ".join(cuda_libraries))
-    )
-
-    return execute(repository_ctx, [python_bin, "-c", decompress_and_execute_cmd])
-
 # TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl,
 # and nccl_configure.bzl.
-def find_cuda_config(repository_ctx, script_path, cuda_libraries):
+def find_cuda_config(repository_ctx, cuda_libraries):
     """Returns CUDA config dictionary from running find_cuda_config.py"""
-    exec_result = _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries)
+    python_bin = get_python_bin(repository_ctx)
+    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_cuda_config] + cuda_libraries)
     if exec_result.return_code:
         auto_configure_fail("Failed to run find_cuda_config.py: %s" % err_out(exec_result))
 
     # Parse the dict from stdout.
     return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
 
-def _get_cuda_config(repository_ctx, find_cuda_config_script):
+def _get_cuda_config(repository_ctx):
     """Detects and returns information about the CUDA installation on the system.
 
       Args:
@@ -692,7 +669,7 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
           compute_capabilities: A list of the system's CUDA compute capabilities.
           cpu_value: The name of the host operating system.
       """
-    config = find_cuda_config(repository_ctx, find_cuda_config_script, ["cuda", "cudnn"])
+    config = find_cuda_config(repository_ctx, ["cuda", "cudnn"])
     cpu_value = get_cpu_value(repository_ctx)
     toolkit_path = config["cuda_toolkit_path"]
 
@@ -1008,9 +985,8 @@ def _create_local_cuda_repository(repository_ctx):
         "cuda:cuda_config.py",
     ]}
     tpl_paths["cuda:BUILD"] = _tpl_path(repository_ctx, "cuda:BUILD.windows" if is_windows(repository_ctx) else "cuda:BUILD")
-    find_cuda_config_script = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_cuda_config.py.gz.base64"))
 
-    cuda_config = _get_cuda_config(repository_ctx, find_cuda_config_script)
+    cuda_config = _get_cuda_config(repository_ctx)
 
     cuda_include_path = cuda_config.config["cuda_include_dir"]
     cublas_include_path = cuda_config.config["cublas_include_dir"]
@@ -1484,12 +1460,20 @@ remote_cuda_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_cuda_config": attr.label(
+            default = Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"),
+        ),
     },
 )
 
 cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
     environ = _ENVIRONS + [_TF_CUDA_CONFIG_REPO],
+    attrs = {
+        "_find_cuda_config": attr.label(
+            default = Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"),
+        ),
+    },
 )
 """Detects and configures the local CUDA toolchain.
 
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index b7ae4a8d71f8ec..4a7781cb2b5a2c 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -53,12 +53,6 @@
 tf_<library>_library_dir: ...
 """
 
-# You can use the following command to regenerate the base64 version of this
-# script:
-# cat third_party/tensorflow/third_party/gpus/find_cuda_config.oss.py |
-#   pigz -z | base64 -w0 >
-#   third_party/tensorflow/third_party/gpus/find_cuda_config.py.gz.base64.oss
-
 import io
 import os
 import glob
diff --git a/third_party/gpus/find_cuda_config.py.gz.base64 b/third_party/gpus/find_cuda_config.py.gz.base64
deleted file mode 100644
index 37525142385f6d..00000000000000
--- a/third_party/gpus/find_cuda_config.py.gz.base64
+++ /dev/null
@@ -1 +0,0 @@
-eF7dPGtz4zaS3/UrcJxzmfLIlJ1NpfZ061w59szFuz57ytZMbsvWamESkpmhSB5Jydbs7n+/7gZAgiApv+JkKq7KRCTRje5GP/F6w46SdJ2F89uCfbO3/x9sfCvYWMR5kr2Pkjt2uCxukyz32GEUsQtslrMLkYtsJQKv96b3hp2GPjQXAVvGgchYAfCHKffhf+rLgH0SWR4mMfvG22MuNnDUJ6f/n4BhnSzZgq9ZnBRsmQtAEeZsFkaCiXtfpAULY+YnizQKeewLdhcWt9SNQgJksL8qFMlNwaE1h/YpPM3MdowXRDD+3RZFOhoO7+7uPE7Eekk2H0ayYT48PTl6d3b5bhcIJpCPcSTynGXi/5ZhBqzerBlPgR6f3wCVEb9jScb4PBPwrUiQ3rssLMJ4PmB5MivueCYASxDmRRbeLIuasDR1wLPZAMTFY+YcXrKTS4f9cHh5cjkAHD+djH88/zhmPx1eXByejU/eXbLzC3Z0fnZ8Mj45P4On9+zw7K/sLydnxwMmQFTQjbhPM6QfiAxRjDR07FKIGgGzRBKUp8IPZ6EPfMXzJZ8LNk9WIouBHZaKbBHmOJg5kBcAlihchAUv6E2DKezm4Bf96zmO8yELY1DDo4/Hh9D9TcazNRLDbgXH/gMYIr9IslAQjWwltQ9UKgECUbDE5TovxMLr9VDhcz8LQc9ywTPQhZxE0YUeFTOvYxnAiKPUirwHLxeoAoEoUFQxiTjMNBGEKJX0I7yfxLNwvsxIgAiXF0GyLDyiKuXFbS71ibATMEKVeliyBgqmxw1V8DZLlvNbJuJVmCXxQsRFb8WzELUVTPlkBqbGVjwKA4uAUAlpIJmTUtHkEnEiy2jgM1EsM1ICBq9AXH4SCCXNCNQYbU8KD4cBYGchEA/4Kyo5kj1fInVA1OUyTZMMNb8CQ7OhYXDD2I+WAbzylz+cHl72B/Dj+OxswM6Ojk4HJBjptC7G9SEt+GdEVNJ0w0HTTQ2p6AGjRq7H76fY5/TD4fjHy54hQqZFiJSDP1rw3VykHEQHwPMouaFOPGb0HiXJZ6lNUnnyHlKqlUpqErmrW54FuyjCAJSQCM2XNyaZsyxZIHlAPbFAuuH1YChr9KKE0Y2WXIFs2PllZdOBmPFlVGA7cLXBqNdjYKzx8n4EXnG4zLNhlPg8GvrLgA/ohZSu1EWiYjsKpNaw3XTbA/ifYGiTu3zESjpYm9gG4H96rPo7Gl1ff8iSecYX7D2K4vr67NPJ8ckh++8PHyEsLdIlulA2TpLoc1hcXyP66+udXu89CPSG+5/BrwY0FEBdeBNGYbFGh7sQpg5FeUJRhUdgkTE0XEkR9hriTaSzbCEdRolCybpuaB6IjtSqp4Z0t5RzKxYlbTn6rAyL0LbSLXYYBCEaI48aygrQuy/8AxRkUfSnVefTu4tLCB56XPDV+Pz89C8nYxpNgJFmV8LgQw2qHQYs1Ojn7KzekXx1cnY5Pjw91UBozyVx+GARR69MGOkCpj8eX2gU2hEQijHE8fOLi7GBpnxV67r3SfvSdgXwwdPdCJ1SgEmDIdxvY0jdvvfW2+ji0C0L0AfeK9VAOWiIxIu0WGPrZWz46YRxnzIcHq91W+W/IAqA/mN/XAUu7TbAdA0qRr1iNv2T+vb9VCEZMSDK+1L/Jn3QFBRqxDzPq39UP6qvEGh7KrNC3mVihn1GkBeiXZILjAPJ+VzEAj0htUG9/e7bMuYRtWEOyKRbHMEvnxf4Mgum4EGL9bCgQZsB6qH5ep4u8yGGjik6pKn0O16S5x6kd/+kZC4N51/Y7hf2T93t7t0e+54+PbuDdO3Nv3gSH/bW60HeBNGJhYn+leT6F7p+/TuNeIGjop8hgKlf4M/TLPEhDSvfrHMUcLqOIL6OMPlDTTuY78rvu+DHd3mxWyRpr8jWI9Bs8lL5LbjFiCkkd7ehf9tTefIJvXuH8bhsjjklAuRenvK7WMMRy+Je+MtChzSJqiJIxJ309Hp+xCGhPCJxUYfuO6IBxruPfaccpdaDcMOmYQ7aBSHGpS8ycygl5ckUyu1Dooh1AbRzDMA7GVseAaqikAm84H7yGNBjnkE/JeSCF5gFaltyuV8seTQtbVmnX/oNdQDmcnQr/M8oR0GBhAKRtoGFECrlU9Ay7empgIjVlYjkS23ddjcsVuUFuCHM59E/hZSGKiR1Okvc+NfAZTXGFjnkBQbIU0KK/ttn9t++V383zpbCBPC+sQGs5/cQwB+A8P6wEaL5Z/dBROmRyOb5qIS25Tai7E4LrZE7gpPMvBK4LuI6KNYHPiaOOrGTYRky/YJHEWXjGtGFTLVH7CelWNha4q7US6psvTpIYuFJzYR/IYGxhhziyBk0kcwq+5CygzdQHkIpmgKL3DfSz7LiQMQWvgNbAbGYTd1+r0X9Dhqvaq2JlgY2cNg5Fv9uwwC15fJiGgmeF08zXSXLA3YFfs9d9SlhX1FibpEAtXPhOp7TnxhMtQE2ubNADQ7Z95U0NCNzUehwrVnBDHzAYr4Q2t8oxUBnoEILdA/gUHVSWXFklqyeVAOkMMKyFL6HiZekQmN2MgfSlBiKEAjuB86ymO3+0el7GaBAAOVHmVQ1GkCPfrqZc53vvIH/ZNdsK7/O37rXwdu+w7aI4AF12Sdw0EMC0zamREHvvDnUram7byiBU3plH4Zf5CGPp8BssPQLdxZmeQEJv4CoHdhCASuiogXTlJswVjMU4FoJimoaCagEo/q7SiC1ADDv5ySM3Rlgl6M6Q4FJUHwk+Ur4iTlmUaBSiCkVTG4bVY26o15QleaKdZwVOysyUYs0GPUFI0IB3HX0a6eP+aYzzIH9YfkS4FRqeWDkJJ6PsWsqv7hXNcygGrupM+lTVC+wiJKjj3UXKBYo9Q47+J653k7fkSOHkQTRi4Ls2dQ52YM0h5paqRTH1DDVm1IzbO0FAotk1+G5H4ZOX6qUyn4+xiF+PKYmZRJkIqwiQ1MLkWaPB4GrFQCGCXXXrWtm39DNnOYrXKz6XYmg3zeVQdXaMrOU+kA/Td+jxtl8L2ky3wDpzg6OnIhUe3AkNCv6EFjt8S2AARrZayO5qnSrDGAgClUMecCPW4upTlnsOwPrw/Pq+hX4jWvwGDVkNXnRF8N9XjnWhMXuVo4uxwQC5bUaOeqVSbbVJo6dCUir3aL1CCv33G7nzfmcTERy7qGc1cPJIWNuSE4HWd5IkeiUxDpqJqzxQvFmv93ZJfexO4+X1UdxX2Q8Hx59/DA+GW5EKNtU3yoxDjuRlA5Rl5S/gIhqU2YPyghaf/dt7bH20C4U/HBvgtUY3GmVgd1Rh3SqZqVswIinVNZPafrUxVpTCmpQykE/I9PKF5IQq7boGBwZqq6c61ir8/ZWvo2GQHEBxUc/wtJjVQj6E+nPzA6firQOPTEcpFkeahkdJcsoIB9GM8I48bElJ0HhlzHpuR6BP4iTGf2vcguuIYympAzGSjulShehniRjk8GW1KMblfKllDPgEhbA4wSBh/+4teRCxlaz476VFOE3lCYPc/F8lakJQhlSDVjbqUzVuqpbbbcyCZKrByQhsFNddoi8msBfV5lMS6wxCAR9MymgMIUmq+OdVcnbsM5WvuMFa4QA7cQUW+suvXpbQw5ufWNlWP/bkMVfjfYnlAsISClaydpIB8SgPHEeoOah7lFSO4Yn7FB12w9v0A7VkXiaweSDbi5sHgYMg+oTVItLMypuedGocqW6aemU6lasU2H6CUjh0d+AJV5hrjZgxTKNxKQxauj5rsznyS/oCLRmhHGtS23xL/MXLbW4IWaSRb9sGj53iqsa09I7DayOu72Vgt+gUjpCgG2o6qs2hKDsUgEwd9R86qDUOSNAai0TrNpYqeyt3eU9Q0lVDvdoNa2Z7Aazs9LMOuEPuLLNbNUkZEx+Wyrd0APoFOE6ZiiUolR6iDMj3dMZjrn+pOs5VePU6ppSYjhlZVYrzlbgbQXk+nWnwyHb39vbG5RUbNGzfN9HBohdQ7KlWCu6OzXHQWDv1nlI+o8akWfgaMqy32vUfRY77A2uB+FUiWD8JlkJr5SCGRxKttuyBOI7K5yBVZQZ+hCvfL9NG4xpA+cIgJmcOtCbHpJIOgOBy53XwdtrD/4ZsE+u/k3/9J3Sn+rJhM75Cz1tsauIcSb9limHlrmsMrPsnm1oTh50T2LVtRZekIQoy4EMAR88cS+cRo5EWYVs4GgoNf9niLgcrg3+o6U6ugnj1pJFv58MKjIHVjFtj3JfMrVa1PWnmwxsi/VQIFahbxSdOZSDdg2LFVm8CoOQ71J1X8gJgzZwg42WHoAjp3yz4+3veTe+oxxBWoSmJ+hmoeGJHQIGT9AvET3FmKC90+JgS8PESUu5HguY7EmpUiP6Wv7G+m1rezVEfTm5/7+nh7prGaeUbMuIRsVZWtxzuYeqlJ5H4OPz43PXz8P5vD+ipT60xpsEQM3JTVzMAJl5miPVR1ll6pxAExsnGe7kuK2nPZYwMDuABLg/eBSwLRqCHpYI+jJhbFJ3tTdh/3bQ9mFf5Y0yzTErXOckhgiaQ4aJK3O0nGNKdYR17ipnNEfV4feb/dF4KSfyD63tpk06o7qJ1tqoaRnkHNrZSmFHwX4d2BBbF7Cp7xZ0NWKaQmMMTW9g9dMYsQorWthDHFnm3LfAH+LJNmKLKXNoNFu23iDEv6zk6ibi+WPSKyu2jvRcbWNdy5riBI+278jmj8nOzPzMnNV9YNnJaIlRmMIY2L3rVBuCpv9z+OfzC4cSu+rdyRm+68p1zKYfDsdHP1aRVud4upY2nSRjz0/gaDx4Gj4njXtxDteewKFbVbuslrlyygv+c1JtHU3iaC3XhBUDXemeOU+wN+nVJine4Cp0JuQ+wRJ1GM+giFlxyMlwM8iNgPFVK9KoWgP28zJXE3a0BwjXE80haF9wsAfpabHVGCNLPsYCNm1TRT3kaj+w/uZCvtvvkJY9giihp0RuREdpsIm3w1GbTcBj1OU1sNptdm4dbo0gN/u1hkez/VOeREDSq3qofW/vN/VQl+en6GAsH2W+fdBLGY1/JT9VjstiAWb9NTmrkrbnOKFX9Bq2xDqpfakTkAjJDdRxdziCeqNNrkC1fJYzULAvdAcZj4PftTO4ODw7tl1B9e5BR1A2/ZXcAI7HV2b9pCJfme0rMXWQ+FKDR3Rk7ibeDmM3m2wydWr3LEMnyBea+WxW/K6t/P37sW3k5Stp4+arX8mWQehfmSmjGnxlliyF1E7gS+0YsJEZG1g7rNhoscmIsdmzbBgBX2jCQRy/zrJNtQnssfZnWx8e7jEsjx4rq8NHsrjTd5/encpdsd1Gh/5G/caZOZod11PqzzPFp5mP65Co0XCZ/FkaCKjpE5G90JQ7138qoh5hxk9a9oljuepT9dA1LVm1kNNj1fOg3upZBoOAL610U54Bc7/nsHf54fDi8l2j0jXePlzpVo1/rUqXxuUri4xKWb6y4FiKqpPMF9e2hFDWtjXcXbVtrdHG2la2fF5tK2Ff5ABi349eJ2BWtvu8gEmnWEt7lU86XNJTzRA3meFzjRBl83QDfJVoRsP0GsEMEYNETfytOm02wEUp43FQa/MMTSa4F2mxPMuaPap6+w00+VN16rpSaPPlpvhTa1nT+Tfs74qIv+PUP2fqzDF0ntz8LHw8CpSwO7pUAxoWLCzk3iy6+0GtaSzzMJ4rdPDdXSzDIkxxg2G4EDm0ipI7tW5QSYROeNRig9qixKPIreSsDaLc8YGdfJKf5bUM6nKNBf8sGnvwmL5Sg06peK9m5merk3gmsk9l/vrbGzxgKFX6kVb/JKNfhcgxqKHdS6vt242641nZ8hleoIR9ridApZzi+bGpiFcu/Kc27KhTRwdXE3vnon1nCdkFnUBru/Kg2k+rcdP5rfJIUO3MUPX6Sree6BEbOMaRAEVd7cycmHNfcvgoNrjcpFI7fAtGTljaOcFRbt42MWi7TKL1cgg6oFIkgGVbHhTCoyJ//G763bfVQZJtdkcnG9JMrMJkmUdruV5Ku2Ka1wj1mLrChR3TSWBW3CZ4SQk0UJKirrYH8laSu1Be0IQ3nqBTs0a/Xzvnu2m8Wk5v/s0dXv1tyCY7/bfE2fXd24qt4X/9O9hN2/BuPM55Vd8KZxzaekhv+9UJmWzBo/CLtOMqVBmaUDaRO3EH8mowLaY84vmtvKpJ3Q2gVFp5DG1qmeBR1UWv68SCglIgacR94TrX13SczNRwbKB4aGytbapyEPp02UtGt5VtunNK6Ytkobrf5oBd8Wy+8iBugaeXh1XxBe2PXOce/r7aH00mvcYmUet4n2NdB4OcEWO180bW+Dm164cejCWPOhfZ6JP2c9Z24hufYbT0SIp7oC2X4ziRrl0dRf3Hv+S40jYeOjpZStA4N9nFY+N2G6d21Eg5QjpBukwD8K1uc1d11YE1saBKZ2tsJLqr+laviWpJS/6a2IoQbZB0vMG1j6SbmIxg2md/Yu7+3oDtlxnMG/ZDYwOI2npxx3N154TcdlGe1sINGMY+Qn13gUVrJQSTkwc00rxsqNRJW+BVztqy5cqkwt630TUaaiX1F5UyyHGvlHKji4ZwGmvWreKpdiU8TUD1PR91apqr2l1ionWo1xRSrYOGiKw1vlYB6dXap4nHXAM3qbBXALsEg5P7rykXE39DLPUVk1apqGWvjUJpWSg0OrYWUToNSU7bvKoh1btoGpI9QdZuSOWk5xMNqTalXKemOYXWJiYVmOK4IzLFcRWa7IzZaSa2LdHJXh5oD/3VDW0PqoSx8GQQaK8SlNzRvEsLczQX0s1bI0FvYc2aK2rhzLxG7gHGzPnBijZrwqjkStdwbZyV9V03d621RguHLbVxC5f2TXcPcGrPIdXpbSmViWs0z88DfccNYQ0LsSiTZJDKZ6AskPf0OFTU0jUktbfYhWNMmFCq83mCQrKy/pVZoasrqmRuveBhLHstbw4h4sR6gIfzlsI4797MwzXVJRF0yanrbOUjuUGfuRUmec5XXjJibPjHlEfN9mOiLS9H9fDiX+HmReYKPLO4fR1v98s2kKAWeDKo18Magwqf6ZSuIJtOkaHp1EGEkrfe/wMY3XD1
\ No newline at end of file
diff --git a/third_party/gpus/find_rocm_config.py.gz.base64 b/third_party/gpus/find_rocm_config.py.gz.base64
deleted file mode 100644
index fee67ec553d9da..00000000000000
--- a/third_party/gpus/find_rocm_config.py.gz.base64
+++ /dev/null
@@ -1 +0,0 @@
-eJztW21v2zgS/q5fQSgoam9dJSlugUMOOcCbZlHftUlhZ7tYtIVB27TNrSz6SCppUPS/7wxJyRIt2YqjvX6JgTa2NPNwOC8Ph7Z4RC7E+l7yxVKTVyevTsjNkpEblighf43FHemneimkikg/jskQxRQZMsXkLZtFwVFwRN7yKYizGUmTGZNEg35/Tafwx93pkQ9MKi4S8io6IR0UCN2tsPsvQLgXKVnRe5IITVLFAIIrMucxI+zrlK014QmZitU65jSZMnLH9dIM40DADPKHgxATTUGagvwaPs2LcoRqYzC+llqvz46P7+7uImqMjYRcHMdWUB2/HVxcXo0uX4LBRuW3JGZKEcn+l3IJU53cE7oGe6Z0AlbG9I4ISehCMrinBdp7J7nmyaJHlJjrOyoZoMy40pJPUl1yVmYdzLkoAO6iCQn7IzIYheSX/mgw6gHG74ObN9e/3ZDf+8Nh/+pmcDki10NycX31enAzuL6CT7+S/tUf5L+Dq9c9wsBVMAz7upZoPxjJ0Y0mdGTEWMmAubAGqTWb8jmfwrySRUoXjCzELZMJTIesmVxxhcFUYN4MUGK+4ppqc2VrUjjMeauvIAzD95InmIbXFysYfiKpvEdjyJJRHH8GIZpqITkzNpJbm32QUgIMRMeaWd4rzVZREGDCq6nkkGeKUQm5oIwr6uAxMVUZpQcRR69pFcDFFabAjGl0VWJczGVmhAFaW/tRfyqSOV+k0jgQ9ZSeiVRHxqo1xUQXGThmiIsNptlSinSxxCRhyS2XIlmxRJNbKrlJyg7Y/278vn/zphsFgzkUF9yL+cwbkju39Ox0rB8yA405TEoTasl0Kk3YCVwCB03FjJX9p+kXZueVxeC+YDEUDd7K7aq0OyrixUJ8scGwvrfxzGJiA2GqfUnl7CXaM4MYaqj7QKWTYh7MpViRCVXOqY4YNrbl9kYEfLUxEdwDrBTkgsZNUJbHYq2PpZiuQhRJkf4o2KIh7nOaxjifOGUBZmsQQM0JCeET2TuhsnfAC+4dZFIQBNOYQp1emBBdopc7l4YCIVTds4CA9QrFYBQyXjA9dsON0ZQxTq1jxGysimYWlYwwJJWmcVxQAltfZ1lrPZ2F3IVtRZySTR3UjHCCMJ6PSM5r7QNxPidh7uMQQyhU5LIBLakG3Mh8LGh/BvmjOnljomQ0NkNvCXU3rtq6V3QYLj5Ccc3GrozHSbqaMNlZ0T+F7BHwGP4Btemy6P/TE3iRn4gRIy/wM35CafhkxIvD2CTPxuigFT2S0BXLwjN08QA2X4PBQOkABOomYODFiyJHudBg9cTIQ3Cfi0isWYYcyhBWiATKBpj9PEz1/OU/w671/wptAx9KFpm3HRke2YHIM0VedD7NXnRD8sxY1zP4XaMHkTXyFoXYMgAcczFaAGWtO6ddd9N5CbimY+S6AbqOcqjSYgU8z0YOv3035WZXtU/J8wgmB8gd4yPywsGWXyExfQUugMgz4AXTWXz7DuvmpyTMIEw61ENwoEM7F8sIyIJgN1sw9C4UDY3DbhZLMHZmM95ybUXineFUUdjcKmeV6jgRHBvyTnHgxVzGst45+ZibGvJkGqczZgodWFDad5lGtIQwQ4mYAv45elWtVyN9mi1YQhq9z+b/oi1gypVImLmOuTZHF1dbnSdF4aJfrH8KnuXnPMsTSKrsNvsKLZLqbCF0z/JpecZtieaCEyCGL1nWYkyLohnedjoWUiQ0XsqWdjMg4tg+A9wA+Zrll5dY1Q7qmgogji/OK1mhqNBzNPrhcjiC5m/8rv+f62Fo3WZZ5gCMwVWOsXYs8FAMIOaLNw7DFXkFU+JcKy4j61QVRQV7ZyuPLTNQ/ObcHFYghGcHkDngfQ+8VcKOVir2JV/LNNF8xRqUfEHYs+DAugdA/Le/4FGwmdJT3e+s+zeD92RoY/g3lP8RbMJhmSms5dDg2sb7K246oc3MlkT0PBiTVV6P3C0F2G27T4sFSJ2CiCUJaEOwGXlBSnew9LtODXoNyfit3SJmDYwlFdsavpzSeJpCLwjWMQkcoLjZnUPzL0zTbHHuKKy6UHe4XAoDNkkVNiwKu/A1hfnDvhK6f0gH3KeqhzLg1tweToBbTihz12mpiTO4GKitoi8SUG2RAw3VE0Blh1okoK0xSzS04tjeNaAgJ/i4vqPEQRbxuMQm7rVhIo9UchCnvAvriZkaMdPzd4Nr8F2ZlEzfnDNT3jUXwbvdh5YdDPT+8uqxleejHNZ8eCiPbT9qqqO6ASnVXJEBKlHCs0KwHt2OlMb2dx+TmKpmGxAj2d4eBOGOcXMkExpnF17ubU42mhapIcITH+zsVJzz2u1SHkgVEKlf3vZHLWxTyjAH71RKMC1sVioLqHa/UihMb8tSgdMuYZRH9xlDQiU1Ywwj2c7mxaEde6h+lZd4wih4ADWKJ0/0sIceZOE3kt30sJMPytnQuBSH/avXWSmWi7AM6GqnkKJe7VSkJLTaNbm6t88uD+UXynyum9UJCra3sAKa+9NoNUXxsvbTEnpgjYDvfugK6qWS0Xz4AuqjoObD108PxWg+bvmsqpLa1XNTex4BbKO0vnZuxva//WvGCE6wJUawaO7PfkZw4mXtJ0Y4hBGs734oI1gTPtiL7w7jgzLGYWxQwnj/SC6oqY9qLihVnffV29/NBaWx/e5ASzoFhzVqEJxsez2CBdy829khOOEt9Sc2eGB/YN32ozfZN8P+xeWwhW22D1TYaB9tsjZ/3kow+5v4kt4yQl01Z75w7XuRVk4e0yzUVExtv1CqRq9lqMRqvWsoWeA3Dgp/AWn4q6GTba99sICbd42aCKe0BfPUShzYSlj3/ehuwlrx6IaiDHNwT1GCaaGtqCmd2s6iVJZec1GJ1Xp/UbLAJw0R3zbqMDayLZKGAdx8PZ9fasYembb3Jf8+lNNdKPuUn74E3OKgjRcfTDxtko6x4PGkU4I5nHSKMG2QTnXp1ZNOsax90qnCap90ihb4+5rGrLORbW9f4yo+f9foG1CPJvYpP7Uq+/Y5BzJGm6wBsRpdv/3Qyj7HBzr4B0UPqIWfFB/EHX5xetuc/wN3+BY47th6Etp/kp2SGZ/iaQI80CHmthKNIQnDIyhuPjyZi31HDSpONQR5tnslVUVeewrg+Sg/LLI5HmKerMn3vwY8f7xmewwbK6bwfAhE6Pvm40f7wKoWIv7CtZEOP2cPwpZOJWQaUbqeUc064wYPm3drtJo8tVqnu+9Rszq9vQ+m7FDc/fv0DsWd3867LCkFwiuVz+Tf5+QfJ6cnJy5Nqp25d5ga8/Z9Ybgjfvu+Pmg+uZ/3TG7f6r9jfvtVC0RiIBx9rCisj6Y0tbw/y1fdL+y+l53GSIgSUrNZZ5tqIqCylep08+XSHGvrhM/UGXmm8PRKZ4Nk7HfHTQv1j6e63PKo7lVkj8NFeLiTdcJPyeVweD08g1L+lBTOkigtOwDYzdWAGDQeewkCiMV4jMdVxmNyfk7C8RjnOB4bNrbTDf4CEAHDYg==
\ No newline at end of file
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index f7f05391dfaaa5..491498bd8f667c 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -365,40 +365,17 @@ def _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, miopen_path, rccl_
     libs_paths.append(("hipblaslt", _rocm_lib_paths(repository_ctx, "hipblaslt", rocm_config.rocm_toolkit_path), True))
     return _select_rocm_lib_paths(repository_ctx, libs_paths, bash_bin)
 
-def _exec_find_rocm_config(repository_ctx, script_path):
-    python_bin = get_python_bin(repository_ctx)
-
-    # If used with remote execution then repository_ctx.execute() can't
-    # access files from the source tree. A trick is to read the contents
-    # of the file in Starlark and embed them as part of the command. In
-    # this case the trick is not sufficient as the find_cuda_config.py
-    # script has more than 8192 characters. 8192 is the command length
-    # limit of cmd.exe on Windows. Thus we additionally need to compress
-    # the contents locally and decompress them as part of the execute().
-    compressed_contents = repository_ctx.read(script_path)
-    decompress_and_execute_cmd = (
-        "from zlib import decompress;" +
-        "from base64 import b64decode;" +
-        "from os import system;" +
-        "script = decompress(b64decode('%s'));" % compressed_contents +
-        "f = open('script.py', 'wb');" +
-        "f.write(script);" +
-        "f.close();" +
-        "system('\"%s\" script.py');" % (python_bin)
-    )
-
-    return execute(repository_ctx, [python_bin, "-c", decompress_and_execute_cmd])
-
-def find_rocm_config(repository_ctx, script_path):
+def find_rocm_config(repository_ctx):
     """Returns ROCm config dictionary from running find_rocm_config.py"""
-    exec_result = _exec_find_rocm_config(repository_ctx, script_path)
+    python_bin = get_python_bin(repository_ctx)
+    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_rocm_config])
     if exec_result.return_code:
         auto_configure_fail("Failed to run find_rocm_config.py: %s" % err_out(exec_result))
 
     # Parse the dict from stdout.
     return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
 
-def _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script):
+def _get_rocm_config(repository_ctx, bash_bin):
     """Detects and returns information about the ROCm installation on the system.
 
     Args:
@@ -413,7 +390,7 @@ def _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script):
         miopen_version_number: The version of MIOpen on the system.
         hipruntime_version_number: The version of HIP Runtime on the system.
     """
-    config = find_rocm_config(repository_ctx, find_rocm_config_script)
+    config = find_rocm_config(repository_ctx)
     rocm_toolkit_path = config["rocm_toolkit_path"]
     rocm_version_number = config["rocm_version_number"]
     miopen_version_number = config["miopen_version_number"]
@@ -565,10 +542,8 @@ def _create_local_rocm_repository(repository_ctx):
         "rocm:rocm_config.h",
     ]}
 
-    find_rocm_config_script = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_rocm_config.py.gz.base64"))
-
     bash_bin = get_bash_bin(repository_ctx)
-    rocm_config = _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script)
+    rocm_config = _get_rocm_config(repository_ctx, bash_bin)
 
     # For ROCm 4.1 and above use hipfft, older ROCm versions use rocfft
     rocm_version_number = int(rocm_config.rocm_version_number)
@@ -849,12 +824,20 @@ remote_rocm_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_rocm_config": attr.label(
+            default = Label("@org_tensorflow//third_party/gpus:find_rocm_config.py"),
+        ),
     },
 )
 
 rocm_configure = repository_rule(
     implementation = _rocm_autoconf_impl,
     environ = _ENVIRONS + [_TF_ROCM_CONFIG_REPO],
+    attrs = {
+        "_find_rocm_config": attr.label(
+            default = Label("@org_tensorflow//third_party/gpus:find_rocm_config.py"),
+        ),
+    },
 )
 """Detects and configures the local ROCm toolchain.
 
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index acdeac3b9fd647..33026b7c3a3211 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -90,17 +90,11 @@ def _label(file):
     return Label("//third_party/nccl:{}".format(file))
 
 def _create_local_nccl_repository(repository_ctx):
-    # Resolve all labels before doing any real work. Resolving causes the
-    # function to be restarted with all previous state being lost. This
-    # can easily lead to a O(n^2) runtime in the number of labels.
-    # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
-    find_cuda_config_path = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_cuda_config.py.gz.base64"))
-
     nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
     if nccl_version:
         nccl_version = nccl_version.split(".")[0]
 
-    cuda_config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda"])
+    cuda_config = find_cuda_config(repository_ctx, ["cuda"])
     cuda_version = cuda_config["cuda_version"].split(".")
 
     if nccl_version == "":
@@ -120,7 +114,7 @@ def _create_local_nccl_repository(repository_ctx):
         )
     else:
         # Create target for locally installed NCCL.
-        config = find_cuda_config(repository_ctx, find_cuda_config_path, ["nccl"])
+        config = find_cuda_config(repository_ctx, ["nccl"])
         config_wrap = {
             "%{nccl_version}": config["nccl_version"],
             "%{nccl_header_dir}": config["nccl_include_dir"],
@@ -170,12 +164,20 @@ remote_nccl_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_cuda_config": attr.label(
+            default = Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"),
+        ),
     },
 )
 
 nccl_configure = repository_rule(
     implementation = _nccl_autoconf_impl,
     environ = _ENVIRONS,
+    attrs = {
+        "_find_cuda_config": attr.label(
+            default = Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"),
+        ),
+    },
 )
 """Detects and configures the NCCL configuration.
 
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index c1af2c30ebdcaf..3d127795638bf5 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -148,11 +148,6 @@ def _get_tensorrt_full_version(repository_ctx):
     return get_host_environ(repository_ctx, _TF_TENSORRT_VERSION, None)
 
 def _create_local_tensorrt_repository(repository_ctx):
-    # Resolve all labels before doing any real work. Resolving causes the
-    # function to be restarted with all previous state being lost. This
-    # can easily lead to a O(n^2) runtime in the number of labels.
-    # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
-    find_cuda_config_path = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_cuda_config.py.gz.base64"))
     tpl_paths = {
         "build_defs.bzl": _tpl_path(repository_ctx, "build_defs.bzl"),
         "BUILD": _tpl_path(repository_ctx, "BUILD"),
@@ -161,7 +156,7 @@ def _create_local_tensorrt_repository(repository_ctx):
         "plugin.BUILD": _tpl_path(repository_ctx, "plugin.BUILD"),
     }
 
-    config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda", "tensorrt"])
+    config = find_cuda_config(repository_ctx, ["cuda", "tensorrt"])
     cuda_version = config["cuda_version"]
     cuda_library_path = config["cuda_library_dir"] + "/"
     trt_version = config["tensorrt_version"]
@@ -318,12 +313,16 @@ remote_tensorrt_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_cuda_config": attr.label(default = "@org_tensorflow//third_party/gpus:find_cuda_config.py"),
     },
 )
 
 tensorrt_configure = repository_rule(
     implementation = _tensorrt_configure_impl,
     environ = _ENVIRONS + [_TF_TENSORRT_CONFIG_REPO],
+    attrs = {
+        "_find_cuda_config": attr.label(default = "@org_tensorflow//third_party/gpus:find_cuda_config.py"),
+    },
 )
 """Detects and configures the local CUDA toolchain.
 
diff --git a/third_party/xla/opensource_only.files b/third_party/xla/opensource_only.files
index 0425cd3bc0d34a..ffdc30b8aba435 100644
--- a/third_party/xla/opensource_only.files
+++ b/third_party/xla/opensource_only.files
@@ -48,7 +48,6 @@ third_party/gpus/cuda/build_defs.bzl.tpl:
 third_party/gpus/cuda/cuda_config.h.tpl:
 third_party/gpus/cuda/cuda_config.py.tpl:
 third_party/gpus/cuda_configure.bzl:
-third_party/gpus/find_cuda_config.py.gz.base64:
 third_party/gpus/find_cuda_config:.py
 third_party/gpus/rocm/BUILD.tpl:
 third_party/gpus/rocm/BUILD:
diff --git a/third_party/xla/third_party/gpus/compress_find_cuda_config.py b/third_party/xla/third_party/gpus/compress_find_cuda_config.py
deleted file mode 100644
index 4d1e417c96d576..00000000000000
--- a/third_party/xla/third_party/gpus/compress_find_cuda_config.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Compresses the contents of 'find_cuda_config.py'.
-
-The compressed file is what is actually being used. It works around remote
-config not being able to upload files yet.
-"""
-import base64
-import zlib
-
-
-def main():
-  with open('find_cuda_config.py', 'rb') as f:
-    data = f.read()
-
-  compressed = zlib.compress(data)
-  b64encoded = base64.b64encode(compressed)
-
-  with open('find_cuda_config.py.gz.base64', 'wb') as f:
-    f.write(b64encoded)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/third_party/xla/third_party/gpus/compress_find_rocm_config.py b/third_party/xla/third_party/gpus/compress_find_rocm_config.py
deleted file mode 100644
index 90615d4b1ea24f..00000000000000
--- a/third_party/xla/third_party/gpus/compress_find_rocm_config.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Compresses the contents of 'find_rocm_config.py'.
-
-The compressed file is what is actually being used. It works around remote
-config not being able to upload files yet.
-"""
-import base64
-import zlib
-
-
-def main():
-  with open('find_rocm_config.py', 'rb') as f:
-    data = f.read()
-
-  compressed = zlib.compress(data)
-  b64encoded = base64.b64encode(compressed)
-
-  with open('find_rocm_config.py.gz.base64', 'wb') as f:
-    f.write(b64encoded)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/third_party/xla/third_party/gpus/cuda_configure.bzl b/third_party/xla/third_party/gpus/cuda_configure.bzl
index 8d5a444f0b7af4..81ab53c05f6f63 100644
--- a/third_party/xla/third_party/gpus/cuda_configure.bzl
+++ b/third_party/xla/third_party/gpus/cuda_configure.bzl
@@ -641,42 +641,19 @@ def _cudart_static_linkopt(cpu_value):
     """Returns additional platform-specific linkopts for cudart."""
     return "" if cpu_value == "Darwin" else "\"-lrt\","
 
-def _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries):
-    python_bin = get_python_bin(repository_ctx)
-
-    # If used with remote execution then repository_ctx.execute() can't
-    # access files from the source tree. A trick is to read the contents
-    # of the file in Starlark and embed them as part of the command. In
-    # this case the trick is not sufficient as the find_cuda_config.py
-    # script has more than 8192 characters. 8192 is the command length
-    # limit of cmd.exe on Windows. Thus we additionally need to compress
-    # the contents locally and decompress them as part of the execute().
-    compressed_contents = repository_ctx.read(script_path)
-    decompress_and_execute_cmd = (
-        "from zlib import decompress;" +
-        "from base64 import b64decode;" +
-        "from os import system;" +
-        "script = decompress(b64decode('%s'));" % compressed_contents +
-        "f = open('script.py', 'wb');" +
-        "f.write(script);" +
-        "f.close();" +
-        "system('\"%s\" script.py %s');" % (python_bin, " ".join(cuda_libraries))
-    )
-
-    return execute(repository_ctx, [python_bin, "-c", decompress_and_execute_cmd])
-
 # TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl,
 # and nccl_configure.bzl.
-def find_cuda_config(repository_ctx, script_path, cuda_libraries):
+def find_cuda_config(repository_ctx, cuda_libraries):
     """Returns CUDA config dictionary from running find_cuda_config.py"""
-    exec_result = _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries)
+    python_bin = get_python_bin(repository_ctx)
+    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_cuda_config] + cuda_libraries)
     if exec_result.return_code:
         auto_configure_fail("Failed to run find_cuda_config.py: %s" % err_out(exec_result))
 
     # Parse the dict from stdout.
     return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
 
-def _get_cuda_config(repository_ctx, find_cuda_config_script):
+def _get_cuda_config(repository_ctx):
     """Detects and returns information about the CUDA installation on the system.
 
       Args:
@@ -692,7 +669,7 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
           compute_capabilities: A list of the system's CUDA compute capabilities.
           cpu_value: The name of the host operating system.
       """
-    config = find_cuda_config(repository_ctx, find_cuda_config_script, ["cuda", "cudnn"])
+    config = find_cuda_config(repository_ctx, ["cuda", "cudnn"])
     cpu_value = get_cpu_value(repository_ctx)
     toolkit_path = config["cuda_toolkit_path"]
 
@@ -1008,9 +985,8 @@ def _create_local_cuda_repository(repository_ctx):
         "cuda:cuda_config.py",
     ]}
     tpl_paths["cuda:BUILD"] = _tpl_path(repository_ctx, "cuda:BUILD.windows" if is_windows(repository_ctx) else "cuda:BUILD")
-    find_cuda_config_script = repository_ctx.path(Label("@local_xla//third_party/gpus:find_cuda_config.py.gz.base64"))
 
-    cuda_config = _get_cuda_config(repository_ctx, find_cuda_config_script)
+    cuda_config = _get_cuda_config(repository_ctx)
 
     cuda_include_path = cuda_config.config["cuda_include_dir"]
     cublas_include_path = cuda_config.config["cublas_include_dir"]
@@ -1484,12 +1460,20 @@ remote_cuda_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_cuda_config": attr.label(
+            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
+        ),
     },
 )
 
 cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
     environ = _ENVIRONS + [_TF_CUDA_CONFIG_REPO],
+    attrs = {
+        "_find_cuda_config": attr.label(
+            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
+        ),
+    },
 )
 """Detects and configures the local CUDA toolchain.
 
diff --git a/third_party/xla/third_party/gpus/find_cuda_config.py b/third_party/xla/third_party/gpus/find_cuda_config.py
index b7ae4a8d71f8ec..4a7781cb2b5a2c 100644
--- a/third_party/xla/third_party/gpus/find_cuda_config.py
+++ b/third_party/xla/third_party/gpus/find_cuda_config.py
@@ -53,12 +53,6 @@
 tf_<library>_library_dir: ...
 """
 
-# You can use the following command to regenerate the base64 version of this
-# script:
-# cat third_party/tensorflow/third_party/gpus/find_cuda_config.oss.py |
-#   pigz -z | base64 -w0 >
-#   third_party/tensorflow/third_party/gpus/find_cuda_config.py.gz.base64.oss
-
 import io
 import os
 import glob
diff --git a/third_party/xla/third_party/gpus/find_cuda_config.py.gz.base64 b/third_party/xla/third_party/gpus/find_cuda_config.py.gz.base64
deleted file mode 100644
index 37525142385f6d..00000000000000
--- a/third_party/xla/third_party/gpus/find_cuda_config.py.gz.base64
+++ /dev/null
@@ -1 +0,0 @@
-eF7dPGtz4zaS3/UrcJxzmfLIlJ1NpfZ061w59szFuz57ytZMbsvWamESkpmhSB5Jydbs7n+/7gZAgiApv+JkKq7KRCTRje5GP/F6w46SdJ2F89uCfbO3/x9sfCvYWMR5kr2Pkjt2uCxukyz32GEUsQtslrMLkYtsJQKv96b3hp2GPjQXAVvGgchYAfCHKffhf+rLgH0SWR4mMfvG22MuNnDUJ6f/n4BhnSzZgq9ZnBRsmQtAEeZsFkaCiXtfpAULY+YnizQKeewLdhcWt9SNQgJksL8qFMlNwaE1h/YpPM3MdowXRDD+3RZFOhoO7+7uPE7Eekk2H0ayYT48PTl6d3b5bhcIJpCPcSTynGXi/5ZhBqzerBlPgR6f3wCVEb9jScb4PBPwrUiQ3rssLMJ4PmB5MivueCYASxDmRRbeLIuasDR1wLPZAMTFY+YcXrKTS4f9cHh5cjkAHD+djH88/zhmPx1eXByejU/eXbLzC3Z0fnZ8Mj45P4On9+zw7K/sLydnxwMmQFTQjbhPM6QfiAxRjDR07FKIGgGzRBKUp8IPZ6EPfMXzJZ8LNk9WIouBHZaKbBHmOJg5kBcAlihchAUv6E2DKezm4Bf96zmO8yELY1DDo4/Hh9D9TcazNRLDbgXH/gMYIr9IslAQjWwltQ9UKgECUbDE5TovxMLr9VDhcz8LQc9ywTPQhZxE0YUeFTOvYxnAiKPUirwHLxeoAoEoUFQxiTjMNBGEKJX0I7yfxLNwvsxIgAiXF0GyLDyiKuXFbS71ibATMEKVeliyBgqmxw1V8DZLlvNbJuJVmCXxQsRFb8WzELUVTPlkBqbGVjwKA4uAUAlpIJmTUtHkEnEiy2jgM1EsM1ICBq9AXH4SCCXNCNQYbU8KD4cBYGchEA/4Kyo5kj1fInVA1OUyTZMMNb8CQ7OhYXDD2I+WAbzylz+cHl72B/Dj+OxswM6Ojk4HJBjptC7G9SEt+GdEVNJ0w0HTTQ2p6AGjRq7H76fY5/TD4fjHy54hQqZFiJSDP1rw3VykHEQHwPMouaFOPGb0HiXJZ6lNUnnyHlKqlUpqErmrW54FuyjCAJSQCM2XNyaZsyxZIHlAPbFAuuH1YChr9KKE0Y2WXIFs2PllZdOBmPFlVGA7cLXBqNdjYKzx8n4EXnG4zLNhlPg8GvrLgA/ohZSu1EWiYjsKpNaw3XTbA/ifYGiTu3zESjpYm9gG4H96rPo7Gl1ff8iSecYX7D2K4vr67NPJ8ckh++8PHyEsLdIlulA2TpLoc1hcXyP66+udXu89CPSG+5/BrwY0FEBdeBNGYbFGh7sQpg5FeUJRhUdgkTE0XEkR9hriTaSzbCEdRolCybpuaB6IjtSqp4Z0t5RzKxYlbTn6rAyL0LbSLXYYBCEaI48aygrQuy/8AxRkUfSnVefTu4tLCB56XPDV+Pz89C8nYxpNgJFmV8LgQw2qHQYs1Ojn7KzekXx1cnY5Pjw91UBozyVx+GARR69MGOkCpj8eX2gU2hEQijHE8fOLi7GBpnxV67r3SfvSdgXwwdPdCJ1SgEmDIdxvY0jdvvfW2+ji0C0L0AfeK9VAOWiIxIu0WGPrZWz46YRxnzIcHq91W+W/IAqA/mN/XAUu7TbAdA0qRr1iNv2T+vb9VCEZMSDK+1L/Jn3QFBRqxDzPq39UP6qvEGh7KrNC3mVihn1GkBeiXZILjAPJ+VzEAj0htUG9/e7bMuYRtWEOyKRbHMEvnxf4Mgum4EGL9bCgQZsB6qH5ep4u8yGGjik6pKn0O16S5x6kd/+kZC4N51/Y7hf2T93t7t0e+54+PbuDdO3Nv3gSH/bW60HeBNGJhYn+leT6F7p+/TuNeIGjop8hgKlf4M/TLPEhDSvfrHMUcLqOIL6OMPlDTTuY78rvu+DHd3mxWyRpr8jWI9Bs8lL5LbjFiCkkd7ehf9tTefIJvXuH8bhsjjklAuRenvK7WMMRy+Je+MtChzSJqiJIxJ309Hp+xCGhPCJxUYfuO6IBxruPfaccpdaDcMOmYQ7aBSHGpS8ycygl5ckUyu1Dooh1AbRzDMA7GVseAaqikAm84H7yGNBjnkE/JeSCF5gFaltyuV8seTQtbVmnX/oNdQDmcnQr/M8oR0GBhAKRtoGFECrlU9Ay7empgIjVlYjkS23ddjcsVuUFuCHM59E/hZSGKiR1Okvc+NfAZTXGFjnkBQbIU0KK/ttn9t++V383zpbCBPC+sQGs5/cQwB+A8P6wEaL5Z/dBROmRyOb5qIS25Tai7E4LrZE7gpPMvBK4LuI6KNYHPiaOOrGTYRky/YJHEWXjGtGFTLVH7CelWNha4q7US6psvTpIYuFJzYR/IYGxhhziyBk0kcwq+5CygzdQHkIpmgKL3DfSz7LiQMQWvgNbAbGYTd1+r0X9Dhqvaq2JlgY2cNg5Fv9uwwC15fJiGgmeF08zXSXLA3YFfs9d9SlhX1FibpEAtXPhOp7TnxhMtQE2ubNADQ7Z95U0NCNzUehwrVnBDHzAYr4Q2t8oxUBnoEILdA/gUHVSWXFklqyeVAOkMMKyFL6HiZekQmN2MgfSlBiKEAjuB86ymO3+0el7GaBAAOVHmVQ1GkCPfrqZc53vvIH/ZNdsK7/O37rXwdu+w7aI4AF12Sdw0EMC0zamREHvvDnUram7byiBU3plH4Zf5CGPp8BssPQLdxZmeQEJv4CoHdhCASuiogXTlJswVjMU4FoJimoaCagEo/q7SiC1ADDv5ySM3Rlgl6M6Q4FJUHwk+Ur4iTlmUaBSiCkVTG4bVY26o15QleaKdZwVOysyUYs0GPUFI0IB3HX0a6eP+aYzzIH9YfkS4FRqeWDkJJ6PsWsqv7hXNcygGrupM+lTVC+wiJKjj3UXKBYo9Q47+J653k7fkSOHkQTRi4Ls2dQ52YM0h5paqRTH1DDVm1IzbO0FAotk1+G5H4ZOX6qUyn4+xiF+PKYmZRJkIqwiQ1MLkWaPB4GrFQCGCXXXrWtm39DNnOYrXKz6XYmg3zeVQdXaMrOU+kA/Td+jxtl8L2ky3wDpzg6OnIhUe3AkNCv6EFjt8S2AARrZayO5qnSrDGAgClUMecCPW4upTlnsOwPrw/Pq+hX4jWvwGDVkNXnRF8N9XjnWhMXuVo4uxwQC5bUaOeqVSbbVJo6dCUir3aL1CCv33G7nzfmcTERy7qGc1cPJIWNuSE4HWd5IkeiUxDpqJqzxQvFmv93ZJfexO4+X1UdxX2Q8Hx59/DA+GW5EKNtU3yoxDjuRlA5Rl5S/gIhqU2YPyghaf/dt7bH20C4U/HBvgtUY3GmVgd1Rh3SqZqVswIinVNZPafrUxVpTCmpQykE/I9PKF5IQq7boGBwZqq6c61ir8/ZWvo2GQHEBxUc/wtJjVQj6E+nPzA6firQOPTEcpFkeahkdJcsoIB9GM8I48bElJ0HhlzHpuR6BP4iTGf2vcguuIYympAzGSjulShehniRjk8GW1KMblfKllDPgEhbA4wSBh/+4teRCxlaz476VFOE3lCYPc/F8lakJQhlSDVjbqUzVuqpbbbcyCZKrByQhsFNddoi8msBfV5lMS6wxCAR9MymgMIUmq+OdVcnbsM5WvuMFa4QA7cQUW+suvXpbQw5ufWNlWP/bkMVfjfYnlAsISClaydpIB8SgPHEeoOah7lFSO4Yn7FB12w9v0A7VkXiaweSDbi5sHgYMg+oTVItLMypuedGocqW6aemU6lasU2H6CUjh0d+AJV5hrjZgxTKNxKQxauj5rsznyS/oCLRmhHGtS23xL/MXLbW4IWaSRb9sGj53iqsa09I7DayOu72Vgt+gUjpCgG2o6qs2hKDsUgEwd9R86qDUOSNAai0TrNpYqeyt3eU9Q0lVDvdoNa2Z7Aazs9LMOuEPuLLNbNUkZEx+Wyrd0APoFOE6ZiiUolR6iDMj3dMZjrn+pOs5VePU6ppSYjhlZVYrzlbgbQXk+nWnwyHb39vbG5RUbNGzfN9HBohdQ7KlWCu6OzXHQWDv1nlI+o8akWfgaMqy32vUfRY77A2uB+FUiWD8JlkJr5SCGRxKttuyBOI7K5yBVZQZ+hCvfL9NG4xpA+cIgJmcOtCbHpJIOgOBy53XwdtrD/4ZsE+u/k3/9J3Sn+rJhM75Cz1tsauIcSb9limHlrmsMrPsnm1oTh50T2LVtRZekIQoy4EMAR88cS+cRo5EWYVs4GgoNf9niLgcrg3+o6U6ugnj1pJFv58MKjIHVjFtj3JfMrVa1PWnmwxsi/VQIFahbxSdOZSDdg2LFVm8CoOQ71J1X8gJgzZwg42WHoAjp3yz4+3veTe+oxxBWoSmJ+hmoeGJHQIGT9AvET3FmKC90+JgS8PESUu5HguY7EmpUiP6Wv7G+m1rezVEfTm5/7+nh7prGaeUbMuIRsVZWtxzuYeqlJ5H4OPz43PXz8P5vD+ipT60xpsEQM3JTVzMAJl5miPVR1ll6pxAExsnGe7kuK2nPZYwMDuABLg/eBSwLRqCHpYI+jJhbFJ3tTdh/3bQ9mFf5Y0yzTErXOckhgiaQ4aJK3O0nGNKdYR17ipnNEfV4feb/dF4KSfyD63tpk06o7qJ1tqoaRnkHNrZSmFHwX4d2BBbF7Cp7xZ0NWKaQmMMTW9g9dMYsQorWthDHFnm3LfAH+LJNmKLKXNoNFu23iDEv6zk6ibi+WPSKyu2jvRcbWNdy5riBI+278jmj8nOzPzMnNV9YNnJaIlRmMIY2L3rVBuCpv9z+OfzC4cSu+rdyRm+68p1zKYfDsdHP1aRVud4upY2nSRjz0/gaDx4Gj4njXtxDteewKFbVbuslrlyygv+c1JtHU3iaC3XhBUDXemeOU+wN+nVJine4Cp0JuQ+wRJ1GM+giFlxyMlwM8iNgPFVK9KoWgP28zJXE3a0BwjXE80haF9wsAfpabHVGCNLPsYCNm1TRT3kaj+w/uZCvtvvkJY9giihp0RuREdpsIm3w1GbTcBj1OU1sNptdm4dbo0gN/u1hkez/VOeREDSq3qofW/vN/VQl+en6GAsH2W+fdBLGY1/JT9VjstiAWb9NTmrkrbnOKFX9Bq2xDqpfakTkAjJDdRxdziCeqNNrkC1fJYzULAvdAcZj4PftTO4ODw7tl1B9e5BR1A2/ZXcAI7HV2b9pCJfme0rMXWQ+FKDR3Rk7ibeDmM3m2wydWr3LEMnyBea+WxW/K6t/P37sW3k5Stp4+arX8mWQehfmSmjGnxlliyF1E7gS+0YsJEZG1g7rNhoscmIsdmzbBgBX2jCQRy/zrJNtQnssfZnWx8e7jEsjx4rq8NHsrjTd5/encpdsd1Gh/5G/caZOZod11PqzzPFp5mP65Co0XCZ/FkaCKjpE5G90JQ7138qoh5hxk9a9oljuepT9dA1LVm1kNNj1fOg3upZBoOAL610U54Bc7/nsHf54fDi8l2j0jXePlzpVo1/rUqXxuUri4xKWb6y4FiKqpPMF9e2hFDWtjXcXbVtrdHG2la2fF5tK2Ff5ABi349eJ2BWtvu8gEmnWEt7lU86XNJTzRA3meFzjRBl83QDfJVoRsP0GsEMEYNETfytOm02wEUp43FQa/MMTSa4F2mxPMuaPap6+w00+VN16rpSaPPlpvhTa1nT+Tfs74qIv+PUP2fqzDF0ntz8LHw8CpSwO7pUAxoWLCzk3iy6+0GtaSzzMJ4rdPDdXSzDIkxxg2G4EDm0ipI7tW5QSYROeNRig9qixKPIreSsDaLc8YGdfJKf5bUM6nKNBf8sGnvwmL5Sg06peK9m5merk3gmsk9l/vrbGzxgKFX6kVb/JKNfhcgxqKHdS6vt242641nZ8hleoIR9ridApZzi+bGpiFcu/Kc27KhTRwdXE3vnon1nCdkFnUBru/Kg2k+rcdP5rfJIUO3MUPX6Sree6BEbOMaRAEVd7cycmHNfcvgoNrjcpFI7fAtGTljaOcFRbt42MWi7TKL1cgg6oFIkgGVbHhTCoyJ//G763bfVQZJtdkcnG9JMrMJkmUdruV5Ku2Ka1wj1mLrChR3TSWBW3CZ4SQk0UJKirrYH8laSu1Be0IQ3nqBTs0a/Xzvnu2m8Wk5v/s0dXv1tyCY7/bfE2fXd24qt4X/9O9hN2/BuPM55Vd8KZxzaekhv+9UJmWzBo/CLtOMqVBmaUDaRO3EH8mowLaY84vmtvKpJ3Q2gVFp5DG1qmeBR1UWv68SCglIgacR94TrX13SczNRwbKB4aGytbapyEPp02UtGt5VtunNK6Ytkobrf5oBd8Wy+8iBugaeXh1XxBe2PXOce/r7aH00mvcYmUet4n2NdB4OcEWO180bW+Dm164cejCWPOhfZ6JP2c9Z24hufYbT0SIp7oC2X4ziRrl0dRf3Hv+S40jYeOjpZStA4N9nFY+N2G6d21Eg5QjpBukwD8K1uc1d11YE1saBKZ2tsJLqr+laviWpJS/6a2IoQbZB0vMG1j6SbmIxg2md/Yu7+3oDtlxnMG/ZDYwOI2npxx3N154TcdlGe1sINGMY+Qn13gUVrJQSTkwc00rxsqNRJW+BVztqy5cqkwt630TUaaiX1F5UyyHGvlHKji4ZwGmvWreKpdiU8TUD1PR91apqr2l1ionWo1xRSrYOGiKw1vlYB6dXap4nHXAM3qbBXALsEg5P7rykXE39DLPUVk1apqGWvjUJpWSg0OrYWUToNSU7bvKoh1btoGpI9QdZuSOWk5xMNqTalXKemOYXWJiYVmOK4IzLFcRWa7IzZaSa2LdHJXh5oD/3VDW0PqoSx8GQQaK8SlNzRvEsLczQX0s1bI0FvYc2aK2rhzLxG7gHGzPnBijZrwqjkStdwbZyV9V03d621RguHLbVxC5f2TXcPcGrPIdXpbSmViWs0z88DfccNYQ0LsSiTZJDKZ6AskPf0OFTU0jUktbfYhWNMmFCq83mCQrKy/pVZoasrqmRuveBhLHstbw4h4sR6gIfzlsI4797MwzXVJRF0yanrbOUjuUGfuRUmec5XXjJibPjHlEfN9mOiLS9H9fDiX+HmReYKPLO4fR1v98s2kKAWeDKo18Magwqf6ZSuIJtOkaHp1EGEkrfe/wMY3XD1
\ No newline at end of file
diff --git a/third_party/xla/third_party/gpus/find_rocm_config.py.gz.base64 b/third_party/xla/third_party/gpus/find_rocm_config.py.gz.base64
deleted file mode 100644
index fee67ec553d9da..00000000000000
--- a/third_party/xla/third_party/gpus/find_rocm_config.py.gz.base64
+++ /dev/null
@@ -1 +0,0 @@
-eJztW21v2zgS/q5fQSgoam9dJSlugUMOOcCbZlHftUlhZ7tYtIVB27TNrSz6SCppUPS/7wxJyRIt2YqjvX6JgTa2NPNwOC8Ph7Z4RC7E+l7yxVKTVyevTsjNkpEblighf43FHemneimkikg/jskQxRQZMsXkLZtFwVFwRN7yKYizGUmTGZNEg35/Tafwx93pkQ9MKi4S8io6IR0UCN2tsPsvQLgXKVnRe5IITVLFAIIrMucxI+zrlK014QmZitU65jSZMnLH9dIM40DADPKHgxATTUGagvwaPs2LcoRqYzC+llqvz46P7+7uImqMjYRcHMdWUB2/HVxcXo0uX4LBRuW3JGZKEcn+l3IJU53cE7oGe6Z0AlbG9I4ISehCMrinBdp7J7nmyaJHlJjrOyoZoMy40pJPUl1yVmYdzLkoAO6iCQn7IzIYheSX/mgw6gHG74ObN9e/3ZDf+8Nh/+pmcDki10NycX31enAzuL6CT7+S/tUf5L+Dq9c9wsBVMAz7upZoPxjJ0Y0mdGTEWMmAubAGqTWb8jmfwrySRUoXjCzELZMJTIesmVxxhcFUYN4MUGK+4ppqc2VrUjjMeauvIAzD95InmIbXFysYfiKpvEdjyJJRHH8GIZpqITkzNpJbm32QUgIMRMeaWd4rzVZREGDCq6nkkGeKUQm5oIwr6uAxMVUZpQcRR69pFcDFFabAjGl0VWJczGVmhAFaW/tRfyqSOV+k0jgQ9ZSeiVRHxqo1xUQXGThmiIsNptlSinSxxCRhyS2XIlmxRJNbKrlJyg7Y/278vn/zphsFgzkUF9yL+cwbkju39Ox0rB8yA405TEoTasl0Kk3YCVwCB03FjJX9p+kXZueVxeC+YDEUDd7K7aq0OyrixUJ8scGwvrfxzGJiA2GqfUnl7CXaM4MYaqj7QKWTYh7MpViRCVXOqY4YNrbl9kYEfLUxEdwDrBTkgsZNUJbHYq2PpZiuQhRJkf4o2KIh7nOaxjifOGUBZmsQQM0JCeET2TuhsnfAC+4dZFIQBNOYQp1emBBdopc7l4YCIVTds4CA9QrFYBQyXjA9dsON0ZQxTq1jxGysimYWlYwwJJWmcVxQAltfZ1lrPZ2F3IVtRZySTR3UjHCCMJ6PSM5r7QNxPidh7uMQQyhU5LIBLakG3Mh8LGh/BvmjOnljomQ0NkNvCXU3rtq6V3QYLj5Ccc3GrozHSbqaMNlZ0T+F7BHwGP4Btemy6P/TE3iRn4gRIy/wM35CafhkxIvD2CTPxuigFT2S0BXLwjN08QA2X4PBQOkABOomYODFiyJHudBg9cTIQ3Cfi0isWYYcyhBWiATKBpj9PEz1/OU/w671/wptAx9KFpm3HRke2YHIM0VedD7NXnRD8sxY1zP4XaMHkTXyFoXYMgAcczFaAGWtO6ddd9N5CbimY+S6AbqOcqjSYgU8z0YOv3035WZXtU/J8wgmB8gd4yPywsGWXyExfQUugMgz4AXTWXz7DuvmpyTMIEw61ENwoEM7F8sIyIJgN1sw9C4UDY3DbhZLMHZmM95ybUXineFUUdjcKmeV6jgRHBvyTnHgxVzGst45+ZibGvJkGqczZgodWFDad5lGtIQwQ4mYAv45elWtVyN9mi1YQhq9z+b/oi1gypVImLmOuTZHF1dbnSdF4aJfrH8KnuXnPMsTSKrsNvsKLZLqbCF0z/JpecZtieaCEyCGL1nWYkyLohnedjoWUiQ0XsqWdjMg4tg+A9wA+Zrll5dY1Q7qmgogji/OK1mhqNBzNPrhcjiC5m/8rv+f62Fo3WZZ5gCMwVWOsXYs8FAMIOaLNw7DFXkFU+JcKy4j61QVRQV7ZyuPLTNQ/ObcHFYghGcHkDngfQ+8VcKOVir2JV/LNNF8xRqUfEHYs+DAugdA/Le/4FGwmdJT3e+s+zeD92RoY/g3lP8RbMJhmSms5dDg2sb7K246oc3MlkT0PBiTVV6P3C0F2G27T4sFSJ2CiCUJaEOwGXlBSnew9LtODXoNyfit3SJmDYwlFdsavpzSeJpCLwjWMQkcoLjZnUPzL0zTbHHuKKy6UHe4XAoDNkkVNiwKu/A1hfnDvhK6f0gH3KeqhzLg1tweToBbTihz12mpiTO4GKitoi8SUG2RAw3VE0Blh1okoK0xSzS04tjeNaAgJ/i4vqPEQRbxuMQm7rVhIo9UchCnvAvriZkaMdPzd4Nr8F2ZlEzfnDNT3jUXwbvdh5YdDPT+8uqxleejHNZ8eCiPbT9qqqO6ASnVXJEBKlHCs0KwHt2OlMb2dx+TmKpmGxAj2d4eBOGOcXMkExpnF17ubU42mhapIcITH+zsVJzz2u1SHkgVEKlf3vZHLWxTyjAH71RKMC1sVioLqHa/UihMb8tSgdMuYZRH9xlDQiU1Ywwj2c7mxaEde6h+lZd4wih4ADWKJ0/0sIceZOE3kt30sJMPytnQuBSH/avXWSmWi7AM6GqnkKJe7VSkJLTaNbm6t88uD+UXynyum9UJCra3sAKa+9NoNUXxsvbTEnpgjYDvfugK6qWS0Xz4AuqjoObD108PxWg+bvmsqpLa1XNTex4BbKO0vnZuxva//WvGCE6wJUawaO7PfkZw4mXtJ0Y4hBGs734oI1gTPtiL7w7jgzLGYWxQwnj/SC6oqY9qLihVnffV29/NBaWx/e5ASzoFhzVqEJxsez2CBdy829khOOEt9Sc2eGB/YN32ozfZN8P+xeWwhW22D1TYaB9tsjZ/3kow+5v4kt4yQl01Z75w7XuRVk4e0yzUVExtv1CqRq9lqMRqvWsoWeA3Dgp/AWn4q6GTba99sICbd42aCKe0BfPUShzYSlj3/ehuwlrx6IaiDHNwT1GCaaGtqCmd2s6iVJZec1GJ1Xp/UbLAJw0R3zbqMDayLZKGAdx8PZ9fasYembb3Jf8+lNNdKPuUn74E3OKgjRcfTDxtko6x4PGkU4I5nHSKMG2QTnXp1ZNOsax90qnCap90ihb4+5rGrLORbW9f4yo+f9foG1CPJvYpP7Uq+/Y5BzJGm6wBsRpdv/3Qyj7HBzr4B0UPqIWfFB/EHX5xetuc/wN3+BY47th6Etp/kp2SGZ/iaQI80CHmthKNIQnDIyhuPjyZi31HDSpONQR5tnslVUVeewrg+Sg/LLI5HmKerMn3vwY8f7xmewwbK6bwfAhE6Pvm40f7wKoWIv7CtZEOP2cPwpZOJWQaUbqeUc064wYPm3drtJo8tVqnu+9Rszq9vQ+m7FDc/fv0DsWd3867LCkFwiuVz+Tf5+QfJ6cnJy5Nqp25d5ga8/Z9Ybgjfvu+Pmg+uZ/3TG7f6r9jfvtVC0RiIBx9rCisj6Y0tbw/y1fdL+y+l53GSIgSUrNZZ5tqIqCylep08+XSHGvrhM/UGXmm8PRKZ4Nk7HfHTQv1j6e63PKo7lVkj8NFeLiTdcJPyeVweD08g1L+lBTOkigtOwDYzdWAGDQeewkCiMV4jMdVxmNyfk7C8RjnOB4bNrbTDf4CEAHDYg==
\ No newline at end of file
diff --git a/third_party/xla/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/gpus/rocm_configure.bzl
index 394f5741ce0ded..19b36bc72d03c1 100644
--- a/third_party/xla/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/gpus/rocm_configure.bzl
@@ -365,40 +365,17 @@ def _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, miopen_path, rccl_
     libs_paths.append(("hipblaslt", _rocm_lib_paths(repository_ctx, "hipblaslt", rocm_config.rocm_toolkit_path), True))
     return _select_rocm_lib_paths(repository_ctx, libs_paths, bash_bin)
 
-def _exec_find_rocm_config(repository_ctx, script_path):
-    python_bin = get_python_bin(repository_ctx)
-
-    # If used with remote execution then repository_ctx.execute() can't
-    # access files from the source tree. A trick is to read the contents
-    # of the file in Starlark and embed them as part of the command. In
-    # this case the trick is not sufficient as the find_cuda_config.py
-    # script has more than 8192 characters. 8192 is the command length
-    # limit of cmd.exe on Windows. Thus we additionally need to compress
-    # the contents locally and decompress them as part of the execute().
-    compressed_contents = repository_ctx.read(script_path)
-    decompress_and_execute_cmd = (
-        "from zlib import decompress;" +
-        "from base64 import b64decode;" +
-        "from os import system;" +
-        "script = decompress(b64decode('%s'));" % compressed_contents +
-        "f = open('script.py', 'wb');" +
-        "f.write(script);" +
-        "f.close();" +
-        "system('\"%s\" script.py');" % (python_bin)
-    )
-
-    return execute(repository_ctx, [python_bin, "-c", decompress_and_execute_cmd])
-
-def find_rocm_config(repository_ctx, script_path):
+def find_rocm_config(repository_ctx):
     """Returns ROCm config dictionary from running find_rocm_config.py"""
-    exec_result = _exec_find_rocm_config(repository_ctx, script_path)
+    python_bin = get_python_bin(repository_ctx)
+    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_rocm_config])
     if exec_result.return_code:
         auto_configure_fail("Failed to run find_rocm_config.py: %s" % err_out(exec_result))
 
     # Parse the dict from stdout.
     return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
 
-def _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script):
+def _get_rocm_config(repository_ctx, bash_bin):
     """Detects and returns information about the ROCm installation on the system.
 
     Args:
@@ -413,7 +390,7 @@ def _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script):
         miopen_version_number: The version of MIOpen on the system.
         hipruntime_version_number: The version of HIP Runtime on the system.
     """
-    config = find_rocm_config(repository_ctx, find_rocm_config_script)
+    config = find_rocm_config(repository_ctx)
     rocm_toolkit_path = config["rocm_toolkit_path"]
     rocm_version_number = config["rocm_version_number"]
     miopen_version_number = config["miopen_version_number"]
@@ -565,10 +542,8 @@ def _create_local_rocm_repository(repository_ctx):
         "rocm:rocm_config.h",
     ]}
 
-    find_rocm_config_script = repository_ctx.path(Label("@local_xla//third_party/gpus:find_rocm_config.py.gz.base64"))
-
     bash_bin = get_bash_bin(repository_ctx)
-    rocm_config = _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script)
+    rocm_config = _get_rocm_config(repository_ctx, bash_bin)
 
     # For ROCm 4.1 and above use hipfft, older ROCm versions use rocfft
     rocm_version_number = int(rocm_config.rocm_version_number)
@@ -849,12 +824,20 @@ remote_rocm_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_rocm_config": attr.label(
+            default = Label("@local_xla//third_party/gpus:find_rocm_config.py"),
+        ),
     },
 )
 
 rocm_configure = repository_rule(
     implementation = _rocm_autoconf_impl,
     environ = _ENVIRONS + [_TF_ROCM_CONFIG_REPO],
+    attrs = {
+        "_find_rocm_config": attr.label(
+            default = Label("@local_xla//third_party/gpus:find_rocm_config.py"),
+        ),
+    },
 )
 """Detects and configures the local ROCm toolchain.
 
diff --git a/third_party/xla/third_party/nccl/nccl_configure.bzl b/third_party/xla/third_party/nccl/nccl_configure.bzl
index 7a9a9ac6112461..2e0563aa17aeb7 100644
--- a/third_party/xla/third_party/nccl/nccl_configure.bzl
+++ b/third_party/xla/third_party/nccl/nccl_configure.bzl
@@ -90,17 +90,11 @@ def _label(file):
     return Label("//third_party/nccl:{}".format(file))
 
 def _create_local_nccl_repository(repository_ctx):
-    # Resolve all labels before doing any real work. Resolving causes the
-    # function to be restarted with all previous state being lost. This
-    # can easily lead to a O(n^2) runtime in the number of labels.
-    # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
-    find_cuda_config_path = repository_ctx.path(Label("@local_xla//third_party/gpus:find_cuda_config.py.gz.base64"))
-
     nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
     if nccl_version:
         nccl_version = nccl_version.split(".")[0]
 
-    cuda_config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda"])
+    cuda_config = find_cuda_config(repository_ctx, ["cuda"])
     cuda_version = cuda_config["cuda_version"].split(".")
 
     if nccl_version == "":
@@ -120,7 +114,7 @@ def _create_local_nccl_repository(repository_ctx):
         )
     else:
         # Create target for locally installed NCCL.
-        config = find_cuda_config(repository_ctx, find_cuda_config_path, ["nccl"])
+        config = find_cuda_config(repository_ctx, ["nccl"])
         config_wrap = {
             "%{nccl_version}": config["nccl_version"],
             "%{nccl_header_dir}": config["nccl_include_dir"],
@@ -170,12 +164,20 @@ remote_nccl_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_cuda_config": attr.label(
+            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
+        ),
     },
 )
 
 nccl_configure = repository_rule(
     implementation = _nccl_autoconf_impl,
     environ = _ENVIRONS,
+    attrs = {
+        "_find_cuda_config": attr.label(
+            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
+        ),
+    },
 )
 """Detects and configures the NCCL configuration.
 
diff --git a/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl b/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
index c8118e0e6e861f..3706b291848399 100644
--- a/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
@@ -148,11 +148,6 @@ def _get_tensorrt_full_version(repository_ctx):
     return get_host_environ(repository_ctx, _TF_TENSORRT_VERSION, None)
 
 def _create_local_tensorrt_repository(repository_ctx):
-    # Resolve all labels before doing any real work. Resolving causes the
-    # function to be restarted with all previous state being lost. This
-    # can easily lead to a O(n^2) runtime in the number of labels.
-    # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
-    find_cuda_config_path = repository_ctx.path(Label("@local_xla//third_party/gpus:find_cuda_config.py.gz.base64"))
     tpl_paths = {
         "build_defs.bzl": _tpl_path(repository_ctx, "build_defs.bzl"),
         "BUILD": _tpl_path(repository_ctx, "BUILD"),
@@ -161,7 +156,7 @@ def _create_local_tensorrt_repository(repository_ctx):
         "plugin.BUILD": _tpl_path(repository_ctx, "plugin.BUILD"),
     }
 
-    config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda", "tensorrt"])
+    config = find_cuda_config(repository_ctx, ["cuda", "tensorrt"])
     cuda_version = config["cuda_version"]
     cuda_library_path = config["cuda_library_dir"] + "/"
     trt_version = config["tensorrt_version"]
@@ -318,12 +313,16 @@ remote_tensorrt_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_cuda_config": attr.label(default = "@local_xla//third_party/gpus:find_cuda_config.py"),
     },
 )
 
 tensorrt_configure = repository_rule(
     implementation = _tensorrt_configure_impl,
     environ = _ENVIRONS + [_TF_TENSORRT_CONFIG_REPO],
+    attrs = {
+        "_find_cuda_config": attr.label(default = "@local_xla//third_party/gpus:find_cuda_config.py"),
+    },
 )
 """Detects and configures the local CUDA toolchain.
 
diff --git a/third_party/xla/third_party/tsl/opensource_only.files b/third_party/xla/third_party/tsl/opensource_only.files
index 4a5a1e2a7c9f79..f1484fce0756fe 100644
--- a/third_party/xla/third_party/tsl/opensource_only.files
+++ b/third_party/xla/third_party/tsl/opensource_only.files
@@ -45,7 +45,6 @@ third_party/gpus/cuda/build_defs.bzl.tpl:
 third_party/gpus/cuda/cuda_config.h.tpl:
 third_party/gpus/cuda/cuda_config.py.tpl:
 third_party/gpus/cuda_configure.bzl:
-third_party/gpus/find_cuda_config.py.gz.base64:
 third_party/gpus/find_cuda_config:.py
 third_party/gpus/rocm/BUILD.tpl:
 third_party/gpus/rocm/BUILD:
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/compress_find_cuda_config.py b/third_party/xla/third_party/tsl/third_party/gpus/compress_find_cuda_config.py
deleted file mode 100644
index 4d1e417c96d576..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/gpus/compress_find_cuda_config.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Compresses the contents of 'find_cuda_config.py'.
-
-The compressed file is what is actually being used. It works around remote
-config not being able to upload files yet.
-"""
-import base64
-import zlib
-
-
-def main():
-  with open('find_cuda_config.py', 'rb') as f:
-    data = f.read()
-
-  compressed = zlib.compress(data)
-  b64encoded = base64.b64encode(compressed)
-
-  with open('find_cuda_config.py.gz.base64', 'wb') as f:
-    f.write(b64encoded)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/compress_find_rocm_config.py b/third_party/xla/third_party/tsl/third_party/gpus/compress_find_rocm_config.py
deleted file mode 100644
index 90615d4b1ea24f..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/gpus/compress_find_rocm_config.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Compresses the contents of 'find_rocm_config.py'.
-
-The compressed file is what is actually being used. It works around remote
-config not being able to upload files yet.
-"""
-import base64
-import zlib
-
-
-def main():
-  with open('find_rocm_config.py', 'rb') as f:
-    data = f.read()
-
-  compressed = zlib.compress(data)
-  b64encoded = base64.b64encode(compressed)
-
-  with open('find_rocm_config.py.gz.base64', 'wb') as f:
-    f.write(b64encoded)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
index 4239915405ab5d..16042405380e66 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
@@ -641,42 +641,19 @@ def _cudart_static_linkopt(cpu_value):
     """Returns additional platform-specific linkopts for cudart."""
     return "" if cpu_value == "Darwin" else "\"-lrt\","
 
-def _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries):
-    python_bin = get_python_bin(repository_ctx)
-
-    # If used with remote execution then repository_ctx.execute() can't
-    # access files from the source tree. A trick is to read the contents
-    # of the file in Starlark and embed them as part of the command. In
-    # this case the trick is not sufficient as the find_cuda_config.py
-    # script has more than 8192 characters. 8192 is the command length
-    # limit of cmd.exe on Windows. Thus we additionally need to compress
-    # the contents locally and decompress them as part of the execute().
-    compressed_contents = repository_ctx.read(script_path)
-    decompress_and_execute_cmd = (
-        "from zlib import decompress;" +
-        "from base64 import b64decode;" +
-        "from os import system;" +
-        "script = decompress(b64decode('%s'));" % compressed_contents +
-        "f = open('script.py', 'wb');" +
-        "f.write(script);" +
-        "f.close();" +
-        "system('\"%s\" script.py %s');" % (python_bin, " ".join(cuda_libraries))
-    )
-
-    return execute(repository_ctx, [python_bin, "-c", decompress_and_execute_cmd])
-
 # TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl,
 # and nccl_configure.bzl.
-def find_cuda_config(repository_ctx, script_path, cuda_libraries):
+def find_cuda_config(repository_ctx, cuda_libraries):
     """Returns CUDA config dictionary from running find_cuda_config.py"""
-    exec_result = _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries)
+    python_bin = get_python_bin(repository_ctx)
+    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_cuda_config] + cuda_libraries)
     if exec_result.return_code:
         auto_configure_fail("Failed to run find_cuda_config.py: %s" % err_out(exec_result))
 
     # Parse the dict from stdout.
     return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
 
-def _get_cuda_config(repository_ctx, find_cuda_config_script):
+def _get_cuda_config(repository_ctx):
     """Detects and returns information about the CUDA installation on the system.
 
       Args:
@@ -692,7 +669,7 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
           compute_capabilities: A list of the system's CUDA compute capabilities.
           cpu_value: The name of the host operating system.
       """
-    config = find_cuda_config(repository_ctx, find_cuda_config_script, ["cuda", "cudnn"])
+    config = find_cuda_config(repository_ctx, ["cuda", "cudnn"])
     cpu_value = get_cpu_value(repository_ctx)
     toolkit_path = config["cuda_toolkit_path"]
 
@@ -1008,9 +985,8 @@ def _create_local_cuda_repository(repository_ctx):
         "cuda:cuda_config.py",
     ]}
     tpl_paths["cuda:BUILD"] = _tpl_path(repository_ctx, "cuda:BUILD.windows" if is_windows(repository_ctx) else "cuda:BUILD")
-    find_cuda_config_script = repository_ctx.path(Label("@local_tsl//third_party/gpus:find_cuda_config.py.gz.base64"))
 
-    cuda_config = _get_cuda_config(repository_ctx, find_cuda_config_script)
+    cuda_config = _get_cuda_config(repository_ctx)
 
     cuda_include_path = cuda_config.config["cuda_include_dir"]
     cublas_include_path = cuda_config.config["cublas_include_dir"]
@@ -1484,12 +1460,20 @@ remote_cuda_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_cuda_config": attr.label(
+            default = Label("@local_tsl//third_party/gpus:find_cuda_config.py"),
+        ),
     },
 )
 
 cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
     environ = _ENVIRONS + [_TF_CUDA_CONFIG_REPO],
+    attrs = {
+        "_find_cuda_config": attr.label(
+            default = Label("@local_tsl//third_party/gpus:find_cuda_config.py"),
+        ),
+    },
 )
 """Detects and configures the local CUDA toolchain.
 
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py b/third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py
index b7ae4a8d71f8ec..4a7781cb2b5a2c 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py
+++ b/third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py
@@ -53,12 +53,6 @@
 tf_<library>_library_dir: ...
 """
 
-# You can use the following command to regenerate the base64 version of this
-# script:
-# cat third_party/tensorflow/third_party/gpus/find_cuda_config.oss.py |
-#   pigz -z | base64 -w0 >
-#   third_party/tensorflow/third_party/gpus/find_cuda_config.py.gz.base64.oss
-
 import io
 import os
 import glob
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py.gz.base64 b/third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py.gz.base64
deleted file mode 100644
index 37525142385f6d..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py.gz.base64
+++ /dev/null
@@ -1 +0,0 @@
-eF7dPGtz4zaS3/UrcJxzmfLIlJ1NpfZ061w59szFuz57ytZMbsvWamESkpmhSB5Jydbs7n+/7gZAgiApv+JkKq7KRCTRje5GP/F6w46SdJ2F89uCfbO3/x9sfCvYWMR5kr2Pkjt2uCxukyz32GEUsQtslrMLkYtsJQKv96b3hp2GPjQXAVvGgchYAfCHKffhf+rLgH0SWR4mMfvG22MuNnDUJ6f/n4BhnSzZgq9ZnBRsmQtAEeZsFkaCiXtfpAULY+YnizQKeewLdhcWt9SNQgJksL8qFMlNwaE1h/YpPM3MdowXRDD+3RZFOhoO7+7uPE7Eekk2H0ayYT48PTl6d3b5bhcIJpCPcSTynGXi/5ZhBqzerBlPgR6f3wCVEb9jScb4PBPwrUiQ3rssLMJ4PmB5MivueCYASxDmRRbeLIuasDR1wLPZAMTFY+YcXrKTS4f9cHh5cjkAHD+djH88/zhmPx1eXByejU/eXbLzC3Z0fnZ8Mj45P4On9+zw7K/sLydnxwMmQFTQjbhPM6QfiAxRjDR07FKIGgGzRBKUp8IPZ6EPfMXzJZ8LNk9WIouBHZaKbBHmOJg5kBcAlihchAUv6E2DKezm4Bf96zmO8yELY1DDo4/Hh9D9TcazNRLDbgXH/gMYIr9IslAQjWwltQ9UKgECUbDE5TovxMLr9VDhcz8LQc9ywTPQhZxE0YUeFTOvYxnAiKPUirwHLxeoAoEoUFQxiTjMNBGEKJX0I7yfxLNwvsxIgAiXF0GyLDyiKuXFbS71ibATMEKVeliyBgqmxw1V8DZLlvNbJuJVmCXxQsRFb8WzELUVTPlkBqbGVjwKA4uAUAlpIJmTUtHkEnEiy2jgM1EsM1ICBq9AXH4SCCXNCNQYbU8KD4cBYGchEA/4Kyo5kj1fInVA1OUyTZMMNb8CQ7OhYXDD2I+WAbzylz+cHl72B/Dj+OxswM6Ojk4HJBjptC7G9SEt+GdEVNJ0w0HTTQ2p6AGjRq7H76fY5/TD4fjHy54hQqZFiJSDP1rw3VykHEQHwPMouaFOPGb0HiXJZ6lNUnnyHlKqlUpqErmrW54FuyjCAJSQCM2XNyaZsyxZIHlAPbFAuuH1YChr9KKE0Y2WXIFs2PllZdOBmPFlVGA7cLXBqNdjYKzx8n4EXnG4zLNhlPg8GvrLgA/ohZSu1EWiYjsKpNaw3XTbA/ifYGiTu3zESjpYm9gG4H96rPo7Gl1ff8iSecYX7D2K4vr67NPJ8ckh++8PHyEsLdIlulA2TpLoc1hcXyP66+udXu89CPSG+5/BrwY0FEBdeBNGYbFGh7sQpg5FeUJRhUdgkTE0XEkR9hriTaSzbCEdRolCybpuaB6IjtSqp4Z0t5RzKxYlbTn6rAyL0LbSLXYYBCEaI48aygrQuy/8AxRkUfSnVefTu4tLCB56XPDV+Pz89C8nYxpNgJFmV8LgQw2qHQYs1Ojn7KzekXx1cnY5Pjw91UBozyVx+GARR69MGOkCpj8eX2gU2hEQijHE8fOLi7GBpnxV67r3SfvSdgXwwdPdCJ1SgEmDIdxvY0jdvvfW2+ji0C0L0AfeK9VAOWiIxIu0WGPrZWz46YRxnzIcHq91W+W/IAqA/mN/XAUu7TbAdA0qRr1iNv2T+vb9VCEZMSDK+1L/Jn3QFBRqxDzPq39UP6qvEGh7KrNC3mVihn1GkBeiXZILjAPJ+VzEAj0htUG9/e7bMuYRtWEOyKRbHMEvnxf4Mgum4EGL9bCgQZsB6qH5ep4u8yGGjik6pKn0O16S5x6kd/+kZC4N51/Y7hf2T93t7t0e+54+PbuDdO3Nv3gSH/bW60HeBNGJhYn+leT6F7p+/TuNeIGjop8hgKlf4M/TLPEhDSvfrHMUcLqOIL6OMPlDTTuY78rvu+DHd3mxWyRpr8jWI9Bs8lL5LbjFiCkkd7ehf9tTefIJvXuH8bhsjjklAuRenvK7WMMRy+Je+MtChzSJqiJIxJ309Hp+xCGhPCJxUYfuO6IBxruPfaccpdaDcMOmYQ7aBSHGpS8ycygl5ckUyu1Dooh1AbRzDMA7GVseAaqikAm84H7yGNBjnkE/JeSCF5gFaltyuV8seTQtbVmnX/oNdQDmcnQr/M8oR0GBhAKRtoGFECrlU9Ay7empgIjVlYjkS23ddjcsVuUFuCHM59E/hZSGKiR1Okvc+NfAZTXGFjnkBQbIU0KK/ttn9t++V383zpbCBPC+sQGs5/cQwB+A8P6wEaL5Z/dBROmRyOb5qIS25Tai7E4LrZE7gpPMvBK4LuI6KNYHPiaOOrGTYRky/YJHEWXjGtGFTLVH7CelWNha4q7US6psvTpIYuFJzYR/IYGxhhziyBk0kcwq+5CygzdQHkIpmgKL3DfSz7LiQMQWvgNbAbGYTd1+r0X9Dhqvaq2JlgY2cNg5Fv9uwwC15fJiGgmeF08zXSXLA3YFfs9d9SlhX1FibpEAtXPhOp7TnxhMtQE2ubNADQ7Z95U0NCNzUehwrVnBDHzAYr4Q2t8oxUBnoEILdA/gUHVSWXFklqyeVAOkMMKyFL6HiZekQmN2MgfSlBiKEAjuB86ymO3+0el7GaBAAOVHmVQ1GkCPfrqZc53vvIH/ZNdsK7/O37rXwdu+w7aI4AF12Sdw0EMC0zamREHvvDnUram7byiBU3plH4Zf5CGPp8BssPQLdxZmeQEJv4CoHdhCASuiogXTlJswVjMU4FoJimoaCagEo/q7SiC1ADDv5ySM3Rlgl6M6Q4FJUHwk+Ur4iTlmUaBSiCkVTG4bVY26o15QleaKdZwVOysyUYs0GPUFI0IB3HX0a6eP+aYzzIH9YfkS4FRqeWDkJJ6PsWsqv7hXNcygGrupM+lTVC+wiJKjj3UXKBYo9Q47+J653k7fkSOHkQTRi4Ls2dQ52YM0h5paqRTH1DDVm1IzbO0FAotk1+G5H4ZOX6qUyn4+xiF+PKYmZRJkIqwiQ1MLkWaPB4GrFQCGCXXXrWtm39DNnOYrXKz6XYmg3zeVQdXaMrOU+kA/Td+jxtl8L2ky3wDpzg6OnIhUe3AkNCv6EFjt8S2AARrZayO5qnSrDGAgClUMecCPW4upTlnsOwPrw/Pq+hX4jWvwGDVkNXnRF8N9XjnWhMXuVo4uxwQC5bUaOeqVSbbVJo6dCUir3aL1CCv33G7nzfmcTERy7qGc1cPJIWNuSE4HWd5IkeiUxDpqJqzxQvFmv93ZJfexO4+X1UdxX2Q8Hx59/DA+GW5EKNtU3yoxDjuRlA5Rl5S/gIhqU2YPyghaf/dt7bH20C4U/HBvgtUY3GmVgd1Rh3SqZqVswIinVNZPafrUxVpTCmpQykE/I9PKF5IQq7boGBwZqq6c61ir8/ZWvo2GQHEBxUc/wtJjVQj6E+nPzA6firQOPTEcpFkeahkdJcsoIB9GM8I48bElJ0HhlzHpuR6BP4iTGf2vcguuIYympAzGSjulShehniRjk8GW1KMblfKllDPgEhbA4wSBh/+4teRCxlaz476VFOE3lCYPc/F8lakJQhlSDVjbqUzVuqpbbbcyCZKrByQhsFNddoi8msBfV5lMS6wxCAR9MymgMIUmq+OdVcnbsM5WvuMFa4QA7cQUW+suvXpbQw5ufWNlWP/bkMVfjfYnlAsISClaydpIB8SgPHEeoOah7lFSO4Yn7FB12w9v0A7VkXiaweSDbi5sHgYMg+oTVItLMypuedGocqW6aemU6lasU2H6CUjh0d+AJV5hrjZgxTKNxKQxauj5rsznyS/oCLRmhHGtS23xL/MXLbW4IWaSRb9sGj53iqsa09I7DayOu72Vgt+gUjpCgG2o6qs2hKDsUgEwd9R86qDUOSNAai0TrNpYqeyt3eU9Q0lVDvdoNa2Z7Aazs9LMOuEPuLLNbNUkZEx+Wyrd0APoFOE6ZiiUolR6iDMj3dMZjrn+pOs5VePU6ppSYjhlZVYrzlbgbQXk+nWnwyHb39vbG5RUbNGzfN9HBohdQ7KlWCu6OzXHQWDv1nlI+o8akWfgaMqy32vUfRY77A2uB+FUiWD8JlkJr5SCGRxKttuyBOI7K5yBVZQZ+hCvfL9NG4xpA+cIgJmcOtCbHpJIOgOBy53XwdtrD/4ZsE+u/k3/9J3Sn+rJhM75Cz1tsauIcSb9limHlrmsMrPsnm1oTh50T2LVtRZekIQoy4EMAR88cS+cRo5EWYVs4GgoNf9niLgcrg3+o6U6ugnj1pJFv58MKjIHVjFtj3JfMrVa1PWnmwxsi/VQIFahbxSdOZSDdg2LFVm8CoOQ71J1X8gJgzZwg42WHoAjp3yz4+3veTe+oxxBWoSmJ+hmoeGJHQIGT9AvET3FmKC90+JgS8PESUu5HguY7EmpUiP6Wv7G+m1rezVEfTm5/7+nh7prGaeUbMuIRsVZWtxzuYeqlJ5H4OPz43PXz8P5vD+ipT60xpsEQM3JTVzMAJl5miPVR1ll6pxAExsnGe7kuK2nPZYwMDuABLg/eBSwLRqCHpYI+jJhbFJ3tTdh/3bQ9mFf5Y0yzTErXOckhgiaQ4aJK3O0nGNKdYR17ipnNEfV4feb/dF4KSfyD63tpk06o7qJ1tqoaRnkHNrZSmFHwX4d2BBbF7Cp7xZ0NWKaQmMMTW9g9dMYsQorWthDHFnm3LfAH+LJNmKLKXNoNFu23iDEv6zk6ibi+WPSKyu2jvRcbWNdy5riBI+278jmj8nOzPzMnNV9YNnJaIlRmMIY2L3rVBuCpv9z+OfzC4cSu+rdyRm+68p1zKYfDsdHP1aRVud4upY2nSRjz0/gaDx4Gj4njXtxDteewKFbVbuslrlyygv+c1JtHU3iaC3XhBUDXemeOU+wN+nVJine4Cp0JuQ+wRJ1GM+giFlxyMlwM8iNgPFVK9KoWgP28zJXE3a0BwjXE80haF9wsAfpabHVGCNLPsYCNm1TRT3kaj+w/uZCvtvvkJY9giihp0RuREdpsIm3w1GbTcBj1OU1sNptdm4dbo0gN/u1hkez/VOeREDSq3qofW/vN/VQl+en6GAsH2W+fdBLGY1/JT9VjstiAWb9NTmrkrbnOKFX9Bq2xDqpfakTkAjJDdRxdziCeqNNrkC1fJYzULAvdAcZj4PftTO4ODw7tl1B9e5BR1A2/ZXcAI7HV2b9pCJfme0rMXWQ+FKDR3Rk7ibeDmM3m2wydWr3LEMnyBea+WxW/K6t/P37sW3k5Stp4+arX8mWQehfmSmjGnxlliyF1E7gS+0YsJEZG1g7rNhoscmIsdmzbBgBX2jCQRy/zrJNtQnssfZnWx8e7jEsjx4rq8NHsrjTd5/encpdsd1Gh/5G/caZOZod11PqzzPFp5mP65Co0XCZ/FkaCKjpE5G90JQ7138qoh5hxk9a9oljuepT9dA1LVm1kNNj1fOg3upZBoOAL610U54Bc7/nsHf54fDi8l2j0jXePlzpVo1/rUqXxuUri4xKWb6y4FiKqpPMF9e2hFDWtjXcXbVtrdHG2la2fF5tK2Ff5ABi349eJ2BWtvu8gEmnWEt7lU86XNJTzRA3meFzjRBl83QDfJVoRsP0GsEMEYNETfytOm02wEUp43FQa/MMTSa4F2mxPMuaPap6+w00+VN16rpSaPPlpvhTa1nT+Tfs74qIv+PUP2fqzDF0ntz8LHw8CpSwO7pUAxoWLCzk3iy6+0GtaSzzMJ4rdPDdXSzDIkxxg2G4EDm0ipI7tW5QSYROeNRig9qixKPIreSsDaLc8YGdfJKf5bUM6nKNBf8sGnvwmL5Sg06peK9m5merk3gmsk9l/vrbGzxgKFX6kVb/JKNfhcgxqKHdS6vt242641nZ8hleoIR9ridApZzi+bGpiFcu/Kc27KhTRwdXE3vnon1nCdkFnUBru/Kg2k+rcdP5rfJIUO3MUPX6Sree6BEbOMaRAEVd7cycmHNfcvgoNrjcpFI7fAtGTljaOcFRbt42MWi7TKL1cgg6oFIkgGVbHhTCoyJ//G763bfVQZJtdkcnG9JMrMJkmUdruV5Ku2Ka1wj1mLrChR3TSWBW3CZ4SQk0UJKirrYH8laSu1Be0IQ3nqBTs0a/Xzvnu2m8Wk5v/s0dXv1tyCY7/bfE2fXd24qt4X/9O9hN2/BuPM55Vd8KZxzaekhv+9UJmWzBo/CLtOMqVBmaUDaRO3EH8mowLaY84vmtvKpJ3Q2gVFp5DG1qmeBR1UWv68SCglIgacR94TrX13SczNRwbKB4aGytbapyEPp02UtGt5VtunNK6Ytkobrf5oBd8Wy+8iBugaeXh1XxBe2PXOce/r7aH00mvcYmUet4n2NdB4OcEWO180bW+Dm164cejCWPOhfZ6JP2c9Z24hufYbT0SIp7oC2X4ziRrl0dRf3Hv+S40jYeOjpZStA4N9nFY+N2G6d21Eg5QjpBukwD8K1uc1d11YE1saBKZ2tsJLqr+laviWpJS/6a2IoQbZB0vMG1j6SbmIxg2md/Yu7+3oDtlxnMG/ZDYwOI2npxx3N154TcdlGe1sINGMY+Qn13gUVrJQSTkwc00rxsqNRJW+BVztqy5cqkwt630TUaaiX1F5UyyHGvlHKji4ZwGmvWreKpdiU8TUD1PR91apqr2l1ionWo1xRSrYOGiKw1vlYB6dXap4nHXAM3qbBXALsEg5P7rykXE39DLPUVk1apqGWvjUJpWSg0OrYWUToNSU7bvKoh1btoGpI9QdZuSOWk5xMNqTalXKemOYXWJiYVmOK4IzLFcRWa7IzZaSa2LdHJXh5oD/3VDW0PqoSx8GQQaK8SlNzRvEsLczQX0s1bI0FvYc2aK2rhzLxG7gHGzPnBijZrwqjkStdwbZyV9V03d621RguHLbVxC5f2TXcPcGrPIdXpbSmViWs0z88DfccNYQ0LsSiTZJDKZ6AskPf0OFTU0jUktbfYhWNMmFCq83mCQrKy/pVZoasrqmRuveBhLHstbw4h4sR6gIfzlsI4797MwzXVJRF0yanrbOUjuUGfuRUmec5XXjJibPjHlEfN9mOiLS9H9fDiX+HmReYKPLO4fR1v98s2kKAWeDKo18Magwqf6ZSuIJtOkaHp1EGEkrfe/wMY3XD1
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/find_rocm_config.py.gz.base64 b/third_party/xla/third_party/tsl/third_party/gpus/find_rocm_config.py.gz.base64
deleted file mode 100644
index fee67ec553d9da..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/gpus/find_rocm_config.py.gz.base64
+++ /dev/null
@@ -1 +0,0 @@
-eJztW21v2zgS/q5fQSgoam9dJSlugUMOOcCbZlHftUlhZ7tYtIVB27TNrSz6SCppUPS/7wxJyRIt2YqjvX6JgTa2NPNwOC8Ph7Z4RC7E+l7yxVKTVyevTsjNkpEblighf43FHemneimkikg/jskQxRQZMsXkLZtFwVFwRN7yKYizGUmTGZNEg35/Tafwx93pkQ9MKi4S8io6IR0UCN2tsPsvQLgXKVnRe5IITVLFAIIrMucxI+zrlK014QmZitU65jSZMnLH9dIM40DADPKHgxATTUGagvwaPs2LcoRqYzC+llqvz46P7+7uImqMjYRcHMdWUB2/HVxcXo0uX4LBRuW3JGZKEcn+l3IJU53cE7oGe6Z0AlbG9I4ISehCMrinBdp7J7nmyaJHlJjrOyoZoMy40pJPUl1yVmYdzLkoAO6iCQn7IzIYheSX/mgw6gHG74ObN9e/3ZDf+8Nh/+pmcDki10NycX31enAzuL6CT7+S/tUf5L+Dq9c9wsBVMAz7upZoPxjJ0Y0mdGTEWMmAubAGqTWb8jmfwrySRUoXjCzELZMJTIesmVxxhcFUYN4MUGK+4ppqc2VrUjjMeauvIAzD95InmIbXFysYfiKpvEdjyJJRHH8GIZpqITkzNpJbm32QUgIMRMeaWd4rzVZREGDCq6nkkGeKUQm5oIwr6uAxMVUZpQcRR69pFcDFFabAjGl0VWJczGVmhAFaW/tRfyqSOV+k0jgQ9ZSeiVRHxqo1xUQXGThmiIsNptlSinSxxCRhyS2XIlmxRJNbKrlJyg7Y/278vn/zphsFgzkUF9yL+cwbkju39Ox0rB8yA405TEoTasl0Kk3YCVwCB03FjJX9p+kXZueVxeC+YDEUDd7K7aq0OyrixUJ8scGwvrfxzGJiA2GqfUnl7CXaM4MYaqj7QKWTYh7MpViRCVXOqY4YNrbl9kYEfLUxEdwDrBTkgsZNUJbHYq2PpZiuQhRJkf4o2KIh7nOaxjifOGUBZmsQQM0JCeET2TuhsnfAC+4dZFIQBNOYQp1emBBdopc7l4YCIVTds4CA9QrFYBQyXjA9dsON0ZQxTq1jxGysimYWlYwwJJWmcVxQAltfZ1lrPZ2F3IVtRZySTR3UjHCCMJ6PSM5r7QNxPidh7uMQQyhU5LIBLakG3Mh8LGh/BvmjOnljomQ0NkNvCXU3rtq6V3QYLj5Ccc3GrozHSbqaMNlZ0T+F7BHwGP4Btemy6P/TE3iRn4gRIy/wM35CafhkxIvD2CTPxuigFT2S0BXLwjN08QA2X4PBQOkABOomYODFiyJHudBg9cTIQ3Cfi0isWYYcyhBWiATKBpj9PEz1/OU/w671/wptAx9KFpm3HRke2YHIM0VedD7NXnRD8sxY1zP4XaMHkTXyFoXYMgAcczFaAGWtO6ddd9N5CbimY+S6AbqOcqjSYgU8z0YOv3035WZXtU/J8wgmB8gd4yPywsGWXyExfQUugMgz4AXTWXz7DuvmpyTMIEw61ENwoEM7F8sIyIJgN1sw9C4UDY3DbhZLMHZmM95ybUXineFUUdjcKmeV6jgRHBvyTnHgxVzGst45+ZibGvJkGqczZgodWFDad5lGtIQwQ4mYAv45elWtVyN9mi1YQhq9z+b/oi1gypVImLmOuTZHF1dbnSdF4aJfrH8KnuXnPMsTSKrsNvsKLZLqbCF0z/JpecZtieaCEyCGL1nWYkyLohnedjoWUiQ0XsqWdjMg4tg+A9wA+Zrll5dY1Q7qmgogji/OK1mhqNBzNPrhcjiC5m/8rv+f62Fo3WZZ5gCMwVWOsXYs8FAMIOaLNw7DFXkFU+JcKy4j61QVRQV7ZyuPLTNQ/ObcHFYghGcHkDngfQ+8VcKOVir2JV/LNNF8xRqUfEHYs+DAugdA/Le/4FGwmdJT3e+s+zeD92RoY/g3lP8RbMJhmSms5dDg2sb7K246oc3MlkT0PBiTVV6P3C0F2G27T4sFSJ2CiCUJaEOwGXlBSnew9LtODXoNyfit3SJmDYwlFdsavpzSeJpCLwjWMQkcoLjZnUPzL0zTbHHuKKy6UHe4XAoDNkkVNiwKu/A1hfnDvhK6f0gH3KeqhzLg1tweToBbTihz12mpiTO4GKitoi8SUG2RAw3VE0Blh1okoK0xSzS04tjeNaAgJ/i4vqPEQRbxuMQm7rVhIo9UchCnvAvriZkaMdPzd4Nr8F2ZlEzfnDNT3jUXwbvdh5YdDPT+8uqxleejHNZ8eCiPbT9qqqO6ASnVXJEBKlHCs0KwHt2OlMb2dx+TmKpmGxAj2d4eBOGOcXMkExpnF17ubU42mhapIcITH+zsVJzz2u1SHkgVEKlf3vZHLWxTyjAH71RKMC1sVioLqHa/UihMb8tSgdMuYZRH9xlDQiU1Ywwj2c7mxaEde6h+lZd4wih4ADWKJ0/0sIceZOE3kt30sJMPytnQuBSH/avXWSmWi7AM6GqnkKJe7VSkJLTaNbm6t88uD+UXynyum9UJCra3sAKa+9NoNUXxsvbTEnpgjYDvfugK6qWS0Xz4AuqjoObD108PxWg+bvmsqpLa1XNTex4BbKO0vnZuxva//WvGCE6wJUawaO7PfkZw4mXtJ0Y4hBGs734oI1gTPtiL7w7jgzLGYWxQwnj/SC6oqY9qLihVnffV29/NBaWx/e5ASzoFhzVqEJxsez2CBdy829khOOEt9Sc2eGB/YN32ozfZN8P+xeWwhW22D1TYaB9tsjZ/3kow+5v4kt4yQl01Z75w7XuRVk4e0yzUVExtv1CqRq9lqMRqvWsoWeA3Dgp/AWn4q6GTba99sICbd42aCKe0BfPUShzYSlj3/ehuwlrx6IaiDHNwT1GCaaGtqCmd2s6iVJZec1GJ1Xp/UbLAJw0R3zbqMDayLZKGAdx8PZ9fasYembb3Jf8+lNNdKPuUn74E3OKgjRcfTDxtko6x4PGkU4I5nHSKMG2QTnXp1ZNOsax90qnCap90ihb4+5rGrLORbW9f4yo+f9foG1CPJvYpP7Uq+/Y5BzJGm6wBsRpdv/3Qyj7HBzr4B0UPqIWfFB/EHX5xetuc/wN3+BY47th6Etp/kp2SGZ/iaQI80CHmthKNIQnDIyhuPjyZi31HDSpONQR5tnslVUVeewrg+Sg/LLI5HmKerMn3vwY8f7xmewwbK6bwfAhE6Pvm40f7wKoWIv7CtZEOP2cPwpZOJWQaUbqeUc064wYPm3drtJo8tVqnu+9Rszq9vQ+m7FDc/fv0DsWd3867LCkFwiuVz+Tf5+QfJ6cnJy5Nqp25d5ga8/Z9Ybgjfvu+Pmg+uZ/3TG7f6r9jfvtVC0RiIBx9rCisj6Y0tbw/y1fdL+y+l53GSIgSUrNZZ5tqIqCylep08+XSHGvrhM/UGXmm8PRKZ4Nk7HfHTQv1j6e63PKo7lVkj8NFeLiTdcJPyeVweD08g1L+lBTOkigtOwDYzdWAGDQeewkCiMV4jMdVxmNyfk7C8RjnOB4bNrbTDf4CEAHDYg==
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
index b999b023b73edd..807e75ebb51c4b 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
@@ -365,40 +365,17 @@ def _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, miopen_path, rccl_
     libs_paths.append(("hipblaslt", _rocm_lib_paths(repository_ctx, "hipblaslt", rocm_config.rocm_toolkit_path), True))
     return _select_rocm_lib_paths(repository_ctx, libs_paths, bash_bin)
 
-def _exec_find_rocm_config(repository_ctx, script_path):
-    python_bin = get_python_bin(repository_ctx)
-
-    # If used with remote execution then repository_ctx.execute() can't
-    # access files from the source tree. A trick is to read the contents
-    # of the file in Starlark and embed them as part of the command. In
-    # this case the trick is not sufficient as the find_cuda_config.py
-    # script has more than 8192 characters. 8192 is the command length
-    # limit of cmd.exe on Windows. Thus we additionally need to compress
-    # the contents locally and decompress them as part of the execute().
-    compressed_contents = repository_ctx.read(script_path)
-    decompress_and_execute_cmd = (
-        "from zlib import decompress;" +
-        "from base64 import b64decode;" +
-        "from os import system;" +
-        "script = decompress(b64decode('%s'));" % compressed_contents +
-        "f = open('script.py', 'wb');" +
-        "f.write(script);" +
-        "f.close();" +
-        "system('\"%s\" script.py');" % (python_bin)
-    )
-
-    return execute(repository_ctx, [python_bin, "-c", decompress_and_execute_cmd])
-
-def find_rocm_config(repository_ctx, script_path):
+def find_rocm_config(repository_ctx):
     """Returns ROCm config dictionary from running find_rocm_config.py"""
-    exec_result = _exec_find_rocm_config(repository_ctx, script_path)
+    python_bin = get_python_bin(repository_ctx)
+    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_rocm_config])
     if exec_result.return_code:
         auto_configure_fail("Failed to run find_rocm_config.py: %s" % err_out(exec_result))
 
     # Parse the dict from stdout.
     return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
 
-def _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script):
+def _get_rocm_config(repository_ctx, bash_bin):
     """Detects and returns information about the ROCm installation on the system.
 
     Args:
@@ -413,7 +390,7 @@ def _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script):
         miopen_version_number: The version of MIOpen on the system.
         hipruntime_version_number: The version of HIP Runtime on the system.
     """
-    config = find_rocm_config(repository_ctx, find_rocm_config_script)
+    config = find_rocm_config(repository_ctx)
     rocm_toolkit_path = config["rocm_toolkit_path"]
     rocm_version_number = config["rocm_version_number"]
     miopen_version_number = config["miopen_version_number"]
@@ -565,10 +542,8 @@ def _create_local_rocm_repository(repository_ctx):
         "rocm:rocm_config.h",
     ]}
 
-    find_rocm_config_script = repository_ctx.path(Label("@local_tsl//third_party/gpus:find_rocm_config.py.gz.base64"))
-
     bash_bin = get_bash_bin(repository_ctx)
-    rocm_config = _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script)
+    rocm_config = _get_rocm_config(repository_ctx, bash_bin)
 
     # For ROCm 4.1 and above use hipfft, older ROCm versions use rocfft
     rocm_version_number = int(rocm_config.rocm_version_number)
@@ -849,12 +824,20 @@ remote_rocm_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_rocm_config": attr.label(
+            default = Label("@local_tsl//third_party/gpus:find_rocm_config.py"),
+        ),
     },
 )
 
 rocm_configure = repository_rule(
     implementation = _rocm_autoconf_impl,
     environ = _ENVIRONS + [_TF_ROCM_CONFIG_REPO],
+    attrs = {
+        "_find_rocm_config": attr.label(
+            default = Label("@local_tsl//third_party/gpus:find_rocm_config.py"),
+        ),
+    },
 )
 """Detects and configures the local ROCm toolchain.
 
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl b/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
index 5ba212f002e85e..844defb52d74ca 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/nccl/nccl_configure.bzl
@@ -90,17 +90,11 @@ def _label(file):
     return Label("//third_party/nccl:{}".format(file))
 
 def _create_local_nccl_repository(repository_ctx):
-    # Resolve all labels before doing any real work. Resolving causes the
-    # function to be restarted with all previous state being lost. This
-    # can easily lead to a O(n^2) runtime in the number of labels.
-    # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
-    find_cuda_config_path = repository_ctx.path(Label("@local_tsl//third_party/gpus:find_cuda_config.py.gz.base64"))
-
     nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
     if nccl_version:
         nccl_version = nccl_version.split(".")[0]
 
-    cuda_config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda"])
+    cuda_config = find_cuda_config(repository_ctx, ["cuda"])
     cuda_version = cuda_config["cuda_version"].split(".")
 
     if nccl_version == "":
@@ -120,7 +114,7 @@ def _create_local_nccl_repository(repository_ctx):
         )
     else:
         # Create target for locally installed NCCL.
-        config = find_cuda_config(repository_ctx, find_cuda_config_path, ["nccl"])
+        config = find_cuda_config(repository_ctx, ["nccl"])
         config_wrap = {
             "%{nccl_version}": config["nccl_version"],
             "%{nccl_header_dir}": config["nccl_include_dir"],
@@ -170,12 +164,20 @@ remote_nccl_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_cuda_config": attr.label(
+            default = Label("@local_tsl//third_party/gpus:find_cuda_config.py"),
+        ),
     },
 )
 
 nccl_configure = repository_rule(
     implementation = _nccl_autoconf_impl,
     environ = _ENVIRONS,
+    attrs = {
+        "_find_cuda_config": attr.label(
+            default = Label("@local_tsl//third_party/gpus:find_cuda_config.py"),
+        ),
+    },
 )
 """Detects and configures the NCCL configuration.
 
diff --git a/third_party/xla/third_party/tsl/third_party/tensorrt/tensorrt_configure.bzl b/third_party/xla/third_party/tsl/third_party/tensorrt/tensorrt_configure.bzl
index cce05b824321a5..91b214fd990866 100644
--- a/third_party/xla/third_party/tsl/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tensorrt/tensorrt_configure.bzl
@@ -148,11 +148,6 @@ def _get_tensorrt_full_version(repository_ctx):
     return get_host_environ(repository_ctx, _TF_TENSORRT_VERSION, None)
 
 def _create_local_tensorrt_repository(repository_ctx):
-    # Resolve all labels before doing any real work. Resolving causes the
-    # function to be restarted with all previous state being lost. This
-    # can easily lead to a O(n^2) runtime in the number of labels.
-    # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
-    find_cuda_config_path = repository_ctx.path(Label("@local_tsl//third_party/gpus:find_cuda_config.py.gz.base64"))
     tpl_paths = {
         "build_defs.bzl": _tpl_path(repository_ctx, "build_defs.bzl"),
         "BUILD": _tpl_path(repository_ctx, "BUILD"),
@@ -161,7 +156,7 @@ def _create_local_tensorrt_repository(repository_ctx):
         "plugin.BUILD": _tpl_path(repository_ctx, "plugin.BUILD"),
     }
 
-    config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda", "tensorrt"])
+    config = find_cuda_config(repository_ctx, ["cuda", "tensorrt"])
     cuda_version = config["cuda_version"]
     cuda_library_path = config["cuda_library_dir"] + "/"
     trt_version = config["tensorrt_version"]
@@ -318,12 +313,16 @@ remote_tensorrt_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
+        "_find_cuda_config": attr.label(default = "@local_tsl//third_party/gpus:find_cuda_config.py"),
     },
 )
 
 tensorrt_configure = repository_rule(
     implementation = _tensorrt_configure_impl,
     environ = _ENVIRONS + [_TF_TENSORRT_CONFIG_REPO],
+    attrs = {
+        "_find_cuda_config": attr.label(default = "@local_tsl//third_party/gpus:find_cuda_config.py"),
+    },
 )
 """Detects and configures the local CUDA toolchain.
 

From 435df2af17d6c826743a72637ae6b4b909f09448 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 27 Sep 2023 23:45:32 -0700
Subject: [PATCH 362/567] [stream_executor] Add support for submitting command
 buffers for execution

https://github.com/openxla/xla/issues/5857

PiperOrigin-RevId: 569080650
---
 third_party/xla/xla/stream_executor/command_buffer.h   |  4 ++++
 .../stream_executor/cuda/cuda_command_buffer_test.cc   | 10 +++++++++-
 .../xla/xla/stream_executor/cuda/cuda_executor.cc      | 10 ++++++++++
 .../xla/xla/stream_executor/gpu/gpu_command_buffer.h   |  8 ++++++++
 third_party/xla/xla/stream_executor/gpu/gpu_executor.h |  4 ++++
 .../xla/xla/stream_executor/stream_executor_internal.h |  6 ++++++
 .../xla/xla/stream_executor/stream_executor_pimpl.cc   |  6 ++++++
 .../xla/xla/stream_executor/stream_executor_pimpl.h    |  3 +++
 8 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index dff5e3705ffc3c..e5d5daed064e07 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -74,6 +74,10 @@ class CommandBuffer {
                      const ThreadDim& threads, const BlockDim& blocks,
                      Args... args);
 
+  const internal::CommandBufferInterface* implementation() const {
+    return implementation_.get();
+  }
+
  private:
   std::unique_ptr<internal::CommandBufferInterface> implementation_;
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
index 0f0b6dd05b3406..4e1304df8d8c17 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "xla/stream_executor/command_buffer.h"
@@ -61,7 +62,14 @@ TEST(CudaCommandBufferTest, LaunchSingleKernel) {
   ASSERT_TRUE(cmd_buffer.Launch(add, ThreadDim(), BlockDim(4), a, b, c).ok());
   ASSERT_TRUE(cmd_buffer.Finalize().ok());
 
-  // TODO(ezhulenev): Execute command buffer and check results.
+  ASSERT_TRUE(executor->Submit(&stream, cmd_buffer).ok());
+
+  // Copy data back to host.
+  std::vector<int32_t> dst(4, 42);
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+
+  std::vector<int32_t> expected = {3, 3, 3, 3};
+  ASSERT_EQ(dst, expected);
 }
 
 }  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index cb351c4f3366cd..a3e88e65f2f524 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_driver.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
@@ -54,6 +55,7 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor_internal.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
 // LOG(ERROR) uses a const named ERROR, so a macro with the same name is
@@ -429,6 +431,14 @@ tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
                                  kernel_params, nullptr /* = extra */);
 }
 
+tsl::Status GpuExecutor::Submit(Stream* stream,
+                                const CommandBuffer& command_buffer) {
+  auto exec = GpuCommandBuffer::Cast(&command_buffer)->executable();
+  VLOG(3) << "Launch command buffer execuable graph " << exec
+          << " on a stream: " << stream->DebugStreamPointers();
+  return GpuDriver::GraphLaunch(exec, AsGpuStreamValue(stream));
+}
+
 // This is a non-essential operation; if there's a failure, proceed without
 // logging an error. It's nearly certain that in case of failures, we'd never
 // get here in the first place; these are very low-impact routines.
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index 95c87e25f41a8e..9e87adb4fa50d8 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <type_traits>
 
+#include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel.h"
@@ -45,6 +46,8 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
 
   tsl::Status Finalize() override;
 
+  GpuGraphExecHandle executable() const { return exec_; }
+
   // We track the total number of allocated and alive executable graphs in the
   // process to track the command buffers resource usage. Executable graph
   // allocates resources on a GPU devices (rule of thumb is ~8kb per node), so
@@ -58,6 +61,11 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   static int64_t AllocatedExecs();
   static int64_t AliveExecs();
 
+  static const GpuCommandBuffer* Cast(const CommandBuffer* command_buffer) {
+    return static_cast<const GpuCommandBuffer*>(
+        command_buffer->implementation());
+  }
+
  private:
   static_assert(std::is_pointer_v<GpuGraphHandle>,
                 "GpuGraphHandle must be a pointer");
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index b5279bf657e432..88574f63db3c2c 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/strings/string_view.h"
+#include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/gpu/gpu_kernel.h"
 #include "xla/stream_executor/platform.h"
@@ -117,6 +118,9 @@ class GpuExecutor : public internal::StreamExecutorInterface {
                      const BlockDim& block_dims, const KernelBase& k,
                      const KernelArgsArrayBase& args) override;
 
+  tsl::Status Submit(Stream* stream,
+                     const CommandBuffer& command_buffer) override;
+
   // (supported on CUDA only)
   int CalculateOccupancy(const DeviceDescription& device_description,
                          uint64_t registers_per_thread,
diff --git a/third_party/xla/xla/stream_executor/stream_executor_internal.h b/third_party/xla/xla/stream_executor/stream_executor_internal.h
index 8d1867dab70724..af79f35ddaa395 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_internal.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_internal.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "xla/stream_executor/allocator_stats.h"
+#include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_options.h"
@@ -250,6 +251,11 @@ class StreamExecutorInterface {
     return absl::UnimplementedError("Not Implemented");
   }
 
+  virtual tsl::Status Submit(Stream* stream,
+                             const CommandBuffer& command_buffer) {
+    return absl::UnimplementedError("Not Implemented");
+  }
+
   // Releases any state associated with the kernel.
   virtual void UnloadKernel(const KernelBase* kernel) {}
   virtual DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) = 0;
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
index 0e4a7a6e9f0106..ffe9056b2affdf 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/notification.h"
 #include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/fft.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/stream.h"
@@ -439,6 +440,11 @@ tsl::Status StreamExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
   return implementation_->Launch(stream, thread_dims, block_dims, kernel, args);
 }
 
+tsl::Status StreamExecutor::Submit(Stream* stream,
+                                   const CommandBuffer& command_buffer) {
+  return implementation_->Submit(stream, command_buffer);
+}
+
 tsl::Status StreamExecutor::BlockHostUntilDone(Stream* stream) {
   tsl::Status result;
   SCOPED_TRACE(TraceListener::BlockHostUntilDone, &result, stream);
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
index 1a208a303e22ae..03c8f1aaefc642 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
@@ -392,6 +392,9 @@ class StreamExecutor {
                      const BlockDim& block_dims, const KernelBase& kernel,
                      const KernelArgsArrayBase& args);
 
+  // Submits command buffer for execution to the underlying platform driver.
+  tsl::Status Submit(Stream* stream, const CommandBuffer& command_buffer);
+
   // Gets-or-creates (creates with memoization) a FftSupport datatype that can
   // be used to execute FFT routines on the current platform.
   //

From 76b5a6c9ee28d6e944c4736a6c82ce54df16b857 Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Thu, 28 Sep 2023 12:36:47 +0530
Subject: [PATCH 363/567] Fixed typos in TF document

Corrected several typos in documentation. Please have a look. Thank you!
---
 .../python/compiler/tensorrt/model_tests/model_handler.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compiler/tensorrt/model_tests/model_handler.py b/tensorflow/python/compiler/tensorrt/model_tests/model_handler.py
index 644092a9fd699a..ef5f3f99d7dacb 100644
--- a/tensorflow/python/compiler/tensorrt/model_tests/model_handler.py
+++ b/tensorflow/python/compiler/tensorrt/model_tests/model_handler.py
@@ -80,7 +80,7 @@ def _generate_random_tensor_ops(shape: Sequence[int], dtype: tf_dtypes.DType,
           shape=shape,
           dtype=random_dtype,
           # Limits maximum value as 255 to simulate pixel values, avoid
-          # generating large numbers and casuing overflows.
+          # generating large numbers and causing overflows.
           maxval=min(dtype_max, random_dtype.max, 255)),
       dtype=dtype,
       name=name)

From 6c629becc76bb09f348d316b6c7507af3cc5c157 Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Thu, 28 Sep 2023 12:39:48 +0530
Subject: [PATCH 364/567] Update result_analyzer.py

---
 .../python/compiler/tensorrt/model_tests/result_analyzer.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py b/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py
index 0b222e8cbc83b3..07fa7a8e86c89e 100644
--- a/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py
+++ b/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py
@@ -104,7 +104,7 @@ def to_json(self, path: str):
 
 def extract_test_info(
     test_results: model_handler.TestResultCollection) -> DataFrame:
-  """Extracts the test infomation."""
+  """Extracts the test information."""
   column_names = list(
       itertools.chain(model_handler.ModelConfig._fields,
                       ["enable_gpu", "trt_model"],

From 2585893a4732485afa1ec253c8c0d0e576b16419 Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Thu, 28 Sep 2023 12:40:44 +0530
Subject: [PATCH 365/567] Update result_analyzer.py

---
 .../python/compiler/tensorrt/model_tests/result_analyzer.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py b/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py
index 07fa7a8e86c89e..bac8d338ba2e0b 100644
--- a/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py
+++ b/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py
@@ -207,7 +207,7 @@ def check_column(df: DataFrame, row: int, name: str,
     row: The row in the DataFrame
     name: The name of the column to be checked.
     fn: The function that takes a value of at the specified column and returns
-      if the value statisfies the check.
+      if the value satisfies the check.
 
   Returns:
     Whether all the values of the specified column satisfies the provided check.

From c60f0baa24ce624fe0dfd9964f4c9372dadb4e9c Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Thu, 28 Sep 2023 12:41:44 +0530
Subject: [PATCH 366/567] Update shape_output_test.py

---
 tensorflow/python/compiler/tensorrt/test/shape_output_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compiler/tensorrt/test/shape_output_test.py b/tensorflow/python/compiler/tensorrt/test/shape_output_test.py
index 8a00d4eb060568..63bc61eab24f5f 100644
--- a/tensorflow/python/compiler/tensorrt/test/shape_output_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/shape_output_test.py
@@ -67,7 +67,7 @@ def ExpectedEnginesToBuild(self, run_params):
       return ["TRTEngineOp_000"]
 
   def ShouldRunTest(self, run_params):
-    # We cannot calibrate without bulding the engine, we turn of INT8 test.
+    # We cannot calibrate without building the engine, we turn of INT8 test.
     return (run_params.dynamic_shape and
             run_params.precision_mode != "INT8", "no calibration dynamic shape")
 

From 2aff7cfae2a73e07f461abfa608e060eef3881e5 Mon Sep 17 00:00:00 2001
From: sushreebarsa <84765720+sushreebarsa@users.noreply.github.com>
Date: Thu, 28 Sep 2023 12:43:32 +0530
Subject: [PATCH 367/567] Update shape_output_test.py

---
 tensorflow/python/compiler/tensorrt/test/shape_output_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compiler/tensorrt/test/shape_output_test.py b/tensorflow/python/compiler/tensorrt/test/shape_output_test.py
index 63bc61eab24f5f..1485f5918b0d61 100644
--- a/tensorflow/python/compiler/tensorrt/test/shape_output_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/shape_output_test.py
@@ -237,7 +237,7 @@ def ShouldRunTest(self, run_params):
 class InputProfile(trt_test.TfTrtIntegrationTestBase):
   """The shape profiles has to fit values of shape tensors, but for regular
 
-  tensors the values do not matter. Here we test shape profile managment with
+  tensors the values do not matter. Here we test shape profile management with
   an INT32 input tensor that is not a shape tensor. The extra inputs with
   dim=10 would trigger an error if we mistakenly treat it as shape tensors.
   """

From a913e6810f07969ea9c3f0120ae7b0e3f61d5d17 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Sep 2023 01:04:32 -0700
Subject: [PATCH 368/567] Internal Code Change

PiperOrigin-RevId: 569095712
---
 tensorflow/python/tpu/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index ce140cc8481c61..f8aac8e7cd4467 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -608,7 +608,6 @@ pytype_strict_library(
     name = "tpu_strategy_util",
     srcs = ["tpu_strategy_util.py"],
     visibility = [
-        "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
         "//third_party/py/tensorflow_numerics/extensions:__pkg__",
     ],

From a6cce9268431d0176da1f85e30c9c031312a851d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Sep 2023 02:02:09 -0700
Subject: [PATCH 369/567] Update GraphDef version to 1633.

PiperOrigin-RevId: 569106700
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index c65ab6b9620b8e..414b97873f5f23 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1632  // Updated: 2023/9/27
+#define TF_GRAPH_DEF_VERSION 1633  // Updated: 2023/9/28
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 40390d99e49213914c2fdbf07a34dfadd292c66b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Sep 2023 02:02:13 -0700
Subject: [PATCH 370/567] compat: Update forward compatibility horizon to
 2023-09-28

PiperOrigin-RevId: 569106716
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 356df7c31d74e6..052bd01a1e1e4a 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 27)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 28)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 4cc57f1c3e7036060e056679097e7a8a093b418b Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Thu, 28 Sep 2023 02:38:20 -0700
Subject: [PATCH 371/567] [XLA:GPU] Trigger Triton GEMM fusions also on kCopy
 input operations.

PiperOrigin-RevId: 569114224
---
 .../xla/service/gpu/gemm_rewriter_triton.cc   | 30 +++++++------------
 .../xla/service/gpu/ir_emitter_triton_test.cc | 22 --------------
 2 files changed, 11 insertions(+), 41 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
index 88be780bf25308..59cecc4dba6e77 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
@@ -73,16 +73,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-int GetFusionLevel(const HloInstruction& hlo, const GpuVersion gpu_version) {
-  int level =
-      hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
-  if (!std::get<se::CudaComputeCapability>(gpu_version)
-           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    level = std::min(level, 1);
-  }
-  return level;
-}
-
 bool HasDivisibleSuffixAllowingSplit(const absl::Span<int64_t const> span,
                                      const int64_t divisor) {
   CHECK_GE(divisor, 1);
@@ -1087,6 +1077,12 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
     absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
         old_to_new_mapping,
     const GpuVersion gpu_version) const {
+  int fusion_level =
+      hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
+  if (!std::get<se::CudaComputeCapability>(gpu_version)
+           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+    fusion_level = std::min(fusion_level, 1);
+  }
   if (hlo.opcode() == HloOpcode::kTuple ||
       hlo.opcode() == HloOpcode::kGetTupleElement) {
     return "Unsupported instruction.";
@@ -1107,7 +1103,7 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
     return "Unsupported output data type.";
   }
   if (as_input) {
-    if (GetFusionLevel(hlo, gpu_version) < 2) {
+    if (fusion_level < 2) {
       if (hlo.opcode() == HloOpcode::kConvert) {
         if (FusionDecision decision =
                 RequireTritonFusibleConvert(&hlo, gpu_version);
@@ -1123,7 +1119,7 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
       }
     }
   } else {
-    if (GetFusionLevel(hlo, gpu_version) < 2) {
+    if (fusion_level < 2) {
       return "Skipping fusing outputs at low fusion levels.";
     }
     for (const HloInstruction* operand : hlo.operands()) {
@@ -1373,14 +1369,10 @@ StatusOr<FusionDecision> FuseDot(HloInstruction& dot,
   if (dot.GetModule()->config().debug_options().xla_gpu_triton_gemm_any()) {
     return FusionDecision{};
   }
-
-  absl::flat_hash_set<HloOpcode> triggers{
-      HloOpcode::kConvert, HloOpcode::kSlice, HloOpcode::kTranspose};
-  if (GetFusionLevel(dot, gpu_version) >= 2) {
-    triggers.insert(HloOpcode::kCopy);
-  }
   for (const auto& iter : old_to_new_mapping) {
-    if (triggers.contains(iter.second->opcode())) {
+    if (iter.second->opcode() == HloOpcode::kConvert ||
+        iter.second->opcode() == HloOpcode::kSlice ||
+        iter.second->opcode() == HloOpcode::kTranspose) {
       return FusionDecision{};
     }
   }
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 83f984fa7234bb..c6aef558bd092a 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -1225,28 +1225,6 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-3, /*arel=*/2e-3}));
 }
 
-TEST_F(TritonGemmLevel2Test, FuseTransposeWithoutMixedTypes) {
-  const std::string kHloText = R"(
-ENTRY e {
-  p1 = f16[150,32,60]{2,1,0} parameter(1)
-  p0 = f16[75,2,26,60]{3,2,1,0} parameter(0)
-  t = f16[75,2,60,26]{3,2,1,0} transpose(p0), dimensions={0,1,3,2}
-  r = f16[150,60,26]{2,1,0} reshape(t)
-  ROOT tmp_4 = f16[150,32,26]{2,1,0} dot(p1, r),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={1}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
 TEST_F(TritonGemmTest, SineOutputIsNotFused) {
   const std::string kHloText = R"(
 HloModule m

From ffbe5e18646b26f2d373d8e78bfec5740deafd55 Mon Sep 17 00:00:00 2001
From: Aliia Khasanova <aliia@google.com>
Date: Thu, 28 Sep 2023 03:22:44 -0700
Subject: [PATCH 372/567] Change layouts of matmul([bf16 x bf16] -> bf16)
 operands to ensure that the contracting dimensions for both sides are the
 most minor for matmul([bf16 x bf16] -> bf16). Do this behind the flag
 --xla_gpu_ensure_minor_dot_contraction_dims.

PiperOrigin-RevId: 569123880
---
 third_party/xla/xla/debug_options_flags.cc    |   8 ++
 third_party/xla/xla/service/gpu/BUILD         |   2 +
 .../xla/service/gpu/gpu_layout_assignment.cc  |  51 +++++---
 .../xla/service/gpu/gpu_layout_assignment.h   |   9 +-
 .../xla/service/gpu/ir_emitter_triton_test.cc | 115 ++++++++++++++++++
 third_party/xla/xla/xla.proto                 |   6 +-
 6 files changed, 166 insertions(+), 25 deletions(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 7d778c051fd569..9074c2de88217c 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -194,6 +194,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_enable_nccl_clique_optimization(false);
   opts.set_xla_gpu_cublas_fallback(true);
   opts.set_xla_gpu_enable_while_loop_double_buffering(false);
+  opts.set_xla_gpu_ensure_minor_dot_contraction_dims(false);
 
   return opts;
 }
@@ -1280,6 +1281,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_enable_while_loop_double_buffering),
       debug_options->xla_gpu_enable_while_loop_double_buffering(),
       "Enable double buffering for while loop"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_ensure_minor_dot_contraction_dims",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_ensure_minor_dot_contraction_dims),
+      debug_options->xla_gpu_ensure_minor_dot_contraction_dims(),
+      "Ensure that the contracting dimensions for matmul operands are the most "
+      "minor by changing layouts accordingly"));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index ac4bcd421e514b..9465087a6743ae 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -559,6 +559,7 @@ xla_test(
         ":ir_emitter_triton",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
+        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
@@ -3108,6 +3109,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:window_util",
         "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:computation_layout",
         "//xla/service:layout_assignment",
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
index 44c485e2631d37..ad2d7ffa62f008 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/service/gpu/gpu_layout_assignment.h"
 
+#include <cstddef>
+#include <initializer_list>
 #include <memory>
 #include <tuple>
 #include <utility>
@@ -33,6 +35,7 @@ limitations under the License.
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/status_macros.h"
 #include "xla/window_util.h"
+#include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 
@@ -310,8 +313,13 @@ Status GpuLayoutAssignment::AddBackendConstraints(
                           GetNonContractingDims(rhs_shape, rhs_batch_dims,
                                                 rhs_contracting_dims));
 
-      // For unbatched S8xS8->S32 matrix multiplication enforce a TN layout,
-      // which will allow the NVidia GPUs to use TensorCores.
+      const DebugOptions& debug_options =
+          instruction->GetModule()->config().debug_options();
+
+      bool is_bf16_to_bf16 =
+          (output_shape.element_type() == PrimitiveType::BF16 &&
+           lhs_shape.element_type() == PrimitiveType::BF16 &&
+           rhs_shape.element_type() == PrimitiveType::BF16);
       bool is_s8_to_s32 = (output_shape.element_type() == PrimitiveType::S32 &&
                            lhs_shape.element_type() == PrimitiveType::S8 &&
                            rhs_shape.element_type() == PrimitiveType::S8 &&
@@ -319,13 +327,17 @@ Status GpuLayoutAssignment::AddBackendConstraints(
                            lhs_shape.dimensions_size() == 2 &&
                            rhs_shape.dimensions_size() == 2);
 
-      if (is_s8_to_s32) {
-        TF_RETURN_IF_ERROR(SetOperandBatchRowsColsLayout(
-            instruction, 0, lhs_batch_dims, lhs_non_contracting_dims,
-            lhs_contracting_dims));
-        TF_RETURN_IF_ERROR(SetOperandBatchRowsColsLayout(
-            instruction, 1, rhs_batch_dims, rhs_non_contracting_dims,
-            rhs_contracting_dims));
+      if (is_s8_to_s32 ||
+          (is_bf16_to_bf16 &&
+           debug_options.xla_gpu_ensure_minor_dot_contraction_dims())) {
+        TF_RETURN_IF_ERROR(SetOperandMajorToMinorLayout(
+            instruction, /*operand=*/0,
+            /*dim_groups=*/
+            {lhs_batch_dims, lhs_non_contracting_dims, lhs_contracting_dims}));
+        TF_RETURN_IF_ERROR(SetOperandMajorToMinorLayout(
+            instruction, /*operand=*/1,
+            /*dim_groups=*/
+            {rhs_batch_dims, rhs_non_contracting_dims, rhs_contracting_dims}));
         TF_RETURN_IF_ERROR(SetDotLayout(instruction, constraints));
       } else {
         if (!lhs_batch_dims.empty() || lhs_contracting_dims.size() > 1 ||
@@ -457,20 +469,21 @@ Status GpuLayoutAssignment::SetDotOperandLayout(
     return SetOperandLayout(shape, instruction, operand);
 
   // Otherwise, fallback to forcing (batch, rows, cols) layout.
-  return SetOperandBatchRowsColsLayout(instruction, operand, batch_dims,
-                                       row_dims, col_dims);
+  return SetOperandMajorToMinorLayout(
+      instruction, operand,
+      /*dim_groups=*/{batch_dims, row_dims, col_dims});
 }
 
-Status GpuLayoutAssignment::SetOperandBatchRowsColsLayout(
+Status GpuLayoutAssignment::SetOperandMajorToMinorLayout(
     const HloInstruction* instruction, int64_t operand,
-    absl::Span<const int64_t> batch_dims, absl::Span<const int64_t> row_dims,
-    absl::Span<const int64_t> col_dims) {
+    std::initializer_list<absl::Span<const int64_t>> dim_groups) {
+  size_t size = 0;
+  for (auto group : dim_groups) size += group.size();
   std::vector<int64_t> major_to_minor;
-  major_to_minor.reserve(batch_dims.size() + row_dims.size() + col_dims.size());
-  major_to_minor.insert(major_to_minor.end(), batch_dims.begin(),
-                        batch_dims.end());
-  major_to_minor.insert(major_to_minor.end(), row_dims.begin(), row_dims.end());
-  major_to_minor.insert(major_to_minor.end(), col_dims.begin(), col_dims.end());
+  major_to_minor.reserve(size);
+  for (const auto& group : dim_groups) {
+    major_to_minor.insert(major_to_minor.end(), group.begin(), group.end());
+  }
 
   Shape shape = instruction->operand(operand)->shape();
   *shape.mutable_layout() =
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.h b/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
index 57eeaa8c7a8a29..54893a982c2a7b 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
@@ -44,11 +44,10 @@ class GpuLayoutAssignment : public LayoutAssignment {
   Status AddBackendConstraintsToDnnConvCustomCall(
       HloCustomCallInstruction* instr, LayoutConstraints* constraints);
 
-  Status SetOperandBatchRowsColsLayout(const HloInstruction* instruction,
-                                       int64_t operand,
-                                       absl::Span<const int64_t> batch_dims,
-                                       absl::Span<const int64_t> row_dims,
-                                       absl::Span<const int64_t> col_dims);
+  // dim_groups are ordered from major to minor dimensions.
+  Status SetOperandMajorToMinorLayout(
+      const HloInstruction* instruction, int64_t operand,
+      std::initializer_list<absl::Span<const int64_t>> dim_groups);
 
   Status SetDotOperandLayout(const HloInstruction* instruction, int64_t operand,
                              absl::Span<const int64_t> batch_dims,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index c6aef558bd092a..44aaa50afc72da 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -2656,6 +2656,121 @@ ENTRY e {
                                       /*run_hlo_passes=*/false));
 }
 
+class TritonGemmContractionDims : public TritonGemmTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = TritonGemmTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_ensure_minor_dot_contraction_dims(true);
+    debug_options.set_xla_gpu_triton_gemm_any(true);
+
+    return debug_options;
+  }
+};
+
+TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_0) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const std::string kHloText = R"(
+HloModule m
+
+ENTRY e {
+  parameter.0 = bf16[16,40]{1,0} parameter(0)
+  parameter.1 = bf16[40,32]{1,0} parameter(1)
+  ROOT dot.31472 = bf16[16,32]{1,0} dot(parameter.0, parameter.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+
+  EXPECT_THAT(module->entry_computation()
+                  ->root_instruction()
+                  ->fused_instructions_computation()
+                  ->root_instruction(),
+              GmockMatch(m::Dot(m::Op().WithShape(BF16, {16, 40}, {1, 0}),
+                                m::Op().WithShape(BF16, {40, 32}, {0, 1}))
+                             .WithShape(BF16, {16, 32}, {1, 0})));
+}
+
+TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_2_1_2) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const std::string kHloText = R"(
+HloModule m
+
+ENTRY e {
+  parameter_0 = bf16[32,4,36]{2,1,0} parameter(0)
+  parameter_1 = bf16[40,4,36]{2,1,0} parameter(1)
+  ROOT dot.16450 = bf16[4,32,40]{2,1,0} dot(parameter_0, parameter_1), lhs_batch_dims={1}, lhs_contracting_dims={2}, rhs_batch_dims={1}, rhs_contracting_dims={2}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+
+  EXPECT_THAT(module->entry_computation()
+                  ->root_instruction()
+                  ->fused_instructions_computation()
+                  ->root_instruction(),
+              GmockMatch(m::Dot(m::Op().WithShape(BF16, {32, 4, 36}, {2, 0, 1}),
+                                m::Op().WithShape(BF16, {40, 4, 36}, {2, 0, 1}))
+                             .WithShape(BF16, {4, 32, 40}, {2, 1, 0})));
+}
+
+TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_2_0_1) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const std::string kHloText = R"(
+HloModule m
+
+ENTRY e {
+  parameter_1 = bf16[16,16,48]{2,1,0} parameter(1)
+  parameter_2 = bf16[16,48,32]{2,1,0} parameter(0)
+  ROOT dot.16125 = bf16[16,16,32]{2,1,0} dot(parameter_1, parameter_2), lhs_batch_dims={1}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+
+  EXPECT_THAT(
+      module->entry_computation()
+          ->root_instruction()
+          ->fused_instructions_computation()
+          ->root_instruction(),
+      GmockMatch(m::Dot(m::Op().WithShape(BF16, {16, 16, 48}, {2, 0, 1}),
+                        m::Op().WithShape(BF16, {16, 48, 32}, {1, 2, 0}))
+                     .WithShape(BF16, {16, 16, 32}, {2, 1, 0})));
+}
+
+TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_1) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const std::string kHloText = R"(
+HloModule m
+
+ENTRY e {
+  parameter_0 = bf16[16,32]{1,0} parameter(0)
+  parameter_1 = bf16[40,32]{0,1} parameter(1)
+  ROOT dot.15148 = bf16[16,40]{1,0} dot(parameter_0, parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(module->entry_computation()
+                  ->root_instruction()
+                  ->fused_instructions_computation()
+                  ->root_instruction(),
+              GmockMatch(m::Dot(m::Op().WithShape(BF16, {16, 32}, {1, 0}),
+                                m::Op().WithShape(BF16, {40, 32}, {1, 0}))
+                             .WithShape(BF16, {16, 40}, {1, 0})));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 19a9fd73b5814c..7cc9f7d48776b4 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -623,7 +623,11 @@ message DebugOptions {
   // Enable double buffering for loops.
   bool xla_gpu_enable_while_loop_double_buffering = 248;
 
-  // Next id: 249
+  // Change the layout of the second triton dot operand to be column major.
+  // Only works for (bf16 x bf16) -> bf16.
+  bool xla_gpu_ensure_minor_dot_contraction_dims = 249;
+
+  // Next id: 250
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.

From 405c15974e16366b8314689b4d22f720ed4d7424 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Thu, 28 Sep 2023 06:18:35 -0700
Subject: [PATCH 373/567] [XLA:GPU] Do not run kernels in autotuning which
 spill registers

Not running kernels which spill registers saves some compilation time and does not affect performance apparently.

PiperOrigin-RevId: 569159250
---
 third_party/xla/xla/debug_options_flags.cc    |   8 ++
 third_party/xla/xla/service/gpu/BUILD         |   5 +
 .../xla/service/gpu/autotuner_compile_util.cc |   4 +-
 .../xla/xla/service/gpu/nvptx_compiler.cc     |  29 ++++-
 .../xla/xla/service/gpu/nvptx_compiler.h      |   4 +-
 .../xla/service/gpu/triton_autotuner_test.cc  | 117 ++++++++++++++++++
 third_party/xla/xla/stream_executor/gpu/BUILD |   1 +
 .../xla/stream_executor/gpu/asm_compiler.cc   |   9 +-
 .../xla/stream_executor/gpu/asm_compiler.h    |   6 +-
 third_party/xla/xla/xla.proto                 |   5 +-
 10 files changed, 174 insertions(+), 14 deletions(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 9074c2de88217c..ad67367e338306 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -195,6 +195,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_cublas_fallback(true);
   opts.set_xla_gpu_enable_while_loop_double_buffering(false);
   opts.set_xla_gpu_ensure_minor_dot_contraction_dims(false);
+  opts.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(true);
 
   return opts;
 }
@@ -1288,6 +1289,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_ensure_minor_dot_contraction_dims(),
       "Ensure that the contracting dimensions for matmul operands are the most "
       "minor by changing layouts accordingly"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_filter_kernels_spilling_registers_on_autotuning",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_filter_kernels_spilling_registers_on_autotuning),
+      debug_options->xla_gpu_filter_kernels_spilling_registers_on_autotuning(),
+      "Filter out kernels that spill registers during autotuning"));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 9465087a6743ae..ccb2d4a471ce35 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -727,6 +727,8 @@ xla_test(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:executable",
+        "//xla/service:hlo_module_config",
         "//xla/service:hlo_pass_pipeline",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
@@ -738,11 +740,13 @@ xla_test(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -2861,6 +2865,7 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@llvm-project//llvm:IRReader",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/service/gpu/autotuner_compile_util.cc b/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
index a5a8baf4dfd7e2..ed91d4952c9a11 100644
--- a/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
@@ -136,8 +136,10 @@ StatusOr<std::unique_ptr<Executable>> AutotunerCompileUtil::Compile(
       Compiler::CompileOptions{&allocator_, /*thread_pool=*/nullptr,
                                /*layout_canonicalization_callback=*/{},
                                /*is_autotuning_compilation=*/true});
-  if (out.status().code() == absl::StatusCode::kResourceExhausted) {
+  if (out.status().code() == absl::StatusCode::kResourceExhausted ||
+      out.status().code() == absl::StatusCode::kCancelled) {
     // Being out of shared memory budget is an expected failure.
+    // Cancelling upon register spilling is also an expected failure.
     return std::unique_ptr<Executable>();
   }
   return out;
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 5ee7f942e30fed..787f6d69451d4c 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/call_once.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/SourceMgr.h"
@@ -508,16 +509,19 @@ NVPTXCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
     RecordLlvmPassesAndLlvmToPtxDuration(end_usecs - start_usecs);
   }
 
-  std::vector<uint8_t> cubin = CompileGpuAsmOrGetCachedResult(
+  StatusOr<std::vector<uint8_t>> maybe_cubin = CompileGpuAsmOrGetCachedResult(
       ptx, std::get<se::CudaComputeCapability>(gpu_version), module_config,
       (debug_module != nullptr ? debug_module->name() : "(unknown)"),
       relocatable, options);
 
-  return std::pair<std::string, std::vector<uint8_t>>(std::move(ptx),
-                                                      std::move(cubin));
+  if (maybe_cubin.status().code() == absl::StatusCode::kCancelled) {
+    return maybe_cubin.status();
+  }
+  return std::pair<std::string, std::vector<uint8_t>>(
+      std::move(ptx), std::move(maybe_cubin.value()));
 }
 
-std::vector<uint8_t> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
+StatusOr<std::vector<uint8_t>> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
     const std::string& ptx, se::CudaComputeCapability cc,
     const HloModuleConfig& hlo_module_config, absl::string_view module_name,
     bool relocatable, const CompileOptions& options) {
@@ -561,8 +565,13 @@ std::vector<uint8_t> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
         }
         uint64_t start_usecs = tsl::Env::Default()->NowMicros();
 
-        StatusOr<std::vector<uint8_t>> maybe_cubin = se::CompileGpuAsm(
-            cc.major, cc.minor, cache_ptx->c_str(), ptxas_config);
+        bool cancel_if_reg_spill =
+            hlo_module_config.debug_options()
+                .xla_gpu_filter_kernels_spilling_registers_on_autotuning() &&
+            options.is_autotuning_compilation;
+        StatusOr<std::vector<uint8_t>> maybe_cubin =
+            se::CompileGpuAsm(cc.major, cc.minor, cache_ptx->c_str(),
+                              ptxas_config, cancel_if_reg_spill);
 
         if (maybe_cubin.ok()) {
           uint64_t end_usecs = tsl::Env::Default()->NowMicros();
@@ -599,6 +608,14 @@ std::vector<uint8_t> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
                 "driver version. Custom ptxas location can be specified "
                 "using $PATH.",
                 hlo_module_config.debug_options().xla_gpu_cuda_data_dir());
+          } else if (maybe_cubin.status().code() ==
+                     absl::StatusCode::kCancelled) {
+            // Register spilling has occurred during autotuning, this config
+            // should not be tried further.
+            CHECK(options.is_autotuning_compilation);
+            cache_value->compilation_done = true;
+            cache_value->compilation_done_cv.SignalAll();
+            return maybe_cubin;
           } else if (maybe_cubin.status().code() !=
                      absl::StatusCode::kUnimplemented) {
             // If unimplemented is returned, we fallback to the driver.
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/xla/xla/service/gpu/nvptx_compiler.h
index debc4539dfe161..c1de926b2b1120 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.h
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -101,8 +101,8 @@ class NVPTXCompiler : public GpuCompiler {
       const std::string& preferred_cuda_dir);
 
   // Tries to compile the given ptx string to cubin.  Returns a vector with the
-  // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
-  std::vector<uint8_t> CompileGpuAsmOrGetCachedResult(
+  // compiled cubin if compilation succeeded.
+  StatusOr<std::vector<uint8_t>> CompileGpuAsmOrGetCachedResult(
       const std::string& ptx, se::CudaComputeCapability cc,
       const HloModuleConfig& hlo_module_config, absl::string_view module_name,
       bool relocatable, const CompileOptions& options);
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
index c61522ff9fb2ac..e6221f1c9bd0d1 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/autotuning.pb.h"
 #include "xla/error_spec.h"
@@ -32,9 +33,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/executable.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
@@ -49,6 +52,7 @@ limitations under the License.
 #include "tsl/platform/cpu_info.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
 
@@ -450,6 +454,119 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+TEST_F(TritonAutotunerTest, DoNotRunAutotuningKernelSpillingRegisters) {
+  const std::string kHloText = R"(
+HloModule m
+
+%triton_gemm_dot {
+  %p1 = s8[4,12288]{1,0} parameter(1)
+  %p0 = s8[12288,1536]{1,0} parameter(0)
+  %convert.p0 = f16[12288,1536]{1,0} convert(s8[12288,1536]{1,0} %p0)
+  %convert.p1 = f16[4,12288]{1,0} convert(s8[4,12288]{1,0} %p1)
+  %dot = f16[4,1536]{1,0} dot(f16[4,12288]{1,0} %convert.p1, f16[12288,1536]{1,0} %convert.p0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT %convert = s8[4,1536]{1,0} convert(f16[4,1536]{1,0} %dot)
+}
+
+ENTRY %e {
+  %get-tuple-element.7020 = s8[12288,1536]{1,0} parameter(0)
+  %convert = s8[4,12288]{1,0} parameter(1)
+  ROOT %triton = s8[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %get-tuple-element.7020, s8[4,12288]{1,0} %convert), kind=kCustom, calls=%triton_gemm_dot,
+    backend_config={"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16"}}
+})";
+
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Not enough shared memory to run big tiles before Ampere.";
+  }
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  EXPECT_THAT(
+      backend().compiler()->RunBackend(std::move(module),
+                                       backend().default_stream_executor(),
+                                       {/*device_allocator=*/nullptr,
+                                        /*thread_pool=*/nullptr,
+                                        /*layout_canonicalization_callback=*/{},
+                                        /*is_autotuning_compilation=*/true}),
+      tsl::testing::StatusIs(
+          tsl::error::CANCELLED,
+          absl::StrFormat(
+              "Compilation result discarded due to register spilling")));
+}
+
+TEST_F(TritonAutotunerTest, DoNotFilterOutAutotuningKernelSpillingRegisters) {
+  const std::string kHloText = R"(
+HloModule m
+
+%triton_gemm_dot {
+  %p1 = s8[4,12288]{1,0} parameter(1)
+  %p0 = s8[12288,1536]{1,0} parameter(0)
+  %convert.p0 = f16[12288,1536]{1,0} convert(s8[12288,1536]{1,0} %p0)
+  %convert.p1 = f16[4,12288]{1,0} convert(s8[4,12288]{1,0} %p1)
+  %dot = f16[4,1536]{1,0} dot(f16[4,12288]{1,0} %convert.p1, f16[12288,1536]{1,0} %convert.p0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT %convert = s8[4,1536]{1,0} convert(f16[4,1536]{1,0} %dot)
+}
+
+ENTRY %e {
+  %get-tuple-element.7020 = s8[12288,1536]{1,0} parameter(0)
+  %convert = s8[4,12288]{1,0} parameter(1)
+  ROOT %triton = s8[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %get-tuple-element.7020, s8[4,12288]{1,0} %convert), kind=kCustom, calls=%triton_gemm_dot,
+    backend_config={"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16"}}
+})";
+
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Not enough shared memory to run big tiles before Ampere.";
+  }
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  HloModuleConfig config = module->config();
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(
+      false);
+  config.set_debug_options(debug_options);
+  module->set_config(config);
+
+  std::unique_ptr<Executable> executable =
+      backend()
+          .compiler()
+          ->RunBackend(std::move(module), backend().default_stream_executor(),
+                       {/*device_allocator=*/nullptr,
+                        /*thread_pool=*/nullptr,
+                        /*layout_canonicalization_callback=*/{},
+                        /*is_autotuning_compilation=*/true})
+          .value();
+  EXPECT_NE(executable, nullptr);
+}
+
+TEST_F(TritonAutotunerTest, RunAutotuningKernelNotSpillingRegisters) {
+  const std::string kHloText = R"(
+HloModule m
+
+%triton_gemm_dot {
+  %p1 = f16[4,12288]{1,0} parameter(1)
+  %p0 = s8[12288,1536]{1,0} parameter(0)
+  %convert.10406 = f16[12288,1536]{1,0} convert(s8[12288,1536]{1,0} %p0)
+  ROOT %dot = f16[4,1536]{1,0} dot(f16[4,12288]{1,0} %p1, f16[12288,1536]{1,0} %convert.10406), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY %e {
+  %p0 = s8[12288,1536]{1,0} parameter(0)
+  %p1 = f16[4,12288]{1,0} parameter(1)
+  ROOT %triton_dot = f16[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %p0, f16[4,12288]{1,0} %p1), kind=kCustom, calls=%triton_gemm_dot,
+    backend_config={"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"32","block_k":"16","split_k":"1","num_stages":"1","num_warps":"2"}}
+})";
+
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  std::unique_ptr<Executable> executable =
+      backend()
+          .compiler()
+          ->RunBackend(std::move(module), backend().default_stream_executor(),
+                       {/*device_allocator=*/nullptr,
+                        /*thread_pool=*/nullptr,
+                        /*layout_canonicalization_callback=*/{},
+                        /*is_autotuning_compilation=*/true})
+          .value();
+  EXPECT_NE(executable, nullptr);
+}
+
 // TODO(b/281489442): Write a testcase called
 // `SkipConfigsProducingDeviantResults` or similar.
 
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 545857f61094b8..a67b7bf8ed1230 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -336,6 +336,7 @@ cc_library(
         "@local_tsl//tsl/platform:cuda_libdevice_path",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/platform",
+        "//xla:util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
index d9149f9fbddf71..5760b22f5cacbf 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/util.h"
 #include "tsl/platform/cuda_libdevice_path.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
@@ -252,7 +253,8 @@ tsl::StatusOr<std::array<int64_t, 3>> GetAsmCompilerVersion(
 
 tsl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
                                                   const char* ptx_contents,
-                                                  GpuAsmOpts options) {
+                                                  GpuAsmOpts options,
+                                                  bool cancel_if_reg_spill) {
   std::string ptxas_path =
       FindCudaExecutable("ptxas", options.preferred_cuda_dir);
 
@@ -329,6 +331,11 @@ tsl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
   if (!stderr_output.empty()) {
     if (absl::StrContains(stderr_output, "warning")) {
       LOG(INFO) << stderr_output;
+      if (cancel_if_reg_spill &&
+          absl::StrContains(stderr_output, "Registers are spilled")) {
+        return xla::Cancelled(
+            "Compilation result discarded due to register spilling");
+      }
     } else {
       VLOG(2) << stderr_output;
     }
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
index 52776d1e3040e9..0e15e326ca23e5 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
@@ -56,9 +56,9 @@ tsl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
 //
 // 'options' is used to query for the CUDA location in case it is
 // customized in a passed flag, and for controlling ptxas optimizations.
-tsl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
-                                                  const char* ptx_contents,
-                                                  GpuAsmOpts options);
+tsl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(
+    int cc_major, int cc_minor, const char* ptx_contents, GpuAsmOpts options,
+    bool cancel_if_reg_spill = false);
 
 // Same as CompileGpuAsm, but caches the result, and returns unowned view of
 // the compiled binary.
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 7cc9f7d48776b4..69b00d3914bbe2 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -627,7 +627,10 @@ message DebugOptions {
   // Only works for (bf16 x bf16) -> bf16.
   bool xla_gpu_ensure_minor_dot_contraction_dims = 249;
 
-  // Next id: 250
+  // Filter out kernels that spill registers during autotuning.
+  bool xla_gpu_filter_kernels_spilling_registers_on_autotuning = 250;
+
+  // Next id: 251
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.

From 72c4b7675c51b946485c5fef7c78e07271c629b7 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Thu, 28 Sep 2023 06:20:08 -0700
Subject: [PATCH 374/567] [XLA:GPU][NFC] Disable cuBLAS fallback in Triton
 emitter tests.

PiperOrigin-RevId: 569159562
---
 .../xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 929b34ee656daf..d33f5052ec483f 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -158,6 +158,7 @@ class NoTF32Test : public GpuCodegenTest {
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_triton_gemm_any(true);
+    debug_options.set_xla_gpu_cublas_fallback(false);
     return debug_options;
   }
 

From 1697b89f58d06cd9293ed444976e162f49747735 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Sep 2023 06:59:24 -0700
Subject: [PATCH 375/567] Support `[0,1,i,o]` transposed convolution kernels

PiperOrigin-RevId: 569168220
---
 .../lite/stablehlo/tests/legalize_hlo.mlir    | 29 ++++++++++++
 .../lite/stablehlo/transforms/legalize_hlo.cc | 46 ++++++++++++++++---
 2 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index 669172daa30f0a..e440d9824bc7f8 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -2725,6 +2725,35 @@ func.func @convert_conv2d_back_prop_input(%arg0: tensor<8x4x4x32xf32>, %arg1: te
   func.return %0 : tensor<8x8x8x64xf32>
 }
 
+// CHECK-LABEL:   func @convert_conv2d_back_prop_input_transpose_filter(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<8x4x4x32xf32>,
+// CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x32x64xf32>) -> tensor<8x8x8x64xf32> {
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[8, 8, 8, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_1]], %[[VAL_4]]) : (tensor<3x3x32x64xf32>, tensor<4xi64>) -> tensor<3x3x64x32xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.ReverseV2"(%[[VAL_5]], %[[VAL_3]]) : (tensor<3x3x64x32xf32>, tensor<2xi64>) -> tensor<3x3x64x32xf32>
+// CHECK:           %[[VAL_7:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_2]], %[[VAL_6]], %[[VAL_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<3x3x64x32xf32>, tensor<8x4x4x32xf32>) -> tensor<8x8x8x64xf32>
+// CHECK:           return %[[VAL_7]] : tensor<8x8x8x64xf32>
+// CHECK:         }
+func.func @convert_conv2d_back_prop_input_transpose_filter(%arg0: tensor<8x4x4x32xf32>, %arg1: tensor<3x3x32x64xf32>) -> tensor<8x8x8x64xf32> {
+  %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64,
+    dimension_numbers = #mhlo.conv<raw
+      input_batch_dimension = 0,
+      input_feature_dimension = 3,
+      input_spatial_dimensions = [1, 2],
+      kernel_input_feature_dimension = 2,
+      kernel_output_feature_dimension = 3,
+      kernel_spatial_dimensions = [0, 1],
+      output_batch_dimension = 0,
+      output_feature_dimension = 3,
+      output_spatial_dimensions = [1, 2]
+    >, feature_group_count = 1 : i64, lhs_dilation = dense<2> : tensor<2xi64>, padding = dense<[[2, 1], [2, 1]]> : tensor<2x2xi64>,
+    precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>
+  } : (tensor<8x4x4x32xf32>, tensor<3x3x32x64xf32>) -> tensor<8x8x8x64xf32>
+  func.return %0 : tensor<8x8x8x64xf32>
+}
+
 // CHECK-LABEL:   func @convert_conv2d_valid_padding(
 // CHECK-SAME:                                       %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
 // CHECK-SAME:                                       %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x6x6x16xf32> {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index 0c1e157af27d3d..28df4946880648 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -772,8 +772,29 @@ class ConvertNonTrivialConvOp
                                   rewriter.getI32Type()),
             input_shape));
     // Mirror the filter in the spatial dimensions.
-    auto filter = rewriter.create<mhlo::ReverseOp>(
-        conv_op.getLoc(), conv_op.getRhs(),
+    mlir::Value reverse_filter_in = conv_op.getRhs();
+    // If the kernel is with format [0,1,i,o] we transpose it to [0,1,o,i]
+    // as the TF->TFL pass anticipates this and the kernel format information
+    // will be lost once we legalize to TF
+    if (isKernelFormatHWIO(dnums)) {
+      SmallVector<int64_t, 4> permutation;
+      for (int64_t dim : dnums.getInputSpatialDimensions()) {
+        permutation.push_back(dim - 1);
+      }
+      permutation.push_back(dnums.getKernelOutputFeatureDimension());
+      permutation.push_back(dnums.getKernelInputFeatureDimension());
+
+      auto filter_transposed = rewriter.create<mhlo::TransposeOp>(
+          conv_op.getLoc(), conv_op.getRhs(),
+          DenseIntElementsAttr::get(
+              RankedTensorType::get({static_cast<int64_t>(permutation.size())},
+                                    rewriter.getI64Type()),
+              permutation));
+      reverse_filter_in = filter_transposed;
+    }
+    mhlo::ReverseOp filter;
+    filter = rewriter.create<mhlo::ReverseOp>(
+        conv_op.getLoc(), reverse_filter_in,
         rewriter.getI64TensorAttr(dnums.getKernelSpatialDimensions()));
     rewriter.replaceOpWithNewOp<TF::Conv2DBackpropInputOp>(
         conv_op, conv_op.getType(), input_sizes, filter, conv_op.getLhs(),
@@ -803,6 +824,18 @@ class ConvertNonTrivialConvOp
     return true;
   }
 
+  bool isKernelFormatHWIO(mhlo::ConvDimensionNumbersAttr dnums) const {
+    int64_t num_spatial_dims = dnums.getKernelSpatialDimensions().size();
+    return dnums.getKernelInputFeatureDimension() == num_spatial_dims &&
+           dnums.getKernelOutputFeatureDimension() == num_spatial_dims + 1;
+  }
+
+  bool isKernelFormatHWOI(mhlo::ConvDimensionNumbersAttr dnums) const {
+    int64_t num_spatial_dims = dnums.getKernelSpatialDimensions().size();
+    return dnums.getKernelInputFeatureDimension() == num_spatial_dims + 1 &&
+           dnums.getKernelOutputFeatureDimension() == num_spatial_dims;
+  }
+
   LogicalResult IsSupportedConvOp(mhlo::ConvolutionOp conv_op,
                                   ConversionPatternRewriter& rewriter) const {
     if (!conv_op.getLhs().getType().cast<ShapedType>().hasStaticShape() ||
@@ -868,15 +901,14 @@ class ConvertNonTrivialConvOp
     }
 
     // Checks kernel dimensions.
-    if (dnums.getKernelInputFeatureDimension() != num_spatial_dims + 1 ||
-        dnums.getKernelOutputFeatureDimension() != num_spatial_dims)
-      return rewriter.notifyMatchFailure(conv_op,
-                                         "requires kernel format [0, 1, o, i]");
+    if (!isKernelFormatHWIO(dnums) && !isKernelFormatHWOI(dnums))
+      return rewriter.notifyMatchFailure(
+          conv_op, "requires kernel format [0, 1, o, i] or [0, 1, i, o]");
     auto kernel_spatial_dimensions = dnums.getKernelSpatialDimensions();
     for (auto p : llvm::enumerate(kernel_spatial_dimensions)) {
       if (p.value() != p.index())
         return rewriter.notifyMatchFailure(
-            conv_op, "requires kernel format [0, 1, o, i]");
+            conv_op, "requires kernel format [0, 1, o, i] or [0, 1, i, o]");
     }
 
     return success();

From 20b4d65b911a9752128a73a7cd866c22f804ecca Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 28 Sep 2023 08:36:42 -0700
Subject: [PATCH 376/567] [XLA:GPU] [NFC] Remove redundant indirection layer

PiperOrigin-RevId: 569191826
---
 .../xla/mlir/backends/gpu/transforms/BUILD    |  6 +--
 .../xla/mlir/backends/gpu/transforms/passes.h |  4 +-
 third_party/xla/xla/service/gpu/BUILD         | 30 ++++----------
 .../xla/xla/service/gpu/amdgpu_compiler.cc    |  9 ++--
 .../xla/xla/service/gpu/amdgpu_compiler.h     |  8 ++--
 .../service/gpu/compile_module_to_llvm_ir.cc  | 14 +++----
 .../xla/xla/service/gpu/gemm_rewriter.cc      |  8 ++--
 .../xla/xla/service/gpu/gemm_rewriter.h       |  6 +--
 .../xla/service/gpu/gemm_rewriter_triton.cc   | 29 ++++++-------
 .../xla/service/gpu/gemm_rewriter_triton.h    | 13 +++---
 .../service/gpu/gemm_rewriter_triton_test.cc  |  3 +-
 .../xla/xla/service/gpu/gpu_compiler.cc       | 18 +++++---
 .../xla/xla/service/gpu/gpu_compiler.h        | 16 ++++----
 .../xla/xla/service/gpu/gpu_device_info.cc    |  8 +---
 .../xla/xla/service/gpu/gpu_device_info.h     |  4 +-
 .../xla/service/gpu/gpu_device_info_test.cc   | 41 +++++++++----------
 .../xla/xla/service/gpu/gpu_executable.cc     |  9 ++--
 .../xla/xla/service/gpu/gpu_executable.h      | 11 ++---
 third_party/xla/xla/service/gpu/gpu_types.h   | 37 -----------------
 .../xla/service/gpu/llvm_gpu_backend/BUILD    |  2 +-
 .../gpu/llvm_gpu_backend/gpu_backend_lib.cc   | 29 ++++++-------
 .../gpu/llvm_gpu_backend/gpu_backend_lib.h    |  6 +--
 .../gpu/loop_double_buffer_transformer.h      |  1 -
 .../xla/xla/service/gpu/nvptx_compiler.cc     |  9 ++--
 .../xla/xla/service/gpu/nvptx_compiler.h      |  8 ++--
 .../service/gpu/softmax_rewriter_triton.cc    | 27 ++++++------
 .../xla/service/gpu/softmax_rewriter_triton.h |  6 +--
 .../gpu/softmax_rewriter_triton_test.cc       | 10 ++---
 .../service/gpu/tree_reduction_rewriter.cc    |  4 +-
 .../xla/service/gpu/tree_reduction_rewriter.h |  6 +--
 .../xla/stream_executor/device_description.cc | 17 +++++++-
 .../xla/stream_executor/device_description.h  | 16 ++++----
 .../xla/xla/tests/llvm_compiler_test.cc       | 10 ++---
 33 files changed, 192 insertions(+), 233 deletions(-)
 delete mode 100644 third_party/xla/xla/service/gpu/gpu_types.h

diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD b/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD
index c949db8a20a494..758b54e4ef28f4 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD
@@ -1,6 +1,6 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -72,10 +72,10 @@ cc_library(
         "//xla/mlir_hlo:lhlo_gpu",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_executable",
-        "//xla/service/gpu:gpu_types",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:nccl_collective_thunks",
         "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_description",
         "//xla/translate/mhlo_to_hlo:location_exporter",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.h b/third_party/xla/xla/mlir/backends/gpu/transforms/passes.h
index 87334be7f532fe..4c3fc987ea74b5 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.h
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/passes.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "xla/service/gpu/gpu_types.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -47,7 +47,7 @@ struct GpuPipelineOpts {
   int32_t gpu_graph_level = 0;
   int32_t min_graph_size = 0;
   bool enable_concurrent_region = false;
-  GpuVersion compute_capability;
+  stream_executor::GpuComputeCapability compute_capability;
 };
 
 // Populate passes that lower MLIR modules from a combination of LMHLO and
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index ccb2d4a471ce35..e57c3c3e035b46 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -98,16 +98,6 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-cc_library(
-    name = "gpu_types",
-    hdrs = ["gpu_types.h"],
-    compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/stream_executor:device_description",
-    ],
-)
-
 cc_library(
     name = "launch_dimensions",
     srcs = [
@@ -254,9 +244,10 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_types",
         "//xla/stream_executor",
+        "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
+        "//xla/stream_executor/gpu:gpu_types_header",
     ],
 )
 
@@ -280,7 +271,6 @@ xla_cc_test(
     deps = [
         ":gpu_device_info",
         ":gpu_device_info_for_tests",
-        ":gpu_types",
         "//xla/service:pattern_matcher_gmock",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
@@ -998,7 +988,6 @@ cc_library(
         ":gpu_conv_runner",
         ":gpu_executable_run_options",
         ":gpu_fused_mha_runner",
-        ":gpu_types",
         ":io_feed_manager",
         ":ir_emission_utils",
         ":kernel_arguments",
@@ -1039,6 +1028,7 @@ cc_library(
         "//xla/service/gpu/runtime2:executable",
         "//xla/stream_executor",
         "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:scratch_allocator",
@@ -1254,7 +1244,6 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
-        ":gpu_types",
         ":ir_emission_utils",
         ":matmul_utils",
         "//xla:shape_util",
@@ -1267,6 +1256,7 @@ cc_library(
         "//xla/service:hlo_pass",
         "//xla/service:pattern_matcher",
         "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
@@ -1286,7 +1276,6 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":cublas_padding_requirements",
-        ":gpu_types",
         ":ir_emission_utils",
         ":matmul_utils",
         "//xla:autotuning_proto_cc",
@@ -1327,7 +1316,6 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":gemm_rewriter_triton",
-        ":gpu_types",
         ":ir_emission_utils",
         "//xla:shape_util",
         "//xla:status",
@@ -1338,6 +1326,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_pass",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1356,7 +1345,6 @@ xla_cc_test(
     deps = [
         ":cublas_padding_requirements",
         ":gemm_rewriter_triton",
-        ":gpu_types",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -2060,13 +2048,13 @@ xla_cc_test(
     name = "softmax_rewriter_triton_test",
     srcs = ["softmax_rewriter_triton_test.cc"],
     deps = [
-        ":gpu_types",
         ":softmax_rewriter_triton",
         "//xla:shape_util",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
+        "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "@com_google_absl//absl/base:core_headers",
@@ -2750,7 +2738,7 @@ cc_library(
         "//xla/service:while_loop_simplifier",
         "//xla/service:while_loop_trip_count_annotator",
         "//xla/service:zero_sized_hlo_elimination",
-        "//xla/service/gpu:gpu_types",
+        "//xla/stream_executor:device_description",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/spmd:collective_permute_motion",
         "//xla/service/spmd:stateful_rng_spmd_partitioner",
@@ -4007,7 +3995,6 @@ cc_library(
     hdrs = ["tree_reduction_rewriter.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_types",
         ":reduction_utils",
         "//xla:shape_util",
         "//xla:statusor",
@@ -4016,6 +4003,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service:hlo_pass",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/strings",
@@ -4619,7 +4607,6 @@ cc_library(
     hdrs = ["loop_double_buffer_transformer.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_types",
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
@@ -4628,6 +4615,7 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "//xla/service:collective_ops_utils",
         "//xla/service:hlo_pass",
+        "//xla/stream_executor/gpu:gpu_types_header",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
index 74bd6c6c01d699..6c8845d6a89a28 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
@@ -78,7 +78,7 @@ std::string GetROCDLDir(const HloModuleConfig& config) {
 }  // namespace
 
 Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
-    HloModule* hlo_module, GpuVersion gpu_version,
+    HloModule* hlo_module, se::GpuComputeCapability gpu_version,
     se::dnn::VersionInfo dnn_version,
     se::DeviceMemoryAllocator* device_allocator) {
   // Convert convolutions into CustomCalls to MIOpen, then canonicalize them
@@ -194,14 +194,11 @@ AMDGPUCompiler::AMDGPUCompiler()
     : GpuCompiler(stream_executor::rocm::kROCmPlatformId,
                   amdgpu::TargetTriple(), amdgpu::DataLayout()) {}
 
-GpuVersion AMDGPUCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
-  return stream_exec->GetDeviceDescription().rocm_compute_capability();
-}
-
 StatusOr<std::pair<std::string, std::vector<uint8_t>>>
 AMDGPUCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
                                     llvm::Module* llvm_module,
-                                    GpuVersion gpu_version, bool relocatable,
+                                    se::GpuComputeCapability gpu_version,
+                                    bool relocatable,
                                     const HloModule* debug_module,
                                     const CompileOptions& options) {
   if (rocdl_dir_.empty()) {
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.h b/third_party/xla/xla/service/gpu/amdgpu_compiler.h
index c05d256d2e32ea..731d66c571098a 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.h
@@ -34,7 +34,7 @@ class AMDGPUCompiler : public GpuCompiler {
   AMDGPUCompiler();
 
   Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, GpuVersion gpu_version,
+      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
       se::dnn::VersionInfo dnn_version,
       se::DeviceMemoryAllocator* device_allocator) override;
 
@@ -58,12 +58,10 @@ class AMDGPUCompiler : public GpuCompiler {
   Status SerializeAutotuneResultsToFile(
       const DebugOptions& debug_options) override;
 
-  GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
-
   StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileTargetBinary(
       const HloModuleConfig& module_config, llvm::Module* llvm_module,
-      GpuVersion gpu_version, bool relocatable, const HloModule* debug_module,
-      const CompileOptions& options) override;
+      se::GpuComputeCapability gpu_version, bool relocatable,
+      const HloModule* debug_module, const CompileOptions& options) override;
 
  private:
   // The parent directory of ROCm-Device-Libs IR libraries.
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index e68fb0215d502f..66ef3a2cd95470 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -107,12 +107,11 @@ static bool HasFp8(const HloModule& hlo_module) {
 }
 
 // Lowers MLIR module to the XLA Gpu runtime custom calls.
-static Status LowerToXlaGpuRuntime(mlir::ModuleOp module,
-                                   llvm::StringRef entry_function_name,
-                                   llvm::ArrayRef<int64_t> buffer_sizes,
-                                   ThunkSequence* thunk_sequence,
-                                   const DebugOptions& debug_options,
-                                   GpuVersion compute_capability) {
+static Status LowerToXlaGpuRuntime(
+    mlir::ModuleOp module, llvm::StringRef entry_function_name,
+    llvm::ArrayRef<int64_t> buffer_sizes, ThunkSequence* thunk_sequence,
+    const DebugOptions& debug_options,
+    se::GpuComputeCapability compute_capability) {
   if (!module) {
     return InternalError("No MLIR module to lower.");
   }
@@ -194,7 +193,8 @@ StatusOr<GpuExecutable::OwnedGpuRuntimeProgram> LowerToJitRt(
     mlir::ModuleOp mlir_module, llvm::StringRef entry_function_name,
     llvm::ArrayRef<int64_t> buffer_sizes, const HloModuleConfig& module_config,
     std::unique_ptr<ThunkSequence> thunk_sequence,
-    const HloModule* hlo_module_for_dump, GpuVersion compute_capability) {
+    const HloModule* hlo_module_for_dump,
+    se::GpuComputeCapability compute_capability) {
   // Forward collective (NCCL) attributes for use by the lowering pipeline.
   ForwardCollectiveAttrs(mlir_module, entry_function_name, module_config);
 
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index b8084049a5aa07..cc3dc4ed1fc721 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -464,7 +464,7 @@ auto OptionalBitcast(HloInstruction **optional_bitcast, Pattern pattern) {
 // when the output of the GEMM is requested in FP8 format.
 class GemmRewriterVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit GemmRewriterVisitor(GpuVersion gpu_version)
+  explicit GemmRewriterVisitor(se::GpuComputeCapability gpu_version)
       : gpu_version_(gpu_version) {}
 
   Status HandleDot(HloInstruction *instr) override {
@@ -1528,7 +1528,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
   }
 
  private:
-  GpuVersion gpu_version_;
+  se::GpuComputeCapability gpu_version_;
 
   // Choose cublas or cublasLt for the target of the custom call that instr will
   // be rewritten into.
@@ -1937,7 +1937,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 };
 
 StatusOr<bool> RunOnComputation(HloComputation *computation,
-                                GpuVersion gpu_version) {
+                                se::GpuComputeCapability gpu_version) {
   GemmRewriterVisitor visitor(gpu_version);
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
   return visitor.changed();
@@ -1945,7 +1945,7 @@ StatusOr<bool> RunOnComputation(HloComputation *computation,
 
 }  // anonymous namespace
 
-GemmRewriter::GemmRewriter(GpuVersion gpu_version)
+GemmRewriter::GemmRewriter(se::GpuComputeCapability gpu_version)
     : gpu_version_(gpu_version) {}
 
 StatusOr<bool> GemmRewriter::Run(
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.h b/third_party/xla/xla/service/gpu/gemm_rewriter.h
index 070df7de91f7b7..65efd940450e46 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.h
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -44,7 +44,7 @@ namespace gpu {
 // stored in the backend config.
 class GemmRewriter : public HloModulePass {
  public:
-  explicit GemmRewriter(GpuVersion gpu_version);
+  explicit GemmRewriter(se::GpuComputeCapability gpu_version);
   absl::string_view name() const override { return "cublas-gemm-rewriter"; }
 
   using HloPassInterface::Run;
@@ -53,7 +53,7 @@ class GemmRewriter : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  GpuVersion gpu_version_;
+  se::GpuComputeCapability gpu_version_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
index 59cecc4dba6e77..5ec96be47f72f6 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
@@ -53,7 +53,6 @@ limitations under the License.
 #include "xla/permutation_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_padding_requirements.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/hlo_creation_utils.h"
@@ -166,8 +165,8 @@ bool IsDistributiveOverAddition(const HloInstruction& hlo) {
   return false;
 }
 
-FusionDecision RequireTritonFusibleConvert(const HloInstruction* input,
-                                           GpuVersion gpu_version) {
+FusionDecision RequireTritonFusibleConvert(
+    const HloInstruction* input, se::GpuComputeCapability gpu_version) {
   // TODO(b/266862494): Can pick up almost any
   // convert, but if it's reducing the data volume it should rather be fused
   // to the output of the producer kernel. However not all operations support
@@ -438,7 +437,7 @@ class FusionContext {
       const HloInstruction& hlo, bool as_input,
       absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
           old_to_new_mapping,
-      GpuVersion gpu_version) const;
+      se::GpuComputeCapability gpu_version) const;
   // Add dimension orders from `updates` to `dim_orders_` and update the
   // splittable dimension ratio if all of them are compatible.
   bool MergeUpdates(const DimOrderUpdates& updates);
@@ -446,7 +445,7 @@ class FusionContext {
   // If an input is not fusible stop there and make a parameter of the new
   // fusion, otherwise put it onto stack and check its own inputs first.
   void TryToFuseWithInputsRecursively(
-      HloInstruction& root, GpuVersion gpu_version,
+      HloInstruction& root, se::GpuComputeCapability gpu_version,
       absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
           old_to_new_mapping,
       std::vector<HloInstruction*>& fusion_inputs,
@@ -1076,7 +1075,7 @@ DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
     const HloInstruction& hlo, bool as_input,
     absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
         old_to_new_mapping,
-    const GpuVersion gpu_version) const {
+    const se::GpuComputeCapability gpu_version) const {
   int fusion_level =
       hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
   if (!std::get<se::CudaComputeCapability>(gpu_version)
@@ -1224,7 +1223,7 @@ bool FusionContext::MergeUpdates(const DimOrderUpdates& updates) {
 }
 
 void FusionContext::TryToFuseWithInputsRecursively(
-    HloInstruction& root, const GpuVersion gpu_version,
+    HloInstruction& root, const se::GpuComputeCapability gpu_version,
     absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
         old_to_new_mapping,
     std::vector<HloInstruction*>& fusion_inputs,
@@ -1293,7 +1292,7 @@ void FusionContext::TryToFuseWithInputsRecursively(
 // call to this fusion. fusion_output_ptr (if not nullptr) gets assigned the
 // original instruction that has to be replaced by the call to the fusion.
 StatusOr<FusionDecision> FuseDot(HloInstruction& dot,
-                                 const GpuVersion gpu_version,
+                                 const se::GpuComputeCapability gpu_version,
                                  HloComputation::Builder& builder,
                                  std::vector<HloInstruction*>& fusion_inputs,
                                  HloInstruction** fusion_output_ptr) {
@@ -1383,7 +1382,7 @@ StatusOr<FusionDecision> FuseDot(HloInstruction& dot,
 // operations that can target the triton GEMM emitter.
 class GemmRewriterTritonVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit GemmRewriterTritonVisitor(const GpuVersion gpu_version)
+  explicit GemmRewriterTritonVisitor(const se::GpuComputeCapability gpu_version)
       : gpu_version_(gpu_version) {}
   // Checks that a dot() should be targeting the triton GEMM emitter;
   // if so - fuses all its compatible inputs and outputs as a new computation
@@ -1437,11 +1436,11 @@ class GemmRewriterTritonVisitor : public DfsHloRewriteVisitor {
   }
 
  private:
-  GpuVersion gpu_version_;
+  se::GpuComputeCapability gpu_version_;
 };
 
 StatusOr<bool> RunOnComputation(HloComputation* computation,
-                                GpuVersion gpu_version) {
+                                se::GpuComputeCapability gpu_version) {
   GemmRewriterTritonVisitor visitor(gpu_version);
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
   return visitor.changed();
@@ -1752,7 +1751,8 @@ Status FusionContext::PropagateDimensionOrdersToParameters(
 }  // anonymous namespace
 
 // Data types that are supported by the Triton emitters.
-bool IsTritonSupportedDataType(PrimitiveType type, GpuVersion gpu_version) {
+bool IsTritonSupportedDataType(PrimitiveType type,
+                               se::GpuComputeCapability gpu_version) {
   auto cuda_compute_capability =
       std::get<se::CudaComputeCapability>(gpu_version);
   switch (type) {
@@ -1965,7 +1965,7 @@ const DimIterationSpec* TritonFusionAnalysis::IterSpec(
 }
 
 FusionDecision CanTritonHandleGEMM(const HloInstruction& dot,
-                                   const GpuVersion gpu_version) {
+                                   const se::GpuComputeCapability gpu_version) {
   if (dot.opcode() != HloOpcode::kDot ||
       absl::c_any_of(dot.precision_config().operand_precision(),
                      [](int x) { return x != PrecisionConfig::DEFAULT; })) {
@@ -2019,7 +2019,8 @@ FusionDecision CanTritonHandleGEMM(const HloInstruction& dot,
   return FusionDecision{};
 }
 
-bool ShouldTritonHandleGEMM(HloInstruction& dot, const GpuVersion gpu_version) {
+bool ShouldTritonHandleGEMM(HloInstruction& dot,
+                            const se::GpuComputeCapability gpu_version) {
   std::vector<HloInstruction*> fusion_inputs;
   HloComputation::Builder builder("disposable");
   return FuseDot(dot, gpu_version, builder, fusion_inputs,
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.h b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.h
index f777339dda384d..d3b10097c9e428 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.h
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.h
@@ -28,11 +28,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -53,7 +53,7 @@ std::vector<HloOpcode> TritonSupportedBinaryElementwise(PrimitiveType);
 std::vector<HloOpcode> TritonSupportedTernaryElementwise(PrimitiveType);
 
 // Data types that are supported by the Triton emitters.
-bool IsTritonSupportedDataType(PrimitiveType, GpuVersion);
+bool IsTritonSupportedDataType(PrimitiveType, se::GpuComputeCapability);
 
 // Checks elementwise operation against all supported by Triton GEMM codegen.
 bool IsTritonSupportedElementwise(HloOpcode, PrimitiveType);
@@ -66,10 +66,11 @@ Status MakeDotSplitKBatch(HloInstruction* dot_fusion,
 
 // Filters GEMMs which can be handled using Triton.
 FusionDecision CanTritonHandleGEMM(const HloInstruction&,
-                                   GpuVersion gpu_version);
+                                   se::GpuComputeCapability gpu_version);
 
 // Filters GEMMs which are better to handle using Triton.
-bool ShouldTritonHandleGEMM(HloInstruction&, GpuVersion gpu_version);
+bool ShouldTritonHandleGEMM(HloInstruction&,
+                            se::GpuComputeCapability gpu_version);
 
 class TensorIterationSpec {
  public:
@@ -162,7 +163,7 @@ class TritonFusionAnalysis {
 // that target Triton-based matmul emitter.
 class GemmRewriterTriton : public HloModulePass {
  public:
-  explicit GemmRewriterTriton(GpuVersion gpu_version)
+  explicit GemmRewriterTriton(se::GpuComputeCapability gpu_version)
       : gpu_version_(gpu_version) {}
   absl::string_view name() const override { return "triton-gemm-rewriter"; }
 
@@ -172,7 +173,7 @@ class GemmRewriterTriton : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  GpuVersion gpu_version_;
+  se::GpuComputeCapability gpu_version_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
index c72884cf3e912e..64c252a8997b1b 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout.h"
 #include "xla/service/gpu/cublas_padding_requirements.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape_util.h"
@@ -73,7 +72,7 @@ class GemmRewriterTritonTest : public HloTestBase {
       : HloTestBase(/*verifier_layout_sensitive=*/true,
                     /*allow_mixed_precision_in_hlo_verifier=*/false) {}
 
-  GpuVersion gpu_version_{
+  se::GpuComputeCapability gpu_version_{
       se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0}};
 };
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index de7585f516d455..4688c09be1fa5f 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -114,7 +114,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_sanitize_constant_names.h"
 #include "xla/service/gpu/gpu_scatter_expander.h"
 #include "xla/service/gpu/gpu_shape_verifier.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/gpu/hlo_fusion_stats.h"
 #include "xla/service/gpu/horizontal_loop_fusion.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -694,7 +693,8 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
 
   // Run target-specific HLO optimization passes for convolution
   // canonicalization.
-  GpuVersion gpu_version = gpu_target_config.gpu_device_info.compute_capability;
+  se::GpuComputeCapability gpu_version =
+      gpu_target_config.gpu_device_info.compute_capability;
   se::dnn::VersionInfo dnn_version = gpu_target_config.dnn_version_info;
   if (stream_exec != nullptr) {
     gpu_version = GetGpuVersion(stream_exec);
@@ -896,10 +896,8 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     tsl::thread::ThreadPool* thread_pool) {
   // Constants:
   const DebugOptions& debug_options = hlo_module->config().debug_options();
-  const GpuVersion gpu_version =
+  const se::GpuComputeCapability gpu_version =
       gpu_target_config.gpu_device_info.compute_capability;
-  const se::CudaComputeCapability* const cuda_cc =
-      std::get_if<se::CudaComputeCapability>(&gpu_version);
   const AlgebraicSimplifierOptions simplifier_options = [&] {
     AlgebraicSimplifierOptions opts;
     opts.set_supports_non_canonical_dots(false);
@@ -956,6 +954,9 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     pipeline.AddPass<HloPassFix<MoveCopyToUsers>>();
 
     // Rewrite GEMMs into custom calls.
+    se::GpuComputeCapability gpu_version =
+        gpu_target_config.gpu_device_info.compute_capability;
+    const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version);
     if (debug_options.xla_gpu_enable_triton_gemm() && cuda_cc != nullptr &&
         cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) {
       pipeline.AddPass<GemmRewriterTriton>(gpu_version);
@@ -1196,7 +1197,7 @@ static void NullDiagnosticHandler(const llvm::DiagnosticInfo& diag_info,
 StatusOr<std::pair<std::string, std::vector<uint8_t>>>
 GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
                                    std::unique_ptr<llvm::Module> llvm_module,
-                                   GpuVersion gpu_version,
+                                   se::GpuComputeCapability gpu_version,
                                    se::StreamExecutor* stream_exec,
                                    const CompileOptions& options,
                                    const HloModule* debug_module) {
@@ -1793,5 +1794,10 @@ Status GpuCompiler::RunPostSchedulingPipelines(
   return OkStatus();
 }
 
+se::GpuComputeCapability GpuCompiler::GetGpuVersion(
+    se::StreamExecutor* stream_exec) {
+  return stream_exec->GetDeviceDescription().gpu_compute_capability();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index f806e3f492093a..765df534908ffe 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -122,7 +122,8 @@ class GpuCompiler : public LLVMCompiler {
   StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
       HloModule* hlo_module, se::StreamExecutor* stream_exec) override;
 
-  virtual GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) = 0;
+  se::GpuComputeCapability GetGpuVersion(se::StreamExecutor* stream_exec);
+
   GpuTargetConfig GetGpuTargetConfig(se::StreamExecutor* stream_exec) {
     GpuTargetConfig gpu_target_config;
     gpu_target_config.gpu_device_info = GetGpuDeviceInfo(stream_exec);
@@ -141,9 +142,9 @@ class GpuCompiler : public LLVMCompiler {
 
   StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileToTargetBinary(
       const HloModuleConfig& module_config,
-      std::unique_ptr<llvm::Module> llvm_module, GpuVersion gpu_version,
-      se::StreamExecutor* stream_exec, const CompileOptions& options,
-      const HloModule* debug_module);
+      std::unique_ptr<llvm::Module> llvm_module,
+      se::GpuComputeCapability gpu_version, se::StreamExecutor* stream_exec,
+      const CompileOptions& options, const HloModule* debug_module);
 
   se::Platform::Id PlatformId() const override { return platform_id_; }
 
@@ -229,7 +230,7 @@ class GpuCompiler : public LLVMCompiler {
                            const AutotuneResults* autotune_results);
 
   virtual Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, GpuVersion gpu_version,
+      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
       se::dnn::VersionInfo dnn_version,
       se::DeviceMemoryAllocator* device_allocator) = 0;
 
@@ -241,8 +242,9 @@ class GpuCompiler : public LLVMCompiler {
   // that accommodates both HLO and MLIR.
   virtual StatusOr<std::pair<std::string, std::vector<uint8_t>>>
   CompileTargetBinary(const HloModuleConfig& module_config,
-                      llvm::Module* llvm_module, GpuVersion gpu_version,
-                      bool relocatable, const HloModule* debug_module,
+                      llvm::Module* llvm_module,
+                      se::GpuComputeCapability gpu_version, bool relocatable,
+                      const HloModule* debug_module,
                       const CompileOptions& options) = 0;
 
   Status PrepareHloModuleForIrEmitting(HloModule* hlo_module);
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info.cc b/third_party/xla/xla/service/gpu/gpu_device_info.cc
index 3672a17dc08efc..160d86b7bb43ce 100644
--- a/third_party/xla/xla/service/gpu/gpu_device_info.cc
+++ b/third_party/xla/xla/service/gpu/gpu_device_info.cc
@@ -23,13 +23,7 @@ namespace {
 GpuDeviceInfo GetGpuDeviceInfo(
     const stream_executor::DeviceDescription& device) {
   GpuDeviceInfo device_info;
-  if (auto cc = device.cuda_compute_capability(); cc.major > 0) {
-    device_info.compute_capability = cc;
-  } else {
-    // Strip the gcn_arch_name down to just the gfx... part.
-    device_info.compute_capability = stream_executor::RocmComputeCapability(
-        device.rocm_compute_capability().gfx_version());
-  }
+  device_info.compute_capability = device.gpu_compute_capability();
   device_info.threads_per_block_limit = device.threads_per_block_limit();
   device_info.threads_per_warp = device.threads_per_warp();
   device_info.shared_memory_per_block = device.shared_memory_per_block();
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info.h b/third_party/xla/xla/service/gpu/gpu_device_info.h
index c499cebeb788fc..7f6abc73845b85 100644
--- a/third_party/xla/xla/service/gpu/gpu_device_info.h
+++ b/third_party/xla/xla/service/gpu/gpu_device_info.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 #include <variant>
 
-#include "xla/service/gpu/gpu_types.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/stream_executor.h"
 
@@ -30,7 +30,7 @@ namespace gpu {
 // se::DeviceDescription, but separating these out lets us write code that does
 // not depend on stream executor.
 struct GpuDeviceInfo {
-  GpuVersion compute_capability;
+  stream_executor::GpuComputeCapability compute_capability;
   int threads_per_block_limit;
   int threads_per_warp;
   int shared_memory_per_block;
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_test.cc b/third_party/xla/xla/service/gpu/gpu_device_info_test.cc
index da282f0638c34e..8d1ec3e65ef239 100644
--- a/third_party/xla/xla/service/gpu/gpu_device_info_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_device_info_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "tsl/platform/test.h"
@@ -65,7 +64,7 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
     EXPECT_THAT(
         dev_info,
         ::testing::FieldsAre(
-            GpuVersion(se::CudaComputeCapability(6, 1)),
+            se::GpuComputeCapability(se::CudaComputeCapability(6, 1)),
             /*threads_per_block_limit=*/1024,
             /*threads_per_warp=*/32, /*shared_memory_per_block=*/48 * 1024,
             /*shared_memory_per_block_optin=*/48 * 1024,
@@ -79,23 +78,23 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
             /*clock_rate_ghz=*/::testing::Ge(1.4),
             /*device_memory_size=*/4'234'346'496));
   } else if (name == "Tesla P100-SXM2-16GB") {
-    EXPECT_THAT(
-        dev_info,
-        ::testing::FieldsAre(GpuVersion(se::CudaComputeCapability(6, 0)),
-                             /*threads_per_block_limit=*/1024,
-                             /*threads_per_warp=*/32,
-                             /*shared_memory_per_block=*/48 * 1024,
-                             /*shared_memory_per_block_optin=*/48 * 1024,
-                             /*shared_memory_per_core=*/64 * 1024,
-                             /*threads_per_core_limit=*/2048, /*core_count=*/56,
-                             /*fpus_per_core=*/64,
-                             /*block_dim_limit_x=*/2'147'483'647,
-                             /*block_dim_limit_y=*/65535,
-                             /*block_dim_limit_z=*/65535,
-                             /*memory_bandwidth=*/732'160'000'000,
-                             /*l2_cache_size=*/4 * 1024 * 1024,
-                             /*clock_rate_ghz=*/::testing::Ge(1.4),
-                             /*device_memory_size=*/17'066'622'976));
+    EXPECT_THAT(dev_info,
+                ::testing::FieldsAre(
+                    se::GpuComputeCapability(se::CudaComputeCapability(6, 0)),
+                    /*threads_per_block_limit=*/1024,
+                    /*threads_per_warp=*/32,
+                    /*shared_memory_per_block=*/48 * 1024,
+                    /*shared_memory_per_block_optin=*/48 * 1024,
+                    /*shared_memory_per_core=*/64 * 1024,
+                    /*threads_per_core_limit=*/2048, /*core_count=*/56,
+                    /*fpus_per_core=*/64,
+                    /*block_dim_limit_x=*/2'147'483'647,
+                    /*block_dim_limit_y=*/65535,
+                    /*block_dim_limit_z=*/65535,
+                    /*memory_bandwidth=*/732'160'000'000,
+                    /*l2_cache_size=*/4 * 1024 * 1024,
+                    /*clock_rate_ghz=*/::testing::Ge(1.4),
+                    /*device_memory_size=*/17'066'622'976));
   }
 #if TF_ROCM_VERSION >= 50500
   else if (name == "AMD Instinct MI210") {  // NOLINT
@@ -116,7 +115,7 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
     EXPECT_THAT(
         dev_info,
         ::testing::FieldsAre(
-            GpuVersion(se::RocmComputeCapability("gfx908")),
+            se::GpuComputeCapability(se::RocmComputeCapability("gfx908")),
             /*threads_per_block_limit=*/1024,
             /*threads_per_warp=*/64, /*shared_memory_per_block=*/64 * 1024,
             /*shared_memory_per_block_optin=*/0,
@@ -133,7 +132,7 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
     EXPECT_THAT(
         dev_info,
         ::testing::FieldsAre(
-            GpuVersion(se::RocmComputeCapability("gfx906")),
+            se::GpuComputeCapability(se::RocmComputeCapability("gfx906")),
             /*threads_per_block_limit=*/1024,
             /*threads_per_warp=*/64, /*shared_memory_per_block=*/64 * 1024,
             /*shared_memory_per_block_optin=*/0,
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 5796c662432a34..022263d02b8f4e 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_constants.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "xla/service/gpu/runtime/executable.h"
 #include "xla/service/gpu/runtime2/executable.h"
@@ -56,6 +55,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
@@ -196,7 +196,7 @@ Status GpuExecutable::CheckCompatibilityWithServiceExecutableRunOptions(
         << "AMDGPU GCN ISA version mismatch; expected {" << gpu_exec_arch
         << ", but was " << stream_arch;
   } else if (platform_id == stream_executor::cuda::kCudaPlatformId) {
-    GpuVersion cc = main_stream->GetCudaComputeCapability();
+    se::GpuComputeCapability cc = main_stream->GetCudaComputeCapability();
     TF_RET_CHECK(std::get<se::CudaComputeCapability>(cc) ==
                  std::get<se::CudaComputeCapability>(gpu_version_))
         << "Compute capability mismatch; expected {"
@@ -985,7 +985,8 @@ GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment) {
 GpuExecutable::GpuExecutable(
     std::shared_ptr<HloModule> hlo_module, std::string asm_text,
     std::vector<uint8_t> binary, std::vector<ConstantInfo> constants,
-    GpuVersion gpu_version, xla::EntryFunctionAttributes entry_func_attrs,
+    se::GpuComputeCapability gpu_version,
+    xla::EntryFunctionAttributes entry_func_attrs,
     absl::string_view module_name, Shape xla_output_shape,
     std::vector<BufferAllocation> allocations,
     absl::flat_hash_map<ShapeIndex, OutputInfo> output_info,
@@ -1080,7 +1081,7 @@ StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromObjFile(
     absl::string_view mlir_module,
     xla::EntryFunctionAttributes entry_func_attrs, DebugOptions debug_options,
     absl::string_view asm_text, absl::string_view binary,
-    std::vector<ConstantInfo> constants, GpuVersion gpu_version,
+    std::vector<ConstantInfo> constants, se::GpuComputeCapability gpu_version,
     se::StreamExecutor* executor) {
   VLOG(1) << "Load serialized Gpu executable from object file: module="
           << hlo_module->name();
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index c32db102365143..a8af42ccbc26b6 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "xla/service/gpu/runtime/executable.h"
 #include "xla/service/gpu/runtime2/executable.h"
@@ -44,6 +43,7 @@ limitations under the License.
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 
@@ -88,7 +88,7 @@ class GpuExecutable : public Executable {
   struct Params {
     std::string asm_text;
     std::vector<uint8_t> binary;
-    GpuVersion gpu_version;
+    se::GpuComputeCapability gpu_version;
     // The GpuExecutable will either execute Thunks, XLA runtime executable
     // (native function) or experimental XLA runtime executable (IREE VM
     // function) depending on which is supplied.
@@ -133,7 +133,7 @@ class GpuExecutable : public Executable {
       absl::string_view mlir_module,
       xla::EntryFunctionAttributes entry_func_attrs, DebugOptions debug_options,
       absl::string_view asm_text, absl::string_view binary,
-      std::vector<ConstantInfo> constants, GpuVersion gpu_version,
+      std::vector<ConstantInfo> constants, se::GpuComputeCapability gpu_version,
       stream_executor::StreamExecutor* executor);
 
   // Constructor to use when loading a GpuExecutable from an object file (native
@@ -141,7 +141,8 @@ class GpuExecutable : public Executable {
   // used in XLA Runtime execution mode.
   GpuExecutable(std::shared_ptr<HloModule> hlo_module, std::string asm_text,
                 std::vector<uint8_t> binary,
-                std::vector<ConstantInfo> constants, GpuVersion gpu_version,
+                std::vector<ConstantInfo> constants,
+                se::GpuComputeCapability gpu_version,
                 xla::EntryFunctionAttributes entry_func_attrs,
                 absl::string_view module_name, Shape xla_output_shape,
                 std::vector<BufferAllocation> allocations,
@@ -283,7 +284,7 @@ class GpuExecutable : public Executable {
   const std::vector<uint8_t> binary_;
 #endif
   // The GPU version for compute compatibility check.
-  GpuVersion gpu_version_;
+  se::GpuComputeCapability gpu_version_;
 
   // The thunks to be invoked by this GpuExecutable. They are generated by the
   // IrEmitter (null if XLA:GPU runtime is enabled).
diff --git a/third_party/xla/xla/service/gpu/gpu_types.h b/third_party/xla/xla/service/gpu/gpu_types.h
deleted file mode 100644
index 54084fcea7c281..00000000000000
--- a/third_party/xla/xla/service/gpu/gpu_types.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_GPU_TYPES_H_
-#define XLA_SERVICE_GPU_GPU_TYPES_H_
-
-#include <variant>
-
-#include "xla/stream_executor/device_description.h"
-
-namespace xla {
-namespace gpu {
-
-// GpuVersion is used to abstract Gpu hardware version.
-//
-// On Cuda platform, it comprises of se::CudaComputeCapability.
-// On ROCm platform, it comprises of se::RocmComputeCapability.
-//
-// TODO(csigg): rename to GpuComputeCapability.
-using GpuVersion = std::variant<stream_executor::CudaComputeCapability,
-                                stream_executor::RocmComputeCapability>;
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_GPU_TYPES_H_
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index 84afcda09c35aa..0a530d4e88d3c2 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -33,10 +33,10 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla:xla_proto_cc",
-        "//xla/service/gpu:gpu_types",
         "//xla/service/gpu:metrics",
         "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/service/llvm_ir:llvm_type_conversion_util",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index abe9d5bd315536..96970b06671aa8 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -259,7 +259,8 @@ Status LinkWithBitcodeVector(
   return OkStatus();
 }
 
-Status NVPTXTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
+Status NVPTXTargetModuleLinker(llvm::Module* module,
+                               se::GpuComputeCapability gpu_version,
                                const DebugOptions& debug_options,
                                const std::string& device_bitcode_path) {
   // Link the input module with libdevice, to pull in implementations of some
@@ -295,8 +296,9 @@ std::unique_ptr<llvm::TargetMachine> NVPTXGetTargetMachine(
                           debug_options, ptx_ver);
 }
 
-using TargetModuleLinker = std::function<Status(
-    llvm::Module*, GpuVersion, const DebugOptions&, const std::string&)>;
+using TargetModuleLinker =
+    std::function<Status(llvm::Module*, se::GpuComputeCapability,
+                         const DebugOptions&, const std::string&)>;
 
 void DumpModule(const std::string output_filename, const llvm::Module* module) {
   std::error_code ec;
@@ -347,13 +349,11 @@ auto DumpCallbackForModule(std::string module_identifier,
   };
 }
 
-Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
-                             const DebugOptions& debug_options,
-                             const std::string& device_bitcode_path,
-                             TargetModuleLinker module_linker,
-                             llvm::Triple default_target_triple,
-                             llvm::TargetMachine* target_machine,
-                             int inline_threshold) {
+Status LinkAndOptimizeModule(
+    llvm::Module* module, se::GpuComputeCapability gpu_version,
+    const DebugOptions& debug_options, const std::string& device_bitcode_path,
+    TargetModuleLinker module_linker, llvm::Triple default_target_triple,
+    llvm::TargetMachine* target_machine, int inline_threshold) {
   TF_RETURN_IF_ERROR(
       module_linker(module, gpu_version, debug_options, device_bitcode_path));
 
@@ -550,7 +550,7 @@ Status LinkLibdeviceIfNecessary(llvm::Module* module,
 }
 
 StatusOr<std::string> CompileToPtx(
-    llvm::Module* module, GpuVersion gpu_version,
+    llvm::Module* module, se::GpuComputeCapability gpu_version,
     const DebugOptions& debug_options,
     std::function<void(llvm::TargetMachine*)> configure_target) {
   static absl::once_flag backend_init_flag;
@@ -816,7 +816,8 @@ Status LinkROCDLIfNecessary(llvm::Module* module, std::string gcn_arch_name,
                                GetROCDLPaths(gcn_arch_name, rocdl_dir_path));
 }
 
-Status AMDGPUTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
+Status AMDGPUTargetModuleLinker(llvm::Module* module,
+                                se::GpuComputeCapability gpu_version,
                                 const DebugOptions& debug_options,
                                 const std::string& device_bitcode_dir_path) {
   // Link the input module with ROCDL.
@@ -889,7 +890,7 @@ std::pair<std::string, std::string> GetFeatureStrFromGCNArchName(
 }
 
 std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
-    llvm::Triple target_triple, GpuVersion gpu_version,
+    llvm::Triple target_triple, se::GpuComputeCapability gpu_version,
     const DebugOptions& debug_options) {
   auto compute_capability =
       std::get_if<se::RocmComputeCapability>(&gpu_version);
@@ -923,7 +924,7 @@ void AMDGPUBackendInit(const DebugOptions& debug_options) {
 
 namespace amdgpu {
 StatusOr<std::vector<uint8_t>> CompileToHsaco(
-    llvm::Module* module, GpuVersion gpu_version,
+    llvm::Module* module, se::GpuComputeCapability gpu_version,
     const DebugOptions& debug_options, const std::string& rocdl_dir_path,
     const std::string& module_config_cache_key) {
   static absl::once_flag backend_init_flag;
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
index e00595e2c7ac63..9ff362c2df3391 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/types.h"
 #include "xla/xla.pb.h"
 
@@ -51,7 +51,7 @@ Status LinkLibdeviceIfNecessary(llvm::Module* module,
 // thread safety, but note that LLVM's multithreaded support is very
 // preliminary; multithreaded use is not recommended at this time.
 StatusOr<std::string> CompileToPtx(
-    llvm::Module* module, GpuVersion gpu_version,
+    llvm::Module* module, se::GpuComputeCapability gpu_version,
     const DebugOptions& debug_options,
     std::function<void(llvm::TargetMachine*)> configure_target = nullptr);
 }  // namespace nvptx
@@ -61,7 +61,7 @@ namespace amdgpu {
 // rocdl_dir_path is the parent directory of ROCm-Device-Libs bitcode libraries.
 // The contents of the module may be changed.
 StatusOr<std::vector<uint8_t>> CompileToHsaco(
-    llvm::Module* module, GpuVersion gpu_version,
+    llvm::Module* module, se::GpuComputeCapability gpu_version,
     const DebugOptions& debug_options, const std::string& rocdl_dir_path,
     const std::string& module_config_cache_key);
 }  // namespace amdgpu
diff --git a/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.h b/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.h
index 3e10b445afbc49..b9fb70a1853f41 100644
--- a/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.h
+++ b/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.h
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 787f6d69451d4c..433422cda60bf2 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -122,7 +122,7 @@ class ConvBfloat16Support : public FloatSupport {
 }  // namespace
 
 Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
-    HloModule* hlo_module, GpuVersion gpu_version,
+    HloModule* hlo_module, se::GpuComputeCapability gpu_version,
     se::dnn::VersionInfo dnn_version,
     se::DeviceMemoryAllocator* device_allocator) {
   auto cuda_compute_capability =
@@ -469,14 +469,11 @@ HloDataflowAnalysis::CanShareBuffer NVPTXCompiler::GetCanShareBuffer() {
   return &CanShareBufferHint;
 }
 
-GpuVersion NVPTXCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
-  return stream_exec->GetDeviceDescription().cuda_compute_capability();
-}
-
 StatusOr<std::pair<std::string, std::vector<uint8_t>>>
 NVPTXCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
                                    llvm::Module* llvm_module,
-                                   GpuVersion gpu_version, bool relocatable,
+                                   se::GpuComputeCapability gpu_version,
+                                   bool relocatable,
                                    const HloModule* debug_module,
                                    const CompileOptions& options) {
   std::unique_ptr<llvm::Module> loaded_module =
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/xla/xla/service/gpu/nvptx_compiler.h
index c1de926b2b1120..6598e0f232408d 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.h
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -40,7 +40,7 @@ class NVPTXCompiler : public GpuCompiler {
   NVPTXCompiler();
 
   Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, GpuVersion gpu_version,
+      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
       se::dnn::VersionInfo dnn_version,
       se::DeviceMemoryAllocator* device_allocator) override;
 
@@ -71,12 +71,10 @@ class NVPTXCompiler : public GpuCompiler {
 
   HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() override;
 
-  GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
-
   StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileTargetBinary(
       const HloModuleConfig& module_config, llvm::Module* llvm_module,
-      GpuVersion gpu_version, bool relocatable, const HloModule* debug_module,
-      const CompileOptions& options) override;
+      se::GpuComputeCapability gpu_version, bool relocatable,
+      const HloModule* debug_module, const CompileOptions& options) override;
 
  private:
   StatusOr<bool> CanUseLinkModules(
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
index 81206454385da3..68a0fb269555fa 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
@@ -32,13 +32,13 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
@@ -54,7 +54,7 @@ bool HasDefaultLayout(const Shape& shape) {
 }
 
 bool IsTritonSupportedInstruction(const HloInstruction* instr,
-                                  const GpuVersion& gpu_version) {
+                                  const se::GpuComputeCapability& gpu_version) {
   if (!instr->shape().IsArray()) {
     return false;
   }
@@ -90,10 +90,10 @@ bool IsTritonSupportedInstruction(const HloInstruction* instr,
 // set to it. The definition of "trivial" operations is as given in
 // 'IsTriviallyFusible'.
 bool TrivialEdge(HloInstruction** producer, HloInstruction* consumer,
-                 HloOpcode opcode, const GpuVersion& gpu_version);
+                 HloOpcode opcode, const se::GpuComputeCapability& gpu_version);
 
 bool BitcastIsTilingNoop(HloInstruction* bitcast,
-                         const GpuVersion& gpu_version) {
+                         const se::GpuComputeCapability& gpu_version) {
   CHECK_EQ(bitcast->opcode(), HloOpcode::kBitcast);
 
   if (ShapeUtil::IsEffectiveScalar(bitcast->shape())) {
@@ -144,7 +144,8 @@ HloInstruction* ChooseOperandForFusionProcessing(HloInstruction* instr) {
   return instr->mutable_operand(0);
 }
 
-bool IsTriviallyFusible(HloInstruction* instr, const GpuVersion& gpu_version,
+bool IsTriviallyFusible(HloInstruction* instr,
+                        const se::GpuComputeCapability& gpu_version,
                         int num_allowed_users = 1) {
   // Checks whether an op is trivially fusible. An op is said to be trivially
   // fusible if it does not increase the amount of memory read/written by the
@@ -189,7 +190,8 @@ bool IsTriviallyFusible(HloInstruction* instr, const GpuVersion& gpu_version,
 }
 
 bool TrivialEdge(HloInstruction** producer, HloInstruction* consumer,
-                 HloOpcode opcode, const GpuVersion& gpu_version) {
+                 HloOpcode opcode,
+                 const se::GpuComputeCapability& gpu_version) {
   while (consumer->opcode() != opcode) {
     if (IsTriviallyFusible(consumer, gpu_version)) {
       consumer = ChooseOperandForFusionProcessing(consumer);
@@ -202,9 +204,9 @@ bool TrivialEdge(HloInstruction** producer, HloInstruction* consumer,
   return true;
 }
 
-bool IsTriviallyConnectedProducerOf(HloInstruction* producer,
-                                    HloInstruction* consumer,
-                                    const GpuVersion& gpu_version) {
+bool IsTriviallyConnectedProducerOf(
+    HloInstruction* producer, HloInstruction* consumer,
+    const se::GpuComputeCapability& gpu_version) {
   if (producer == consumer) {
     return true;
   }
@@ -227,7 +229,7 @@ bool IsTriviallyConnectedProducerOf(HloInstruction* producer,
 }
 
 bool IsTritonSupportedComputation(const HloComputation* computation,
-                                  const GpuVersion& gpu_version) {
+                                  const se::GpuComputeCapability& gpu_version) {
   for (const HloInstruction* instr : computation->instructions()) {
     if (!IsTritonSupportedInstruction(instr, gpu_version)) {
       return false;
@@ -237,7 +239,7 @@ bool IsTritonSupportedComputation(const HloComputation* computation,
 }
 
 std::optional<HloInstruction*> MatchesTritonCompatibleClosedReductionDiamond(
-    HloInstruction* instr, const GpuVersion& gpu_version) {
+    HloInstruction* instr, const se::GpuComputeCapability& gpu_version) {
   // Return the producer of the following pattern:
   //
   // producer
@@ -318,7 +320,8 @@ std::optional<HloInstruction*> MatchesTritonCompatibleClosedReductionDiamond(
 //      that instruction is used more than once, and/or is not trivially
 //      fusible.
 HloInstruction* FindFirstNonFusibleDiamondProducer(
-    HloInstruction* diamond_producer, const GpuVersion& gpu_version) {
+    HloInstruction* diamond_producer,
+    const se::GpuComputeCapability& gpu_version) {
   if (IsTriviallyFusible(diamond_producer, gpu_version,
                          /*num_allowed_users=*/2)) {
     diamond_producer = ChooseOperandForFusionProcessing(diamond_producer);
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h
index ca13005149aa77..fd131f8336d754 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h
@@ -22,10 +22,10 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -39,7 +39,7 @@ struct DiamondChainDescriptor {
 // with the Triton-based Softmax emitter.
 class SoftmaxRewriterTriton : public HloModulePass {
  public:
-  explicit SoftmaxRewriterTriton(GpuVersion gpu_version)
+  explicit SoftmaxRewriterTriton(se::GpuComputeCapability gpu_version)
       : gpu_version_(gpu_version) {}
   absl::string_view name() const override { return "triton-softmax-rewriter"; }
 
@@ -61,7 +61,7 @@ class SoftmaxRewriterTriton : public HloModulePass {
   Status FuseDiamondChain(const DiamondChainDescriptor& diamond_chain);
 
  private:
-  GpuVersion gpu_version_;
+  se::GpuComputeCapability gpu_version_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
index 31598ff2028092..67e341e146095e 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include "absl/strings/substitute.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/primitive_util.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/errors.h"
 
@@ -39,8 +39,8 @@ namespace m = ::xla::match;
 // Wrapper around SoftmaxRewriterTriton(gpu_version).Run(module) that finds
 // and fuses as many diamond chains as possible without invoking any kind of
 // cost analysis.
-StatusOr<bool> SoftmaxRewriterTritonMatchAndRewrite(GpuVersion gpu_version,
-                                                    HloModule* module) {
+StatusOr<bool> SoftmaxRewriterTritonMatchAndRewrite(
+    se::GpuComputeCapability gpu_version, HloModule* module) {
   CHECK_NE(module, nullptr);
   SoftmaxRewriterTriton softmax_rewriter_triton(gpu_version);
   std::vector<DiamondChainDescriptor> diamond_chains =
@@ -61,12 +61,12 @@ class SoftmaxRewriterTritonTest
       public ::testing::WithParamInterface<PrimitiveType> {
  public:
   void SetUp() override {
-    gpu_version_ = GpuVersion{
+    gpu_version_ = se::GpuComputeCapability{
         se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0}};
   }
 
  protected:
-  GpuVersion gpu_version_;
+  se::GpuComputeCapability gpu_version_;
 };
 
 TEST_P(SoftmaxRewriterTritonTest, CanFuseExactSoftmax) {
diff --git a/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc b/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc
index 4ca5102b1c208b..bd37ce540540fd 100644
--- a/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc
@@ -41,7 +41,7 @@ namespace gpu {
 
 class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit ReductionRewriterVisitor(GpuVersion gpu_version)
+  explicit ReductionRewriterVisitor(se::GpuComputeCapability gpu_version)
       : gpu_version_(gpu_version) {}
 
   Status HandleReduce(HloInstruction *hlo) override {
@@ -275,7 +275,7 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
     return ReplaceWithNewInstruction(hlo, std::move(out));
   }
 
-  GpuVersion gpu_version_;
+  se::GpuComputeCapability gpu_version_;
 };
 
 StatusOr<bool> GpuTreeReductionRewriter::Run(
diff --git a/third_party/xla/xla/service/gpu/tree_reduction_rewriter.h b/third_party/xla/xla/service/gpu/tree_reduction_rewriter.h
index 82656701728d36..5c3e8d9bf73116 100644
--- a/third_party/xla/xla/service/gpu/tree_reduction_rewriter.h
+++ b/third_party/xla/xla/service/gpu/tree_reduction_rewriter.h
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/gpu_types.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -76,7 +76,7 @@ namespace gpu {
 //
 class GpuTreeReductionRewriter : public HloModulePass {
  public:
-  explicit GpuTreeReductionRewriter(GpuVersion gpu_version)
+  explicit GpuTreeReductionRewriter(se::GpuComputeCapability gpu_version)
       : gpu_version_(gpu_version) {}
 
   ~GpuTreeReductionRewriter() override = default;
@@ -90,7 +90,7 @@ class GpuTreeReductionRewriter : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  GpuVersion gpu_version_;
+  se::GpuComputeCapability gpu_version_;
 };
 
 }  // end namespace gpu
diff --git a/third_party/xla/xla/stream_executor/device_description.cc b/third_party/xla/xla/stream_executor/device_description.cc
index 619a252432e9f4..366c91d57d6cff 100644
--- a/third_party/xla/xla/stream_executor/device_description.cc
+++ b/third_party/xla/xla/stream_executor/device_description.cc
@@ -60,12 +60,25 @@ DeviceDescriptionBuilder::DeviceDescriptionBuilder()
 
 }  // namespace internal
 
+GpuComputeCapability DeviceDescription::gpu_compute_capability() const {
+  return gpu_compute_capability_;
+}
+
 CudaComputeCapability DeviceDescription::cuda_compute_capability() const {
-  return cuda_compute_capability_;
+  if (auto *ptr =
+          std::get_if<CudaComputeCapability>(&gpu_compute_capability_)) {
+    return *ptr;
+  }
+  // Fallback for backwards compatibility.
+  return CudaComputeCapability{-1, -1};
 }
 
 RocmComputeCapability DeviceDescription::rocm_compute_capability() const {
-  return rocm_compute_capability_;
+  if (auto *ptr =
+          std::get_if<RocmComputeCapability>(&gpu_compute_capability_)) {
+    return *ptr;
+  }
+  return RocmComputeCapability{};
 }
 
 bool ThreadDimOk(const DeviceDescription &device_description,
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index cbbde07417ba70..b3d2af5b123b66 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -195,6 +196,9 @@ class RocmComputeCapability {
   };
 };
 
+using GpuComputeCapability =
+    std::variant<CudaComputeCapability, RocmComputeCapability>;
+
 // Data that describes the execution target of the StreamExecutor, in terms of
 // important logical parameters. These include dimensionality limits and
 // physical parameters of interest, such as number of cores present on the
@@ -319,6 +323,8 @@ class DeviceDescription {
   // be "gfx000" (which is an invalid gfx arch).
   RocmComputeCapability rocm_compute_capability() const;
 
+  GpuComputeCapability gpu_compute_capability() const;
+
   // Returns the maximum amount of shared memory present on a single core
   // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
   // devices). Note that some devices, such as NVIDIA's have a configurable
@@ -378,11 +384,7 @@ class DeviceDescription {
 
   float clock_rate_ghz_;
 
-  // CUDA "CC" major value, -1 if not available.
-  CudaComputeCapability cuda_compute_capability_{-1, -1};
-
-  // ROCm gfx arch,  "gfx000" if not available.
-  RocmComputeCapability rocm_compute_capability_;
+  GpuComputeCapability gpu_compute_capability_;
 
   int numa_node_;
   int core_count_;
@@ -477,12 +479,12 @@ class DeviceDescriptionBuilder {
   }
 
   void set_cuda_compute_capability(int major, int minor) {
-    device_description_->cuda_compute_capability_ =
+    device_description_->gpu_compute_capability_ =
         CudaComputeCapability{major, minor};
   }
 
   void set_rocm_compute_capability(std::string gcn_arch_name) {
-    device_description_->rocm_compute_capability_ =
+    device_description_->gpu_compute_capability_ =
         RocmComputeCapability(gcn_arch_name);
   }
 
diff --git a/third_party/xla/xla/tests/llvm_compiler_test.cc b/third_party/xla/xla/tests/llvm_compiler_test.cc
index 951fc04d9b9f7a..f04328350c8415 100644
--- a/third_party/xla/xla/tests/llvm_compiler_test.cc
+++ b/third_party/xla/xla/tests/llvm_compiler_test.cc
@@ -47,7 +47,7 @@ class GpuDummyCompiler : public GpuCompiler {
   GpuDummyCompiler() : GpuCompiler(kDummyTestId, kDummyTriple, kDummyLayout) {}
 
   Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, GpuVersion gpu_version,
+      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
       se::dnn::VersionInfo dnn_version,
       se::DeviceMemoryAllocator* device_allocator) {
     return OkStatus();
@@ -61,14 +61,10 @@ class GpuDummyCompiler : public GpuCompiler {
     return OkStatus();
   }
 
-  GpuVersion GetGpuVersion(se::StreamExecutor*) override {
-    return se::CudaComputeCapability{0, 0};
-  }
-
   StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileTargetBinary(
       const HloModuleConfig& module_config, llvm::Module* llvm_module,
-      GpuVersion gpu_version, bool relocatable, const HloModule* debug_module,
-      const CompileOptions& options) override {
+      se::GpuComputeCapability gpu_version, bool relocatable,
+      const HloModule* debug_module, const CompileOptions& options) override {
     std::vector<uint8_t> compiled_results;
     return std::pair<std::string, std::vector<uint8_t>>(
         "", std::move(compiled_results));

From aff7c7f18eb40f0ae33d6616f54b36f789c64a7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Thu, 28 Sep 2023 08:48:44 -0700
Subject: [PATCH 377/567] [XLA:GPU][NFC] Change logging in TritonAutotuner

PiperOrigin-RevId: 569195141
---
 .../xla/xla/service/gpu/triton_autotuner.cc   | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/triton_autotuner.cc b/third_party/xla/xla/service/gpu/triton_autotuner.cc
index d0dc5fc2fcbe64..11f03d7136c001 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner.cc
@@ -82,6 +82,13 @@ limitations under the License.
 #include "tsl/platform/threadpool.h"
 #include "tsl/util/proto/proto_utils.h"
 
+// Log levels used in this file:
+// VLOG(1): Overview
+// VLOG(2): Autotuning progress
+// VLOG(3): Autotuning progress - more frequent
+// VLOG(4): Print all fusions
+// VLOG(5): Profiling information for every tiling
+
 namespace xla {
 namespace gpu {
 
@@ -119,7 +126,7 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
       return OkStatus();
     }
 
-    VLOG(2) << "Processing " << hlo->ToString();
+    VLOG(4) << "Processing " << hlo->ToString();
     if (!backend_config.has_triton_gemm_config()) {
       TF_ASSIGN_OR_RETURN(
           AutotuneResult autotune_result,
@@ -132,7 +139,7 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
                 }
                 return InternalError("Expect autotune result cache hit.");
               }));
-      VLOG(2) << "Result: " << autotune_result.ShortDebugString();
+      VLOG(4) << "Result: " << autotune_result.ShortDebugString();
 
       if (autotune_result.has_triton()) {
         *backend_config.mutable_triton_gemm_config() = autotune_result.triton();
@@ -328,7 +335,7 @@ std::vector<AutotuneResult::TritonGemmKey> GetFixedMatmulAutotuneConfigs(
   return configs;
 }
 
-int GetLogEveryN() { return VLOG_IS_ON(2) ? 100 : 1000; }
+int GetLogEveryN() { return VLOG_IS_ON(3) ? 100 : 1000; }
 
 StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
     const AutotuneResult::TritonGemmKey& key,
@@ -616,7 +623,6 @@ StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
     }
     cublas_duration = output.duration;
   }
-  VLOG(3) << "Running with cuBLAS took: " << cublas_duration;
 
   const int log_every_n = GetLogEveryN();
   int64_t executable_count =
@@ -626,7 +632,7 @@ StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
   VLOG(2) << "Running " << executable_count << " configs for " << fusion->name()
           << ".";
   for (const ExecutableCandidate& candidate : executable_set.candidates) {
-    VLOG(3) << "Trying triton tiling: " << candidate.config.ShortDebugString();
+    VLOG(5) << "Trying triton tiling: " << candidate.config.ShortDebugString();
 
     AutotuneResult res;
     *res.mutable_triton() = candidate.config;
@@ -641,11 +647,11 @@ StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
     }
 
     if (!profiling_output) {
-      VLOG(3) << "Skipping this tiling.";
+      VLOG(5) << "Skipping this tiling.";
       continue;
     }
 
-    VLOG(3) << "Running the kernel took: " << profiling_output->duration;
+    VLOG(5) << "Running the kernel took: " << profiling_output->duration;
     if (profiling_output->duration >= absl::Seconds(1)) {
       LOG(WARNING) << "Slow kernel for " << fusion->name()
                    << " took: " << profiling_output->duration
@@ -694,8 +700,10 @@ StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
   if (debug_opts.xla_gpu_cublas_fallback()) {
     const absl::Duration best_triton_duration =
         tsl::proto_utils::FromDurationProto(best_triton.run_time());
+    VLOG(2) << fusion->name() << ": time with cuBLAS: " << cublas_duration
+            << ", best time with Triton: " << best_triton_duration;
     if (cublas_duration < best_triton_duration) {
-      VLOG(1) << "Falling back to cuBLAS for " << fusion->name();
+      VLOG(2) << "Falling back to cuBLAS for " << fusion->name();
 
       AutotuneResult cublas;
       *cublas.mutable_run_time() =

From 614a467679fdbdb74e65a7b89f52745d5b2b0456 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Thu, 28 Sep 2023 09:03:19 -0700
Subject: [PATCH 378/567] Do not use PJRT to transfer int4 tensors.

XLA PR https://github.com/openxla/xla/pull/5926 will cause PJRT to pack int4 tensors when transferring them to device, but TensorFlow represents int4 tensors as unpacked. This changes makes PJRT not be used to transfer int4 tensors. In the future, TensorFlow will likely pack int4 tensors, in which case PJRT can be used again.

PiperOrigin-RevId: 569199408
---
 tensorflow/core/common_runtime/eager/tensor_handle.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index d449dc75ac683f..b6ad32083fa088 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -1018,7 +1018,14 @@ Status TensorHandle::CopyToDevice(const EagerContext& ctx,
   }
   tensorflow::DeviceContext* dst_device_context = nullptr;
   if (!dst_cpu) {
-    if (dstd_info->use_pjrt_tensor_buffer) {
+    // PJRT will soon pack int4 tensors when transferring them to device (once
+    // XLA int4 support is implemented), but TF currently represents int4 as
+    // unpacked, so do not use PJRT for int4 tensors.
+    // TODO(b/226482736): Either pack int4, or support unpacked int4 tensors in
+    // PJRT. Also update beginning of comment from future to present tense once
+    // PJRT packs int4 tensors.
+    if (dstd_info->use_pjrt_tensor_buffer && DataType() != DT_INT4 &&
+        DataType() != DT_UINT4) {
       dst_device_context = dstd_info->pjrt_context;
     } else {
       dst_device_context = dstd_info->default_context;

From 5d4c1b0c67e2cfdcdea416e2314fa676aa072c07 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 28 Sep 2023 09:33:55 -0700
Subject: [PATCH 379/567] [stream_executor] NFC: Make temporary memory
 allocators private and export them via stream_executor target

Temporary memory manager is an implementation detail of StreamExecutor, and all SE clients should get access to headers via `xla/stream_executor` dependency.

PiperOrigin-RevId: 569207046
---
 tensorflow/core/platform/BUILD                |   2 -
 third_party/xla/xla/service/gpu/BUILD         |   5 -
 third_party/xla/xla/service/gpu/runtime/BUILD |   2 +-
 third_party/xla/xla/stream_executor/BUILD     | 146 +++++++++---------
 .../xla/xla/stream_executor/cuda/BUILD        |   7 +-
 third_party/xla/xla/stream_executor/gpu/BUILD |   1 -
 .../xla/xla/stream_executor/rocm/BUILD        |   5 +-
 7 files changed, 78 insertions(+), 90 deletions(-)

diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index af8ca88c76447e..e5ef3f04dd242f 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -1102,7 +1102,6 @@ tf_cuda_library(
         "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:dnn",
         "@local_xla//xla/stream_executor:multi_platform_manager",
-        "@local_xla//xla/stream_executor:scratch_allocator",
         "@local_xla//xla/stream_executor/cuda:cuda_activation_header",
         "@local_xla//xla/stream_executor/cuda:cuda_platform_id",
         "@local_xla//xla/stream_executor/host:host_platform_id",
@@ -1126,7 +1125,6 @@ cc_library(
         "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:dnn",
         "@local_xla//xla/stream_executor:multi_platform_manager",
-        "@local_xla//xla/stream_executor:scratch_allocator",
         "@local_xla//xla/stream_executor/cuda:cuda_platform_id",
         "@local_xla//xla/stream_executor/host:host_platform",
         "@local_xla//xla/stream_executor/host:host_platform_id",
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index e57c3c3e035b46..7d0ab81fdbb0ed 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1031,7 +1031,6 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/gpu:asm_compiler",
         "//xla/stream_executor/gpu:gpu_activation",
@@ -1226,7 +1225,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor",
-        "//xla/stream_executor:scratch_allocator",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1565,12 +1563,10 @@ cc_library(
         "//xla/stream_executor/cuda:cublas_plugin",
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
         "//xla/stream_executor:host_or_device_scalar",
-        "//xla/stream_executor:scratch_allocator",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor/rocm:hipblas_lt_header",
         "//xla/stream_executor/rocm:hipblaslt_plugin",
         "//xla/stream_executor:host_or_device_scalar",
-        "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor/platform:dso_loader",
     ]) + if_static([
         "@local_tsl//tsl/platform:tensor_float_32_utils",
@@ -1705,7 +1701,6 @@ cc_library(
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/gpu:redzone_allocator",
         "//xla/stream_executor/rocm:rocm_platform_id",
-        "//xla/stream_executor:scratch_allocator",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "@local_tsl//tsl/platform:logger",
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 2c48d0087d358d..b087ce7f33d285 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -526,7 +526,7 @@ cc_library(
         "//xla/runtime:state",
         "//xla/service:executable",
         "//xla/service/gpu:matmul_utils",
-        "//xla/stream_executor:scratch_allocator",
+        "//xla/stream_executor",
         "//xla/stream_executor/cuda:cublas_lt_header",
         "@local_tsl//tsl/platform:status",
     ] + if_rocm_is_configured([
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index c8afc42b51a74a..2fd02c7453e3d0 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -66,6 +66,7 @@ filegroup(
         "platform.h",
         "plugin.h",
         "plugin_registry.h",
+        "scratch_allocator.h",
         "stream.h",
         "stream_executor.h",
         "stream_executor_internal.h",
@@ -133,6 +134,23 @@ cc_library(
     deps = ["//xla/stream_executor/platform"],
 )
 
+cc_library(
+    name = "device_memory_allocator",
+    hdrs = ["device_memory_allocator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device_memory",
+        ":platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:types",
+    ],
+)
+
 cc_library(
     name = "device_options",
     hdrs = ["device_options.h"],
@@ -333,6 +351,55 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "scratch_allocator",
+    srcs = ["scratch_allocator.cc"],
+    hdrs = ["scratch_allocator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":stream_executor_headers",
+        ":temporary_device_memory",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "temporary_device_memory",
+    srcs = ["temporary_device_memory.cc"],
+    hdrs = ["temporary_device_memory.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":stream_executor_headers",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "temporary_memory_manager",
+    srcs = ["temporary_memory_manager.cc"],
+    hdrs = ["temporary_memory_manager.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":stream_executor_headers",
+        ":temporary_device_memory",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 #===--------------------------------------------------------------------------------------------===#
 
 # The stream_executor_headers target does not prescribe an implementation.
@@ -354,6 +421,7 @@ cc_library(
         "kernel.h",
         "kernel_cache_config.h",
         "kernel_spec.h",
+        "scratch_allocator.h",
         "data_type.h",
         "launch_dim.h",
         "numeric_options.h",
@@ -525,6 +593,7 @@ cc_library(
         ":launch_dim",
         ":platform",
         ":plugin_registry",
+        ":scratch_allocator",
         ":stream_executor_headers",
         ":stream_executor_internal",
         ":temporary_device_memory",
@@ -614,46 +683,6 @@ cc_library(
     ] + if_static(["@com_google_protobuf//:protobuf"]),
 )
 
-cc_library(
-    name = "temporary_memory_manager",
-    srcs = ["temporary_memory_manager.cc"],
-    hdrs = ["temporary_memory_manager.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device_memory",
-        ":stream_executor_headers",
-        ":temporary_device_memory",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "temporary_device_memory",
-    srcs = [
-        "event.h",
-        "temporary_device_memory.cc",
-        "temporary_memory_manager.h",
-    ],
-    hdrs = ["temporary_device_memory.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device_memory",
-        ":stream_executor_headers",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "stream_executor",
     textual_hdrs = [
@@ -675,6 +704,7 @@ cc_library(
         "launch_dim.h",
         "module_spec.h",
         "multi_platform_manager.h",
+        "scratch_allocator.h",
         "platform.h",
         "plugin.h",
         "plugin_registry.h",
@@ -716,46 +746,16 @@ cc_library(
         ":launch_dim",
         ":multi_platform_manager",
         ":platform",
+        ":scratch_allocator",
         ":stream_executor_headers",
         ":stream_executor_pimpl",
+        ":temporary_device_memory",
+        ":temporary_memory_manager",
         "//xla/stream_executor:dnn_proto_cc_impl",
         "@local_tsl//tsl/protobuf:dnn_proto_cc_impl",
     ],
 )
 
-cc_library(
-    name = "device_memory_allocator",
-    hdrs = ["device_memory_allocator.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device_memory",
-        ":platform",
-        ":stream_executor",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:types",
-    ],
-)
-
-cc_library(
-    name = "scratch_allocator",
-    srcs = ["scratch_allocator.cc"],
-    hdrs = ["scratch_allocator.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device_memory",
-        ":stream_executor_headers",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log:check",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 xla_cc_test(
     name = "stream_test",
     size = "small",
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index aa0567dc76856d..2aa3e04098ab28 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -265,9 +265,9 @@ cc_library(
         "@eigen_archive//:eigen3",
         "@local_config_cuda//cuda:cuda_headers",
         "//xla:status_macros",
+        "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:host_or_device_scalar",
-        "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_executor_header",
@@ -310,9 +310,9 @@ cc_library(
         ":cuda_stream",
         ":cuda_helpers",
         "@local_config_cuda//cuda:cuda_headers",
+        "//xla/stream_executor",
         "//xla/stream_executor:fft",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor/gpu:gpu_helpers_header",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
@@ -355,11 +355,10 @@ cc_library(
         "@local_tsl//tsl/cuda:cudnn",
         "@local_tsl//tsl/cuda:cudnn_version",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
+        "//xla/stream_executor",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream_executor_headers",
-        "//xla/stream_executor:temporary_device_memory",
         "//xla/stream_executor/platform",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index a67b7bf8ed1230..452a1e46a90b5a 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -375,7 +375,6 @@ cc_library(
         "@local_tsl//tsl/framework:allocator",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream_executor_headers",
         "@local_tsl//tsl/platform:status",
     ]),
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 812dae5f7d4be0..361216524667a7 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -219,7 +219,6 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:host_or_device_scalar",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/gpu:gpu_helpers_header",
         "//xla/stream_executor/gpu:gpu_stream_header",
@@ -261,7 +260,6 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:fft",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/gpu:gpu_helpers_header",
         "//xla/stream_executor/gpu:gpu_executor_header",
@@ -311,7 +309,6 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_timer_header",
@@ -473,7 +470,7 @@ cc_library(
         "//xla:status",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
-        "//xla/stream_executor:scratch_allocator",
+        "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_helpers_header",
         "//xla:util",
     ]) + if_static([

From 3f50ac64300be574470ec8bc529adba6671d54f4 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Thu, 28 Sep 2023 09:45:53 -0700
Subject: [PATCH 380/567] Implement ReduceWindow in TFLite.

PiperOrigin-RevId: 569210176
---
 tensorflow/lite/builtin_ops.h                 |   1 +
 .../lite/core/api/flatbuffer_conversions.cc   |  33 +-
 tensorflow/lite/core/c/builtin_op_data.h      |  14 +
 .../lite/core/kernels/builtin_op_kernels.h    |   1 +
 tensorflow/lite/core/kernels/register.cc      |   1 +
 tensorflow/lite/kernels/BUILD                 |  19 +
 tensorflow/lite/kernels/builtin_ops_list.inc  |   1 +
 tensorflow/lite/kernels/reduce_window.cc      | 406 ++++++++++++++++++
 tensorflow/lite/kernels/reduce_window_test.cc | 253 +++++++++++
 tensorflow/lite/kernels/register_ref.cc       |   2 +
 tensorflow/lite/schema/schema.fbs             |  16 +
 tensorflow/lite/schema/schema_generated.h     | 224 ++++++++--
 .../serialization/option_writer_generator.cc  |   1 +
 .../lite/tools/versioning/runtime_version.cc  |   3 +-
 14 files changed, 949 insertions(+), 26 deletions(-)
 create mode 100644 tensorflow/lite/kernels/reduce_window.cc
 create mode 100644 tensorflow/lite/kernels/reduce_window_test.cc

diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index d685ff173869e7..a8d68368b06656 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -232,6 +232,7 @@ typedef enum {
   kTfLiteBuiltinStablehloTranspose = 202,
   kTfLiteBuiltinDilate = 203,
   kTfLiteBuiltinStablehloRngBitGenerator = 204,
+  kTfLiteBuiltinReduceWindow = 205,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index e46ef134b91321..a7089182cef4d2 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -881,7 +881,38 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_STABLEHLO_GATHER: {
       return ParseStablehloGather(op, error_reporter, allocator, builtin_data);
     }
-
+    case BuiltinOperator_REDUCE_WINDOW: {
+      auto params = safe_allocator.Allocate<TfLiteReduceWindowParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
+      if (const auto* reduce_params =
+              op->builtin_options_2_as_ReduceWindowOptions()) {
+        switch (reduce_params->reduce_function()) {
+          case ReduceWindowFunction_ADD:
+            params->reduce_function = TfLiteReduceWindowFunctionAdd;
+            break;
+          case ReduceWindowFunction_MUL:
+            params->reduce_function = TfLiteReduceWindowFunctionMul;
+            break;
+          case ReduceWindowFunction_MINIMUM:
+            params->reduce_function = TfLiteReduceWindowFunctionMin;
+            break;
+          case ReduceWindowFunction_MAXIMUM:
+            params->reduce_function = TfLiteReduceWindowFunctionMax;
+            break;
+          case ReduceWindowFunction_ALL:
+            params->reduce_function = TfLiteReduceWindowFunctionAll;
+            break;
+          case ReduceWindowFunction_ANY:
+            params->reduce_function = TfLiteReduceWindowFunctionAny;
+            break;
+          case ReduceWindowFunction_UNSUPPORTED:
+          default:
+            return kTfLiteError;
+        }
+      }
+      *builtin_data = params.release();
+      return kTfLiteOk;
+    }
     // TODO: skip param parsing for now since ops below don't have kernels
     case BuiltinOperator_STABLEHLO_SLICE:
     case BuiltinOperator_STABLEHLO_BROADCAST_IN_DIM:
diff --git a/tensorflow/lite/core/c/builtin_op_data.h b/tensorflow/lite/core/c/builtin_op_data.h
index 252f2b23b1a69a..8464a26bacb8b8 100644
--- a/tensorflow/lite/core/c/builtin_op_data.h
+++ b/tensorflow/lite/core/c/builtin_op_data.h
@@ -605,6 +605,20 @@ typedef struct {
   bool indices_are_sorted;
 } TfLiteStablehloGatherParams;
 
+enum TfLiteReduceWindowFunction {
+  TfLiteReduceWindowFunctionUnsupported,
+  TfLiteReduceWindowFunctionAdd,
+  TfLiteReduceWindowFunctionMul,
+  TfLiteReduceWindowFunctionMin,
+  TfLiteReduceWindowFunctionMax,
+  TfLiteReduceWindowFunctionAll,
+  TfLiteReduceWindowFunctionAny
+};
+
+typedef struct {
+  enum TfLiteReduceWindowFunction reduce_function;
+} TfLiteReduceWindowParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/core/kernels/builtin_op_kernels.h b/tensorflow/lite/core/kernels/builtin_op_kernels.h
index 5ae2a246ae0960..2163a679773c32 100644
--- a/tensorflow/lite/core/kernels/builtin_op_kernels.h
+++ b/tensorflow/lite/core/kernels/builtin_op_kernels.h
@@ -322,6 +322,7 @@ Register_STABLEHLO_TRANSPOSE();  // WARNING: not implemented, using this
 
 TfLiteRegistration* Register_DILATE();
 
+TfLiteRegistration* Register_REDUCE_WINDOW();
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/core/kernels/register.cc b/tensorflow/lite/core/kernels/register.cc
index 0a0fd8e44e4e07..ea1a19f14cf1fa 100644
--- a/tensorflow/lite/core/kernels/register.cc
+++ b/tensorflow/lite/core/kernels/register.cc
@@ -372,6 +372,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DILATE, Register_DILATE());
   AddBuiltin(BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR,
              Register_STABLEHLO_RNG_BIT_GENERATOR());
+  AddBuiltin(BuiltinOperator_REDUCE_WINDOW, Register_REDUCE_WINDOW());
   AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index b641ecdeadae14..ed39d82fb06f54 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -699,6 +699,7 @@ BUILTIN_KERNEL_SRCS = [
     "range.cc",
     "rank.cc",
     "reduce.cc",
+    "reduce_window.cc",
     "reshape.cc",
     "resize_bilinear.cc",
     "resize_nearest_neighbor.cc",
@@ -818,6 +819,7 @@ cc_library(
         ":variable_op_kernels",
         "@fft2d",
         "@ruy//ruy/profiler:instrumentation",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite:array",
         "//tensorflow/lite:builtin_ops",
         "@local_tsl//tsl/lib/random:philox_random",
@@ -1253,6 +1255,23 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "reduce_window_test",
+    size = "small",
+    srcs = ["reduce_window_test.cc"],
+    tags = ["tflite_nnapi"],
+    deps = [
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "space_to_batch_nd_test",
     size = "small",
diff --git a/tensorflow/lite/kernels/builtin_ops_list.inc b/tensorflow/lite/kernels/builtin_ops_list.inc
index dee63abbac725b..793649a48f9cd6 100644
--- a/tensorflow/lite/kernels/builtin_ops_list.inc
+++ b/tensorflow/lite/kernels/builtin_ops_list.inc
@@ -217,3 +217,4 @@ TFLITE_OP(Register_STABLEHLO_GATHER)
 TFLITE_OP(Register_STABLEHLO_TRANSPOSE)
 TFLITE_OP(Register_DILATE)
 TFLITE_OP(Register_STABLEHLO_RNG_BIT_GENERATOR)
+TFLITE_OP(Register_REDUCE_WINDOW)
diff --git a/tensorflow/lite/kernels/reduce_window.cc b/tensorflow/lite/kernels/reduce_window.cc
new file mode 100644
index 00000000000000..2c315cdd03693b
--- /dev/null
+++ b/tensorflow/lite/kernels/reduce_window.cc
@@ -0,0 +1,406 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+         //
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <type_traits>
+
+#include "tensorflow/lite/array.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace reduce_window {
+namespace {
+
+constexpr int32_t kMaxReduceWindowDims = 6;
+
+template <int Val>
+using IntCst = std::integral_constant<int, Val>;
+
+// Reduces the elements of a tensor viewed through a strided window.
+//
+// This applies a reduction to a tensor by skipping over elements that are not
+// in the window defined by the given shape and strides. The window is reduced
+// to one element.
+//
+// The shape is the shape of the window. The strides are based on the actual
+// tensor and the distance between window elements, counted in elements. Sparse
+// windows are possible.
+//
+// For instance: the following window has a [2, 2] shape and [8, 3] strides.
+//
+// ┌──┐     ┌──┐
+// │ 1│ 2  3│ 4│
+// └──┘     └──┘
+//   5  6  7  8    is reduced to 1 + 4 + 9 + 12 = 26
+// ┌──┐     ┌──┐
+// │ 9│10 11│12│
+// └──┘     └──┘
+//  13 14 15 16
+//
+// This is a recursive implementation of the strided reduction. At the time of
+// writing, both GCC and Clang are able to optimise away the recursion when done
+// with templates. The same code without templates is optimised by GCC but not
+// by Clang.
+//
+// The template uses integral_constant for the rank and depth values because old
+// versions of GCC [do not support partial specializations on non-type template
+// parameters](https://stackoverflow.com/a/7776468).
+template <class Op, class Type, class RankValue, class DepthValue = IntCst<0>,
+          class = void>
+struct StridedReduceImpl;
+
+// Body case of the recursive implementation of the strided reduction.
+template <class Op, class Type, int Rank, int Depth>
+struct StridedReduceImpl<Op, Type, IntCst<Rank>, IntCst<Depth>,
+                         std::enable_if_t<Depth + 1 != Rank>> {
+  static void Run(const Type* input, const int64_t* const shape,
+                  const int64_t* const strides, Type& accu) {
+    const int64_t stride = strides[Depth];
+    const int64_t size = shape[Depth];
+    for (int64_t i = 0; i < size; ++i) {
+      StridedReduceImpl<Op, Type, IntCst<Rank>, IntCst<Depth + 1>>::Run(
+          input, shape, strides, accu);
+      input += stride;
+    }
+  }
+};
+
+// Tail case of the recursive implementation of the strided reduction.
+template <class Op, class Type, int Rank>
+struct StridedReduceImpl<Op, Type, IntCst<Rank>, IntCst<Rank - 1>> {
+  static void Run(const Type* input, const int64_t* const shape,
+                  const int64_t* const strides, Type& accu) {
+    const int64_t stride = strides[Rank - 1];
+    const int64_t size = shape[Rank - 1];
+    const Op op;
+    for (int64_t i = 0; i < size; ++i) {
+      accu = op(accu, *input);
+      input += stride;
+    }
+  }
+};
+
+// Recursively computes strided reductions using a sliding window over the given
+// tensor.
+//
+// The window is defined using a shape and a dilation. The shape defines the
+// elements that the window will let the reduction *see*. The dilation defines
+// the step between window elements.
+//
+// For instance: the following window has a [2, 2] shape and [2, 3] dilations.
+//
+//    3
+// ┌────┐
+// ┌─┐   ┌─┐
+// │X│X X│X│┐
+// └─┘   └─┘│2
+//  X X X X ┘
+// ┌─┐   ┌─┐
+// │X│X X│X│
+// └─┘   └─┘
+//
+// The template uses integral_constant for the rank and depth values because old
+// versions of GCC [do not support partial specializations on non-type template
+// parameters](https://stackoverflow.com/a/7776468).
+template <class Op, class Type, class RankValue, class DepthValue = IntCst<0>,
+          class = void>
+struct ReduceWindowImpl;
+
+// Body case of the recursive implementation of the window sliding.
+template <class Op, class Type, int Rank, int Depth>
+struct ReduceWindowImpl<Op, Type, IntCst<Rank>, IntCst<Depth>,
+                        std::enable_if_t<Depth + 1 != Rank>> {
+  static void Run(const Type* input, Type* output,
+                  const int64_t* const output_shape,
+                  const int64_t* const output_strides,
+                  const int64_t* const window_offset_strides,
+                  const int64_t* const window_shape,
+                  const int64_t* const window_reduce_strides, const Type init) {
+    for (int32_t dim = 0; dim < output_shape[Depth]; ++dim) {
+      ReduceWindowImpl<Op, Type, IntCst<Rank>, IntCst<Depth + 1>>::Run(
+          input, output, output_shape, output_strides, window_offset_strides,
+          window_shape, window_reduce_strides, init);
+      input += window_offset_strides[Depth];
+      output += output_strides[Depth];
+    }
+  }
+};
+
+// Tail case of the recursive implementation of the window sliding.
+template <class Op, class Type, int Rank>
+struct ReduceWindowImpl<Op, Type, IntCst<Rank>, IntCst<Rank - 1>> {
+  static void Run(const Type* input, Type* output,
+                  const int64_t* const output_shape,
+                  const int64_t* const output_strides,
+                  const int64_t* const window_offset_strides,
+                  const int64_t* const window_shape,
+                  const int64_t* const window_reduce_strides, const Type init) {
+    for (int32_t dim = 0; dim < output_shape[Rank - 1]; ++dim) {
+      *output = init;
+      StridedReduceImpl<Op, Type, IntCst<Rank>>::Run(
+          input, window_shape, window_reduce_strides, *output);
+      input += window_offset_strides[Rank - 1];
+      output += output_strides[Rank - 1];
+    }
+  }
+};
+
+std::array<int64_t, kMaxReduceWindowDims> ComputeStrides(
+    const int64_t* const shape, const int64_t rank) {
+  std::array<int64_t, kMaxReduceWindowDims> strides;
+  strides[rank - 1] = 1;
+  for (int64_t i = rank - 2; i >= 0; --i) {
+    strides[i] = shape[i + 1] * strides[i + 1];
+  }
+  return strides;
+}
+
+// Element-wise multiplication of the two operands of given size.
+std::array<int64_t, kMaxReduceWindowDims> Multiply(const int64_t* const vec1,
+                                                   const int64_t* const vec2,
+                                                   const int64_t size) {
+  std::array<int64_t, kMaxReduceWindowDims> result;
+  for (int64_t i = 0; i < size; ++i) {
+    result[i] = vec2[i] * vec1[i];
+  }
+  return result;
+}
+
+// Computes the output shape of the ReduceWindow operator.
+std::array<int64_t, kMaxReduceWindowDims> ComputeOutputShape(
+    const int64_t* const shape, const int64_t* const window_shape,
+    const int64_t* const window_strides, const int64_t* const window_dilations,
+    const int64_t rank) {
+  std::array<int64_t, kMaxReduceWindowDims> dilated_window_shape;
+  for (int64_t i = 0; i < rank; ++i) {
+    dilated_window_shape[i] = (window_shape[i] - 1) * window_dilations[i] + 1;
+  }
+
+  std::array<int64_t, kMaxReduceWindowDims> window_range;
+  for (int64_t i = 0; i < rank; ++i) {
+    window_range[i] =
+        (shape[i] - dilated_window_shape[i] + window_strides[i] - 1) /
+            window_strides[i] +
+        1;
+  }
+  return window_range;
+}
+
+template <class Op, class Type, int Rank>
+void ReduceWindow(const Type* const input, Type* output,
+                  const int64_t* const shape, const int64_t* const window_shape,
+                  const int64_t* const window_strides,
+                  const int64_t* const window_dilations, const Type init) {
+  const std::array<int64_t, kMaxReduceWindowDims> strides =
+      ComputeStrides(shape, Rank);
+  const std::array<int64_t, kMaxReduceWindowDims> window_reduce_strides =
+      Multiply(strides.data(), window_dilations, Rank);
+  const std::array<int64_t, kMaxReduceWindowDims> window_offset_strides =
+      Multiply(strides.data(), window_strides, Rank);
+  const std::array<int64_t, kMaxReduceWindowDims> output_shape =
+      ComputeOutputShape(shape, window_shape, window_strides, window_dilations,
+                         Rank);
+  const std::array<int64_t, kMaxReduceWindowDims> output_strides =
+      ComputeStrides(output_shape.data(), Rank);
+  ReduceWindowImpl<Op, Type, IntCst<Rank>>::Run(
+      input, output, output_shape.data(), output_strides.data(),
+      window_offset_strides.data(), window_shape, window_reduce_strides.data(),
+      init);
+}
+
+std::array<int64_t, kMaxReduceWindowDims> AsInt64(const int32_t* data,
+                                                  const int size) {
+  std::array<int64_t, kMaxReduceWindowDims> res;
+  std::copy_n(data, size, res.data());
+  return res;
+}
+
+// Holds the tensors and operation context for convenience.
+struct ReduceWindowContext {
+  enum InputTensorId {
+    kInput,
+    kInitValue,
+    kWindowShape,
+    kWindowStrides,
+    kWindowDilations,
+    kNumInputTensors
+  };
+  enum OutputTensorId { kOutput, kNumOutputTensors };
+
+  ReduceWindowContext(TfLiteContext* context, TfLiteNode* node)
+      : context(context),
+        node(node),
+        input_tensor(GetInput(context, node, kInput)),
+        init_value_tensor(GetInput(context, node, kInitValue)),
+        window_shape_tensor(GetInput(context, node, kWindowShape)),
+        window_strides_tensor(GetInput(context, node, kWindowStrides)),
+        window_dilations_tensor(GetInput(context, node, kWindowDilations)),
+        output_tensor(GetOutput(context, node, kOutput)) {}
+
+  TfLiteContext* context;
+  TfLiteNode* node;
+  const TfLiteTensor* input_tensor;
+  const TfLiteTensor* init_value_tensor;
+  const TfLiteTensor* window_shape_tensor;
+  const TfLiteTensor* window_strides_tensor;
+  const TfLiteTensor* window_dilations_tensor;
+  TfLiteTensor* output_tensor;
+};
+
+TfLiteStatus SetupOutputTensor(const ReduceWindowContext& ctx) {
+  const int rank = ctx.input_tensor->dims->size;
+  const std::array<int64_t, kMaxReduceWindowDims> input_shape =
+      AsInt64(ctx.input_tensor->dims->data, rank);
+  auto output_shape_data =
+      ComputeOutputShape(input_shape.data(), ctx.window_shape_tensor->data.i64,
+                         ctx.window_strides_tensor->data.i64,
+                         ctx.window_dilations_tensor->data.i64, rank);
+  IntArrayUniquePtr output_shape =
+      BuildTfLiteArray<int32_t>(rank, output_shape_data.data());
+  return ctx.context->ResizeTensor(ctx.context, ctx.output_tensor,
+                                   output_shape.release());
+}
+
+template <class Op, class Type>
+TfLiteStatus DispatchReduceWindowRank(ReduceWindowContext& ctx) {
+  const int rank = ctx.input_tensor->dims->size;
+  const std::array<int64_t, kMaxReduceWindowDims> input_shape =
+      AsInt64(ctx.input_tensor->dims->data, rank);
+#define REDUCE_WINDOW_RANK_CASE(RANK)                               \
+  case RANK:                                                        \
+    ReduceWindow<Op, Type, RANK>(                                   \
+        reinterpret_cast<const Type*>(ctx.input_tensor->data.raw),  \
+        reinterpret_cast<Type*>(ctx.output_tensor->data.raw),       \
+        input_shape.data(), ctx.window_shape_tensor->data.i64,      \
+        ctx.window_strides_tensor->data.i64,                        \
+        ctx.window_dilations_tensor->data.i64,                      \
+        *reinterpret_cast<Type*>(ctx.init_value_tensor->data.raw)); \
+    break;
+  switch (ctx.input_tensor->dims->size) {
+    REDUCE_WINDOW_RANK_CASE(1);
+    REDUCE_WINDOW_RANK_CASE(2);
+    REDUCE_WINDOW_RANK_CASE(3);
+    REDUCE_WINDOW_RANK_CASE(4);
+    REDUCE_WINDOW_RANK_CASE(5);
+    REDUCE_WINDOW_RANK_CASE(6);
+    default:
+      return kTfLiteError;
+  }
+#undef REDUCE_WINDOW_RANK_CASE
+  return kTfLiteOk;
+}
+
+template <class Op>
+TfLiteStatus DispatchReduceWindowType(ReduceWindowContext& ctx) {
+#define REDUCE_WINDOW_TYPE_CASE(CPP_TYPE, TENSOR_TYPE) \
+  case TENSOR_TYPE:                                    \
+    return DispatchReduceWindowRank<Op, CPP_TYPE>(ctx);
+  switch (ctx.input_tensor->type) {
+    REDUCE_WINDOW_TYPE_CASE(int8_t, kTfLiteInt8);
+    REDUCE_WINDOW_TYPE_CASE(int16_t, kTfLiteInt16);
+    REDUCE_WINDOW_TYPE_CASE(int32_t, kTfLiteInt32);
+    REDUCE_WINDOW_TYPE_CASE(int64_t, kTfLiteInt64);
+    REDUCE_WINDOW_TYPE_CASE(uint8_t, kTfLiteUInt8);
+    REDUCE_WINDOW_TYPE_CASE(uint16_t, kTfLiteUInt16);
+    REDUCE_WINDOW_TYPE_CASE(uint32_t, kTfLiteUInt32);
+    REDUCE_WINDOW_TYPE_CASE(uint64_t, kTfLiteUInt64);
+    REDUCE_WINDOW_TYPE_CASE(float, kTfLiteFloat32);
+    static_assert(sizeof(float) == 4,
+                  "float type is expected to be 32 bit long");
+    REDUCE_WINDOW_TYPE_CASE(double, kTfLiteFloat64);
+    static_assert(sizeof(double) == 8,
+                  "double type is expected to be 64 bit long");
+    default:
+      return kTfLiteError;
+  }
+#undef REDUCE_WINDOW_TYPE_CASE
+}
+
+template <class Op>
+TfLiteStatus DispatchReduceWindowOp(ReduceWindowContext& ctx) {
+  return DispatchReduceWindowType<Op>(ctx);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node),
+                    ReduceWindowContext::kNumInputTensors);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node),
+                    ReduceWindowContext::kNumOutputTensors);
+  ReduceWindowContext ctx(context, node);
+  TF_LITE_ENSURE(context, IsConstantTensor(ctx.window_shape_tensor));
+  TF_LITE_ENSURE(context, IsConstantTensor(ctx.window_strides_tensor));
+  TF_LITE_ENSURE(context, IsConstantTensor(ctx.window_dilations_tensor));
+  TF_LITE_ENSURE(context, ctx.input_tensor->dims != nullptr);
+  TF_LITE_ENSURE(context, ctx.input_tensor->dims->size > 0);
+  TF_LITE_ENSURE(context, ctx.input_tensor->dims->size <= kMaxReduceWindowDims);
+  return SetupOutputTensor(ctx);
+}
+
+struct Max {
+  template <class T>
+  constexpr T operator()(const T& a, const T& b) const {
+    return a >= b ? a : b;
+  }
+};
+
+struct Min {
+  template <class T>
+  constexpr T operator()(const T& a, const T& b) const {
+    return a <= b ? a : b;
+  }
+};
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto& params =
+      *reinterpret_cast<TfLiteReduceWindowParams*>(node->builtin_data);
+  ReduceWindowContext ctx(context, node);
+  switch (params.reduce_function) {
+    case TfLiteReduceWindowFunctionUnsupported:
+      return kTfLiteError;
+    case TfLiteReduceWindowFunctionAdd:
+      return DispatchReduceWindowOp<std::plus<>>(ctx);
+    case TfLiteReduceWindowFunctionMul:
+      return DispatchReduceWindowOp<std::multiplies<>>(ctx);
+    case TfLiteReduceWindowFunctionAll:
+      return DispatchReduceWindowOp<std::logical_and<>>(ctx);
+    case TfLiteReduceWindowFunctionAny:
+      return DispatchReduceWindowOp<std::logical_or<>>(ctx);
+    case TfLiteReduceWindowFunctionMin:
+      return DispatchReduceWindowOp<Min>(ctx);
+    case TfLiteReduceWindowFunctionMax:
+      return DispatchReduceWindowOp<Max>(ctx);
+  }
+}
+
+}  // namespace
+}  // namespace reduce_window
+
+TfLiteRegistration* Register_REDUCE_WINDOW() {
+  static TfLiteRegistration r = {/*.init=*/nullptr, /*.free=*/nullptr,
+                                 /*.prepare=*/reduce_window::Prepare,
+                                 /*.invoke=*/reduce_window::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/reduce_window_test.cc b/tensorflow/lite/kernels/reduce_window_test.cc
new file mode 100644
index 00000000000000..37f8dd82878704
--- /dev/null
+++ b/tensorflow/lite/kernels/reduce_window_test.cc
@@ -0,0 +1,253 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+         //
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using testing::ElementsAre;
+
+template <class T>
+struct TensorTypeFor;
+
+#define TENSOR_TYPE_ASSOC(CPP_TYPE, TENSORTYPE_VALUE)     \
+  template <>                                             \
+  struct TensorTypeFor<CPP_TYPE> {                        \
+    static constexpr TensorType value = TENSORTYPE_VALUE; \
+  };
+
+TENSOR_TYPE_ASSOC(int8_t, TensorType_INT8);
+TENSOR_TYPE_ASSOC(int16_t, TensorType_INT16);
+TENSOR_TYPE_ASSOC(int32_t, TensorType_INT32);
+TENSOR_TYPE_ASSOC(int64_t, TensorType_INT64);
+
+TENSOR_TYPE_ASSOC(uint8_t, TensorType_UINT8);
+TENSOR_TYPE_ASSOC(uint16_t, TensorType_UINT16);
+TENSOR_TYPE_ASSOC(uint32_t, TensorType_UINT32);
+TENSOR_TYPE_ASSOC(uint64_t, TensorType_UINT64);
+
+TENSOR_TYPE_ASSOC(float, TensorType_FLOAT32);
+static_assert(sizeof(float) == 4, "float type is expected to be 32 bit long");
+TENSOR_TYPE_ASSOC(double, TensorType_FLOAT64);
+static_assert(sizeof(double) == 8, "double type is expected to be 64 bit long");
+
+template <class Container>
+int32_t ssize(const Container& c) {
+  return static_cast<int32_t>(c.size());
+}
+
+template <class T>
+class DilateOpModel : public SingleOpModel {
+  static constexpr TensorType kTensorType = TensorTypeFor<T>::value;
+
+ public:
+  void SetInput(absl::Span<const int32_t> shape,
+                absl::Span<const T> data = {}) {
+    input_shape_.assign(shape.begin(), shape.end());
+    if (data.empty()) {
+      input_data_.resize(absl::c_accumulate(shape, 1, std::multiplies<int>()));
+      absl::c_iota(input_data_, 1);
+    } else {
+      input_data_.assign(data.begin(), data.end());
+    }
+  }
+
+  void SetWindowShape(absl::Span<const int64_t> shape) {
+    window_shape_data_.assign(shape.begin(), shape.end());
+  }
+
+  // Note: the strides are counted in elements on the tensor grid not in the
+  // underlying buffer.
+  //
+  // For instance, {2,2} on the following matrix strting at element 1 will reach
+  // elements 3 (+2 horizontally), 7 (+2 vertically) and 9 (+2 vertically, +2
+  // horizontally):
+  //
+  // 1 2 3
+  // 4 5 6
+  // 7 8 9
+  void SetWindowStrides(absl::Span<const int64_t> strides) {
+    window_strides_data_.assign(strides.begin(), strides.end());
+  }
+
+  void SetWindowDilations(absl::Span<const int64_t> dilations) {
+    window_dilations_data_.assign(dilations.begin(), dilations.end());
+  }
+
+  void SetInitValue(const T& val) { init_value_data_ = val; }
+
+  void Build() {
+    input_ = AddInput({kTensorType, input_shape_});
+    init_value_ = AddConstInput(kTensorType, {init_value_data_}, {1});
+    window_shape_ = AddConstInput(TensorType_INT64, window_shape_data_,
+                                  {ssize(window_shape_data_)});
+    window_strides_ = AddConstInput(TensorType_INT64, window_strides_data_,
+                                    {ssize(window_strides_data_)});
+    window_dilations_ = AddConstInput(TensorType_INT64, window_dilations_data_,
+                                      {ssize(window_dilations_data_)});
+    output_ = AddOutput(kTensorType);
+    SetBuiltinOp(
+        BuiltinOperator_REDUCE_WINDOW, BuiltinOptions2_ReduceWindowOptions,
+        CreateReduceWindowOptions(builder_, ReduceWindowFunction_ADD).Union());
+    BuildInterpreter({input_shape_});
+    PopulateTensor(input_, input_data_);
+  }
+
+  TfLiteStatus BuildAndInvoke() {
+    Build();
+    return Invoke();
+  }
+
+  absl::Span<const T> GetOutputData() {
+    return absl::Span<const T>(interpreter_->typed_tensor<T>(output_),
+                               GetTensorSize(output_));
+  }
+
+  absl::Span<const int> GetOutputShape() {
+    const TfLiteIntArray& shape = *(interpreter_->tensor(output_)->dims);
+    return absl::Span<const int>(shape.data, shape.size);
+  }
+
+  const std::vector<T>& GetInput() const { return input_data_; }
+  const std::vector<int32_t>& GetInputShape() const { return input_shape_; }
+  const std::vector<int64_t>& GetWindowShape() const {
+    return window_shape_data_;
+  }
+  const std::vector<int64_t>& GetWindowStrides() const {
+    return window_strides_data_;
+  }
+  const std::vector<int64_t>& GetWindowDilations() const {
+    return window_dilations_data_;
+  }
+  const T& GetInitValue() const { return init_value_data_; }
+
+ protected:
+  int input_ = -1;
+  int window_shape_ = -1;
+  int window_strides_ = -1;
+  int window_dilations_ = -1;
+  int init_value_ = -1;
+  int output_ = -1;
+  std::vector<T> input_data_;
+  T init_value_data_;
+  std::vector<int32_t> input_shape_;
+  std::vector<int64_t> window_shape_data_;
+  std::vector<int64_t> window_strides_data_;
+  std::vector<int64_t> window_dilations_data_;
+};
+
+template <class StorageType>
+class ReduceWindowTest : public testing::Test {
+ protected:
+  DilateOpModel<StorageType> model_;
+};
+
+using TestList = testing::Types<int8_t, int16_t, int32_t, int64_t, uint8_t,
+                                uint16_t, uint32_t, uint64_t, float, double>;
+
+TYPED_TEST_SUITE(ReduceWindowTest, TestList);
+
+TYPED_TEST(ReduceWindowTest, FullWindow) {
+  auto& model = this->model_;
+  model.SetInput(/*shape=*/{3, 3});
+  model.SetWindowShape({3, 3});
+  model.SetWindowStrides({1, 1});
+  model.SetWindowDilations({1, 1});
+  model.SetInitValue(0);
+
+  EXPECT_EQ(this->model_.BuildAndInvoke(), kTfLiteOk);
+  EXPECT_THAT(this->model_.GetOutputShape(), ElementsAre(1, 1));
+  EXPECT_THAT(this->model_.GetOutputData(), ElementsAre(45));
+}
+
+TYPED_TEST(ReduceWindowTest, NoDilation) {
+  auto& model = this->model_;
+  model.SetInput(/*shape=*/{3, 3});
+  model.SetWindowShape({2, 2});
+  model.SetWindowStrides({1, 1});
+  model.SetWindowDilations({1, 1});
+  model.SetInitValue(0);
+
+  EXPECT_EQ(this->model_.BuildAndInvoke(), kTfLiteOk);
+  EXPECT_THAT(this->model_.GetOutputShape(), ElementsAre(2, 2));
+  EXPECT_THAT(this->model_.GetOutputData(), ElementsAre(12, 16, 24, 28));
+}
+
+TYPED_TEST(ReduceWindowTest, FullWindowWithDilation) {
+  auto& model = this->model_;
+  model.SetInput(/*shape=*/{3, 3});
+  model.SetWindowShape({2, 2});
+  model.SetWindowStrides({1, 1});
+  model.SetWindowDilations({2, 2});
+  model.SetInitValue(0);
+
+  EXPECT_EQ(this->model_.BuildAndInvoke(), kTfLiteOk);
+  EXPECT_THAT(this->model_.GetOutputShape(), ElementsAre(1, 1));
+  EXPECT_THAT(this->model_.GetOutputData(), ElementsAre(20));
+}
+
+TYPED_TEST(ReduceWindowTest, WithDilation) {
+  auto& model = this->model_;
+  model.SetInput(/*shape=*/{4, 4});
+  model.SetWindowShape({2, 2});
+  model.SetWindowStrides({1, 1});
+  model.SetWindowDilations({2, 2});
+  model.SetInitValue(0);
+
+  EXPECT_EQ(this->model_.BuildAndInvoke(), kTfLiteOk);
+  EXPECT_THAT(this->model_.GetOutputShape(), ElementsAre(2, 2));
+  EXPECT_THAT(this->model_.GetOutputData(), ElementsAre(24, 28, 40, 44));
+}
+
+TYPED_TEST(ReduceWindowTest, WithStrides) {
+  auto& model = this->model_;
+  model.SetInput(/*shape=*/{4, 4});
+  model.SetWindowShape({2, 2});
+  model.SetWindowStrides({2, 2});
+  model.SetWindowDilations({1, 1});
+  model.SetInitValue(0);
+
+  EXPECT_EQ(this->model_.BuildAndInvoke(), kTfLiteOk);
+  EXPECT_THAT(this->model_.GetOutputShape(), ElementsAre(2, 2));
+  EXPECT_THAT(this->model_.GetOutputData(), ElementsAre(14, 22, 46, 54));
+}
+
+TYPED_TEST(ReduceWindowTest, WithDilationAndStrides) {
+  auto& model = this->model_;
+  model.SetInput(/*shape=*/{5, 5});
+  model.SetWindowShape({2, 2});
+  model.SetWindowStrides({2, 2});
+  model.SetWindowDilations({2, 2});
+  model.SetInitValue(2);
+
+  EXPECT_EQ(this->model_.BuildAndInvoke(), kTfLiteOk);
+  EXPECT_THAT(this->model_.GetOutputShape(), ElementsAre(2, 2));
+  EXPECT_THAT(this->model_.GetOutputData(), ElementsAre(30, 38, 70, 78));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index 169d5ca1c3046a..25d64c8249a198 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -187,6 +187,7 @@ TfLiteRegistration* Register_RIGHT_SHIFT();
 TfLiteRegistration* Register_STABLEHLO_SCATTER();
 TfLiteRegistration* Register_DILATE();
 TfLiteRegistration* Register_STABLEHLO_RNG_BIT_GENERATOR();
+TfLiteRegistration* Register_REDUCE_WINDOW();
 
 namespace {
 
@@ -543,6 +544,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
   AddBuiltin(BuiltinOperator_DILATE, Register_DILATE());
   AddBuiltin(BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR,
              Register_STABLEHLO_RNG_BIT_GENERATOR());
+  AddBuiltin(BuiltinOperator_REDUCE_WINDOW, Register_REDUCE_WINDOW());
   AddCustom("NumericVerify",
             tflite::ops::custom::Register_NUMERIC_VERIFY_REF());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index e52f30ca16f7c3..993c069e4612c8 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -469,6 +469,7 @@ enum BuiltinOperator : int32 {
   STABLEHLO_TRANSPOSE = 202, // WARNING: No runtime support
   DILATE = 203,
   STABLEHLO_RNG_BIT_GENERATOR = 204,
+  REDUCE_WINDOW = 205,
 }
 // LINT.ThenChange(nnapi_linter/linter.proto)
 
@@ -625,6 +626,7 @@ union BuiltinOptions2{
   StablehloTransposeOptions,
   DilateOptions,
   StablehloRngBitGeneratorOptions,
+  ReduceWindowOptions,
 }
 
 table StablehloGatherOptions{
@@ -1446,6 +1448,20 @@ table RightShiftOptions {
 table DilateOptions {
 }
 
+enum ReduceWindowFunction : int {
+  UNSUPPORTED,
+  ADD,
+  MUL,
+  MINIMUM,
+  MAXIMUM,
+  ALL,
+  ANY,
+}
+
+table ReduceWindowOptions{
+  reduce_function: ReduceWindowFunction;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index b86d017f6d7c97..b416555e837c3f 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -645,6 +645,10 @@ struct DilateOptions;
 struct DilateOptionsBuilder;
 struct DilateOptionsT;
 
+struct ReduceWindowOptions;
+struct ReduceWindowOptionsBuilder;
+struct ReduceWindowOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeBuilder;
 struct OperatorCodeT;
@@ -1207,11 +1211,12 @@ enum BuiltinOperator : int32_t {
   BuiltinOperator_STABLEHLO_TRANSPOSE = 202,
   BuiltinOperator_DILATE = 203,
   BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR = 204,
+  BuiltinOperator_REDUCE_WINDOW = 205,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR
+  BuiltinOperator_MAX = BuiltinOperator_REDUCE_WINDOW
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[205] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[206] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -1417,13 +1422,14 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[205] {
     BuiltinOperator_STABLEHLO_GATHER,
     BuiltinOperator_STABLEHLO_TRANSPOSE,
     BuiltinOperator_DILATE,
-    BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR
+    BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR,
+    BuiltinOperator_REDUCE_WINDOW
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOperator() {
-  static const char * const names[206] = {
+  static const char * const names[207] = {
     "ADD",
     "AVERAGE_POOL_2D",
     "CONCATENATION",
@@ -1629,13 +1635,14 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "STABLEHLO_TRANSPOSE",
     "DILATE",
     "STABLEHLO_RNG_BIT_GENERATOR",
+    "REDUCE_WINDOW",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
-  if (::flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR)) return "";
+  if (::flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_REDUCE_WINDOW)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOperator()[index];
 }
@@ -4125,11 +4132,12 @@ enum BuiltinOptions2 : uint8_t {
   BuiltinOptions2_StablehloTransposeOptions = 17,
   BuiltinOptions2_DilateOptions = 18,
   BuiltinOptions2_StablehloRngBitGeneratorOptions = 19,
+  BuiltinOptions2_ReduceWindowOptions = 20,
   BuiltinOptions2_MIN = BuiltinOptions2_NONE,
-  BuiltinOptions2_MAX = BuiltinOptions2_StablehloRngBitGeneratorOptions
+  BuiltinOptions2_MAX = BuiltinOptions2_ReduceWindowOptions
 };
 
-inline const BuiltinOptions2 (&EnumValuesBuiltinOptions2())[20] {
+inline const BuiltinOptions2 (&EnumValuesBuiltinOptions2())[21] {
   static const BuiltinOptions2 values[] = {
     BuiltinOptions2_NONE,
     BuiltinOptions2_StablehloConcatenateOptions,
@@ -4150,13 +4158,14 @@ inline const BuiltinOptions2 (&EnumValuesBuiltinOptions2())[20] {
     BuiltinOptions2_StablehloGatherOptions,
     BuiltinOptions2_StablehloTransposeOptions,
     BuiltinOptions2_DilateOptions,
-    BuiltinOptions2_StablehloRngBitGeneratorOptions
+    BuiltinOptions2_StablehloRngBitGeneratorOptions,
+    BuiltinOptions2_ReduceWindowOptions
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOptions2() {
-  static const char * const names[21] = {
+  static const char * const names[22] = {
     "NONE",
     "StablehloConcatenateOptions",
     "StablehloBroadcastInDimOptions",
@@ -4177,13 +4186,14 @@ inline const char * const *EnumNamesBuiltinOptions2() {
     "StablehloTransposeOptions",
     "DilateOptions",
     "StablehloRngBitGeneratorOptions",
+    "ReduceWindowOptions",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOptions2(BuiltinOptions2 e) {
-  if (::flatbuffers::IsOutRange(e, BuiltinOptions2_NONE, BuiltinOptions2_StablehloRngBitGeneratorOptions)) return "";
+  if (::flatbuffers::IsOutRange(e, BuiltinOptions2_NONE, BuiltinOptions2_ReduceWindowOptions)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOptions2()[index];
 }
@@ -4268,6 +4278,10 @@ template<> struct BuiltinOptions2Traits<tflite::StablehloRngBitGeneratorOptions>
   static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloRngBitGeneratorOptions;
 };
 
+template<> struct BuiltinOptions2Traits<tflite::ReduceWindowOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_ReduceWindowOptions;
+};
+
 template<typename T> struct BuiltinOptions2UnionTraits {
   static const BuiltinOptions2 enum_value = BuiltinOptions2_NONE;
 };
@@ -4348,6 +4362,10 @@ template<> struct BuiltinOptions2UnionTraits<tflite::StablehloRngBitGeneratorOpt
   static const BuiltinOptions2 enum_value = BuiltinOptions2_StablehloRngBitGeneratorOptions;
 };
 
+template<> struct BuiltinOptions2UnionTraits<tflite::ReduceWindowOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_ReduceWindowOptions;
+};
+
 struct BuiltinOptions2Union {
   BuiltinOptions2 type;
   void *value;
@@ -4530,6 +4548,14 @@ struct BuiltinOptions2Union {
     return type == BuiltinOptions2_StablehloRngBitGeneratorOptions ?
       reinterpret_cast<const tflite::StablehloRngBitGeneratorOptionsT *>(value) : nullptr;
   }
+  tflite::ReduceWindowOptionsT *AsReduceWindowOptions() {
+    return type == BuiltinOptions2_ReduceWindowOptions ?
+      reinterpret_cast<tflite::ReduceWindowOptionsT *>(value) : nullptr;
+  }
+  const tflite::ReduceWindowOptionsT *AsReduceWindowOptions() const {
+    return type == BuiltinOptions2_ReduceWindowOptions ?
+      reinterpret_cast<const tflite::ReduceWindowOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions2(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions2 type);
@@ -4910,6 +4936,51 @@ inline const char *EnumNameMirrorPadMode(MirrorPadMode e) {
   return EnumNamesMirrorPadMode()[index];
 }
 
+enum ReduceWindowFunction : int32_t {
+  ReduceWindowFunction_UNSUPPORTED = 0,
+  ReduceWindowFunction_ADD = 1,
+  ReduceWindowFunction_MUL = 2,
+  ReduceWindowFunction_MINIMUM = 3,
+  ReduceWindowFunction_MAXIMUM = 4,
+  ReduceWindowFunction_ALL = 5,
+  ReduceWindowFunction_ANY = 6,
+  ReduceWindowFunction_MIN = ReduceWindowFunction_UNSUPPORTED,
+  ReduceWindowFunction_MAX = ReduceWindowFunction_ANY
+};
+
+inline const ReduceWindowFunction (&EnumValuesReduceWindowFunction())[7] {
+  static const ReduceWindowFunction values[] = {
+    ReduceWindowFunction_UNSUPPORTED,
+    ReduceWindowFunction_ADD,
+    ReduceWindowFunction_MUL,
+    ReduceWindowFunction_MINIMUM,
+    ReduceWindowFunction_MAXIMUM,
+    ReduceWindowFunction_ALL,
+    ReduceWindowFunction_ANY
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesReduceWindowFunction() {
+  static const char * const names[8] = {
+    "UNSUPPORTED",
+    "ADD",
+    "MUL",
+    "MINIMUM",
+    "MAXIMUM",
+    "ALL",
+    "ANY",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameReduceWindowFunction(ReduceWindowFunction e) {
+  if (::flatbuffers::IsOutRange(e, ReduceWindowFunction_UNSUPPORTED, ReduceWindowFunction_ANY)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesReduceWindowFunction()[index];
+}
+
 enum CustomOptionsFormat : int8_t {
   CustomOptionsFormat_FLEXBUFFERS = 0,
   CustomOptionsFormat_MIN = CustomOptionsFormat_FLEXBUFFERS,
@@ -7645,7 +7716,7 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     VT_FUSED_ACTIVATION_FUNCTION = 10,
     VT_DILATION_W_FACTOR = 12,
     VT_DILATION_H_FACTOR = 14,
-    VT_quantized_bias_type = 16
+    VT_QUANTIZED_BIAS_TYPE = 16
   };
   tflite::Padding padding() const {
     return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
@@ -7666,7 +7737,7 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
   }
   tflite::TensorType quantized_bias_type() const {
-    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_quantized_bias_type, 0));
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_QUANTIZED_BIAS_TYPE, 0));
   }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -7676,7 +7747,7 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR, 4) &&
            VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR, 4) &&
-           VerifyField<int8_t>(verifier, VT_quantized_bias_type, 1) &&
+           VerifyField<int8_t>(verifier, VT_QUANTIZED_BIAS_TYPE, 1) &&
            verifier.EndTable();
   }
   Conv2DOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -7707,7 +7778,7 @@ struct Conv2DOptionsBuilder {
     fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
   }
   void add_quantized_bias_type(tflite::TensorType quantized_bias_type) {
-    fbb_.AddElement<int8_t>(Conv2DOptions::VT_quantized_bias_type, static_cast<int8_t>(quantized_bias_type), 0);
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_QUANTIZED_BIAS_TYPE, static_cast<int8_t>(quantized_bias_type), 0);
   }
   explicit Conv2DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
@@ -8551,7 +8622,7 @@ struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Ta
     VT_WEIGHTS_FORMAT = 6,
     VT_KEEP_NUM_DIMS = 8,
     VT_ASYMMETRIC_QUANTIZE_INPUTS = 10,
-    VT_quantized_bias_type = 12
+    VT_QUANTIZED_BIAS_TYPE = 12
   };
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
@@ -8566,7 +8637,7 @@ struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Ta
     return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
   }
   tflite::TensorType quantized_bias_type() const {
-    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_quantized_bias_type, 0));
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_QUANTIZED_BIAS_TYPE, 0));
   }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -8574,7 +8645,7 @@ struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Ta
            VerifyField<int8_t>(verifier, VT_WEIGHTS_FORMAT, 1) &&
            VerifyField<uint8_t>(verifier, VT_KEEP_NUM_DIMS, 1) &&
            VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
-           VerifyField<int8_t>(verifier, VT_quantized_bias_type, 1) &&
+           VerifyField<int8_t>(verifier, VT_QUANTIZED_BIAS_TYPE, 1) &&
            verifier.EndTable();
   }
   FullyConnectedOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -8599,7 +8670,7 @@ struct FullyConnectedOptionsBuilder {
     fbb_.AddElement<uint8_t>(FullyConnectedOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
   void add_quantized_bias_type(tflite::TensorType quantized_bias_type) {
-    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_quantized_bias_type, static_cast<int8_t>(quantized_bias_type), 0);
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_QUANTIZED_BIAS_TYPE, static_cast<int8_t>(quantized_bias_type), 0);
   }
   explicit FullyConnectedOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
@@ -11205,7 +11276,7 @@ struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Tab
     VT_STRIDE_W = 6,
     VT_STRIDE_H = 8,
     VT_FUSED_ACTIVATION_FUNCTION = 10,
-    VT_quantized_bias_type = 12
+    VT_QUANTIZED_BIAS_TYPE = 12
   };
   tflite::Padding padding() const {
     return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
@@ -11216,12 +11287,11 @@ struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Tab
   int32_t stride_h() const {
     return GetField<int32_t>(VT_STRIDE_H, 0);
   }
-
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   tflite::TensorType quantized_bias_type() const {
-    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_quantized_bias_type, 0));
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_QUANTIZED_BIAS_TYPE, 0));
   }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -11229,7 +11299,7 @@ struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Tab
            VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
-           VerifyField<int8_t>(verifier, VT_quantized_bias_type, 1) &&
+           VerifyField<int8_t>(verifier, VT_QUANTIZED_BIAS_TYPE, 1) &&
            verifier.EndTable();
   }
   TransposeConvOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -11254,7 +11324,7 @@ struct TransposeConvOptionsBuilder {
     fbb_.AddElement<int8_t>(TransposeConvOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
   void add_quantized_bias_type(tflite::TensorType quantized_bias_type) {
-    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_quantized_bias_type, static_cast<int8_t>(quantized_bias_type), 0);
+    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_QUANTIZED_BIAS_TYPE, static_cast<int8_t>(quantized_bias_type), 0);
   }
   explicit TransposeConvOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
@@ -11277,8 +11347,8 @@ inline ::flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
   TransposeConvOptionsBuilder builder_(_fbb);
   builder_.add_stride_h(stride_h);
   builder_.add_stride_w(stride_w);
-  builder_.add_fused_activation_function(fused_activation_function);
   builder_.add_quantized_bias_type(quantized_bias_type);
+  builder_.add_fused_activation_function(fused_activation_function);
   builder_.add_padding(padding);
   return builder_.Finish();
 }
@@ -14386,6 +14456,58 @@ inline ::flatbuffers::Offset<DilateOptions> CreateDilateOptions(
 
 ::flatbuffers::Offset<DilateOptions> CreateDilateOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DilateOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct ReduceWindowOptionsT : public ::flatbuffers::NativeTable {
+  typedef ReduceWindowOptions TableType;
+  tflite::ReduceWindowFunction reduce_function = tflite::ReduceWindowFunction_UNSUPPORTED;
+};
+
+struct ReduceWindowOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReduceWindowOptionsT NativeTableType;
+  typedef ReduceWindowOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_REDUCE_FUNCTION = 4
+  };
+  tflite::ReduceWindowFunction reduce_function() const {
+    return static_cast<tflite::ReduceWindowFunction>(GetField<int32_t>(VT_REDUCE_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_REDUCE_FUNCTION, 4) &&
+           verifier.EndTable();
+  }
+  ReduceWindowOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReduceWindowOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReduceWindowOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReduceWindowOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReduceWindowOptionsBuilder {
+  typedef ReduceWindowOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_reduce_function(tflite::ReduceWindowFunction reduce_function) {
+    fbb_.AddElement<int32_t>(ReduceWindowOptions::VT_REDUCE_FUNCTION, static_cast<int32_t>(reduce_function), 0);
+  }
+  explicit ReduceWindowOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReduceWindowOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReduceWindowOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReduceWindowOptions> CreateReduceWindowOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ReduceWindowFunction reduce_function = tflite::ReduceWindowFunction_UNSUPPORTED) {
+  ReduceWindowOptionsBuilder builder_(_fbb);
+  builder_.add_reduce_function(reduce_function);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ReduceWindowOptions> CreateReduceWindowOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReduceWindowOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public ::flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   int8_t deprecated_builtin_code = 0;
@@ -14996,6 +15118,9 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const tflite::StablehloRngBitGeneratorOptions *builtin_options_2_as_StablehloRngBitGeneratorOptions() const {
     return builtin_options_2_type() == tflite::BuiltinOptions2_StablehloRngBitGeneratorOptions ? static_cast<const tflite::StablehloRngBitGeneratorOptions *>(builtin_options_2()) : nullptr;
   }
+  const tflite::ReduceWindowOptions *builtin_options_2_as_ReduceWindowOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_ReduceWindowOptions ? static_cast<const tflite::ReduceWindowOptions *>(builtin_options_2()) : nullptr;
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_OPCODE_INDEX, 4) &&
@@ -15605,6 +15730,10 @@ template<> inline const tflite::StablehloRngBitGeneratorOptions *Operator::built
   return builtin_options_2_as_StablehloRngBitGeneratorOptions();
 }
 
+template<> inline const tflite::ReduceWindowOptions *Operator::builtin_options_2_as<tflite::ReduceWindowOptions>() const {
+  return builtin_options_2_as_ReduceWindowOptions();
+}
+
 struct OperatorBuilder {
   typedef Operator Table;
   ::flatbuffers::FlatBufferBuilder &fbb_;
@@ -20716,6 +20845,32 @@ inline ::flatbuffers::Offset<DilateOptions> CreateDilateOptions(::flatbuffers::F
       _fbb);
 }
 
+inline ReduceWindowOptionsT *ReduceWindowOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ReduceWindowOptionsT>(new ReduceWindowOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ReduceWindowOptions::UnPackTo(ReduceWindowOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = reduce_function(); _o->reduce_function = _e; }
+}
+
+inline ::flatbuffers::Offset<ReduceWindowOptions> ReduceWindowOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReduceWindowOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReduceWindowOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ReduceWindowOptions> CreateReduceWindowOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReduceWindowOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReduceWindowOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _reduce_function = _o->reduce_function;
+  return tflite::CreateReduceWindowOptions(
+      _fbb,
+      _reduce_function);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<OperatorCodeT>(new OperatorCodeT());
   UnPackTo(_o.get(), _resolver);
@@ -24049,6 +24204,10 @@ inline bool VerifyBuiltinOptions2(::flatbuffers::Verifier &verifier, const void
       auto ptr = reinterpret_cast<const tflite::StablehloRngBitGeneratorOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions2_ReduceWindowOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReduceWindowOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return true;
   }
 }
@@ -24144,6 +24303,10 @@ inline void *BuiltinOptions2Union::UnPack(const void *obj, BuiltinOptions2 type,
       auto ptr = reinterpret_cast<const tflite::StablehloRngBitGeneratorOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions2_ReduceWindowOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReduceWindowOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -24227,6 +24390,10 @@ inline ::flatbuffers::Offset<void> BuiltinOptions2Union::Pack(::flatbuffers::Fla
       auto ptr = reinterpret_cast<const tflite::StablehloRngBitGeneratorOptionsT *>(value);
       return CreateStablehloRngBitGeneratorOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions2_ReduceWindowOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReduceWindowOptionsT *>(value);
+      return CreateReduceWindowOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -24309,6 +24476,10 @@ inline BuiltinOptions2Union::BuiltinOptions2Union(const BuiltinOptions2Union &u)
       value = new tflite::StablehloRngBitGeneratorOptionsT(*reinterpret_cast<tflite::StablehloRngBitGeneratorOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions2_ReduceWindowOptions: {
+      value = new tflite::ReduceWindowOptionsT(*reinterpret_cast<tflite::ReduceWindowOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -24411,6 +24582,11 @@ inline void BuiltinOptions2Union::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions2_ReduceWindowOptions: {
+      auto ptr = reinterpret_cast<tflite::ReduceWindowOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/tools/serialization/option_writer_generator.cc b/tensorflow/lite/tools/serialization/option_writer_generator.cc
index 5baf9e5ea121d1..2d1f8452547d03 100644
--- a/tensorflow/lite/tools/serialization/option_writer_generator.cc
+++ b/tensorflow/lite/tools/serialization/option_writer_generator.cc
@@ -106,6 +106,7 @@ static const char* param_structs[] = {"TfLiteAddParams",
                                       "TfLiteUnsortedSegmentMinParams",
                                       "TfLiteBitwiseXorParams",
                                       "TfLiteRightShiftParams",
+                                      "TfLiteReduceWindowParams",
                                       nullptr};
 }  // namespace
 
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 77c6f44d6e8d50..67157cd2d21ad7 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -431,7 +431,8 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_RIGHT_SHIFT, 1}, "2.13.0"},
            {{BuiltinOperator_STABLEHLO_SCATTER, 1}, "2.15.0"},
            {{BuiltinOperator_DILATE, 1}, "2.15.0"},
-           {{BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR, 1}, "2.15.0"}});
+           {{BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR, 1}, "2.15.0"},
+           {{BuiltinOperator_REDUCE_WINDOW, 1}, "2.15.0"}});
 
   std::pair<BuiltinOperator, int> version_key = {op_code, op_version};
   auto it = op_version_map->find(version_key);

From 96368f2e618af97a1d40a70447315610259031cd Mon Sep 17 00:00:00 2001
From: cjflan <89868659+cjflan@users.noreply.github.com>
Date: Thu, 28 Sep 2023 12:06:47 -0500
Subject: [PATCH 381/567] Update release branch to 2.15

---
 .../ci_build/osx/arm64/tensorflow_as_build_release.Jenkinsfile  | 2 +-
 .../ci_build/osx/arm64/tensorflow_as_test_release.Jenkinsfile   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_build_release.Jenkinsfile b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_build_release.Jenkinsfile
index 45b47207aacfbb..05d096d6df7a28 100644
--- a/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_build_release.Jenkinsfile
+++ b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_build_release.Jenkinsfile
@@ -17,7 +17,7 @@ limitations under the License.
 pipeline {
     agent none
     environment {
-        RELEASE_BRANCH = 'r2.14'
+        RELEASE_BRANCH = 'r2.15'
     }
     stages {
         stage("Build Tensorflow") {
diff --git a/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_release.Jenkinsfile b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_release.Jenkinsfile
index f4c6f23fc74b53..4b10cd8911695d 100644
--- a/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_release.Jenkinsfile
+++ b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_release.Jenkinsfile
@@ -17,7 +17,7 @@ limitations under the License.
 pipeline {
     agent none
     environment {
-        RELEASE_BRANCH = 'r2.14'
+        RELEASE_BRANCH = 'r2.15'
     }
     stages {
         stage("Build Tensorflow") {

From c97bf4d1d92b5fbd60f52fe783be378aa72c869a Mon Sep 17 00:00:00 2001
From: Matt Callanan <mpcallanan@google.com>
Date: Thu, 28 Sep 2023 10:10:47 -0700
Subject: [PATCH 382/567] #tf-data-service Increase version number for internal
 change.

PiperOrigin-RevId: 569217384
---
 tensorflow/core/data/service/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/data/service/common.h b/tensorflow/core/data/service/common.h
index c0caa0c81e4a11..873c8361325840 100644
--- a/tensorflow/core/data/service/common.h
+++ b/tensorflow/core/data/service/common.h
@@ -32,7 +32,7 @@ namespace data {
 
 // Increment this when making backwards-incompatible changes to communication
 // between tf.data clients and servers.
-constexpr int kDataServiceVersion = 8;
+constexpr int kDataServiceVersion = 9;
 
 // If the user starts a colocated tf.data worker on each TF host, the worker
 // will be applied a "COLOCATED" tag. This is used to avoid reading from tf.data

From 98ef794c0ae6ea7e52ef843175db1871fa761966 Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Thu, 28 Sep 2023 10:34:39 -0700
Subject: [PATCH 383/567] Open source `_pywrap_tpu_embedding` for SparseCore.

PiperOrigin-RevId: 569224412
---
 tensorflow/compiler/jit/BUILD                 |   1 +
 tensorflow/compiler/jit/flags.cc              |  30 +++++-
 tensorflow/compiler/jit/flags.h               |  17 +++
 tensorflow/core/tpu/kernels/BUILD             |  12 ++-
 .../core/tpu/kernels/sparse_core_ops_utils.cc | 101 ++++++++++++++++++
 .../core/tpu/kernels/sparse_core_ops_utils.h  |  17 +++
 .../kernels/sparse_core_xla_flags_defaults.h  |  29 +++++
 tensorflow/python/tpu/BUILD                   |  22 +++-
 .../python/tpu/_pywrap_tpu_embedding.pyi      |  18 ++++
 tensorflow/python/tpu/pywrap_tpu_embedding.cc |  35 ++++++
 10 files changed, 279 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h
 create mode 100644 tensorflow/python/tpu/_pywrap_tpu_embedding.pyi
 create mode 100644 tensorflow/python/tpu/pywrap_tpu_embedding.cc

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 3c9748984b1115..5c75b8b6e0c0a7 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -409,6 +409,7 @@ cc_library(
         "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/tpu/kernels:sparse_core_xla_flags_defaults",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index 983eb64e2b4e60..46d185f79a0438 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h"
 #include "xla/parse_flags_from_env.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
@@ -35,6 +36,7 @@ namespace {
 BuildXlaOpsPassFlags* build_ops_flags;
 MarkForCompilationPassFlags* mark_for_compilation_flags;
 XlaDeviceFlags* device_flags;
+XlaSparseCoreFlags* sparse_core_flags;
 XlaOpsCommonFlags* ops_flags;
 XlaCallModuleFlags* call_module_flags;
 MlirCommonFlags* mlir_flags;
@@ -172,7 +174,20 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
       Flag("tf_xla_persistent_cache_prefix",
            &mark_for_compilation_flags->tf_xla_persistent_cache_prefix,
            "Specifies the persistance cache prefix. Default is "
-           "\"xla_compile_cache\"")};
+           "\"xla_compile_cache\""),
+      Flag("tf_xla_sparse_core_disable_table_stacking",
+           &sparse_core_flags->tf_xla_sparse_core_disable_table_stacking,
+           "Disable table stacking for all the tables passed to the SparseCore"
+           "mid level API."),
+      Flag("tf_xla_sparse_core_stacking_mem_limit_bytes",
+           &sparse_core_flags->tf_xla_sparse_core_stacking_mem_limit_bytes,
+           "If non-zero, limits the size of the activations for a given table"
+           "to be below these many bytes."),
+      Flag("tf_xla_sparse_core_stacking_table_shard_limit_bytes",
+           &sparse_core_flags
+                ->tf_xla_sparse_core_stacking_table_shard_limit_bytes,
+           "If non-zero, limits the size of any table shard to be below these"
+           "many bytes.")};
   flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
 }
 
@@ -232,6 +247,14 @@ void AllocateAndParseFlags() {
   device_flags->tf_xla_compile_on_demand = false;
   device_flags->tf_xla_enable_xla_devices = false;
 
+  sparse_core_flags = new XlaSparseCoreFlags;
+  sparse_core_flags->tf_xla_sparse_core_disable_table_stacking =
+      kDefaultDisableTableStacking;
+  sparse_core_flags->tf_xla_sparse_core_stacking_mem_limit_bytes =
+      kDefaultXlaSparseCoreStackingMemLimit;
+  sparse_core_flags->tf_xla_sparse_core_stacking_table_shard_limit_bytes =
+      kDefaultXlaSparseCoreStackingTableShardLimit;
+
   ops_flags = new XlaOpsCommonFlags;
   ops_flags->tf_xla_always_defer_compilation = false;
   ops_flags->tf_xla_async_compilation = false;
@@ -423,6 +446,11 @@ MarkForCompilationPassFlags* GetMarkForCompilationPassFlags() {
   return mark_for_compilation_flags;
 }
 
+XlaSparseCoreFlags* GetXlaSparseCoreFlags() {
+  absl::call_once(flags_init, &AllocateAndParseFlags);
+  return sparse_core_flags;
+}
+
 XlaDeviceFlags* GetXlaDeviceFlags() {
   absl::call_once(flags_init, &AllocateAndParseFlags);
   return device_flags;
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index e178ab16f35b85..ccda739133aaa7 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_FLAGS_H_
 #define TENSORFLOW_COMPILER_JIT_FLAGS_H_
 
+#include <cstdint>
 #include <optional>
 #include <string>
 #include <vector>
@@ -112,6 +113,21 @@ struct MarkForCompilationPassFlags {
   string tf_xla_persistent_cache_prefix;
 };
 
+// Flags associated with XLA Sparse Core.
+struct XlaSparseCoreFlags {
+  // Disable table stacking for all the tables passed to the SparseCore
+  // mid level API.
+  bool tf_xla_sparse_core_disable_table_stacking;
+
+  // If non-zero, limits the size of the activations for a given table to
+  // be below these many bytes.
+  int64_t tf_xla_sparse_core_stacking_mem_limit_bytes;
+
+  // If non-zero, limits the size of any table shard to be below these
+  // many bytes.
+  int64_t tf_xla_sparse_core_stacking_table_shard_limit_bytes;
+};
+
 // Flags associated with the XLA bridge's xla_device module.
 struct XlaDeviceFlags {
   // Switch the CPU device into "on-demand" mode, where instead of
@@ -299,6 +315,7 @@ struct JitRtFlags {
 // always return the same pointer.
 MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
 BuildXlaOpsPassFlags* GetBuildXlaOpsPassFlags();
+XlaSparseCoreFlags* GetXlaSparseCoreFlags();
 XlaDeviceFlags* GetXlaDeviceFlags();
 XlaOpsCommonFlags* GetXlaOpsCommonFlags();
 XlaCallModuleFlags* GetXlaCallModuleFlags();
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 9a41f6b39827a0..1fd1b1e7f7f709 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -1381,18 +1381,28 @@ cc_library(
     name = "sparse_core_ops_utils",
     srcs = ["sparse_core_ops_utils.cc"],
     hdrs = ["sparse_core_ops_utils.h"],
+    visibility = ["//visibility:public"],
     deps = [
+        ":sparse_core_xla_flags_defaults",
+        "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/core:lib",
-        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
 
+cc_library(
+    name = "sparse_core_xla_flags_defaults",
+    hdrs = ["sparse_core_xla_flags_defaults.h"],
+    visibility = ["//visibility:public"],
+)
+
 tf_cc_test(
     name = "sparse_core_ops_utils_test",
     srcs = ["sparse_core_ops_utils_test.cc"],
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
index a9d01fbc1602ad..2e3daafa93b80c 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
@@ -14,16 +14,21 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/sparse_core_ops_utils.h"
 
+#include <algorithm>
 #include <cmath>
+#include <cstdint>
 #include <functional>
 #include <string>
+#include <tuple>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/numeric/bits.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -93,8 +98,104 @@ std::function<float(float)> GetCombinerScaleTransformFunction(
   }
 }
 
+std::vector<std::vector<std::string>> GetTableStacks(
+    const std::vector<int64_t>& table_height,
+    const std::vector<int64_t>& table_width,
+    const std::vector<int64_t>& table_num_samples,
+    const std::vector<int64_t>& table_group,
+    const std::vector<std::string>& table_names, int64_t num_tpu_chips) {
+  if (GetDisableTableStacking()) {
+    std::vector<std::vector<std::string>> stacks(table_names.size());
+    for (int i = 0; i < table_names.size(); ++i) stacks[i] = {table_names[i]};
+    return stacks;
+  }
+
+  std::vector<std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>>
+      table_data(table_height.size());
+  for (int i = 0; i < table_height.size(); ++i)
+    table_data[i] =
+        std::make_tuple(table_height[i], table_width[i], table_num_samples[i],
+                        table_group[i], table_names[i]);
+
+  // Sort tables by name so that we have a deterministic stacking.
+  std::sort(table_data.begin(), table_data.end(), [](auto& lh, auto& rh) {
+    return std::get<4>(lh) < std::get<4>(rh);
+  });
+
+  absl::flat_hash_map<int64_t, std::vector<std::vector<std::string>>>
+      stacks_by_group;
+  absl::flat_hash_map<int64_t, std::vector<int64_t>> stacks_height_by_group;
+  absl::flat_hash_map<int64_t, std::vector<int64_t>> stacks_width_by_group;
+  absl::flat_hash_map<int64_t, std::vector<int64_t>> stacks_samples_by_group;
+
+  const int64_t mem_limit = GetXlaSparseCoreStackingMemLimit();
+  const int64_t table_shard_limit = GetXlaSparseCoreStackingTableShardLimit();
+
+  for (const auto& table : table_data) {
+    int64_t height;
+    int64_t width;
+    int64_t num_samples;
+    int64_t group;
+    std::string name;
+    std::tie(height, width, num_samples, group, name) = table;
+
+    // Want per SparseCore samples.
+    num_samples /= 4;
+
+    // Find a stack to fit in. We need to stay under the limit on activation
+    // sizes (if set) and the limit on table shard sizes (if set).
+    int64_t stack_id = 0;
+    for (; stack_id < stacks_by_group[group].size(); ++stack_id)
+      if (((mem_limit == 0) ||
+           (sizeof(float) * width *
+                (num_samples + stacks_samples_by_group[group][stack_id]) <
+            mem_limit)) &&
+          ((table_shard_limit == 0) ||
+           (sizeof(float) * (height + stacks_height_by_group[group][stack_id]) *
+                width / num_tpu_chips <
+            table_shard_limit)))
+        break;
+
+    // Create a new stack if we didn't find a stack to join.
+    if (stack_id == stacks_by_group[group].size()) {
+      stacks_by_group[group].resize(stacks_by_group[group].size() + 1);
+      stacks_height_by_group[group].push_back(0);
+      stacks_width_by_group[group].push_back(width);
+      stacks_samples_by_group[group].push_back(0);
+    }
+
+    // Add the table to the stack and track the number of samples and height
+    // of the table.
+    stacks_by_group[group][stack_id].push_back(name);
+    stacks_height_by_group[group][stack_id] += height;
+    stacks_samples_by_group[group][stack_id] += num_samples;
+  }
+
+  // Merge all the stacks into one list.
+  std::vector<std::vector<std::string>> table_stacks;
+  for (const auto& [group, stacks] : stacks_by_group)
+    table_stacks.insert(table_stacks.end(), stacks.begin(), stacks.end());
+
+  return table_stacks;
+}
+
 ABSL_ATTRIBUTE_WEAK int GetMinibatchMaxDivisionLevel() {
   return kMinibatchMaxDivisionLevel;
 }
 
+ABSL_ATTRIBUTE_WEAK bool GetDisableTableStacking() {
+  XlaSparseCoreFlags* sparse_core_flags = GetXlaSparseCoreFlags();
+  return sparse_core_flags->tf_xla_sparse_core_disable_table_stacking;
+}
+
+ABSL_ATTRIBUTE_WEAK int64_t GetXlaSparseCoreStackingMemLimit() {
+  XlaSparseCoreFlags* sparse_core_flags = GetXlaSparseCoreFlags();
+  return sparse_core_flags->tf_xla_sparse_core_stacking_mem_limit_bytes;
+}
+
+ABSL_ATTRIBUTE_WEAK int64_t GetXlaSparseCoreStackingTableShardLimit() {
+  XlaSparseCoreFlags* sparse_core_flags = GetXlaSparseCoreFlags();
+  return sparse_core_flags->tf_xla_sparse_core_stacking_table_shard_limit_bytes;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
index 0e09cc8306862b..718676e4d81700 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -44,8 +45,24 @@ std::function<float(float)> GetCombinerScaleContributionFunction(
 std::function<float(float)> GetCombinerScaleTransformFunction(
     absl::string_view combiner);
 
+// Stacks tables, so long as table have the same 'group' index. We assume that
+// all tables with a given group index have the same width. Returns a list of
+// list of table names, in alphabetical order.
+std::vector<std::vector<std::string>> GetTableStacks(
+    const std::vector<int64_t>& table_height,
+    const std::vector<int64_t>& table_width,
+    const std::vector<int64_t>& table_num_samples,
+    const std::vector<int64_t>& table_group,
+    const std::vector<std::string>& table_names, int64_t num_tpu_chips);
+
 int GetMinibatchMaxDivisionLevel();
 
+bool GetDisableTableStacking();
+
+int64_t GetXlaSparseCoreStackingMemLimit();
+
+int64_t GetXlaSparseCoreStackingTableShardLimit();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_OPS_UTILS_H_
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h b/tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h
new file mode 100644
index 00000000000000..5b212fa729fe5b
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_FLAGS_DEFAULTS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_FLAGS_DEFAULTS_H_
+
+#include <stdint.h>
+
+namespace tensorflow {
+
+constexpr bool kDefaultDisableTableStacking = false;
+constexpr int64_t kDefaultXlaSparseCoreStackingMemLimit = 2097152;
+constexpr int64_t kDefaultXlaSparseCoreStackingTableShardLimit = 2147483648;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_FLAGS_DEFAULTS_H_
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index f8aac8e7cd4467..74fac03c20325e 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -3,7 +3,7 @@ load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 # Placeholder: load py_proto_library
-load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test", "tf_python_pybind_extension")
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/python/tpu:tpu.bzl", "internal_create_sanitizer_settings", "tpu_py_strict_test")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
@@ -976,4 +976,24 @@ tf_proto_library(
 # )
 # copybara:uncomment_end
 
+tf_python_pybind_extension(
+    name = "_pywrap_tpu_embedding",
+    srcs = ["pywrap_tpu_embedding.cc"],
+    enable_stub_generation = True,
+    features = ["-layering_check"],
+    pytype_srcs = [
+        "_pywrap_tpu_embedding.pyi",
+    ],
+    deps = [
+        "//tensorflow/core/tpu/kernels:sparse_core_ops_utils",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_status",
+        "//tensorflow/python/lib/core:pybind11_status_headers",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:absl_casters",
+        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
+    ],
+)
+
 internal_create_sanitizer_settings()
diff --git a/tensorflow/python/tpu/_pywrap_tpu_embedding.pyi b/tensorflow/python/tpu/_pywrap_tpu_embedding.pyi
new file mode 100644
index 00000000000000..d016aaf3c09370
--- /dev/null
+++ b/tensorflow/python/tpu/_pywrap_tpu_embedding.pyi
@@ -0,0 +1,18 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import List
+
+def stack_tables(arg0: List[int], arg1: List[int], arg2: List[int], arg3: List[int], arg4: List[str], arg5: int) -> List[List[str]]: ...
diff --git a/tensorflow/python/tpu/pywrap_tpu_embedding.cc b/tensorflow/python/tpu/pywrap_tpu_embedding.cc
new file mode 100644
index 00000000000000..8c76b1af9981a0
--- /dev/null
+++ b/tensorflow/python/tpu/pywrap_tpu_embedding.cc
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/stl.h"  // from @pybind11
+#include "tensorflow/core/tpu/kernels/sparse_core_ops_utils.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+PYBIND11_MODULE(_pywrap_tpu_embedding, m) {
+  m.def("stack_tables",
+        [](const std::vector<int64_t>& table_heights,
+           const std::vector<int64_t>& table_widths,
+           const std::vector<int64_t>& table_num_samples,
+           const std::vector<int64_t>& table_groups,
+           const std::vector<std::string>& table_names, int64_t num_tpu_chips) {
+          return tensorflow::GetTableStacks(table_heights, table_widths,
+                                            table_num_samples, table_groups,
+                                            table_names, num_tpu_chips);
+        });
+}

From 522caf71145fed24fb2a3cfa75c263ff4c6e64e5 Mon Sep 17 00:00:00 2001
From: Yishuang Pang <ypang@google.com>
Date: Thu, 28 Sep 2023 10:34:49 -0700
Subject: [PATCH 384/567] Add deserialization for `stablehlo.rng_bit_generator`
 op and an e2e gumbel test which emits the op.

PiperOrigin-RevId: 569224472
---
 .../compiler/mlir/lite/flatbuffer_export.cc   | 37 +++++++++++++++++++
 .../compiler/mlir/lite/flatbuffer_operator.cc | 17 +++++++++
 .../lite/tests/flatbuffer2mlir/stablehlo.mlir | 33 ++++++++++++++++-
 3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 113877ded63427..1a55e517791f84 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -731,6 +731,11 @@ class Translator {
       mlir::stablehlo::ScatterOp shlo_op, const std::vector<int32_t>& operands,
       const std::vector<int32_t>& results);
 
+  std::optional<BufferOffset<tflite::Operator>> BuildStablehloRngBitGeneratorOp(
+      mlir::stablehlo::RngBitGeneratorOp rng_op,
+      const std::vector<int32_t>& operands,
+      const std::vector<int32_t>& results);
+
   // create a subgraph given a unnamed mlir region, return the corresponding
   // subgraph index
   int32_t UnnamedRegionToSubgraph(mlir::Region* region,
@@ -1478,6 +1483,34 @@ Translator::BuildStablehloScatterOp(mlir::stablehlo::ScatterOp scatter_op,
       tflite::BuiltinOptions2_StablehloScatterOptions, options.Union());
 }
 
+std::optional<BufferOffset<tflite::Operator>>
+Translator::BuildStablehloRngBitGeneratorOp(
+    mlir::stablehlo::RngBitGeneratorOp rng_op,
+    const std::vector<int32_t>& operands, const std::vector<int32_t>& results) {
+  std::string op_name = rng_op.getOperation()->getName().getStringRef().str();
+  uint32_t opcode_index = GetOpcodeIndex(
+      op_name, tflite::BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR);
+  tflite::RngAlgorithm algorithm = tflite::RngAlgorithm_DEFAULT;
+  switch (rng_op.getRngAlgorithm()) {
+    case mlir::stablehlo::RngAlgorithm::THREE_FRY:
+      algorithm = tflite::RngAlgorithm_THREEFRY;
+      break;
+    case mlir::stablehlo::RngAlgorithm::PHILOX:
+      algorithm = tflite::RngAlgorithm_PHILOX;
+      break;
+    case mlir::stablehlo::RngAlgorithm::DEFAULT:
+      break;
+  }
+  auto rng_options =
+      tflite::CreateStablehloRngBitGeneratorOptions(builder_, algorithm);
+  return tflite::CreateOperator(
+      builder_, opcode_index, builder_.CreateVector(operands),
+      builder_.CreateVector(results), tflite::BuiltinOptions_NONE, 0, 0,
+      tflite::CustomOptionsFormat_FLEXBUFFERS, 0, 0, 0, 0,
+      tflite::BuiltinOptions2_StablehloRngBitGeneratorOptions,
+      rng_options.Union());
+}
+
 std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     Operation* inst, std::vector<int32_t> operands,
     const std::vector<int32_t>& results,
@@ -1553,6 +1586,10 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::ScatterOp>(inst)) {
       return BuildStablehloScatterOp(shlo_op, operands, results);
     }
+    if (auto shlo_op =
+            llvm::dyn_cast<mlir::stablehlo::RngBitGeneratorOp>(inst)) {
+      return BuildStablehloRngBitGeneratorOp(shlo_op, operands, results);
+    }
     // for ops don't have kernels, only serialize when conversion is set to true
     if (convert_stablehlo_) {
       if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::LogisticOp>(inst)) {
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index f2b194c3331ab8..290b1075f28dba 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -635,6 +635,23 @@ void mlir::BuiltinOptions2ToAttributes(
 
     return;
   }
+  if (const auto* op = op_union.AsStablehloRngBitGeneratorOptions()) {
+    mlir::stablehlo::RngAlgorithm algorithm;
+    switch (op->algorithm) {
+      case tflite::RngAlgorithm_THREEFRY:
+        algorithm = mlir::stablehlo::RngAlgorithm::THREE_FRY;
+        break;
+      case tflite::RngAlgorithm_PHILOX:
+        algorithm = mlir::stablehlo::RngAlgorithm::PHILOX;
+        break;
+      case tflite::RngAlgorithm_DEFAULT:
+        algorithm = mlir::stablehlo::RngAlgorithm::DEFAULT;
+    }
+    auto attr =
+        mlir::stablehlo::RngAlgorithmAttr::get(builder.getContext(), algorithm);
+    attributes.emplace_back(builder.getNamedAttr("rng_algorithm", attr));
+    return;
+  }
 }
 
 // Pull in FlatBuffer writers for TFLite generated using TableGen
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir
index 579e5be35e386d..b1b08d5fac291d 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir
@@ -529,6 +529,37 @@ func.func @transpose(%arg0: tensor<2x3x2xi32>) -> tensor<2x3x2xi32> {
 }
 
 // CHECK:func.func private @transpose(%arg0: tensor<2x3x2xi32>) -> tensor<2x3x2xi32> {
-// CHECK-NEXT:  %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<2x3x2xi32>) -> tensor<2x3x2xi32> 
+// CHECK-NEXT:  %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<2x3x2xi32>) -> tensor<2x3x2xi32>
 // CHECK-NEXT:  return %0 : tensor<2x3x2xi32>
+// CHECK-NEXT:}
+
+func.func @rng_bit_generator(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>) {
+  %output_state, %output = "stablehlo.rng_bit_generator"(%arg0) {rng_algorithm = #stablehlo<rng_algorithm DEFAULT>} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>)
+  func.return %output_state, %output : tensor<2xui64>, tensor<10x12xui32>
+}
+
+// CHECK:func.func private @rng_bit_generator(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>) {
+// CHECK-NEXT:  %output_state, %output = stablehlo.rng_bit_generator %arg0, algorithm = DEFAULT : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>)
+// CHECK-NEXT:  return %output_state, %output : tensor<2xui64>, tensor<10x12xui32>
+// CHECK-NEXT:}
+
+func.func @rng_bit_generator_threefry(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui64>) {
+  %output_state, %output = "stablehlo.rng_bit_generator"(%arg0) {rng_algorithm = #stablehlo<rng_algorithm THREE_FRY>} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui64>)
+  func.return %output_state, %output : tensor<2xui64>, tensor<10x12xui64>
+}
+
+// CHECK:func.func private @rng_bit_generator_threefry(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui64>) {
+// CHECK-NEXT:  %output_state, %output = stablehlo.rng_bit_generator %arg0, algorithm = THREE_FRY : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui64>)
+// CHECK-NEXT:  return %output_state, %output : tensor<2xui64>, tensor<10x12xui64>
+// CHECK-NEXT:}
+
+
+func.func @rng_bit_generator_philox(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xi32>) {
+  %output_state, %output = "stablehlo.rng_bit_generator"(%arg0) {rng_algorithm = #stablehlo<rng_algorithm PHILOX>} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xi32>)
+  func.return %output_state, %output : tensor<2xui64>, tensor<10x12xi32>
+}
+
+// CHECK:func.func private @rng_bit_generator_philox(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xi32>) {
+// CHECK-NEXT:  %output_state, %output = stablehlo.rng_bit_generator %arg0, algorithm = PHILOX : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xi32>)
+// CHECK-NEXT:  return %output_state, %output : tensor<2xui64>, tensor<10x12xi32>
 // CHECK-NEXT:}
\ No newline at end of file

From 8c6a0b5c1d201c17f5cc26125d058ef6a194d92a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Sep 2023 10:48:50 -0700
Subject: [PATCH 385/567] [IFRT] Add on-demand canonicalization for new
 sharding's memory kind in `PjRtArray::Reshard` in case it has not been done
 before.

PiperOrigin-RevId: 569228579
---
 third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index 4bea6712820b7e..452262b22c7105 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -340,10 +340,12 @@ StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
   // permits device changes and nothing else.
   PjRtBuffers buffers;
   buffers.reserve(pjrt_buffers_.size());
-  // TODO(yueshengys): Add a on-demand canonicalization when all users
-  // (e.g., ifrt proxy) support memories.
+  CHECK_GT(new_sharding->devices().size(), 0);
+  // Canonicalize memory kind in case it hasn't been done before.
+  MemoryKind canonicalized_sharding_memory_kind = CanonicalizeMemoryKind(
+      new_sharding->memory_kind(), new_sharding->devices().front());
   bool new_sharding_has_memory_kind =
-      new_sharding->memory_kind().memory_kind().has_value();
+      canonicalized_sharding_memory_kind.memory_kind().has_value();
   for (int i = 0; i < pjrt_buffers_.size(); ++i) {
     bool devices_equal =
         pjrt_buffers_[i]->device() == new_sharding->devices()[i];
@@ -351,7 +353,7 @@ StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
     bool memory_kind_equal =
         new_sharding_has_memory_kind && memories_supported &&
         pjrt_buffers_[i]->memory_space()->memory_space_kind() ==
-            new_sharding->memory_kind().memory_kind();
+            canonicalized_sharding_memory_kind.memory_kind();
 
     // No need for data transfer.
     if (devices_equal && (!new_sharding_has_memory_kind ||
@@ -387,7 +389,7 @@ StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
         TF_ASSIGN_OR_RETURN(
             auto memory_space,
             GetMemorySpaceFromMemoryKind(new_sharding->devices()[i],
-                                         new_sharding->memory_kind()));
+                                         canonicalized_sharding_memory_kind));
         TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> copied_buffer,
                             pjrt_buffers_[i]->CopyToMemorySpace(memory_space));
         if (semantics == ArrayCopySemantics::kDonateInput) {

From fe0941c441095625558e99e53a72676081b8d11f Mon Sep 17 00:00:00 2001
From: Shixin Li <shixinli@google.com>
Date: Thu, 28 Sep 2023 11:04:42 -0700
Subject: [PATCH 386/567] Enable cross compilation for GPU PJRT
 compiler/client.

PiperOrigin-RevId: 569233566
---
 third_party/xla/xla/pjrt/gpu/BUILD            |  23 ++--
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  33 ++---
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     |  17 +--
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc  |  48 +++----
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h   |   4 +-
 .../pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc | 117 ++++++++----------
 .../xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc |  11 --
 7 files changed, 112 insertions(+), 141 deletions(-)

diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index eb1ddc2fad0b38..3df92863bc8416 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -4,7 +4,6 @@ load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_tsl//tsl:tsl.bzl", "if_nccl")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
-load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
@@ -237,6 +236,7 @@ cc_library(
         "//xla/service/gpu:gpu_target_config",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
     ] + if_cuda([
         ":nccl_id_store_cuda",
@@ -257,7 +257,12 @@ cc_library(
 xla_cc_test(
     name = "se_gpu_pjrt_compiler_test",
     srcs = if_gpu_is_configured(["se_gpu_pjrt_compiler_test.cc"]),
-    tags = tf_cuda_tests_tags(),
+    tags = [
+        "config-cuda-only",
+        "gpu",
+        "no_oss",
+        "requires-gpu-nvidia",
+    ],
     deps = [
         ":se_gpu_pjrt_client",
         ":se_gpu_pjrt_compiler",
@@ -280,7 +285,6 @@ xla_cc_test(
 xla_cc_test(
     name = "se_gpu_pjrt_compiler_aot_test",
     srcs = if_gpu_is_configured(["se_gpu_pjrt_compiler_aot_test.cc"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     tags = [
         "config-cuda-only",
         "gpu",
@@ -302,6 +306,7 @@ xla_cc_test(
         "//xla/service:gpu_plugin",
         "//xla/service:hlo_parser",
         "//xla/service/gpu:gpu_target_config",
+        "//xla/service/gpu:nvptx_compiler_impl",
         "//xla/stream_executor/cuda:cublas_plugin",
         "//xla/tests:literal_test_util",
         "@com_google_absl//absl/memory",
@@ -312,18 +317,8 @@ xla_cc_test(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
-        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
-    ] + if_cuda([
-        ":nccl_id_store_cuda",
-        "@local_config_cuda//cuda:cuda_headers",
-        "//xla/stream_executor/cuda:cuda_activation_header",
-        "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
-    ]) + if_rocm([
-        ":nccl_id_store_rocm",
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
+    ],
 )
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index b1916e471c8b6a..7799f0eb379514 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -556,9 +556,9 @@ StatusOr<std::unique_ptr<StreamExecutorUnloadedExecutable>> FromProto(
 }  // namespace
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
-                                        std::optional<CompileOptions> options,
-                                        const LoadOptions& load_options) {
+StreamExecutorGpuClient::LoadSerializedExecutable(
+    absl::string_view serialized, std::optional<CompileOptions> options,
+    const LoadOptions& load_options) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   StreamExecutorUnloadedExecutableProto proto;
   if (serialized.size() > std::numeric_limits<int>::max()) {
@@ -572,8 +572,7 @@ StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
         "failed");
   }
   TF_ASSIGN_OR_RETURN(auto se_executable, FromProto(proto));
-  // TODO(b/296466237): Unify the `Load` method.
-  return Load(std::move(se_executable));
+  return Load(std::move(se_executable), LoadOptions());
 #endif
   return absl::InternalError("LoadSerialized only works with cuda or rocm.");
 }
@@ -594,7 +593,8 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
 }
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>> StreamExecutorGpuClient::Load(
-    std::unique_ptr<PjRtExecutable> executable) {
+    std::unique_ptr<PjRtExecutable> executable,
+    const LoadOptions& load_options) {
   auto se_executable =
       absl::WrapUnique(tensorflow::down_cast<StreamExecutorUnloadedExecutable*>(
           executable.release()));
@@ -604,22 +604,23 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> StreamExecutorGpuClient::Load(
   TF_ASSIGN_OR_RETURN(ExecutableExtras extras,
                       GetExecutableExtras(&compile_options));
 
-  TF_ASSIGN_OR_RETURN(
-      auto se_executor,
-      client()->backend().stream_executor(
-          compile_options.executable_build_options.device_ordinal()));
+  se::StreamExecutor* se_executor =
+      client()->backend().default_stream_executor();
+  if (se_executor == nullptr) {
+    return absl::FailedPreconditionError("Default stream executor is null.");
+  }
 
   // Load Executable from AOT compilation result.
   std::vector<std::unique_ptr<LocalExecutable>> local_executables;
   local_executables.reserve(se_executable->aot_executables().size());
   for (std::unique_ptr<xla::AotCompilationResult>& aot_executable :
        se_executable->aot_executables()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                        aot_executable->LoadExecutable(
-                            client()->backend().compiler(), se_executor));
-    local_executables.push_back(std::make_unique<LocalExecutable>(
-        std::move(executable), client()->local_service()->mutable_backend(),
-        compile_options.executable_build_options));
+    TF_ASSIGN_OR_RETURN(std::string serialized,
+                        aot_executable->SerializeAsString());
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<LocalExecutable> local_executable,
+        client()->Load(serialized, compile_options.executable_build_options));
+    local_executables.push_back(std::move(local_executable));
   }
   bool parameter_is_tupled_arguments =
       compile_options.parameter_is_tupled_arguments;
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index be6b4cb75a9777..ab40c64b87b85e 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -190,24 +190,13 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
     return &topology_;
   }
 
-  // TODO(b/285385306): Enable loading a non-loaded PjRtExecutable.
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
       std::unique_ptr<PjRtExecutable> executable,
-      const LoadOptions& load_options) override {
-    return absl::WrapUnique<PjRtLoadedExecutable>(
-        tensorflow::down_cast<PjRtLoadedExecutable*>(executable.release()));
-  }
-
-  // TODO(b/296466237): Unify `Load` method after (de)serialization and tests on
-  // existing use cases are done.
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
-      std::unique_ptr<PjRtExecutable> executable);
+      const LoadOptions& load_options) override;
 
-  // TODO(b/296466237): Unify `LoadSerializedExecutable` after fixing existing
-  // tests.
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerialized(
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerializedExecutable(
       absl::string_view serialized, std::optional<CompileOptions> options,
-      const LoadOptions& load_options);
+      const LoadOptions& load_options) override;
 
  private:
   xla::StreamExecutorGpuTopologyDescription topology_;
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 2535b5d5308ada..d707919ab40b79 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/status_macros.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/errors.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -146,23 +147,31 @@ absl::StatusOr<std::unique_ptr<PjRtExecutable>> AotCompile(
 #endif
 }  // namespace
 
-// TODO(b/285385306): Enable compilation on provided `topology`.
 absl::StatusOr<std::unique_ptr<PjRtExecutable>>
 StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    const XlaComputation& computation,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
-  if (client == nullptr && gpu_target_config_ != std::nullopt) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  if (client == nullptr && gpu_target_config_ != std::nullopt) {
     return AotCompile(options, computation, *gpu_target_config_);
-#endif
-    return absl::InternalError(
-        "GPU AOT compilation requires the target to be built with CUDA or "
-        "ROCm.");
   }
-  // TODO(b/296466237): Remove client dependency.
   TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
-  return client->Compile(computation, options);
+
+  PjRtStreamExecutorClient* se_client =
+      tensorflow::down_cast<PjRtStreamExecutorClient*>(client);
+#if GOOGLE_CUDA
+  auto gpu_compiler = gpu::NVPTXCompiler();
+#elif TENSORFLOW_USE_ROCM
+  auto gpu_compiler = gpu::AMDGPUCompiler();
+#endif
+  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
+      se_client->client()->backend().default_stream_executor());
+  return AotCompile(options, computation, gpu_target_config);
+#endif
+  return absl::InternalError(
+      "GPU AOT compilation requires the target to be built with CUDA or "
+      "ROCm.");
 }
 
 absl::StatusOr<std::unique_ptr<PjRtExecutable>>
@@ -170,22 +179,17 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    mlir::ModuleOp module,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
-  if (client == nullptr && gpu_target_config_ != std::nullopt) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    XlaComputation xla_computation;
-    TF_RETURN_IF_ERROR(MlirToXlaComputation(
-        module, xla_computation,
-        /*use_tuple_args=*/options.parameter_is_tupled_arguments,
-        /*return_tuple=*/false));
-    return AotCompile(options, xla_computation, *gpu_target_config_);
+  XlaComputation xla_computation;
+  TF_RETURN_IF_ERROR(MlirToXlaComputation(
+      module, xla_computation,
+      /*use_tuple_args=*/options.parameter_is_tupled_arguments,
+      /*return_tuple=*/false));
+  return Compile(options, xla_computation, topology, client);
 #endif
-    return absl::InternalError(
-        "GPU AOT compilation requires the target to be built with CUDA or "
-        "ROCm.");
-  }
-  // TODO(b/296466237): Remove client dependency.
-  TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
-  return client->Compile(module, options);
+  return absl::InternalError(
+      "GPU AOT compilation requires the target to be built with CUDA or "
+      "ROCm.");
 }
 
 REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
index fe3a67eb5849ef..981748429fa48e 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
@@ -32,8 +32,8 @@ namespace xla {
 class StreamExecutorGpuCompiler : public PjRtCompiler {
  public:
   // If `gpu_target_config` is nullopt, the compiler has to compile with device,
-  // i.e. calling of `Compile` should depend on the passed-in client's
-  // compilation functionality.
+  // i.e. calling of `Compile` should depend on the passed-in client's runtime
+  // device information.
   explicit StreamExecutorGpuCompiler(const std::optional<gpu::GpuTargetConfig>
                                          gpu_target_config = std::nullopt)
       : gpu_target_config_(gpu_target_config) {}
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index 42f28daf377447..07960a19dc4917 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -37,8 +37,10 @@ limitations under the License.
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_compiler.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/service/gpu/gpu_target_config.h"
+#include "xla/service/gpu/nvptx_compiler.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/tests/literal_test_util.h"
 #include "tsl/platform/casts.h"
@@ -61,31 +63,6 @@ constexpr absl::string_view mlir_str = R"mlir(
     }
   })mlir";
 
-constexpr absl::string_view kGpuTargetConfig =
-    R"pb(
-  gpu_device_info {
-    threads_per_block_limit: 1024
-    threads_per_warp: 32
-    shared_memory_per_block: 49152
-    shared_memory_per_core: 65536
-    threads_per_core_limit: 2048
-    core_count: 56
-    fpus_per_core: 64
-    block_dim_limit_x: 2147483647
-    block_dim_limit_y: 65535
-    block_dim_limit_z: 65535
-    memory_bandwidth: 732160000000
-    l2_cache_size: 4194304
-    clock_rate_ghz: 1.4805
-    device_memory_size: 17066622976
-    shared_memory_per_block_optin: 49152
-    cuda_compute_capability { major: 6 }
-  }
-  platform_name: "CUDA"
-  dnn_version_info {}
-  device_description_str: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
-    )pb";
-
 absl::StatusOr<xla::XlaComputation> GetXlaComputation(
     absl::string_view program) {
   TF_ASSIGN_OR_RETURN(auto hlo_module,
@@ -105,36 +82,29 @@ void ValidateResult(
       LiteralTestUtil::Equal(LiteralUtil::CreateR0(2), *result_literal));
 }
 
-absl::StatusOr<gpu::GpuTargetConfig> GetGpuTargetConfig() {
-  stream_executor::GpuTargetConfigProto gpu_target_config_proto;
-  if (!proto2::TextFormat::ParseFromString(kGpuTargetConfig,
-                                           &gpu_target_config_proto)) {
-    return absl::InvalidArgumentError("Failed to parse GpuTargetConfigProto");
-  }
-  return gpu::GpuTargetConfig(gpu_target_config_proto);
-}
-
 TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
-  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
-                       GetGpuTargetConfig());
-  CompileOptions options = xla::CompileOptions();
-  StreamExecutorGpuCompiler compiler(gpu_config);
-
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
                                               /*node_id=*/0));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+  auto gpu_compiler = gpu::NVPTXCompiler();
+  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
+      se_client->client()->backend().default_stream_executor());
+  StreamExecutorGpuCompiler compiler(gpu_target_config);
+
   mlir::MLIRContext context;
   context.loadDialect<mlir::mhlo::MhloDialect, mlir::func::FuncDialect>();
   auto mlir_module =
       mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
   TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
   TF_ASSERT_OK_AND_ASSIGN(
-      auto executable, compiler.Compile(xla::CompileOptions(),
-                                        mlir_module.get(), *topology, nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
-                          se_client->Load(std::move(executable)));
+      auto executable,
+      compiler.Compile(xla::CompileOptions(), mlir_module.get(), *topology,
+                       /*client=*/nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto loaded_executable,
+      se_client->Load(std::move(executable), LoadOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
@@ -142,59 +112,82 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
 }
 
 TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
-  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
-                       GetGpuTargetConfig());
-  CompileOptions options = xla::CompileOptions();
-  StreamExecutorGpuCompiler compiler(gpu_config);
-
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
                                               /*node_id=*/0));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+  auto gpu_compiler = gpu::NVPTXCompiler();
+  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
+      se_client->client()->backend().default_stream_executor());
+  StreamExecutorGpuCompiler compiler(gpu_target_config);
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
   TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          compiler.Compile(xla::CompileOptions(), computation,
+                                           *topology, /*client=*/nullptr));
   TF_ASSERT_OK_AND_ASSIGN(
-      auto executable,
-      compiler.Compile(xla::CompileOptions(), computation, *topology, nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
-                          se_client->Load(std::move(executable)));
+      auto loaded_executable,
+      se_client->Load(std::move(executable), LoadOptions()));
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
   ValidateResult(result);
 }
 
 TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
-  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
-                       GetGpuTargetConfig());
-  CompileOptions options = xla::CompileOptions();
-  StreamExecutorGpuCompiler compiler(gpu_config);
-
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
                                               /*node_id=*/0));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+  auto gpu_compiler = gpu::NVPTXCompiler();
+  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
+      se_client->client()->backend().default_stream_executor());
+  StreamExecutorGpuCompiler compiler(gpu_target_config);
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
   TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto executable,
-      compiler.Compile(xla::CompileOptions(), computation, *topology, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          compiler.Compile(xla::CompileOptions(), computation,
+                                           *topology, /*client=*/nullptr));
 
   // Serialize the executable and load it.
   TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable,
                           executable->SerializeExecutable());
   TF_ASSERT_OK_AND_ASSIGN(
       auto loaded_executable,
-      se_client->LoadSerialized(serialized_executable, std::nullopt,
-                                LoadOptions()));
+      se_client->LoadSerializedExecutable(serialized_executable, std::nullopt,
+                                          LoadOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
   ValidateResult(result);
 }
 
+TEST(StreamExecutorGpuCompilerTest, SuccessCrossCompileAndLoad) {
+  CompileOptions options = xla::CompileOptions();
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
+                          GetXlaComputation(kProgram));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto topology, client->GetTopologyDescription());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      xla::PjRtCompile(options, computation, *topology, client.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized,
+                          executable->SerializeExecutable());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
+                          client->LoadSerializedExecutable(
+                              serialized, std::nullopt, LoadOptions()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
+  ValidateResult(result);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
index d4bc1beb3f74e4..925180aa7c16db 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -56,17 +56,6 @@ absl::StatusOr<xla::XlaComputation> GetXlaComputation(
   return XlaComputation(hlo_module->ToProto());
 }
 
-TEST(StreamExecutorGpuCompilerTest, NoClientXla) {
-  StreamExecutorGpuCompiler compiler;
-  StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),
-                                                "Fake_device", {0, 1});
-
-  TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
-  EXPECT_THAT(compiler.Compile(xla::CompileOptions(), computation, topology,
-                               /*client=*/nullptr),
-              StatusIs(absl::StatusCode::kUnimplemented));
-}
-
 TEST(StreamExecutorGpuCompilerTest, TopologyNotSameXla) {
   StreamExecutorGpuCompiler compiler;
   StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),

From 0947680e554c3b89aba29102d9a934ec38a2f417 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Thu, 28 Sep 2023 11:16:18 -0700
Subject: [PATCH 387/567] Fix missed --config setting

PiperOrigin-RevId: 569237506
---
 .bazelrc                                 | 4 ++--
 third_party/xla/.bazelrc                 | 4 ++--
 third_party/xla/third_party/tsl/.bazelrc | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index a55118d273a570..9691d264a24a8f 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -681,7 +681,7 @@ test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/
 test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_cuda_wheel_test --config=nonpip_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
 # the whole TF code base. These are usually run continuously or upon presubmit.
@@ -694,6 +694,6 @@ test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_filters -- //tensorflow/... -
 test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # END TF TEST SUITE OPTIONS
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index a55118d273a570..9691d264a24a8f 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -681,7 +681,7 @@ test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/
 test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_cuda_wheel_test --config=nonpip_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
 # the whole TF code base. These are usually run continuously or upon presubmit.
@@ -694,6 +694,6 @@ test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_filters -- //tensorflow/... -
 test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # END TF TEST SUITE OPTIONS
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index a55118d273a570..9691d264a24a8f 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -681,7 +681,7 @@ test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/
 test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_cuda_wheel_test --config=nonpip_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
 # the whole TF code base. These are usually run continuously or upon presubmit.
@@ -694,6 +694,6 @@ test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_filters -- //tensorflow/... -
 test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # END TF TEST SUITE OPTIONS

From cb557ce896ddc662cb404213dcd778eeb432e258 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Thu, 28 Sep 2023 11:26:21 -0700
Subject: [PATCH 388/567] Add functions to clean up tensor allocation types in
 TFLite.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`TfLiteAllocationType` values carry several orthogonal meanings at the same time. For instance, `kTfLiteDynamic` means “we don't know the shape until runtime” and “is malloc allocated”. This prevents writing clear code using `TfLiteTensor::allocation_type` to discriminate between tensor behaviours.

This CL introduces new functions that disambiguate the different properties of a tensor depending on its allocation type.

PiperOrigin-RevId: 569240578
---
 tensorflow/lite/core/c/BUILD                |  23 ++
 tensorflow/lite/core/c/c_api_opaque.cc      |  23 ++
 tensorflow/lite/core/c/c_api_opaque.h       |  20 +
 tensorflow/lite/core/c/c_api_opaque_test.cc | 386 ++++++++++++++++++++
 tensorflow/lite/core/c/common.cc            | 124 +++++++
 tensorflow/lite/core/c/common.h             |  54 +++
 tensorflow/lite/core/c/common_test.cc       | 266 ++++++++++++++
 7 files changed, 896 insertions(+)
 create mode 100644 tensorflow/lite/core/c/c_api_opaque_test.cc

diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index 3766c1831afc35..c1218edace4b0d 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -251,6 +251,29 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "c_api_opaque_test",
+    size = "small",
+    srcs = [
+        "c_api_opaque.cc",
+        "c_api_opaque.h",
+        "c_api_opaque_test.cc",
+    ],
+    deps = [
+        ":c_api",
+        ":c_api_types",
+        ":common",
+        ":registration_external",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_opaque_internal",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:subgraph",
+        "//tensorflow/lite/kernels:kernel_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 tflite_cc_library_with_c_headers_test(
     name = "common",
     srcs = ["common.cc"],
diff --git a/tensorflow/lite/core/c/c_api_opaque.cc b/tensorflow/lite/core/c/c_api_opaque.cc
index ba803a1c0dab46..13cf85cfb967bb 100644
--- a/tensorflow/lite/core/c/c_api_opaque.cc
+++ b/tensorflow/lite/core/c/c_api_opaque.cc
@@ -155,6 +155,29 @@ TfLiteAllocationType TfLiteOpaqueTensorGetAllocationType(
   return Convert(opaque_tensor)->allocation_type;
 }
 
+TfLiteAllocationStrategy TfLiteOpaqueTensorGetAllocationStrategy(
+    const TfLiteOpaqueTensor* t) {
+  return TfLiteTensorGetAllocationStrategy(Convert(t));
+}
+
+TfLiteRunStability TfLiteOpaqueTensorGetBufferAddressStability(
+    const TfLiteOpaqueTensor* t) {
+  return TfLiteTensorGetBufferAddressStability(Convert(t));
+}
+
+TfLiteRunStability TfLiteOpaqueTensorGetDataStability(
+    const TfLiteOpaqueTensor* t) {
+  return TfLiteTensorGetDataStability(Convert(t));
+}
+
+TfLiteRunStep TfLiteOpaqueTensorGetDataKnownStep(const TfLiteOpaqueTensor* t) {
+  return TfLiteTensorGetDataKnownStep(Convert(t));
+}
+
+TfLiteRunStep TfLiteOpaqueTensorGetShapeKnownStep(const TfLiteOpaqueTensor* t) {
+  return TfLiteTensorGetShapeKnownStep(Convert(t));
+}
+
 const char* TfLiteOpaqueTensorName(const TfLiteOpaqueTensor* opaque_tensor) {
   return TfLiteTensorName(reinterpret_cast<const TfLiteTensor*>(opaque_tensor));
 }
diff --git a/tensorflow/lite/core/c/c_api_opaque.h b/tensorflow/lite/core/c/c_api_opaque.h
index e73e9e8a40ffda..b06709cd61d872 100644
--- a/tensorflow/lite/core/c/c_api_opaque.h
+++ b/tensorflow/lite/core/c/c_api_opaque.h
@@ -92,6 +92,26 @@ TFL_CAPI_EXPORT extern void* TfLiteOpaqueTensorData(
 TFL_CAPI_EXPORT extern TfLiteAllocationType TfLiteOpaqueTensorGetAllocationType(
     const TfLiteOpaqueTensor* opaque_tensor);
 
+/// Returns a tensor data allocation strategy.
+TFL_CAPI_EXPORT extern TfLiteAllocationStrategy
+TfLiteOpaqueTensorGetAllocationStrategy(const TfLiteOpaqueTensor* t);
+
+/// Returns how stable a tensor data buffer address is across runs.
+TFL_CAPI_EXPORT extern TfLiteRunStability
+TfLiteOpaqueTensorGetBufferAddressStability(const TfLiteOpaqueTensor* t);
+
+/// Returns how stable a tensor data values are across runs.
+TFL_CAPI_EXPORT extern TfLiteRunStability TfLiteOpaqueTensorGetDataStability(
+    const TfLiteOpaqueTensor* t);
+
+/// Returns the operation step when the data of a tensor is populated.
+TFL_CAPI_EXPORT extern TfLiteRunStep TfLiteOpaqueTensorGetDataKnownStep(
+    const TfLiteOpaqueTensor* t);
+
+/// Returns the operation step when the shape of a tensor is computed.
+TFL_CAPI_EXPORT extern TfLiteRunStep TfLiteOpaqueTensorGetShapeKnownStep(
+    const TfLiteOpaqueTensor* t);
+
 /// Returns the (null-terminated) name of the tensor.
 TFL_CAPI_EXPORT extern const char* TfLiteOpaqueTensorName(
     const TfLiteOpaqueTensor* opaque_tensor);
diff --git a/tensorflow/lite/core/c/c_api_opaque_test.cc b/tensorflow/lite/core/c/c_api_opaque_test.cc
new file mode 100644
index 00000000000000..ab2c00b2604a4f
--- /dev/null
+++ b/tensorflow/lite/core/c/c_api_opaque_test.cc
@@ -0,0 +1,386 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/core/c/c_api_opaque.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api.h"
+
+namespace tflite {
+namespace {
+
+TEST(TestTfLiteOpaqueTensorGetAllocationStrategy,
+     WithMemNoneBehavesAsTfLiteTensorGetAllocationStrategy) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMemNone;
+  EXPECT_EQ(TfLiteOpaqueTensorGetAllocationStrategy(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetAllocationStrategy(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetAllocationStrategy,
+     WithMmapRoBehavesAsTfLiteTensorGetAllocationStrategy) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMmapRo;
+  EXPECT_EQ(TfLiteOpaqueTensorGetAllocationStrategy(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetAllocationStrategy(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetAllocationStrategy,
+     WithArenaRwBehavesAsTfLiteTensorGetAllocationStrategy) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRw;
+  EXPECT_EQ(TfLiteOpaqueTensorGetAllocationStrategy(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetAllocationStrategy(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetAllocationStrategy,
+     WithArenaRwPersistentBehavesAsTfLiteTensorGetAllocationStrategy) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRwPersistent;
+  EXPECT_EQ(TfLiteOpaqueTensorGetAllocationStrategy(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetAllocationStrategy(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetAllocationStrategy,
+     WithDynamicBehavesAsTfLiteTensorGetAllocationStrategy) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteDynamic;
+  EXPECT_EQ(TfLiteOpaqueTensorGetAllocationStrategy(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetAllocationStrategy(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetAllocationStrategy,
+     WithPersistentRoBehavesAsTfLiteTensorGetAllocationStrategy) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLitePersistentRo;
+  EXPECT_EQ(TfLiteOpaqueTensorGetAllocationStrategy(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetAllocationStrategy(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetAllocationStrategy,
+     WithCustomBehavesAsTfLiteTensorGetAllocationStrategy) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteCustom;
+  EXPECT_EQ(TfLiteOpaqueTensorGetAllocationStrategy(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetAllocationStrategy(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetAllocationStrategy,
+     WithVariantObjectBehavesAsTfLiteTensorGetAllocationStrategy) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteVariantObject;
+  EXPECT_EQ(TfLiteOpaqueTensorGetAllocationStrategy(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetAllocationStrategy(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetBufferAddressStability,
+     WithMemNoneBehavesAsTfLiteTensorGetBufferAddressStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMemNone;
+  EXPECT_EQ(TfLiteOpaqueTensorGetBufferAddressStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetBufferAddressStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetBufferAddressStability,
+     WithMmapRoBehavesAsTfLiteTensorGetBufferAddressStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMmapRo;
+  EXPECT_EQ(TfLiteOpaqueTensorGetBufferAddressStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetBufferAddressStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetBufferAddressStability,
+     WithArenaRwBehavesAsTfLiteTensorGetBufferAddressStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRw;
+  EXPECT_EQ(TfLiteOpaqueTensorGetBufferAddressStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetBufferAddressStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetBufferAddressStability,
+     WithArenaRwPersistentBehavesAsTfLiteTensorGetBufferAddressStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRwPersistent;
+  EXPECT_EQ(TfLiteOpaqueTensorGetBufferAddressStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetBufferAddressStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetBufferAddressStability,
+     WithDynamicBehavesAsTfLiteTensorGetBufferAddressStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteDynamic;
+  EXPECT_EQ(TfLiteOpaqueTensorGetBufferAddressStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetBufferAddressStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetBufferAddressStability,
+     WithPersistentRoBehavesAsTfLiteTensorGetBufferAddressStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLitePersistentRo;
+  EXPECT_EQ(TfLiteOpaqueTensorGetBufferAddressStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetBufferAddressStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetBufferAddressStability,
+     WithCustomBehavesAsTfLiteTensorGetBufferAddressStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteCustom;
+  EXPECT_EQ(TfLiteOpaqueTensorGetBufferAddressStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetBufferAddressStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetBufferAddressStability,
+     WithVariantObjectBehavesAsTfLiteTensorGetBufferAddressStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteVariantObject;
+  EXPECT_EQ(TfLiteOpaqueTensorGetBufferAddressStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetBufferAddressStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataStability,
+     WithMemNoneBehavesAsTfLiteTensorGetDataStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMemNone;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataStability,
+     WithMmapRoBehavesAsTfLiteTensorGetDataStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMmapRo;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataStability,
+     WithArenaRwBehavesAsTfLiteTensorGetDataStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRw;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataStability,
+     WithArenaRwPersistentBehavesAsTfLiteTensorGetDataStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRwPersistent;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataStability,
+     WithDynamicBehavesAsTfLiteTensorGetDataStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteDynamic;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataStability,
+     WithPersistentRoBehavesAsTfLiteTensorGetDataStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLitePersistentRo;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataStability,
+     WithCustomBehavesAsTfLiteTensorGetDataStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteCustom;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataStability,
+     WithVariantObjectBehavesAsTfLiteTensorGetDataStability) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteVariantObject;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataStability(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataStability(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataKnownStep,
+     WithMemNoneBehavesAsTfLiteTensorGetDataKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMemNone;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataKnownStep,
+     WithMmapRoBehavesAsTfLiteTensorGetDataKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMmapRo;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataKnownStep,
+     WithArenaRwBehavesAsTfLiteTensorGetDataKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRw;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataKnownStep,
+     WithArenaRwPersistentBehavesAsTfLiteTensorGetDataKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRwPersistent;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataKnownStep,
+     WithDynamicBehavesAsTfLiteTensorGetDataKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteDynamic;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataKnownStep,
+     WithPersistentRoBehavesAsTfLiteTensorGetDataKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLitePersistentRo;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataKnownStep,
+     WithCustomBehavesAsTfLiteTensorGetDataKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteCustom;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetDataKnownStep,
+     WithVariantObjectBehavesAsTfLiteTensorGetDataKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteVariantObject;
+  EXPECT_EQ(TfLiteOpaqueTensorGetDataKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetDataKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetShapeKnownStep,
+     WithMemNoneBehavesAsTfLiteTensorGetShapeKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMemNone;
+  EXPECT_EQ(TfLiteOpaqueTensorGetShapeKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetShapeKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetShapeKnownStep,
+     WithMmapRoBehavesAsTfLiteTensorGetShapeKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMmapRo;
+  EXPECT_EQ(TfLiteOpaqueTensorGetShapeKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetShapeKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetShapeKnownStep,
+     WithArenaRwBehavesAsTfLiteTensorGetShapeKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRw;
+  EXPECT_EQ(TfLiteOpaqueTensorGetShapeKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetShapeKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetShapeKnownStep,
+     WithArenaRwPersistentBehavesAsTfLiteTensorGetShapeKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRwPersistent;
+  EXPECT_EQ(TfLiteOpaqueTensorGetShapeKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetShapeKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetShapeKnownStep,
+     WithDynamicBehavesAsTfLiteTensorGetShapeKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteDynamic;
+  EXPECT_EQ(TfLiteOpaqueTensorGetShapeKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetShapeKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetShapeKnownStep,
+     WithPersistentRoBehavesAsTfLiteTensorGetShapeKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLitePersistentRo;
+  EXPECT_EQ(TfLiteOpaqueTensorGetShapeKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetShapeKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetShapeKnownStep,
+     WithCustomBehavesAsTfLiteTensorGetShapeKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteCustom;
+  EXPECT_EQ(TfLiteOpaqueTensorGetShapeKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetShapeKnownStep(&t));
+}
+
+TEST(TestTfLiteOpaqueTensorGetShapeKnownStep,
+     WithVariantObjectBehavesAsTfLiteTensorGetShapeKnownStep) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteVariantObject;
+  EXPECT_EQ(TfLiteOpaqueTensorGetShapeKnownStep(
+                reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            TfLiteTensorGetShapeKnownStep(&t));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc
index 367f1752261766..a2046fc8548719 100644
--- a/tensorflow/lite/core/c/common.cc
+++ b/tensorflow/lite/core/c/common.cc
@@ -420,4 +420,128 @@ void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate) {
   return tflite_delegate->opaque_delegate_builder->data;
 }
 
+// Returns a tensor data allocation strategy.
+TfLiteAllocationStrategy TfLiteTensorGetAllocationStrategy(
+    const TfLiteTensor* const t) {
+  switch (t->allocation_type) {
+    case kTfLiteMemNone:
+      return kTfLiteAllocationStrategyNone;
+    case kTfLiteMmapRo:
+      return kTfLiteAllocationStrategyMMap;
+    case kTfLiteArenaRw:
+      return kTfLiteAllocationStrategyArena;
+    case kTfLiteArenaRwPersistent:
+      return kTfLiteAllocationStrategyArena;
+    case kTfLiteDynamic:
+      return kTfLiteAllocationStrategyMalloc;
+    case kTfLitePersistentRo:
+      return kTfLiteAllocationStrategyUnknown;
+    case kTfLiteCustom:
+      return kTfLiteAllocationStrategyUnknown;
+    case kTfLiteVariantObject:
+      return kTfLiteAllocationStrategyNew;
+  }
+  return kTfLiteAllocationStrategyUnknown;
+}
+
+// Returns how stable a tensor data buffer address is across runs.
+TfLiteRunStability TfLiteTensorGetBufferAddressStability(
+    const TfLiteTensor* const t) {
+  switch (t->allocation_type) {
+    case kTfLiteMemNone:
+      return kTfLiteRunStabilityAcrossRuns;
+    case kTfLiteMmapRo:
+      return kTfLiteRunStabilityAcrossRuns;
+    case kTfLiteArenaRw:
+      return kTfLiteRunStabilityUnstable;
+    case kTfLiteArenaRwPersistent:
+      return kTfLiteRunStabilityUnstable;
+    case kTfLiteDynamic:
+      return kTfLiteRunStabilitySingleRun;
+    case kTfLitePersistentRo:
+      return kTfLiteRunStabilitySingleRun;
+    case kTfLiteCustom:
+      return kTfLiteRunStabilityUnknown;
+    case kTfLiteVariantObject:
+      return kTfLiteRunStabilityAcrossRuns;
+  }
+  return kTfLiteRunStabilityUnknown;
+}
+
+// Returns how stable a tensor data values are across runs.
+TfLiteRunStability TfLiteTensorGetDataStability(const TfLiteTensor* const t) {
+  switch (t->allocation_type) {
+    case kTfLiteMemNone:
+      return kTfLiteRunStabilityAcrossRuns;
+    case kTfLiteMmapRo:
+      return kTfLiteRunStabilityAcrossRuns;
+    case kTfLiteArenaRw:
+      return kTfLiteRunStabilitySingleRun;
+    case kTfLiteArenaRwPersistent:
+      return kTfLiteRunStabilityAcrossRuns;
+    case kTfLiteDynamic:
+      return kTfLiteRunStabilitySingleRun;
+    case kTfLitePersistentRo:
+      return kTfLiteRunStabilitySingleRun;
+    case kTfLiteCustom:
+      return kTfLiteRunStabilityUnknown;
+    case kTfLiteVariantObject:
+      return kTfLiteRunStabilitySingleRun;
+  }
+  return kTfLiteRunStabilityUnknown;
+}
+
+// Returns the operation step when the data of a tensor is populated.
+//
+// Some operations can precompute their results before the evaluation step. This
+// makes the data available earlier for subsequent operations.
+TfLiteRunStep TfLiteTensorGetDataKnownStep(const TfLiteTensor* t) {
+  switch (t->allocation_type) {
+    case kTfLiteMemNone:
+      return kTfLiteRunStepInit;
+    case kTfLiteMmapRo:
+      return kTfLiteRunStepInit;
+    case kTfLiteArenaRw:
+      return kTfLiteRunStepEval;
+    case kTfLiteArenaRwPersistent:
+      return kTfLiteRunStepEval;
+    case kTfLiteDynamic:
+      return kTfLiteRunStepEval;
+    case kTfLitePersistentRo:
+      return kTfLiteRunStepPrepare;
+    case kTfLiteCustom:
+      return kTfLiteRunStepUnknown;
+    case kTfLiteVariantObject:
+      return kTfLiteRunStepEval;
+  }
+  return kTfLiteRunStepUnknown;
+}
+
+// Returns the operation steop when the shape of a tensor is computed.
+//
+// Some operations can precompute the shape of their results before the
+// evaluation step. This makes the shape available earlier for subsequent
+// operations.
+TfLiteRunStep TfLiteTensorGetShapeKnownStep(const TfLiteTensor* t) {
+  switch (t->allocation_type) {
+    case kTfLiteMemNone:
+      return kTfLiteRunStepInit;
+    case kTfLiteMmapRo:
+      return kTfLiteRunStepInit;
+    case kTfLiteArenaRw:
+      return kTfLiteRunStepPrepare;
+    case kTfLiteArenaRwPersistent:
+      return kTfLiteRunStepPrepare;
+    case kTfLiteDynamic:
+      return kTfLiteRunStepEval;
+    case kTfLitePersistentRo:
+      return kTfLiteRunStepPrepare;
+    case kTfLiteCustom:
+      return kTfLiteRunStepUnknown;
+    case kTfLiteVariantObject:
+      return kTfLiteRunStepEval;
+  }
+  return kTfLiteRunStepUnknown;
+}
+
 }  // extern "C"
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index 19a74a7c90d600..d9abf9eaf2ee6f 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -363,6 +363,37 @@ typedef enum TfLiteAllocationType {
   kTfLiteVariantObject,
 } TfLiteAllocationType;
 
+// Memory allocation strategies.
+//
+// TfLiteAllocationType values have been overloaded to mean more than their
+// original intent. This enum should only be used to document the allocation
+// strategy used by a tensor for it data.
+typedef enum TfLiteAllocationStrategy {
+  kTfLiteAllocationStrategyUnknown,
+  kTfLiteAllocationStrategyNone,    // No data is allocated.
+  kTfLiteAllocationStrategyMMap,    // Data is mmaped.
+  kTfLiteAllocationStrategyArena,   // Handled by the arena.
+  kTfLiteAllocationStrategyMalloc,  // Uses `malloc`/`free`.
+  kTfLiteAllocationStrategyNew      // Uses `new[]`/`delete[]`.
+} TfLiteAllocationStrategy;
+
+// Describes how stable a tensor attribute is with regards to an interpreter
+// runs.
+typedef enum TfLiteRunStability {
+  kTfLiteRunStabilityUnknown,
+  kTfLiteRunStabilityUnstable,   // May change at any time.
+  kTfLiteRunStabilitySingleRun,  // Will stay the same for one run.
+  kTfLiteRunStabilityAcrossRuns  // Will stay the same across all runs.
+} TfLiteRunStability;
+
+// Describes the steps of a TFLite operation life cycle.
+typedef enum TfLiteRunStep {
+  kTfLiteRunStepUnknown,
+  kTfLiteRunStepInit,
+  kTfLiteRunStepPrepare,
+  kTfLiteRunStepEval
+} TfLiteRunStep;
+
 // The delegates should use zero or positive integers to represent handles.
 // -1 is reserved from unallocated status.
 typedef int TfLiteBufferHandle;
@@ -1312,6 +1343,29 @@ void TfLiteOpaqueDelegateDelete(TfLiteOpaqueDelegate* delegate);
 //  'opaque_delegate_builder' field is null.
 void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate);
 
+// Returns a tensor data allocation strategy.
+TfLiteAllocationStrategy TfLiteTensorGetAllocationStrategy(
+    const TfLiteTensor* t);
+
+// Returns how stable a tensor data buffer address is across runs.
+TfLiteRunStability TfLiteTensorGetBufferAddressStability(const TfLiteTensor* t);
+
+// Returns how stable a tensor data values are across runs.
+TfLiteRunStability TfLiteTensorGetDataStability(const TfLiteTensor* t);
+
+// Returns the operation step when the data of a tensor is populated.
+//
+// Some operations can precompute their results before the evaluation step. This
+// makes the data available earlier for subsequent operations.
+TfLiteRunStep TfLiteTensorGetDataKnownStep(const TfLiteTensor* t);
+
+// Returns the operation steop when the shape of a tensor is computed.
+//
+// Some operations can precompute the shape of their results before the
+// evaluation step. This makes the shape available earlier for subsequent
+// operations.
+TfLiteRunStep TfLiteTensorGetShapeKnownStep(const TfLiteTensor* t);
+
 #ifdef __cplusplus
 }  // extern "C"
 
diff --git a/tensorflow/lite/core/c/common_test.cc b/tensorflow/lite/core/c/common_test.cc
index 5259b5a6589add..2c3e4c8b7afd62 100644
--- a/tensorflow/lite/core/c/common_test.cc
+++ b/tensorflow/lite/core/c/common_test.cc
@@ -429,6 +429,272 @@ TEST(TestTfLiteOpaqueDelegate, GetData_NoDataSetViaOpaqueDelegateBuilder) {
   TfLiteOpaqueDelegateDelete(opaque_delegate);
 }
 
+TEST(TestTfLiteTensorGetAllocationStrategy, MemNoneIsAllocatedWithNone) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMemNone;
+  EXPECT_EQ(TfLiteTensorGetAllocationStrategy(&t),
+            kTfLiteAllocationStrategyNone);
+}
+
+TEST(TestTfLiteTensorGetAllocationStrategy, MmapRoIsAllocatedWithMMap) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMmapRo;
+  EXPECT_EQ(TfLiteTensorGetAllocationStrategy(&t),
+            kTfLiteAllocationStrategyMMap);
+}
+
+TEST(TestTfLiteTensorGetAllocationStrategy, ArenaRwIsAllocatedWithArena) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRw;
+  EXPECT_EQ(TfLiteTensorGetAllocationStrategy(&t),
+            kTfLiteAllocationStrategyArena);
+}
+
+TEST(TestTfLiteTensorGetAllocationStrategy,
+     ArenaRwPersistentIsAllocatedWithArena) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRwPersistent;
+  EXPECT_EQ(TfLiteTensorGetAllocationStrategy(&t),
+            kTfLiteAllocationStrategyArena);
+}
+
+TEST(TestTfLiteTensorGetAllocationStrategy, DynamicIsAllocatedWithMalloc) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteDynamic;
+  EXPECT_EQ(TfLiteTensorGetAllocationStrategy(&t),
+            kTfLiteAllocationStrategyMalloc);
+}
+
+TEST(TestTfLiteTensorGetAllocationStrategy,
+     PersistentRoIsAllocatedWithUnknown) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLitePersistentRo;
+  EXPECT_EQ(TfLiteTensorGetAllocationStrategy(&t),
+            kTfLiteAllocationStrategyUnknown);
+}
+
+TEST(TestTfLiteTensorGetAllocationStrategy, CustomIsAllocatedWithUnknown) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteCustom;
+  EXPECT_EQ(TfLiteTensorGetAllocationStrategy(&t),
+            kTfLiteAllocationStrategyUnknown);
+}
+
+TEST(TestTfLiteTensorGetAllocationStrategy, VariantObjectIsAllocatedWithNew) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteVariantObject;
+  EXPECT_EQ(TfLiteTensorGetAllocationStrategy(&t),
+            kTfLiteAllocationStrategyNew);
+}
+
+TEST(TestTfLiteTensorGetBufferAddressStability,
+     MemNoneBufferIsStableAcrossRuns) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMemNone;
+  EXPECT_EQ(TfLiteTensorGetBufferAddressStability(&t),
+            kTfLiteRunStabilityAcrossRuns);
+}
+
+TEST(TestTfLiteTensorGetBufferAddressStability,
+     MmapRoBufferIsStableAcrossRuns) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMmapRo;
+  EXPECT_EQ(TfLiteTensorGetBufferAddressStability(&t),
+            kTfLiteRunStabilityAcrossRuns);
+}
+
+TEST(TestTfLiteTensorGetBufferAddressStability, ArenaRwBufferIsStableUnstable) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRw;
+  EXPECT_EQ(TfLiteTensorGetBufferAddressStability(&t),
+            kTfLiteRunStabilityUnstable);
+}
+
+TEST(TestTfLiteTensorGetBufferAddressStability,
+     ArenaRwPersistentBufferIsStableUnstable) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRwPersistent;
+  EXPECT_EQ(TfLiteTensorGetBufferAddressStability(&t),
+            kTfLiteRunStabilityUnstable);
+}
+
+TEST(TestTfLiteTensorGetBufferAddressStability,
+     DynamicBufferIsStableSingleRun) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteDynamic;
+  EXPECT_EQ(TfLiteTensorGetBufferAddressStability(&t),
+            kTfLiteRunStabilitySingleRun);
+}
+
+TEST(TestTfLiteTensorGetBufferAddressStability,
+     PersistentRoBufferIsStableSingleRun) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLitePersistentRo;
+  EXPECT_EQ(TfLiteTensorGetBufferAddressStability(&t),
+            kTfLiteRunStabilitySingleRun);
+}
+
+TEST(TestTfLiteTensorGetBufferAddressStability, CustomBufferIsStableUnknown) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteCustom;
+  EXPECT_EQ(TfLiteTensorGetBufferAddressStability(&t),
+            kTfLiteRunStabilityUnknown);
+}
+
+TEST(TestTfLiteTensorGetBufferAddressStability,
+     VariantObjectBufferIsStableAcrossRuns) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteVariantObject;
+  EXPECT_EQ(TfLiteTensorGetBufferAddressStability(&t),
+            kTfLiteRunStabilityAcrossRuns);
+}
+
+TEST(TestTfLiteTensorGetDataStability, MemNoneDataIsStableAcrossRuns) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMemNone;
+  EXPECT_EQ(TfLiteTensorGetDataStability(&t), kTfLiteRunStabilityAcrossRuns);
+}
+
+TEST(TestTfLiteTensorGetDataStability, MmapRoDataIsStableAcrossRuns) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMmapRo;
+  EXPECT_EQ(TfLiteTensorGetDataStability(&t), kTfLiteRunStabilityAcrossRuns);
+}
+
+TEST(TestTfLiteTensorGetDataStability, ArenaRwDataIsStableSingleRun) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRw;
+  EXPECT_EQ(TfLiteTensorGetDataStability(&t), kTfLiteRunStabilitySingleRun);
+}
+
+TEST(TestTfLiteTensorGetDataStability,
+     ArenaRwPersistentDataIsStableAcrossRuns) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRwPersistent;
+  EXPECT_EQ(TfLiteTensorGetDataStability(&t), kTfLiteRunStabilityAcrossRuns);
+}
+
+TEST(TestTfLiteTensorGetDataStability, DynamicDataIsStableSingleRun) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteDynamic;
+  EXPECT_EQ(TfLiteTensorGetDataStability(&t), kTfLiteRunStabilitySingleRun);
+}
+
+TEST(TestTfLiteTensorGetDataStability, PersistentRoDataIsStableSingleRun) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLitePersistentRo;
+  EXPECT_EQ(TfLiteTensorGetDataStability(&t), kTfLiteRunStabilitySingleRun);
+}
+
+TEST(TestTfLiteTensorGetDataStability, CustomDataIsStableUnknown) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteCustom;
+  EXPECT_EQ(TfLiteTensorGetDataStability(&t), kTfLiteRunStabilityUnknown);
+}
+
+TEST(TestTfLiteTensorGetDataStability, VariantObjectDataIsStableSingleRun) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteVariantObject;
+  EXPECT_EQ(TfLiteTensorGetDataStability(&t), kTfLiteRunStabilitySingleRun);
+}
+
+TEST(TestTfLiteTensorGetDataKnownStep, MemNoneDataIsKnownAtInit) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMemNone;
+  EXPECT_EQ(TfLiteTensorGetDataKnownStep(&t), kTfLiteRunStepInit);
+}
+
+TEST(TestTfLiteTensorGetDataKnownStep, MmapRoDataIsKnownAtInit) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMmapRo;
+  EXPECT_EQ(TfLiteTensorGetDataKnownStep(&t), kTfLiteRunStepInit);
+}
+
+TEST(TestTfLiteTensorGetDataKnownStep, ArenaRwDataIsKnownAtEval) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRw;
+  EXPECT_EQ(TfLiteTensorGetDataKnownStep(&t), kTfLiteRunStepEval);
+}
+
+TEST(TestTfLiteTensorGetDataKnownStep, ArenaRwPersistentDataIsKnownAtEval) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRwPersistent;
+  EXPECT_EQ(TfLiteTensorGetDataKnownStep(&t), kTfLiteRunStepEval);
+}
+
+TEST(TestTfLiteTensorGetDataKnownStep, DynamicDataIsKnownAtEval) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteDynamic;
+  EXPECT_EQ(TfLiteTensorGetDataKnownStep(&t), kTfLiteRunStepEval);
+}
+
+TEST(TestTfLiteTensorGetDataKnownStep, PersistentRoDataIsKnownAtPrepare) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLitePersistentRo;
+  EXPECT_EQ(TfLiteTensorGetDataKnownStep(&t), kTfLiteRunStepPrepare);
+}
+
+TEST(TestTfLiteTensorGetDataKnownStep, CustomDataIsKnownAtUnknown) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteCustom;
+  EXPECT_EQ(TfLiteTensorGetDataKnownStep(&t), kTfLiteRunStepUnknown);
+}
+
+TEST(TestTfLiteTensorGetDataKnownStep, VariantObjectDataIsKnownAtEval) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteVariantObject;
+  EXPECT_EQ(TfLiteTensorGetDataKnownStep(&t), kTfLiteRunStepEval);
+}
+
+TEST(TestTfLiteTensorGetShapeKnownStep, MemNoneShapeIsKnownAtInit) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMemNone;
+  EXPECT_EQ(TfLiteTensorGetShapeKnownStep(&t), kTfLiteRunStepInit);
+}
+
+TEST(TestTfLiteTensorGetShapeKnownStep, MmapRoShapeIsKnownAtInit) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteMmapRo;
+  EXPECT_EQ(TfLiteTensorGetShapeKnownStep(&t), kTfLiteRunStepInit);
+}
+
+TEST(TestTfLiteTensorGetShapeKnownStep, ArenaRwShapeIsKnownAtPrepare) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRw;
+  EXPECT_EQ(TfLiteTensorGetShapeKnownStep(&t), kTfLiteRunStepPrepare);
+}
+
+TEST(TestTfLiteTensorGetShapeKnownStep,
+     ArenaRwPersistentShapeIsKnownAtPrepare) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteArenaRwPersistent;
+  EXPECT_EQ(TfLiteTensorGetShapeKnownStep(&t), kTfLiteRunStepPrepare);
+}
+
+TEST(TestTfLiteTensorGetShapeKnownStep, DynamicShapeIsKnownAtEval) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteDynamic;
+  EXPECT_EQ(TfLiteTensorGetShapeKnownStep(&t), kTfLiteRunStepEval);
+}
+
+TEST(TestTfLiteTensorGetShapeKnownStep, PersistentRoShapeIsKnownAtPrepare) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLitePersistentRo;
+  EXPECT_EQ(TfLiteTensorGetShapeKnownStep(&t), kTfLiteRunStepPrepare);
+}
+
+TEST(TestTfLiteTensorGetShapeKnownStep, CustomShapeIsKnownAtUnknown) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteCustom;
+  EXPECT_EQ(TfLiteTensorGetShapeKnownStep(&t), kTfLiteRunStepUnknown);
+}
+
+TEST(TestTfLiteTensorGetShapeKnownStep, VariantObjectShapeIsKnownAtEval) {
+  TfLiteTensor t;
+  t.allocation_type = kTfLiteVariantObject;
+  EXPECT_EQ(TfLiteTensorGetShapeKnownStep(&t), kTfLiteRunStepEval);
+}
+
 struct Foo {
   int data;
   bool copied;

From 1739a1bba88ee6f7774b73c47e0ac28bb8dcd072 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Thu, 28 Sep 2023 11:42:27 -0700
Subject: [PATCH 389/567] Remove outdated TODOs in Toco.

PiperOrigin-RevId: 569245248
---
 tensorflow/lite/toco/graph_transformations/identify_prelu.cc | 2 --
 tensorflow/lite/toco/graph_transformations/lstm_utils.h      | 1 -
 tensorflow/lite/toco/tflite/export.cc                        | 1 -
 tensorflow/lite/toco/tflite/export.h                         | 3 ---
 tensorflow/lite/toco/tflite/export_test.cc                   | 2 --
 tensorflow/lite/toco/tflite/operator.cc                      | 5 -----
 tensorflow/lite/toco/tflite/operator.h                       | 2 --
 tensorflow/lite/toco/tflite/types.cc                         | 1 -
 tensorflow/lite/toco/toco_cmdline_flags.cc                   | 2 --
 tensorflow/lite/toco/toco_flags.proto                        | 2 --
 10 files changed, 21 deletions(-)

diff --git a/tensorflow/lite/toco/graph_transformations/identify_prelu.cc b/tensorflow/lite/toco/graph_transformations/identify_prelu.cc
index e883754dfafbd9..39e2210a801f77 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_prelu.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_prelu.cc
@@ -62,8 +62,6 @@ ::tensorflow::Status IdentifyPRelu::Run(Model* model, std::size_t op_index,
     return ::tensorflow::OkStatus();
   }
 
-  // TODO(ycling): Both Add and Mul are commutative. Support the case where
-  // the position of operands are exchanged.
   const auto* mul_op = GetOpWithOutput(*model, add_op->inputs[1]);
   if (mul_op == nullptr || mul_op->type != OperatorType::kMul ||
       mul_op->inputs.size() != 2 ||
diff --git a/tensorflow/lite/toco/graph_transformations/lstm_utils.h b/tensorflow/lite/toco/graph_transformations/lstm_utils.h
index 102fe7d6cfc699..b5b8dcab568af1 100644
--- a/tensorflow/lite/toco/graph_transformations/lstm_utils.h
+++ b/tensorflow/lite/toco/graph_transformations/lstm_utils.h
@@ -54,7 +54,6 @@ enum ExtendedLstmCellInputs {
 };
 
 enum ExtendedLstmCellOutputs {
-  // TODO(ycling): Make the 2 output state tensors optional.
   kOutputStateTensor = 0,
   kCellStateTensor = 1,
   kOutputTensor = 2,
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index 1bde1df6cf9e04..9c29f700ba4594 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -66,7 +66,6 @@ bool IsControlFlowOp(const std::string& tensorflow_op) {
       tensorflow_op == "NextIteration" || tensorflow_op == "RefNextIteration") {
     return true;
   }
-  // TODO(ycling): Also check how to handle Variable ops and Assign ops.
   return false;
 }
 
diff --git a/tensorflow/lite/toco/tflite/export.h b/tensorflow/lite/toco/tflite/export.h
index 12d7c0b8c9368e..38ea2a925e1ebd 100644
--- a/tensorflow/lite/toco/tflite/export.h
+++ b/tensorflow/lite/toco/tflite/export.h
@@ -49,7 +49,6 @@ tensorflow::Status Export(
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type);
 
 // This is for backward-compatibility.
-// TODO(ycling): Remove the deprecated entry functions.
 inline void Export(const Model& model, bool allow_custom_ops,
                    bool quantize_weights, std::string* output_file_contents) {
   ExportParams params;
@@ -61,7 +60,6 @@ inline void Export(const Model& model, bool allow_custom_ops,
 }
 
 // This is for backward-compatibility.
-// TODO(ycling): Remove the deprecated entry functions.
 inline void Export(
     const Model& model, bool allow_custom_ops, bool quantize_weights,
     std::string* output_file_contents,
@@ -75,7 +73,6 @@ inline void Export(
 }
 
 // This is for backward-compatibility.
-// TODO(ycling): Remove the deprecated entry functions.
 inline void Export(const Model& model, std::string* output_file_contents) {
   ExportParams params;
   params.allow_custom_ops = true;
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index bf88b8e3babe40..7659ed7c1e237e 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -803,8 +803,6 @@ TEST(OperatorKeyTest, TestFlexWithUnsupportedOp) {
 TEST(OperatorKeyTest, TestFlexWithPartiallySupportedOps) {
   // Test Toco-supported/TFLite-unsupported operators.
   Model model;
-  // TODO(ycling): The test will be broken if TensorFlowAssert is implemented in
-  // TFLite. Find a more robust way to test the fallback logic.
   auto op = std::make_unique<TensorFlowAssertOperator>();
 
   const auto ops_by_type = BuildOperatorByTypeMap();
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 32640722d49252..6e583a6ee48796 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 
-// TODO(ycling): Consider refactoring to extract the LSTM definition out of
 // graph_transformation module.
 #include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/delegates/flex/allowlisted_flex_ops.h"
@@ -1619,8 +1618,6 @@ class TensorFlowUnsupported : public BaseOperator {
       const BuiltinOptions* builtin_options,
       const CustomOptions* custom_options) const override {
     // Deserializing Flex ops doesn't work now.
-    // TODO(ycling): Revisit and decide if we should fix the flow for importing
-    // TFLite models with Flex ops.
     auto op = std::make_unique<TensorFlowUnsupportedOperator>();
     if (custom_options) {
       auto flexbuffer_map =
@@ -1781,8 +1778,6 @@ class TensorFlowUnsupported : public BaseOperator {
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    // TODO(ycling): Design and implement a way to plumb the version of
-    // custom ops.
     return 1;
   }
 
diff --git a/tensorflow/lite/toco/tflite/operator.h b/tensorflow/lite/toco/tflite/operator.h
index b2aae08fc0761b..cc1b38e5cf82af 100644
--- a/tensorflow/lite/toco/tflite/operator.h
+++ b/tensorflow/lite/toco/tflite/operator.h
@@ -30,8 +30,6 @@ namespace tflite {
 class BaseOperator;
 
 // Return a map contained all know TF Lite Operators, keyed by their names.
-// TODO(ycling): The pattern to propagate parameters (e.g. enable_select_tf_ops)
-// is ugly here. Consider refactoring.
 std::map<std::string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
     bool enable_select_tf_ops = false);
 
diff --git a/tensorflow/lite/toco/tflite/types.cc b/tensorflow/lite/toco/tflite/types.cc
index e680502ef1323d..659b42bf0537b9 100644
--- a/tensorflow/lite/toco/tflite/types.cc
+++ b/tensorflow/lite/toco/tflite/types.cc
@@ -111,7 +111,6 @@ ::tflite::TensorType DataType::Serialize(ArrayDataType array_data_type) {
       return ::tflite::TensorType_COMPLEX64;
     default:
       // FLOAT32 is filled for unknown data types.
-      // TODO(ycling): Implement type inference in TF Lite interpreter.
       return ::tflite::TensorType_FLOAT32;
   }
 }
diff --git a/tensorflow/lite/toco/toco_cmdline_flags.cc b/tensorflow/lite/toco/toco_cmdline_flags.cc
index 079d55c912b4da..505b9ec6301ba0 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/lite/toco/toco_cmdline_flags.cc
@@ -304,8 +304,6 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
 
   if (parsed_toco_flags.force_select_tf_ops.value() &&
       !parsed_toco_flags.enable_select_tf_ops.value()) {
-    // TODO(ycling): Consider to enforce `enable_select_tf_ops` when
-    // `force_select_tf_ops` is true.
     LOG(WARNING) << "--force_select_tf_ops should always be used with "
                     "--enable_select_tf_ops.";
   }
diff --git a/tensorflow/lite/toco/toco_flags.proto b/tensorflow/lite/toco/toco_flags.proto
index f9a6ad8349cd58..1006a94f11b969 100644
--- a/tensorflow/lite/toco/toco_flags.proto
+++ b/tensorflow/lite/toco/toco_flags.proto
@@ -195,8 +195,6 @@ message TocoFlags {
 
   // This flag only works when converting to TensorFlow Lite format.
   // When enabled, unsupported ops will be converted to select TensorFlow ops.
-  // TODO(ycling): Consider to rename the following 2 flags and don't call it
-  // "Flex".
   // `enable_select_tf_ops` should always be used with `allow_custom_ops`.
   // WARNING: Experimental interface, subject to change
   optional bool enable_select_tf_ops = 27 [default = false];

From 8e8967a83ff910253e5f2880385c010a89ab09f8 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Thu, 28 Sep 2023 12:06:12 -0700
Subject: [PATCH 390/567] [XLA:GPU] Let Triton GEMM fusions reevaluate
 operations adding more parameters.

Other operations in the fusion queue may reduce the number of parameters per fusion which allows adding more parameters again.

PiperOrigin-RevId: 569252026
---
 .../xla/service/gpu/gemm_rewriter_triton.cc   | 11 ++++--
 .../service/gpu/gemm_rewriter_triton_test.cc  | 34 +++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
index 5ec96be47f72f6..df712e823bb39d 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
@@ -1238,14 +1238,21 @@ void FusionContext::TryToFuseWithInputsRecursively(
   absl::flat_hash_set<const HloInstruction*> enqueued;
   std::queue<HloInstruction*> to_visit;
   to_visit.push(&root);
-  while (!to_visit.empty()) {
+  int num_requeued = 0;
+  while (to_visit.size() > num_requeued) {
     HloInstruction* hlo = to_visit.front();
     to_visit.pop();
-    // Limit the total number of fusion parameters.
+    // Watch the total number of fusion parameters.
     if (inputs.size() >= TritonFusionAnalysis::kMaxParameterPerScope &&
         NumAddedParameters(*hlo) > 0) {
+      // Re-queue: the number of parameters may go down when other instructions
+      // are processed.
+      to_visit.push(hlo);
+      // Prevent infinite loops.
+      ++num_requeued;
       continue;
     }
+    num_requeued = 0;
     const DimOrderUpdatesOrError result = AnalyzeForFusion(
         *hlo, /*as_input=*/true, old_to_new_mapping, gpu_version);
     if (!std::holds_alternative<DimOrderUpdates>(result) ||
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
index 64c252a8997b1b..05ff127675788c 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
@@ -1724,6 +1724,40 @@ ENTRY e {
             TritonFusionAnalysis::kMaxParameterPerScope * 2);
 }
 
+TEST_F(GemmRewriterTritonLevel2Test,
+       OperationsAddingMoreParametersGetMultipleTries) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+e {
+  p0 = f32[2,2] parameter(0)
+  c0 = f32[] constant(12345)
+  b0 = f32[2,2] broadcast(c0), dimensions={}
+  m0 = f32[2,2] multiply(p0, b0)
+  c1 = f32[] constant(34567)
+  b1 = f32[2,2] broadcast(c1), dimensions={}
+  a0 = f32[2,2] add(m0, b1)
+  b3 = f32[2,2,2] broadcast(a0), dimensions={0,1}
+  p2 = f32[2,2,2] parameter(2)
+  m2 = f32[2,2,2] multiply(p2, b3)
+  p1 = f32[2]{0} parameter(1)
+  c2 = f32[] constant(5678)
+  b2 = f32[2] broadcast(c2), dimensions={}
+  a1 = f32[2]{0} add(p1, b2)
+  b4 = f32[2,2,2] broadcast(a1), dimensions={2}
+  m1 = f32[2,2,2] multiply(m2, b4)
+  b = f32[4,2] bitcast(m1)
+  p3 = f16[2,2] parameter(3)
+  p3c = f32[2,2] convert(p3)
+  ROOT r = f32[4,2] dot(b, p3c),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})"));
+
+  EXPECT_TRUE(GemmRewriterTriton(gpu_version_).Run(module.get()).value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch((m::Fusion(m::Parameter(), m::Parameter(),
+                                    m::Parameter(), m::Parameter()))));
+}
+
 TEST_F(GemmRewriterTritonLevel2Test, FusionLevelIsLimitedOnVolta) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(

From 9db5cf6d77fbfb6bacefcd2d861d060b868f63d0 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <dmitrig@google.com>
Date: Thu, 28 Sep 2023 12:41:51 -0700
Subject: [PATCH 391/567] Replace the deprecated TensorFlow's `gtl::optional`
 alias with its target `std::optional`

PiperOrigin-RevId: 569261141
---
 tensorflow/core/common_runtime/eager/attr_builder.h     | 1 -
 tensorflow/core/data/captured_function.cc               | 4 ++--
 tensorflow/core/kernels/deserialize_sparse_string_op.cc | 4 ++--
 tensorflow/core/kernels/map_stage_op.cc                 | 4 ++--
 tensorflow/core/kernels/serialize_sparse_op.cc          | 1 -
 5 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 2f05463facf900..6fc817039cc214 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 
diff --git a/tensorflow/core/data/captured_function.cc b/tensorflow/core/data/captured_function.cc
index 7387559ce3161e..8336551138806e 100644
--- a/tensorflow/core/data/captured_function.cc
+++ b/tensorflow/core/data/captured_function.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <functional>
 #include <map>
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -34,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/errors.h"
@@ -300,7 +300,7 @@ class CallFrameBase : public CallFrameInterface {
 
  private:
   DataTypeSlice ret_types_;
-  std::vector<gtl::optional<Tensor>> retvals_;
+  std::vector<std::optional<Tensor>> retvals_;
   TF_DISALLOW_COPY_AND_ASSIGN(CallFrameBase);
 };
 
diff --git a/tensorflow/core/kernels/deserialize_sparse_string_op.cc b/tensorflow/core/kernels/deserialize_sparse_string_op.cc
index 6d4298d052c77e..b02308b827b8f1 100644
--- a/tensorflow/core/kernels/deserialize_sparse_string_op.cc
+++ b/tensorflow/core/kernels/deserialize_sparse_string_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <numeric>
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -30,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/reshape_util.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
@@ -173,7 +173,7 @@ class DeserializeSparseOp : public OpKernel {
       tensors.push_back(std::move(tensor));
     }
 
-    gtl::optional<SparseTensor> maybe_output;
+    std::optional<SparseTensor> maybe_output;
 #define HANDLE_TYPE(T)                               \
   case DataTypeToEnum<T>::value: {                   \
     maybe_output = SparseTensor::Concat<T>(tensors); \
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index 806f559f86e5b2..d60d3435ff5294 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <map>
 #include <mutex>
 #include <numeric>
+#include <optional>
 #include <unordered_map>
 #include <vector>
 
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -84,7 +84,7 @@ class StagingMap : public ResourceBase {
  public:
   // Public typedefs
   using Tuple = std::vector<Tensor>;
-  using OptionalTensor = gtl::optional<Tensor>;
+  using OptionalTensor = std::optional<Tensor>;
   using OptionalTuple = std::vector<OptionalTensor>;
 
   using MapType = typename MapTraits<Ordered, OptionalTuple>::MapType;
diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index a6712e93e5bbdf..3c077a6cf895c8 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/reshape_util.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/util/sparse/group_iterator.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 

From b8d797390486f4bb14dd72887df4f380f08d479b Mon Sep 17 00:00:00 2001
From: Kanvi Khanna <kanvi.khanna@intel.com>
Date: Thu, 28 Sep 2023 12:55:43 -0700
Subject: [PATCH 392/567] Check for bf16 support

---
 .../core/common_runtime/mkl_layout_pass.cc    | 12 +++
 .../grappler/optimizers/mkl_remapper_test.cc  | 35 +++++++-
 .../core/grappler/optimizers/remapper.cc      | 16 +++-
 .../core/grappler/optimizers/remapper_test.cc | 88 ++++++++++++-------
 4 files changed, 118 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index b02e487756c937..ca7ca96aa1164b 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -1070,6 +1070,18 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
       reason = "User has assigned a device that is not CPU.";
     }
 
+    const string& type_attr = "T";
+    if (HasNodeAttr(n->def(), type_attr)) {
+      const auto& attr = n->def().attr().at(type_attr);
+      DataType dtype = attr.type();
+      if (dtype == DT_BFLOAT16 &&
+          !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU()) {
+        mkl_op_registry::BF16UnsupportedWarning();
+        result = false;
+        reason = "Intel oneDNN with bfloat16 support is not enabled.";
+      }
+    }
+
     if (result == false) {
       VLOG(1) << "MklLayoutRewritePass: Skipping rewriting of the node "
               << n->type_string() << ", reason: " << reason;
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index 894ea55fb45b87..ed60e93bfd6a16 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/remapper.h"
@@ -758,12 +759,20 @@ TEST_F(FusedMatMulBiasAddAndGeluTest, Float32GeluExact) {
   RunTest<DT_FLOAT, false>();
 }
 TEST_F(FusedMatMulBiasAddAndGeluTest, BFloat16GeluExact) {
+  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "FusedMatMulBiasAddAndGelu with bfloat16.";
   RunTest<DT_BFLOAT16, false>();
 }
 TEST_F(FusedMatMulBiasAddAndGeluTest, Float32GeluExact2) {
   RunTest<DT_FLOAT, true>();
 }
 TEST_F(FusedMatMulBiasAddAndGeluTest, BFloat16GeluExact2) {
+  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "FusedMatMulBiasAddAndGelu with bfloat16.";
   RunTest<DT_BFLOAT16, true>();
 }
 
@@ -771,6 +780,11 @@ class MklFusedBatchMatMul : public MklRemapperTest {
  public:
   template <typename T>
   void VerifyFused(bool adjx, bool adjy) {
+    if (DataTypeToEnum<T>::v() == DT_BFLOAT16 &&
+        !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+      GTEST_SKIP()
+          << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+             "MklFusedBatchMatMul with bfloat16.";
     using ::tensorflow::ops::Placeholder;
     using normal_generator = Eigen::internal::NormalRandomGenerator<T>;
 
@@ -862,6 +876,11 @@ class MklFusedBatchMatMul : public MklRemapperTest {
 
   template <typename T>
   void VerifyPreceedingScalarMul(bool adjx, bool adjy) {
+    if (DataTypeToEnum<T>::v() == DT_BFLOAT16 &&
+        !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+      GTEST_SKIP()
+          << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+             "MklFusedBatchMatMul with bfloat16.";
     using ::tensorflow::ops::Placeholder;
     using normal_generator = Eigen::internal::NormalRandomGenerator<T>;
 
@@ -1055,7 +1074,13 @@ class MklRemapperSwishTest : public GrapplerTest {
 };
 
 TEST_F(MklRemapperSwishTest, F32) { RunTest<DT_FLOAT>(); }
-TEST_F(MklRemapperSwishTest, BF16) { RunTest<DT_BFLOAT16>(); }
+TEST_F(MklRemapperSwishTest, BF16) {
+  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "MklRemapperSwish with bfloat16.";
+  RunTest<DT_BFLOAT16>();
+}
 
 class MklRemapperConv2dBiasAddSwishTest : public GrapplerTest {
  protected:
@@ -1135,7 +1160,13 @@ class MklRemapperConv2dBiasAddSwishTest : public GrapplerTest {
 };
 
 TEST_F(MklRemapperConv2dBiasAddSwishTest, F32) { RunTest<DT_FLOAT>(); }
-TEST_F(MklRemapperConv2dBiasAddSwishTest, BF16) { RunTest<DT_BFLOAT16>(); }
+TEST_F(MklRemapperConv2dBiasAddSwishTest, BF16) {
+  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "MklRemapperConv2dBiasAddSwish with bfloat16.";
+  RunTest<DT_BFLOAT16>();
+}
 
 class MklRemapperConv2dFusedBatchNormSwishTest : public GrapplerTest {
  protected:
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 949368cda95e6f..128e86d423f2cd 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -340,7 +341,9 @@ bool IsCpuCompatibleDataType(const NodeDef* contraction,
     return (IsConv2D(*contraction) || IsDepthwiseConv2dNative(*contraction) ||
             IsConv3D(*contraction) || IsAnyBatchMatMul(*contraction) ||
             is_supported_matmul) &&
-           (dtype == DT_FLOAT || dtype == DT_BFLOAT16);
+           (dtype == DT_FLOAT ||
+            (dtype == DT_BFLOAT16 &&
+             mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU()));
   }
   if (IsConv2D(*contraction)) {
     return dtype == DT_FLOAT || dtype == DT_DOUBLE;
@@ -4650,6 +4653,17 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 
     if (IsMKLEnabled()) {
+      // Check if BF16 datatype is supported by oneDNN on this CPU,
+      // skipping fusions if it is not supported.
+      const auto* node_view = ctx.graph_view.GetNode(i);
+      const auto* node_def = node_view->node();
+      const string& type_attr = "T";
+      DataType dtype = GetDataTypeFromAttr(*node_def, type_attr);
+      if (dtype == DT_BFLOAT16 &&
+          !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU()) {
+        continue;
+      }
+
       // Remap Conv2D+BiasAdd+Add+relu into the _FusedConv2D.
       // or Remap Conv3D+BiasAdd+Add+relu into _FusedConv3D
       if (FindContractionWithBiasAndAddActivation(
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index d0edf1ace3e783..c0ac573cf58423 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
@@ -721,15 +722,17 @@ class RemapperFuseConvWithBias : public RemapperTest {
 TEST_F(RemapperFuseConvWithBias, Conv2D_F32) { RunTest<2, DT_FLOAT>(); }
 TEST_F(RemapperFuseConvWithBias, Conv3D_F32) { RunTest<3, DT_FLOAT>(); }
 TEST_F(RemapperFuseConvWithBias, Conv2D_BF16) {
-  if (!IsMKLEnabled())
-    GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
-                    "FuseConv2DWithBias with bfloat16.";
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "FuseConv2DWithBias with bfloat16.";
   RunTest<2, DT_BFLOAT16>();
 }
 TEST_F(RemapperFuseConvWithBias, Conv3D_BF16) {
-  if (!IsMKLEnabled())
-    GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
-                    "FuseConv3DWithBias with bfloat16.";
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "FuseConv3DWithBias with bfloat16.";
   RunTest<3, DT_BFLOAT16>();
 }
 
@@ -876,15 +879,17 @@ TEST_F(RemapperFuseConvWithBiasAndActivation, Conv3D_F32) {
   RunTest<3, DT_FLOAT>();
 }
 TEST_F(RemapperFuseConvWithBiasAndActivation, Conv2D_BF16) {
-  if (!IsMKLEnabled())
-    GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
-                    "FuseConv2DWithBiasAndActivation with bfloat16.";
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "FuseConv2DWithBiasAndActivation with bfloat16.";
   RunTest<2, DT_BFLOAT16>();
 }
 TEST_F(RemapperFuseConvWithBiasAndActivation, Conv3D_BF16) {
-  if (!IsMKLEnabled())
-    GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
-                    "FuseConv3DWithBiasAndActivation with bfloat16.";
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "FuseConv3DWithBiasAndActivation with bfloat16.";
   RunTest<3, DT_BFLOAT16>();
 }
 
@@ -1000,15 +1005,17 @@ TEST_F(RemapperFuseConvWithSqueezeAndBias, Conv3D_FP32) {
   RunTest<3, DT_FLOAT>();
 }
 TEST_F(RemapperFuseConvWithSqueezeAndBias, Conv2D_BF16) {
-  if (!IsMKLEnabled())
-    GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
-                    "FuseConvWithSqueezeAndBias with bfloat16.";
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "FuseConvWithSqueezeAndBias with bfloat16.";
   RunTest<2, DT_BFLOAT16>();
 }
 TEST_F(RemapperFuseConvWithSqueezeAndBias, Conv3D_BF16) {
-  if (!IsMKLEnabled())
-    GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
-                    "FuseConvWithSqueezeAndBias with bfloat16.";
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "FuseConvWithSqueezeAndBias with bfloat16.";
   RunTest<3, DT_BFLOAT16>();
 }
 
@@ -1400,7 +1407,8 @@ TEST_F(RemapperFuseSoftplusTanhMul, FP32) {
   RunTest<DT_FLOAT>();
 }
 TEST_F(RemapperFuseSoftplusTanhMul, BF16) {
-  if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to MKL.";
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP() << "Test only applicable to oneDNN.";
   RunTest<DT_BFLOAT16>();
 }
 #endif
@@ -1695,8 +1703,10 @@ TEST_F(RemapperFuseMatMulWithBiasTest, F32) { RunTest<DT_FLOAT>(); }
 
 TEST_F(RemapperFuseMatMulWithBiasTest, Bf16) {
 #if !defined(ENABLE_MKL)
-  GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
-                  "FuseMatMulWithBias with bfloat16.";
+  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "FuseMatMulWithBias with bfloat16.";
 #endif
   RunTest<DT_BFLOAT16>();  // NOLINT
 }
@@ -1905,8 +1915,10 @@ TEST_F(RemapperFuseMatMulWithBiasAndActivationTest, F32) {
 
 TEST_F(RemapperFuseMatMulWithBiasAndActivationTest, Bf16) {
 #if !defined(ENABLE_MKL)
-  GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
-                  "FuseMatMulWithBiasAndActivation with bfloat16.";
+  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "FuseMatMulWithBiasAndActivation with bfloat16.";
 #endif
   RunTest<DT_BFLOAT16>();  // NOLINT
 }
@@ -2503,9 +2515,10 @@ TEST_F(RemapperFusePadConv3D, Conv3D_FP32) {
   RunTest<DT_FLOAT>();
 }
 TEST_F(RemapperFusePadConv3D, Conv3D_BF16) {
-  if (!IsMKLEnabled())
-    GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
-                    "RemapperFusePadConv3D with bfloat16.";
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "RemapperFusePadConv3D with bfloat16.";
   RunTest<DT_BFLOAT16>();
 }
 
@@ -2627,9 +2640,10 @@ TEST_F(RemapperFusePadWithFusedConv3D, FusedConv3D_FP32) {
   RunTest<DT_FLOAT>();
 }
 TEST_F(RemapperFusePadWithFusedConv3D, FusedConv3D_BF16) {
-  if (!IsMKLEnabled())
-    GTEST_SKIP() << "Intel MKL with bfloat16 support is not enabled, skipping "
-                    "RemapperFusePadWithFusedConv3D with bfloat16.";
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "RemapperFusePadWithFusedConv3D with bfloat16.";
   RunTest<DT_BFLOAT16>();
 }
 #endif
@@ -2696,7 +2710,13 @@ class RemapperLeakyReluTest : public GrapplerTest {
 };
 
 TEST_F(RemapperLeakyReluTest, F32) { RunTest<DT_FLOAT>(); }
-TEST_F(RemapperLeakyReluTest, BF16) { RunTest<DT_BFLOAT16>(); }
+TEST_F(RemapperLeakyReluTest, BF16) {
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "RemapperLeakyRelu with bfloat16.";
+  RunTest<DT_BFLOAT16>();
+}
 
 class RemapperFuseFusedConvWithFusedActivation : public RemapperTest {
  public:
@@ -2841,12 +2861,20 @@ TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv2D_F32) {
   RunTest<2, DT_FLOAT>();
 }
 TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv2D_BF16) {
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "RemapperFuseFusedConvWithFusedActivation with bfloat16.";
   RunTest<2, DT_BFLOAT16>();
 }
 TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv3D_F32) {
   RunTest<3, DT_FLOAT>();
 }
 TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv3D_BF16) {
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+    GTEST_SKIP()
+        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+           "RemapperFuseFusedConvWithFusedActivation with bfloat16.";
   RunTest<3, DT_BFLOAT16>();
 }
 

From 5d16b1d6cf4a34cccccb33bcead11e048deda365 Mon Sep 17 00:00:00 2001
From: Sam McCall <sammccall@google.com>
Date: Thu, 28 Sep 2023 13:37:46 -0700
Subject: [PATCH 393/567] Integrate LLVM at llvm/llvm-project@23ef8bf9c0f3

Updates LLVM usage to match
[23ef8bf9c0f3](https://github.com/llvm/llvm-project/commit/23ef8bf9c0f3)

PiperOrigin-RevId: 569276827
---
 third_party/llvm/generated.patch              | 47 -------------------
 third_party/llvm/workspace.bzl                |  4 +-
 third_party/triton/cl568793052.patch          | 16 +++++++
 third_party/triton/workspace.bzl              |  1 +
 .../xla/third_party/triton/cl568793052.patch  | 16 +++++++
 .../xla/third_party/triton/workspace.bzl      |  1 +
 .../cpu_tiling/transform_dot_for_cpu.cc       |  5 +-
 .../cpu_tiling/transform_reduce_for_cpu.cc    |  4 +-
 .../gml_st/transforms/fusion/fusion.cc        |  2 +-
 9 files changed, 42 insertions(+), 54 deletions(-)
 create mode 100644 third_party/triton/cl568793052.patch
 create mode 100644 third_party/xla/third_party/triton/cl568793052.patch

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index aea3ea01bf42bf..509398da979e83 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,48 +1 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
---- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
-+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
-@@ -5889,9 +5889,15 @@
-     for (const unsigned E = G.size();
-          BestIdx < E && (G[BestIdx].second == TargetLowering::C_Other ||
-                          G[BestIdx].second == TargetLowering::C_Immediate);
--         ++BestIdx)
-+         ++BestIdx) {
-       if (lowerImmediateIfPossible(G[BestIdx], Op, DAG, *this))
-         break;
-+      // If we're out of constraints, just pick the first one.
-+      if (BestIdx + 1 == E) {
-+        BestIdx = 0;
-+        break;
-+      }
-+    }
- 
-     OpInfo.ConstraintCode = G[BestIdx].first;
-     OpInfo.ConstraintType = G[BestIdx].second;
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/X86/inline-asm-bad-constraint-n.ll b/llvm/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
---- a/llvm/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
-+++ b/llvm/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
-@@ -8,3 +8,9 @@
-   call void asm sideeffect "foo $0", "n"(ptr %a) nounwind
-   ret void
- }
-+
-+; CHECK: error: invalid operand for inline asm constraint 'i'
-+define void @bar(i32 %v) {
-+  call void asm "", "in"(i32 %v)
-+  ret void
-+}
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/X86/inline-asm-n-constraint.ll b/llvm/test/CodeGen/X86/inline-asm-n-constraint.ll
---- a/llvm/test/CodeGen/X86/inline-asm-n-constraint.ll
-+++ b/llvm/test/CodeGen/X86/inline-asm-n-constraint.ll
-@@ -8,6 +8,10 @@
- ; CHECK:      #APP
- ; CHECK-NEXT: foo    $42
- ; CHECK-NEXT: #NO_APP
-+  call void asm "# $0", "in"(i32 1392848979)
-+; CHECK-NEXT: #APP
-+; CHECK-NEXT: # $1392848979
-+; CHECK-NEXT: #NO_APP
-   ret void
- ; CHECK-NEXT: retq
- }
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 7d4569adf21255..1b4bdad80ced49 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "aecb58005c692ed756d748437896498b124e9d45"
-    LLVM_SHA256 = "9d5f9be58aa8dcde7b8b4b454ac2b5c92531b5ecf61254438e70ac772efac607"
+    LLVM_COMMIT = "23ef8bf9c0f338ee073c6c1b553c42e46d2f22ad"
+    LLVM_SHA256 = "416d1cc3b9e7d9272c5677a94dbc6298ad6d20200171f62134af1eee5fa4c24f"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/triton/cl568793052.patch b/third_party/triton/cl568793052.patch
new file mode 100644
index 00000000000000..3c243361744d69
--- /dev/null
+++ b/third_party/triton/cl568793052.patch
@@ -0,0 +1,16 @@
+==== triton/test/Analysis/test-alias.mlir#6 - /google/src/cloud/akuegel/mlir_bd3f7376f1586aad43edfbc21f53cd6143237556_1695808376/triton/test/Analysis/test-alias.mlir ====
+# action=edit type=text
+--- triton/test/Analysis/test-alias.mlir	2023-07-24 05:26:38.000000000 -0700
++++ triton/test/Analysis/test-alias.mlir	2023-09-27 03:42:17.000000000 -0700
+@@ -233,9 +233,9 @@
+   %5 = arith.cmpi slt, %1, %arg1 : index
+   cf.cond_br %5, ^bb2, ^bb3
+ ^bb2:  // pred: ^bb1
+-  %6 = tt.cat %cst, %cst_0 {axis = 0 : i64} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #blocked>
++  %6 = tt.cat %cst, %cst_0 {axis = 0 : i64} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #AL>
+   gpu.barrier
+-  %7 = tt.cat %2, %3 {axis = 0 : i64} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #blocked>
++  %7 = tt.cat %2, %3 {axis = 0 : i64} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #AL>
+   %8 = arith.addi %1, %arg2 : index
+   cf.br ^bb1(%8, %4, %2, %3 : index, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>)
+ ^bb3:  // pred: ^bb1
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index c12e2ae5b8f6d9..acf032d41acbb3 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -26,5 +26,6 @@ def repo():
             "//third_party/triton:cl565664892.patch",
             "//third_party/triton:cl566223642.patch",
             "//third_party/triton:cl568240805.patch",
+            "//third_party/triton:cl568793052.patch",
         ],
     )
diff --git a/third_party/xla/third_party/triton/cl568793052.patch b/third_party/xla/third_party/triton/cl568793052.patch
new file mode 100644
index 00000000000000..3c243361744d69
--- /dev/null
+++ b/third_party/xla/third_party/triton/cl568793052.patch
@@ -0,0 +1,16 @@
+==== triton/test/Analysis/test-alias.mlir#6 - /google/src/cloud/akuegel/mlir_bd3f7376f1586aad43edfbc21f53cd6143237556_1695808376/triton/test/Analysis/test-alias.mlir ====
+# action=edit type=text
+--- triton/test/Analysis/test-alias.mlir	2023-07-24 05:26:38.000000000 -0700
++++ triton/test/Analysis/test-alias.mlir	2023-09-27 03:42:17.000000000 -0700
+@@ -233,9 +233,9 @@
+   %5 = arith.cmpi slt, %1, %arg1 : index
+   cf.cond_br %5, ^bb2, ^bb3
+ ^bb2:  // pred: ^bb1
+-  %6 = tt.cat %cst, %cst_0 {axis = 0 : i64} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #blocked>
++  %6 = tt.cat %cst, %cst_0 {axis = 0 : i64} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #AL>
+   gpu.barrier
+-  %7 = tt.cat %2, %3 {axis = 0 : i64} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #blocked>
++  %7 = tt.cat %2, %3 {axis = 0 : i64} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #AL>
+   %8 = arith.addi %1, %arg2 : index
+   cf.br ^bb1(%8, %4, %2, %3 : index, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>)
+ ^bb3:  // pred: ^bb1
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index c12e2ae5b8f6d9..acf032d41acbb3 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -26,5 +26,6 @@ def repo():
             "//third_party/triton:cl565664892.patch",
             "//third_party/triton:cl566223642.patch",
             "//third_party/triton:cl568240805.patch",
+            "//third_party/triton:cl568793052.patch",
         ],
     )
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc
index 35b92bf4b06ff6..316c9e5cc92ec2 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc
+++ b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Pass/Pass.h"  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "thlo/IR/thlo_ops.h"
 
@@ -291,8 +292,8 @@ LogicalResult tileAndPeelReductionDim(PatternRewriter &rewriter,
           getSCFTilingOptions(rewriter.getContext(), reductionDimTileSizes));
   if (failed(reductionDimTilingResult)) return failure();
 
-  SCFForPeelingResult reductionDimPeelingResult =
-      peelSCFForOp(rewriter, reductionDimTilingResult->loops.front());
+  SCFForPeelingResult reductionDimPeelingResult = peelSCFForOp(
+      rewriter, cast<scf::ForOp>(reductionDimTilingResult->loops.front()));
   if (reductionDimPeelingResult.mainLoop) {
     setLabel(reductionDimPeelingResult.mainLoop, kPerfectlyTiledLoopLabel);
   }
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
index 766c33a4573fa1..0c1c878a5f0847 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
+++ b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
@@ -500,8 +500,8 @@ struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
             producerFilterFn);
     if (failed(reductionDimTilingResult)) return failure();
 
-    SCFForPeelingResult reductionDimPeelingResult =
-        peelSCFForOp(rewriter, reductionDimTilingResult->loops.front());
+    SCFForPeelingResult reductionDimPeelingResult = peelSCFForOp(
+        rewriter, cast<scf::ForOp>(reductionDimTilingResult->loops.front()));
     if (reductionDimPeelingResult.mainLoop) {
       setLabel(reductionDimPeelingResult.mainLoop, kPerfectlyTiledLoopLabel);
     }
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
index 57e6b3a5cc42d0..04cf7f87181bac 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
+++ b/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
@@ -621,7 +621,7 @@ FailureOr<scf::SCFTilingResult> tileUsingSCFForOpAndFuseGreedily(
 
   // If tiling created an `scf.for` loop nest, we fuse.
   if (!tilingResult->loops.empty()) {
-    scf::ForOp innerLoop = tilingResult->loops.back();
+    scf::ForOp innerLoop = cast<scf::ForOp>(tilingResult->loops.back());
     fuseGreedily(rewriter, innerLoop.getBody(), fuseFilterFn);
   }
   return tilingResult;

From 50b94e93fc6fe8e6efbcb640ec7674f11b2d1323 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Thu, 28 Sep 2023 13:56:33 -0700
Subject: [PATCH 394/567] [xla] Fix a problem in cloning wrapped async
 instructions.

Previously, we clone the wrapped async computation twice, one for each async
instruction in a pair of async instructions. We now make sure that both async
instructions use the same wrapped async computation.

PiperOrigin-RevId: 569282322
---
 .../xla/xla/hlo/ir/hlo_instructions.cc        | 14 +++-
 .../xla/xla/service/hlo_computation_test.cc   | 74 +++++++++++++++++++
 2 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index f6c7ef48f57f94..a7c7a18b743057 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -368,8 +368,18 @@ std::unique_ptr<HloInstruction> HloAsyncInstruction::CloneWithNewOperandsImpl(
         context->FindComputation(async_wrapped_computation());
   }
   if (new_wrapped_computation == nullptr) {
-    new_wrapped_computation = module->AddEmbeddedComputation(
-        async_wrapped_computation()->Clone("clone", context));
+    // kAsyncDone and kAsyncUpdate uses the sync wrapped computation of its
+    // corresponding kAsyncUpdate or kAsyncDone, avoid cloning the computation
+    // again.
+    if ((opcode() == HloOpcode::kAsyncDone ||
+         opcode() == HloOpcode::kAsyncUpdate) &&
+        operand(0)->async_wrapped_computation() ==
+            async_wrapped_computation()) {
+      new_wrapped_computation = new_operands[0]->async_wrapped_computation();
+    } else {
+      new_wrapped_computation = module->AddEmbeddedComputation(
+          async_wrapped_computation()->Clone("clone", context));
+    }
   }
   return std::make_unique<HloAsyncInstruction>(
       opcode(), shape, new_operands, new_wrapped_computation, async_group_id_,
diff --git a/third_party/xla/xla/service/hlo_computation_test.cc b/third_party/xla/xla/service/hlo_computation_test.cc
index f25f74f8acec0f..2e576161664e97 100644
--- a/third_party/xla/xla/service/hlo_computation_test.cc
+++ b/third_party/xla/xla/service/hlo_computation_test.cc
@@ -829,5 +829,79 @@ TEST_F(HloComputationTest, ComparisonWithCustomComparator) {
   EXPECT_TRUE(comp_a->Equal(*comp_b, false, compare_func));
 }
 
+TEST_F(HloComputationTest, CloneWrappedAsyncInstructionSameWrappedFunc) {
+  const char* const hlo_string = R"(
+  HloModule Module
+    add (lhs: u32[], rhs: u32[]) -> u32[] {
+       lhs = u32[] parameter(0)
+       rhs = u32[] parameter(1)
+       ROOT add = u32[] add(u32[] lhs, u32[] rhs)
+    }
+
+    async_wrapped (async_param.1: u32[8]) -> u32[4] {
+       async_param.1 = u32[8]{0} parameter(0)
+       ROOT reduce-scatter.1 = u32[4]{0} reduce-scatter(u32[8]{0} async_param.1),
+         replica_groups={}, dimensions={0}, to_apply=add
+    }
+
+    ENTRY main (data: u32[8]) -> u32[4] {
+      data = u32[8]{0} parameter(0)
+      reduce-scatter-start = ((u32[8]{0}), u32[4]{0}) async-start(u32[8]{0} data),
+        calls=async_wrapped, backend_config={"is_sync":false}
+      ROOT reduce-scatter-done = u32[4]{0} async-done(((u32[8]{0}), u32[4]{0}) reduce-scatter-start),
+        calls=async_wrapped
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* start = FindInstruction(module.get(), "reduce-scatter-start");
+  HloInstruction* done = FindInstruction(module.get(), "reduce-scatter-done");
+  EXPECT_EQ(start->called_computations()[0], done->called_computations()[0]);
+  std::unique_ptr<HloInstruction> cloned_start = start->Clone();
+  std::unique_ptr<HloInstruction> cloned_done =
+      done->CloneWithNewOperands(done->shape(), {cloned_start.get()});
+  EXPECT_EQ(cloned_start.get()->called_computations()[0],
+            cloned_done.get()->called_computations()[0]);
+}
+
+TEST_F(HloComputationTest, CloneWrappedAsyncInstructionDiffWrappedFunc) {
+  const char* const hlo_string = R"(
+  HloModule Module
+    add (lhs: u32[], rhs: u32[]) -> u32[] {
+       lhs = u32[] parameter(0)
+       rhs = u32[] parameter(1)
+       ROOT add = u32[] add(u32[] lhs, u32[] rhs)
+    }
+
+    async_wrapped_1 (async_param: u32[8]) -> u32[4] {
+       async_param = u32[8]{0} parameter(0)
+       ROOT %reduce-scatter = u32[4]{0} reduce-scatter(u32[8]{0} async_param),
+         replica_groups={}, dimensions={0}, to_apply=add
+    }
+
+    async_wrapped_2 (async_param.1: u32[8]) -> u32[4] {
+       async_param.1 = u32[8]{0} parameter(0)
+       ROOT reduce-scatter.1 = u32[4]{0} reduce-scatter(u32[8]{0} async_param.1),
+         replica_groups={}, dimensions={0}, to_apply=add
+    }
+
+    ENTRY main (data: u32[8]) -> u32[4] {
+      data = u32[8]{0} parameter(0)
+      reduce-scatter-start = ((u32[8]{0}), u32[4]{0}) async-start(u32[8]{0} data),
+        calls=async_wrapped_1, backend_config={"is_sync":false}
+      ROOT reduce-scatter-done = u32[4]{0} async-done(((u32[8]{0}), u32[4]{0}) reduce-scatter-start),
+        calls=async_wrapped_2
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* start = FindInstruction(module.get(), "reduce-scatter-start");
+  HloInstruction* done = FindInstruction(module.get(), "reduce-scatter-done");
+  EXPECT_NE(start->called_computations()[0], done->called_computations()[0]);
+  std::unique_ptr<HloInstruction> cloned_start = start->Clone();
+  std::unique_ptr<HloInstruction> cloned_done =
+      done->CloneWithNewOperands(done->shape(), {cloned_start.get()});
+  EXPECT_NE(cloned_start.get()->called_computations()[0],
+            cloned_done.get()->called_computations()[0]);
+}
+
 }  // namespace
 }  // namespace xla

From 5acba3f5acb78e480dc30cc5c03143f470e1303a Mon Sep 17 00:00:00 2001
From: Luke Boyer <lukeboyer@google.com>
Date: Thu, 28 Sep 2023 14:04:42 -0700
Subject: [PATCH 395/567] Refine result types for TensorListPopBack during
 shape inference.

PiperOrigin-RevId: 569285288
---
 .../tensorflow/tests/shape_inference.mlir     |  8 ++++
 .../tensorflow/transforms/shape_inference.cc  | 46 ++++++++++++++++++-
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index c65bd89421132c..f84260faf8338b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -767,6 +767,14 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     "tf._SomeOtherOp"(%shape_32, %shape_64) : (tensor<?xi32>, tensor<?xi64>) -> ()
     func.return
   }
+  
+  // CHECK-LABEL: refine_pop_back_results_from_operands
+  func.func @refine_pop_back_results_from_operands(%arg0: tensor<!tf_type.variant<tensor<2xi32>>>, %arg1: tensor<1xi32>) -> (tensor<!tf_type.variant>, tensor<*xi32>)  {
+    %0, %1 = "tf.TensorListPopBack"(%arg0, %arg1) : (tensor<!tf_type.variant<tensor<2xi32>>>, tensor<1xi32>) -> (tensor<!tf_type.variant>, tensor<*xi32>)
+    // CHECK: %output_handle, %tensor = "tf.TensorListPopBack"(%arg0, %arg1) : (tensor<!tf_type.variant<tensor<2xi32>>>, tensor<1xi32>) -> (tensor<!tf_type.variant<tensor<2xi32>>>, tensor<2xi32>)
+    // CHECK: return %output_handle, %tensor : tensor<!tf_type.variant<tensor<2xi32>>>, tensor<2xi32>
+    func.return %0, %1 : tensor<!tf_type.variant>, tensor<*xi32>
+  }
 
   // CHECK-LABEL: do_not_unrefine_fully_defined_subtypes
   func.func @do_not_unrefine_fully_defined_subtypes() {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 8960325595b55f..1bc881e7ebcac6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -952,6 +952,10 @@ class ShapeInference {
   // mutations have identical static shape.
   bool InferShapeForTensorListInitOps(Operation* op);
 
+  // Conservatively infers shape of output tenorlist and item based on
+  // input tensorlist's element shape.
+  bool InferShapeForTensorListPopBackOp(TensorListPopBackOp op);
+
   // Infers the shape of VarHandleOp based on the uses of the VarHandleOp to
   // update the subtypes of the resource type.
   bool InferShapeForVarHandleOp(VarHandleOp op);
@@ -1553,7 +1557,7 @@ bool ShapeInference::InferShapeForReduceDataset(ReduceDatasetOp op,
 }
 
 bool ShapeInference::InferShapeForTensorListInitOps(Operation* op) {
-  DCOMMENT_OP(op, "Inferring shape for TensorList ");
+  DCOMMENT_OP(op, "Inferring shape for TensorListInitOps.");
   Value handle = op->getResult(0);
   Value initial_element_shape = GetElementShapeOperand(op);
   RankedTensorType element_type;
@@ -1578,6 +1582,42 @@ bool ShapeInference::InferShapeForTensorListInitOps(Operation* op) {
   return changed;
 }
 
+bool ShapeInference::InferShapeForTensorListPopBackOp(TensorListPopBackOp op) {
+  // The first Operand is assumed to be a TensorType around a variant with a
+  // single subtype (e.g. tensor<!tf_type.variant<tensor<2xi32>>>). We will
+  // copy this type to the first result, and copy the singular variant subtype
+  // to the second result (tensor<2xi32>).
+  DCOMMENT_OP(op, "Inferring shape for TensorListPopBackOp.");
+
+  auto src_list_handle_t =
+      op.getOperand(0).getType().dyn_cast_or_null<TensorType>();
+  if (!src_list_handle_t) return false;
+
+  // Copy of operand tensorlist type.
+  TensorType dst_list_handle_t =
+      src_list_handle_t.clone(src_list_handle_t.getElementType());
+  auto variant_element_t =
+      dst_list_handle_t.getElementType().dyn_cast_or_null<VariantType>();
+  if (!variant_element_t || variant_element_t.getSubtypes().size() != 1)
+    return false;
+
+  // Underlying TensorType from variant that represents a shape signature
+  // compatible with all elements in the tensorlist.
+  TensorType list_handle_element_t = variant_element_t.getSubtypes()[0];
+
+  if (!RefineResultType(op, op->getResult(0), dst_list_handle_t)) {
+    DCOMMENT("InferShapeForTensorListPopBackOp could not propogate result 0"
+             << op);
+    return false;
+  }
+  if (!RefineResultType(op, op->getResult(1), list_handle_element_t)) {
+    DCOMMENT("InferShapeForTensorListPopBackOp could not propogate result 1"
+             << op);
+    return false;
+  }
+  return true;
+}
+
 bool ShapeInference::InferShapeForVarHandleOp(VarHandleOp op) {
   DCOMMENT_OP(op, "Inferring shape for VarHandleOp");
 
@@ -2480,6 +2520,10 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op,
   // the InferenceContext below to get more precise shapes.
   if (IsTensorListInitOp(op) && InferShapeForTensorListInitOps(op)) return true;
 
+  if (auto pop_back = dyn_cast<TF::TensorListPopBackOp>(op)) {
+    return InferShapeForTensorListPopBackOp(pop_back);
+  }
+
   if (auto var_handle_op = dyn_cast<VarHandleOp>(op)) {
     return InferShapeForVarHandleOp(var_handle_op);
   }

From 4ea0eb06620d77ee5e70117a15ae560bc12edc8a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Sep 2023 14:13:14 -0700
Subject: [PATCH 396/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/703dbd9190461909be83200d6a1f1c495f1177d5.

PiperOrigin-RevId: 569287679
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index d6c8f2e2431d7b..ac11c18a145229 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "9d58a7aefd3b0b184bfc30be383bdbe17c5233ae"
-    TFRT_SHA256 = "f0bd547a2a36e4866af3101c9f3c7da39e66fe77ead83bf8b2fdc1c9b5674ce0"
+    TFRT_COMMIT = "703dbd9190461909be83200d6a1f1c495f1177d5"
+    TFRT_SHA256 = "7c7edf3357844ec377f5c23fc5242d888f691616cd5a5a8c2e8d95d17bb93e0b"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index d6c8f2e2431d7b..ac11c18a145229 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "9d58a7aefd3b0b184bfc30be383bdbe17c5233ae"
-    TFRT_SHA256 = "f0bd547a2a36e4866af3101c9f3c7da39e66fe77ead83bf8b2fdc1c9b5674ce0"
+    TFRT_COMMIT = "703dbd9190461909be83200d6a1f1c495f1177d5"
+    TFRT_SHA256 = "7c7edf3357844ec377f5c23fc5242d888f691616cd5a5a8c2e8d95d17bb93e0b"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index d6c8f2e2431d7b..ac11c18a145229 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "9d58a7aefd3b0b184bfc30be383bdbe17c5233ae"
-    TFRT_SHA256 = "f0bd547a2a36e4866af3101c9f3c7da39e66fe77ead83bf8b2fdc1c9b5674ce0"
+    TFRT_COMMIT = "703dbd9190461909be83200d6a1f1c495f1177d5"
+    TFRT_SHA256 = "7c7edf3357844ec377f5c23fc5242d888f691616cd5a5a8c2e8d95d17bb93e0b"
 
     tf_http_archive(
         name = "tf_runtime",

From f26b3dfc360f37f9948aff41dcade560e6314fdb Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Thu, 28 Sep 2023 14:31:40 -0700
Subject: [PATCH 397/567] Roll back
 https://github.com/tensorflow/tensorflow/commit/f92a70a41e76db5d0829120f46d5b001f89decdf

PiperOrigin-RevId: 569292433
---
 third_party/xla/xla/python/BUILD                | 2 --
 third_party/xla/xla/python/py_client.cc         | 1 -
 third_party/xla/xla/python/weakref_lru_cache.cc | 6 ------
 3 files changed, 9 deletions(-)

diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index d177a0edbfea0a..7d9b296bd27a9e 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -946,11 +946,9 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
-        ":python_ref_manager",
         # placeholder for index annotation deps
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
         "//xla/pjrt:lru_cache",
         "@pybind11",
     ],
diff --git a/third_party/xla/xla/python/py_client.cc b/third_party/xla/xla/python/py_client.cc
index 234086a525ed4b..ce6e4dcba1c2e7 100644
--- a/third_party/xla/xla/python/py_client.cc
+++ b/third_party/xla/xla/python/py_client.cc
@@ -102,7 +102,6 @@ std::vector<py::object> PyClient::LiveBuffers() {
 
 std::vector<std::shared_ptr<PyLoadedExecutable>> PyClient::LiveExecutables() {
   CHECK(PyGILState_Check());
-  GlobalPyRefManager()->CollectGarbage();
   std::vector<std::shared_ptr<PyLoadedExecutable>> executables;
   for (PyLoadedExecutable* exec = executables_; exec; exec = exec->next_) {
     if (!exec->is_deleted()) {
diff --git a/third_party/xla/xla/python/weakref_lru_cache.cc b/third_party/xla/xla/python/weakref_lru_cache.cc
index 2f84767f02053a..7d0447b2395035 100644
--- a/third_party/xla/xla/python/weakref_lru_cache.cc
+++ b/third_party/xla/xla/python/weakref_lru_cache.cc
@@ -23,10 +23,8 @@ limitations under the License.
 
 #include "absl/cleanup/cleanup.h"
 #include "absl/synchronization/notification.h"
-#include "absl/types/span.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/pjrt/lru_cache.h"
-#include "xla/python/python_ref_manager.h"
 
 namespace jax {
 namespace {
@@ -96,10 +94,6 @@ class WeakrefLRUCache : public std::enable_shared_from_this<WeakrefLRUCache> {
     pybind11::object result;
     absl::Notification completed;
     std::thread::id thread_id = std::this_thread::get_id();
-
-    ~CacheEntry() {
-      xla::GlobalPyRefManager()->AddGarbage(absl::MakeSpan(&result, 1));
-    }
   };
 
   struct CacheInfo {

From 8314e4cf4f030db93138bc44868e95ded4ce2128 Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Thu, 28 Sep 2023 14:41:41 -0700
Subject: [PATCH 398/567] Fix copying of xla aot runtime sources.

With the recent vendoring of XLA/TSL, the previous required  XLA/TSL
runtime sources now have a path prefix `../local_{xla|tsl}/`, which
when copied into the pip package `xla_aot_runtime_src/` folder end up
being copied outside the directory.

We fix this by removing the `../local_{xla|tsl}/` prefix in the output
file name.  These files will therefore exist as
```
xla_aot_runtime_src/xla/*
xla_aot_runtime_src/tsl/*
```

PiperOrigin-RevId: 569295112
---
 tensorflow/tools/pip_package/build_pip_package.sh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 8f6906cf8f0ade..423a79bff780a8 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -83,10 +83,15 @@ function copy_xla_aot_runtime_sources() {
     if [ ! -z "$candidate_file" ]; then
       file=$candidate_file
     fi
-    dn=$(dirname $file)
+
+    # For XLA/TSL, we need to remove the prefix "../local_{xla|tsl}/".
+    dst_file=$file
+    dst_file=${dst_file#"../local_xla/"}
+    dst_file=${dst_file#"../local_tsl/"}
+
     if test -f "$file"; then
-      mkdir -p "${dst_dir}/${dn}"
-      cp $file "${dst_dir}/${file}"
+      mkdir -p "${dst_dir}/$(dirname $dst_file)"
+      cp $file "${dst_dir}/${dst_file}"
     else
       echo "Missing xla source file: ${file}" 1>&2
     fi

From 1c22354116b9dc9943e6026bf449d67ac2bfd696 Mon Sep 17 00:00:00 2001
From: Raviteja Gorijala <gorijala@google.com>
Date: Thu, 28 Sep 2023 14:49:29 -0700
Subject: [PATCH 399/567] Update RELEASE.md with final 2.14.0 release notes

PiperOrigin-RevId: 569297075
---
 RELEASE.md | 136 ++++++++++++++++-------------------------------------
 1 file changed, 41 insertions(+), 95 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index d4b23dfa88c25d..b2c706b5899c36 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -158,80 +158,38 @@ This release contains contributions from many people at Google, as well as:
 
 # Release 2.14.0
 
-<INSERT SMALL BLURB ABOUT RELEASE FOCUS AREA AND POTENTIAL TOOLCHAIN CHANGES>
+## Tensorflow
 
-# Breaking Changes
+### Breaking Changes
 
-* <DOCUMENT BREAKING CHANGES HERE>
-* <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
+*   Support for Python 3.8 has been removed starting with TF 2.14. The TensorFlow 2.13.1 patch release will still have Python 3.8 support.
 
 *  `tf.Tensor`
-    * The class hierarchy for `tf.Tensor` has changed, and there are now
-      explicit `EagerTensor` and `SymbolicTensor` classes for eager and
-      tf.function respectively. Users who relied on the exact type of Tensor
-      (e.g. `type(t) == tf.Tensor`) will need to update their code to use
-      `isinstance(t, tf.Tensor)`. The `tf.is_symbolic_tensor` helper added in
-      2.13 may be used when it is necessary to determine if a value is
-      specifically a symbolic tensor.
+    * The class hierarchy for `tf.Tensor` has changed, and there are now explicit `EagerTensor` and `SymbolicTensor` classes for eager and tf.function respectively. Users who relied on the exact type of Tensor (e.g. `type(t) == tf.Tensor`) will need to update their code to use `isinstance(t, tf.Tensor)`. The `tf.is_symbolic_tensor` helper added in 2.13 may be used when it is necessary to determine if a value is specifically a symbolic tensor.
 
 *   `tf.compat.v1.Session`
-    * `tf.compat.v1.Session.partial_run` and
-      `tf.compat.v1.Session.partial_run_setup` will be deprecated in the
-      next release.
+    * `tf.compat.v1.Session.partial_run` and `tf.compat.v1.Session.partial_run_setup` will be deprecated in the next release.
 
-*   `tf.estimator`
-    * `tf.estimator` API will be removed in the next release. TF Estimator
-       Python package will no longer be released.
-
-# Known Caveats
+### Known Caveats
 
-* <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES).>
-* <ADDING/BUMPING DEPENDENCIES SHOULD GO HERE>
-* <KNOWN LACK OF SUPPORT ON SOME PLATFORM, SHOULD GO HERE>
 * `tf.lite`
-    * when converter flag "_experimenal_use_buffer_offset" is enabled,
-    additional metadata is automatically excluded from the generated model.
-    The behaviour is the same as "exclude_conversion_metadata" is set
-    * If the model is larger than 2GB, then we also require
-    "exclude_conversion_metadata" flag to be set
+    * when converter flag "_experimenal_use_buffer_offset" is enabled, additional metadata is automatically excluded from the generated model. The behaviour is the same as "exclude_conversion_metadata" is set
+    * If the model is larger than 2GB, then we also require "exclude_conversion_metadata" flag to be set
 
-# Major Features and Improvements
-
-*   <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
-*   <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
+### Major Features and Improvements
 
-*   The `tensorflow` pip package has a new, optional installation method for
-    Linux that installs necessary Nvidia CUDA libraries through pip. As long as
-    the Nvidia driver is already installed on the system, you may now run `pip
-    install tensorflow[and-cuda]` to install TensorFlow's Nvidia CUDA library
-    dependencies in the Python environment. Aside from the Nvidia driver, no
-    other pre-existing Nvidia CUDA packages are necessary.
+*   The `tensorflow` pip package has a new, optional installation method for Linux that installs necessary Nvidia CUDA libraries through pip. As long as the Nvidia driver is already installed on the system, you may now run `pip install tensorflow[and-cuda]` to install TensorFlow's Nvidia CUDA library dependencies in the Python environment. Aside from the Nvidia driver, no other pre-existing Nvidia CUDA packages are necessary.
 
-* `tf.keras`
-    * `Model.compile` now support `steps_per_execution='auto'` as a parameter,
-    allowing automatic tuning of steps per execution during `Model.fit`,
-    `Model.predict`, and `Model.evaluate` for a significant performance boost.
-    * New `StepsPerExecutionTuner` class allows for `steps_per_execution` tuning
-    for custom training loops enabling performance boosts in custom workflows.
-    * Model now has two more settable parameters, `steps_per_execution` to set
-    this value to a manual heuristic, and `autotune_steps_per_execution` to
-    enable and disable tuning.
-
-*   Enable JIT-compiled i64-indexed kernels on GPU for large tensors with more
-    than 2**32 elements.
-    *   Unary GPU kernels: Abs, Atanh, Acos, Acosh, Asin, Asinh, Atan, Cos,
-        Cosh, Sin, Sinh, Tan, Tanh.
-    *   Binary GPU kernels: AddV2, Sub, Div, DivNoNan, Mul, MulNoNan, FloorDiv,
-        Equal, NotEqual, Greater, GreaterEqual, LessEqual, Less.
+*   Enable JIT-compiled i64-indexed kernels on GPU for large tensors with more than 2**32 elements.
+    *   Unary GPU kernels: Abs, Atanh, Acos, Acosh, Asin, Asinh, Atan, Cos, Cosh, Sin, Sinh, Tan, Tanh.
+    *   Binary GPU kernels: AddV2, Sub, Div, DivNoNan, Mul, MulNoNan, FloorDiv, Equal, NotEqual, Greater, GreaterEqual, LessEqual, Less.
 
 * `tf.lite`
-    * Add experimental supports conversion of models that may be larger than 2GB
-     before buffer deduplication
+    * Add experimental supports conversion of models that may be larger than 2GB before buffer deduplication
 
-# Bug Fixes and Other Changes
+### Bug Fixes and Other Changes
 
-* `tf.py_function` and `tf.numpy_function` can now be used as function
-   decorators for clearer code:
+* `tf.py_function` and `tf.numpy_function` can now be used as function decorators for clearer code:
    ```
    @tf.py_function(Tout=tf.float32)
    def my_fun(x):
@@ -240,59 +198,47 @@ This release contains contributions from many people at Google, as well as:
    ```
 
 * `tf.lite`
-    * `Strided_Slice` now supports `UINT32`.
-    * Add int8 and int16x8 support for LOG operator
-
-* <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
-* <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
-* <NOTES SHOULD BE GROUPED PER AREA>
+    * Strided_Slice now supports `UINT32`.
 
 * `tf.config.experimental.enable_tensor_float_32_execution`
-    * Disabling TensorFloat-32 execution now causes TPUs to use float32
-      precision for float32 matmuls and other ops. TPUs have always used
-      bfloat16 precision for certain ops, like matmul, when such ops had float32
-      inputs. Now, disabling TensorFloat-32 by calling
-      `tf.config.experimental.enable_tensor_float_32_execution(False)` will
-      cause TPUs to use float32 precision for such ops instead of bfloat16.
+    * Disabling TensorFloat-32 execution now causes TPUs to use float32 precision for float32 matmuls and other ops. TPUs have always used bfloat16 precision for certain ops, like matmul, when such ops had float32 inputs. Now, disabling TensorFloat-32 by calling `tf.config.experimental.enable_tensor_float_32_execution(False)` will cause TPUs to use float32 precision for such ops instead of bfloat16.
 
 *  `tf.experimental.dtensor`
-    * API changes for Relayout. Added a new API, `dtensor.relayout_like`, for
-      relayouting a tensor according to the layout of another tensor.
-    * Added `dtensor.get_default_mesh`, for retrieving the current default
-      mesh under the dtensor context.
-    * \*fft\* ops now support dtensors with any layout. Fixed bug in 'fft2d/
-      fft3d', 'ifft2d/ifft3d', 'rfft2d/rfft3d', and 'irfft2d/irfft3d' for
-      sharded input.
+    * API changes for Relayout. Added a new API, `dtensor.relayout_like`, for relayouting a tensor according to the layout of another tensor.
+    * Added `dtensor.get_default_mesh`, for retrieving the current default mesh under the dtensor context.
+    * \*fft\* ops now support dtensors with any layout. Fixed bug in 'fft2d/fft3d', 'ifft2d/ifft3d', 'rfft2d/rfft3d', and 'irfft2d/irfft3d' for sharde input. Refer to this [blog post](https://blog.tensorflow.org/2023/08/distributed-fast-fourier-transform-in-tensorflow.html) for details.
 
 *  `tf.experimental.strict_mode`
-    * Added a new API, `strict_mode`, which converts all deprecation warnings
-      into runtime errors with instructions on switching to a recommended 
-      substitute.
+    * Added a new API, `strict_mode`, which converts all deprecation warnings into runtime errors with instructions on switching to a recommended  substitute.
 
 *   TensorFlow Debugger (tfdbg) CLI: ncurses-based CLI for tfdbg v1 was removed.
 
-*   TensorFlow now supports C++ RTTI on mobile and Android. To enable this
-    feature, pass the flag `--define=tf_force_rtti=true` to Bazel when building
-    TensorFlow. This may be needed when linking TensorFlow into RTTI-enabled
-    programs since mixing RTTI and non-RTTI code can cause ABI issues.
+*   TensorFlow now supports C++ RTTI on mobile and Android. To enable this feature, pass the flag `--define=tf_force_rtti=true` to Bazel when building TensorFlow. This may be needed when linking TensorFlow into RTTI-enabled programs since mixing RTTI and non-RTTI code can cause ABI issues.
+
+* `tf.ones`, `tf.zeros`, `tf.fill`, `tf.ones_like`, `tf.zeros_like` now take an additional Layout argument that controls the output layout of their results.
+
+* `tf.nest` and `tf.data` now support user defined classes implementing `__tf_flatten__` and `__tf_unflatten__` methods. See [nest_util code examples](https://github.com/tensorflow/tensorflow/blob/04869b4e63bfc03cb13627b3e1b879fdd0f69e34/tensorflow/python/util/nest_util.py#L97)
+for an example.
+
+*  TensorFlow IO support is now available for Apple Silicon packages.
 
-* `tf.ones`, `tf.zeros`, `tf.fill`, `tf.ones_like`, `tf.zeros_like` now take an
-    additional Layout argument that controls the output layout of their results.
+*  Refactor CpuExecutable to propagate LLVM errors.
 
-*  Limited support of unified n-d FFT Ops: `tf.signal.fftn`,
-   `tf.signal.ifftn`, `tf.signal.rfftn`, `tf.signal.irfftn`.
-   Note that they only support up to 3d and gradients are unsupported.
+## Keras
+
+Keras is a framework built on top of the TensorFlow. See more details on the Keras [website](https://keras.io/).
+
+### Major Features and Improvements
 
-* `tf.nest` and `tf.data` now support user defined classes implementing
-  `__tf_flatten__` and `__tf_unflatten__` methods. See [
-  nest_util code examples](https://github.com/tensorflow/tensorflow/blob/04869b4e63bfc03cb13627b3e1b879fdd0f69e34/tensorflow/python/util/nest_util.py#L97)
-  for an example.
+* `tf.keras`
+    * `Model.compile` now support `steps_per_execution='auto'` as a parameter, allowing automatic tuning of steps per execution during `Model.fit`,
+    `Model.predict`, and `Model.evaluate` for a significant performance boost.
 
-# Thanks to our Contributors
+## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
 
-<INSERT>, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
+Aakar Dwivedi, Adrian Popescu, ag.ramesh, Akhil Goel, Albert Zeyer, Alex Rosen, Alexey Vishnyakov, Andrew Goodbody, angerson, Ashiq Imran, Ayan Moitra, Ben Barsdell, Bhavani Subramanian, Boian Petkantchin, BrianWieder, Chris Mc, cloudhan, Connor Flanagan, Daniel Lang, Daniel Yudelevich, Darya Parygina, David Korczynski, David Svantesson, dingyuqing05, Dragan Mladjenovic, dskkato, Eli Kobrin, Erick Ochoa, Erik Schultheis, Frédéric Bastien, gaikwadrahul8, Gauri1 Deshpande, guozhong.zhuang, H. Vetinari, Isaac Cilia Attard, Jake Hall, Jason Furmanek, Jerry Ge, Jinzhe Zeng, JJ, johnnkp, Jonathan Albrecht, jongkweh, justkw, Kanvi Khanna, kikoxia, Koan-Sin Tan, Kun-Lu, ltsai1, Lu Teng, luliyucoordinate, Mahmoud Abuzaina, mdfaijul, Milos Puzovic, Nathan Luehr, Om Thakkar, pateldeev, Peng Sun, Philipp Hack, pjpratik, Poliorcetics, rahulbatra85, rangjiaheng, Renato Arantes, Robert Kalmar, roho, Rylan Justice, Sachin Muradi, samypr100, Saoirse Stewart, Shanbin Ke, Shivam Mishra, shuw, Song Ziming, Stephan Hartmann, Sulav, sushreebarsa, T Coxon, Tai Ly, talyz, Thibaut Goetghebuer-Planchon, Thomas Preud'Homme, tilakrayal, Tirumalesh, Tj Xu, Tom Allsop, Trevor Morris, Varghese, Jojimon, Wen Chen, Yaohui Liu, Yimei Sun, Zhoulong Jiang, Zhoulong, Jiang
 
 # Release 2.13.0
 

From d2981d776d2ee0ba155c1d88fa1197395f7e2990 Mon Sep 17 00:00:00 2001
From: Shixin Li <shixinli@google.com>
Date: Thu, 28 Sep 2023 14:59:54 -0700
Subject: [PATCH 400/567] Enable cross compilation for GPU PJRT
 compiler/client.

PiperOrigin-RevId: 569299837
---
 third_party/xla/xla/pjrt/gpu/BUILD            |  23 ++--
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  33 +++--
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     |  17 ++-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc  |  48 ++++---
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h   |   4 +-
 .../pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc | 117 ++++++++++--------
 .../xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc |  11 ++
 7 files changed, 141 insertions(+), 112 deletions(-)

diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 3df92863bc8416..eb1ddc2fad0b38 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -4,6 +4,7 @@ load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_tsl//tsl:tsl.bzl", "if_nccl")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
+load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
@@ -236,7 +237,6 @@ cc_library(
         "//xla/service/gpu:gpu_target_config",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
     ] + if_cuda([
         ":nccl_id_store_cuda",
@@ -257,12 +257,7 @@ cc_library(
 xla_cc_test(
     name = "se_gpu_pjrt_compiler_test",
     srcs = if_gpu_is_configured(["se_gpu_pjrt_compiler_test.cc"]),
-    tags = [
-        "config-cuda-only",
-        "gpu",
-        "no_oss",
-        "requires-gpu-nvidia",
-    ],
+    tags = tf_cuda_tests_tags(),
     deps = [
         ":se_gpu_pjrt_client",
         ":se_gpu_pjrt_compiler",
@@ -285,6 +280,7 @@ xla_cc_test(
 xla_cc_test(
     name = "se_gpu_pjrt_compiler_aot_test",
     srcs = if_gpu_is_configured(["se_gpu_pjrt_compiler_aot_test.cc"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     tags = [
         "config-cuda-only",
         "gpu",
@@ -306,7 +302,6 @@ xla_cc_test(
         "//xla/service:gpu_plugin",
         "//xla/service:hlo_parser",
         "//xla/service/gpu:gpu_target_config",
-        "//xla/service/gpu:nvptx_compiler_impl",
         "//xla/stream_executor/cuda:cublas_plugin",
         "//xla/tests:literal_test_util",
         "@com_google_absl//absl/memory",
@@ -317,8 +312,18 @@ xla_cc_test(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
+        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
-    ],
+    ] + if_cuda([
+        ":nccl_id_store_cuda",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//xla/stream_executor/cuda:cuda_activation_header",
+        "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
+    ]) + if_rocm([
+        ":nccl_id_store_rocm",
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
 )
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 7799f0eb379514..b1916e471c8b6a 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -556,9 +556,9 @@ StatusOr<std::unique_ptr<StreamExecutorUnloadedExecutable>> FromProto(
 }  // namespace
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-StreamExecutorGpuClient::LoadSerializedExecutable(
-    absl::string_view serialized, std::optional<CompileOptions> options,
-    const LoadOptions& load_options) {
+StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
+                                        std::optional<CompileOptions> options,
+                                        const LoadOptions& load_options) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   StreamExecutorUnloadedExecutableProto proto;
   if (serialized.size() > std::numeric_limits<int>::max()) {
@@ -572,7 +572,8 @@ StreamExecutorGpuClient::LoadSerializedExecutable(
         "failed");
   }
   TF_ASSIGN_OR_RETURN(auto se_executable, FromProto(proto));
-  return Load(std::move(se_executable), LoadOptions());
+  // TODO(b/296466237): Unify the `Load` method.
+  return Load(std::move(se_executable));
 #endif
   return absl::InternalError("LoadSerialized only works with cuda or rocm.");
 }
@@ -593,8 +594,7 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
 }
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>> StreamExecutorGpuClient::Load(
-    std::unique_ptr<PjRtExecutable> executable,
-    const LoadOptions& load_options) {
+    std::unique_ptr<PjRtExecutable> executable) {
   auto se_executable =
       absl::WrapUnique(tensorflow::down_cast<StreamExecutorUnloadedExecutable*>(
           executable.release()));
@@ -604,23 +604,22 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> StreamExecutorGpuClient::Load(
   TF_ASSIGN_OR_RETURN(ExecutableExtras extras,
                       GetExecutableExtras(&compile_options));
 
-  se::StreamExecutor* se_executor =
-      client()->backend().default_stream_executor();
-  if (se_executor == nullptr) {
-    return absl::FailedPreconditionError("Default stream executor is null.");
-  }
+  TF_ASSIGN_OR_RETURN(
+      auto se_executor,
+      client()->backend().stream_executor(
+          compile_options.executable_build_options.device_ordinal()));
 
   // Load Executable from AOT compilation result.
   std::vector<std::unique_ptr<LocalExecutable>> local_executables;
   local_executables.reserve(se_executable->aot_executables().size());
   for (std::unique_ptr<xla::AotCompilationResult>& aot_executable :
        se_executable->aot_executables()) {
-    TF_ASSIGN_OR_RETURN(std::string serialized,
-                        aot_executable->SerializeAsString());
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<LocalExecutable> local_executable,
-        client()->Load(serialized, compile_options.executable_build_options));
-    local_executables.push_back(std::move(local_executable));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                        aot_executable->LoadExecutable(
+                            client()->backend().compiler(), se_executor));
+    local_executables.push_back(std::make_unique<LocalExecutable>(
+        std::move(executable), client()->local_service()->mutable_backend(),
+        compile_options.executable_build_options));
   }
   bool parameter_is_tupled_arguments =
       compile_options.parameter_is_tupled_arguments;
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index ab40c64b87b85e..be6b4cb75a9777 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -190,13 +190,24 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
     return &topology_;
   }
 
+  // TODO(b/285385306): Enable loading a non-loaded PjRtExecutable.
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
       std::unique_ptr<PjRtExecutable> executable,
-      const LoadOptions& load_options) override;
+      const LoadOptions& load_options) override {
+    return absl::WrapUnique<PjRtLoadedExecutable>(
+        tensorflow::down_cast<PjRtLoadedExecutable*>(executable.release()));
+  }
+
+  // TODO(b/296466237): Unify `Load` method after (de)serialization and tests on
+  // existing use cases are done.
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
+      std::unique_ptr<PjRtExecutable> executable);
 
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerializedExecutable(
+  // TODO(b/296466237): Unify `LoadSerializedExecutable` after fixing existing
+  // tests.
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerialized(
       absl::string_view serialized, std::optional<CompileOptions> options,
-      const LoadOptions& load_options) override;
+      const LoadOptions& load_options);
 
  private:
   xla::StreamExecutorGpuTopologyDescription topology_;
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index d707919ab40b79..2535b5d5308ada 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/status_macros.h"
-#include "tsl/platform/casts.h"
 #include "tsl/platform/errors.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -147,31 +146,23 @@ absl::StatusOr<std::unique_ptr<PjRtExecutable>> AotCompile(
 #endif
 }  // namespace
 
+// TODO(b/285385306): Enable compilation on provided `topology`.
 absl::StatusOr<std::unique_ptr<PjRtExecutable>>
 StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    const XlaComputation& computation,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (client == nullptr && gpu_target_config_ != std::nullopt) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     return AotCompile(options, computation, *gpu_target_config_);
+#endif
+    return absl::InternalError(
+        "GPU AOT compilation requires the target to be built with CUDA or "
+        "ROCm.");
   }
+  // TODO(b/296466237): Remove client dependency.
   TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
-
-  PjRtStreamExecutorClient* se_client =
-      tensorflow::down_cast<PjRtStreamExecutorClient*>(client);
-#if GOOGLE_CUDA
-  auto gpu_compiler = gpu::NVPTXCompiler();
-#elif TENSORFLOW_USE_ROCM
-  auto gpu_compiler = gpu::AMDGPUCompiler();
-#endif
-  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
-      se_client->client()->backend().default_stream_executor());
-  return AotCompile(options, computation, gpu_target_config);
-#endif
-  return absl::InternalError(
-      "GPU AOT compilation requires the target to be built with CUDA or "
-      "ROCm.");
+  return client->Compile(computation, options);
 }
 
 absl::StatusOr<std::unique_ptr<PjRtExecutable>>
@@ -179,17 +170,22 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    mlir::ModuleOp module,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
+  if (client == nullptr && gpu_target_config_ != std::nullopt) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  XlaComputation xla_computation;
-  TF_RETURN_IF_ERROR(MlirToXlaComputation(
-      module, xla_computation,
-      /*use_tuple_args=*/options.parameter_is_tupled_arguments,
-      /*return_tuple=*/false));
-  return Compile(options, xla_computation, topology, client);
+    XlaComputation xla_computation;
+    TF_RETURN_IF_ERROR(MlirToXlaComputation(
+        module, xla_computation,
+        /*use_tuple_args=*/options.parameter_is_tupled_arguments,
+        /*return_tuple=*/false));
+    return AotCompile(options, xla_computation, *gpu_target_config_);
 #endif
-  return absl::InternalError(
-      "GPU AOT compilation requires the target to be built with CUDA or "
-      "ROCm.");
+    return absl::InternalError(
+        "GPU AOT compilation requires the target to be built with CUDA or "
+        "ROCm.");
+  }
+  // TODO(b/296466237): Remove client dependency.
+  TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
+  return client->Compile(module, options);
 }
 
 REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
index 981748429fa48e..fe3a67eb5849ef 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
@@ -32,8 +32,8 @@ namespace xla {
 class StreamExecutorGpuCompiler : public PjRtCompiler {
  public:
   // If `gpu_target_config` is nullopt, the compiler has to compile with device,
-  // i.e. calling of `Compile` should depend on the passed-in client's runtime
-  // device information.
+  // i.e. calling of `Compile` should depend on the passed-in client's
+  // compilation functionality.
   explicit StreamExecutorGpuCompiler(const std::optional<gpu::GpuTargetConfig>
                                          gpu_target_config = std::nullopt)
       : gpu_target_config_(gpu_target_config) {}
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index 07960a19dc4917..42f28daf377447 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -37,10 +37,8 @@ limitations under the License.
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_compiler.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/service/gpu/gpu_target_config.h"
-#include "xla/service/gpu/nvptx_compiler.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/tests/literal_test_util.h"
 #include "tsl/platform/casts.h"
@@ -63,6 +61,31 @@ constexpr absl::string_view mlir_str = R"mlir(
     }
   })mlir";
 
+constexpr absl::string_view kGpuTargetConfig =
+    R"pb(
+  gpu_device_info {
+    threads_per_block_limit: 1024
+    threads_per_warp: 32
+    shared_memory_per_block: 49152
+    shared_memory_per_core: 65536
+    threads_per_core_limit: 2048
+    core_count: 56
+    fpus_per_core: 64
+    block_dim_limit_x: 2147483647
+    block_dim_limit_y: 65535
+    block_dim_limit_z: 65535
+    memory_bandwidth: 732160000000
+    l2_cache_size: 4194304
+    clock_rate_ghz: 1.4805
+    device_memory_size: 17066622976
+    shared_memory_per_block_optin: 49152
+    cuda_compute_capability { major: 6 }
+  }
+  platform_name: "CUDA"
+  dnn_version_info {}
+  device_description_str: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
+    )pb";
+
 absl::StatusOr<xla::XlaComputation> GetXlaComputation(
     absl::string_view program) {
   TF_ASSIGN_OR_RETURN(auto hlo_module,
@@ -82,29 +105,36 @@ void ValidateResult(
       LiteralTestUtil::Equal(LiteralUtil::CreateR0(2), *result_literal));
 }
 
+absl::StatusOr<gpu::GpuTargetConfig> GetGpuTargetConfig() {
+  stream_executor::GpuTargetConfigProto gpu_target_config_proto;
+  if (!proto2::TextFormat::ParseFromString(kGpuTargetConfig,
+                                           &gpu_target_config_proto)) {
+    return absl::InvalidArgumentError("Failed to parse GpuTargetConfigProto");
+  }
+  return gpu::GpuTargetConfig(gpu_target_config_proto);
+}
+
 TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
+  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
+                       GetGpuTargetConfig());
+  CompileOptions options = xla::CompileOptions();
+  StreamExecutorGpuCompiler compiler(gpu_config);
+
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
                                               /*node_id=*/0));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
-  auto gpu_compiler = gpu::NVPTXCompiler();
-  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
-      se_client->client()->backend().default_stream_executor());
-  StreamExecutorGpuCompiler compiler(gpu_target_config);
-
   mlir::MLIRContext context;
   context.loadDialect<mlir::mhlo::MhloDialect, mlir::func::FuncDialect>();
   auto mlir_module =
       mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
   TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
   TF_ASSERT_OK_AND_ASSIGN(
-      auto executable,
-      compiler.Compile(xla::CompileOptions(), mlir_module.get(), *topology,
-                       /*client=*/nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto loaded_executable,
-      se_client->Load(std::move(executable), LoadOptions()));
+      auto executable, compiler.Compile(xla::CompileOptions(),
+                                        mlir_module.get(), *topology, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
+                          se_client->Load(std::move(executable)));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
@@ -112,82 +142,59 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
 }
 
 TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
+  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
+                       GetGpuTargetConfig());
+  CompileOptions options = xla::CompileOptions();
+  StreamExecutorGpuCompiler compiler(gpu_config);
+
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
                                               /*node_id=*/0));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
-  auto gpu_compiler = gpu::NVPTXCompiler();
-  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
-      se_client->client()->backend().default_stream_executor());
-  StreamExecutorGpuCompiler compiler(gpu_target_config);
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
   TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
-  TF_ASSERT_OK_AND_ASSIGN(auto executable,
-                          compiler.Compile(xla::CompileOptions(), computation,
-                                           *topology, /*client=*/nullptr));
   TF_ASSERT_OK_AND_ASSIGN(
-      auto loaded_executable,
-      se_client->Load(std::move(executable), LoadOptions()));
+      auto executable,
+      compiler.Compile(xla::CompileOptions(), computation, *topology, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
+                          se_client->Load(std::move(executable)));
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
   ValidateResult(result);
 }
 
 TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
+  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
+                       GetGpuTargetConfig());
+  CompileOptions options = xla::CompileOptions();
+  StreamExecutorGpuCompiler compiler(gpu_config);
+
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
                                               /*node_id=*/0));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
-  auto gpu_compiler = gpu::NVPTXCompiler();
-  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
-      se_client->client()->backend().default_stream_executor());
-  StreamExecutorGpuCompiler compiler(gpu_target_config);
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
   TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
-  TF_ASSERT_OK_AND_ASSIGN(auto executable,
-                          compiler.Compile(xla::CompileOptions(), computation,
-                                           *topology, /*client=*/nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      compiler.Compile(xla::CompileOptions(), computation, *topology, nullptr));
 
   // Serialize the executable and load it.
   TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable,
                           executable->SerializeExecutable());
   TF_ASSERT_OK_AND_ASSIGN(
       auto loaded_executable,
-      se_client->LoadSerializedExecutable(serialized_executable, std::nullopt,
-                                          LoadOptions()));
+      se_client->LoadSerialized(serialized_executable, std::nullopt,
+                                LoadOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
   ValidateResult(result);
 }
 
-TEST(StreamExecutorGpuCompilerTest, SuccessCrossCompileAndLoad) {
-  CompileOptions options = xla::CompileOptions();
-  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
-                          GetXlaComputation(kProgram));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
-  TF_ASSERT_OK_AND_ASSIGN(auto topology, client->GetTopologyDescription());
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto executable,
-      xla::PjRtCompile(options, computation, *topology, client.get()));
-  TF_ASSERT_OK_AND_ASSIGN(std::string serialized,
-                          executable->SerializeExecutable());
-
-  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
-                          client->LoadSerializedExecutable(
-                              serialized, std::nullopt, LoadOptions()));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
-  ValidateResult(result);
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
index 925180aa7c16db..d4bc1beb3f74e4 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -56,6 +56,17 @@ absl::StatusOr<xla::XlaComputation> GetXlaComputation(
   return XlaComputation(hlo_module->ToProto());
 }
 
+TEST(StreamExecutorGpuCompilerTest, NoClientXla) {
+  StreamExecutorGpuCompiler compiler;
+  StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),
+                                                "Fake_device", {0, 1});
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
+  EXPECT_THAT(compiler.Compile(xla::CompileOptions(), computation, topology,
+                               /*client=*/nullptr),
+              StatusIs(absl::StatusCode::kUnimplemented));
+}
+
 TEST(StreamExecutorGpuCompilerTest, TopologyNotSameXla) {
   StreamExecutorGpuCompiler compiler;
   StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),

From 047bc82d7c654376822bfa0cfe0976c84266e059 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Sep 2023 15:04:35 -0700
Subject: [PATCH 401/567] [mlir][sparse] Change to new syntax in mlir tests

Example:
#sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] } to
map = (d0) -> (d0 : compressed)

FileCheck tests would be changed to more general form for sparse encoding: #sparse_tensor.encoding<{{{.*}}}>.

PiperOrigin-RevId: 569301180
---
 .../convert-mhlo-quant-to-int-no-chlo.mlir    |  6 +--
 .../bridge/convert-mhlo-quant-to-int.mlir     | 10 ++---
 .../tests/hlo_xla_runtime_pipeline.mlir       |  6 +--
 .../tf2xla/tests/hlo_xla_sparsification.mlir  |  2 +-
 .../chlo/sparse_chlo_legalize_to_linalg.mlir  |  5 +--
 .../Dialect/mhlo/hlo-legalize-to-linalg.mlir  |  6 +--
 .../mhlo/hlo-legalize-to-stablehlo.mlir       |  6 +--
 .../mhlo/mhlo_infer_shape_type_methods.mlir   | 10 ++---
 .../Dialect/mhlo/mhlo_ops_prettyprint.mlir    |  6 +--
 .../xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir  |  2 +-
 .../Dialect/mhlo/sparse_gendot_lower.mlir     |  8 ++--
 .../tests/Dialect/mhlo/sparse_lower.mlir      | 44 +++++++++----------
 .../tests/Dialect/mhlo/sparse_ops.mlir        |  6 +--
 .../tests/Dialect/mhlo/sparse_rewriting.mlir  | 20 ++++-----
 .../tests/Dialect/mhlo/sparse_transpose.mlir  |  2 +-
 .../mhlo/stablehlo-legalize-to-hlo.mlir       |  6 +--
 .../translate/hlo_to_mhlo/tests/import.hlotxt |  4 +-
 .../tests/export-with-layouts.mlir            | 10 ++---
 18 files changed, 78 insertions(+), 81 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int-no-chlo.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int-no-chlo.mlir
index 49a49d04b86b5b..8c5e8395b98035 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int-no-chlo.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int-no-chlo.mlir
@@ -31,10 +31,10 @@ func.func @uniform_quantize_and_dequantize_type_exensions(%arg0: tensor<?x?xf32,
 // -----
 
 // CHECK-LABEL: func @uniform_quantize_and_dequantize_sparse_tensor_encoding
-func.func @uniform_quantize_and_dequantize_sparse_tensor_encoding(%arg0: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> () {
+func.func @uniform_quantize_and_dequantize_sparse_tensor_encoding(%arg0: tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> () {
   // CHECK-NOT: chlo
-  %0 = mhlo.uniform_quantize %arg0 : (tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>
-  %1 = mhlo.uniform_dequantize %0 : (tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>
+  %1 = mhlo.uniform_dequantize %0 : (tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>
   return
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
index 81882209250883..d1acaa6c9d9a63 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
@@ -96,11 +96,11 @@ func.func @uniform_quantize_and_dequantize_type_exensions(%arg0: tensor<?x?xf32,
 // -----
 
 // CHECK-LABEL: func @uniform_quantize_and_dequantize_sparse_tensor_encoding
-func.func @uniform_quantize_and_dequantize_sparse_tensor_encoding(%arg0: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> () {
-  // CHECK: %[[QUANTIZED:.*]] = mhlo.convert %[[VAL0:.*]] : (tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<?xi8, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>
-  %0 = mhlo.uniform_quantize %arg0 : (tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>
-  // CHECK: %[[DEQUANTIZED:.*]] = chlo.broadcast_multiply %[[VAL1:.*]], %[[CONST_SCALE:.*]] : (tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>, tensor<f32>) -> tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>
-  %1 = mhlo.uniform_dequantize %0 : (tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>
+func.func @uniform_quantize_and_dequantize_sparse_tensor_encoding(%arg0: tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> () {
+  // CHECK: %[[QUANTIZED:.*]] = mhlo.convert %[[VAL0:.*]] : (tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<?xi8, #sparse_tensor.encoding<{{{.*}}}>>
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>
+  // CHECK: %[[DEQUANTIZED:.*]] = chlo.broadcast_multiply %[[VAL1:.*]], %[[CONST_SCALE:.*]] : (tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>, tensor<f32>) -> tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+  %1 = mhlo.uniform_dequantize %0 : (tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>
   return
 }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_runtime_pipeline.mlir b/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_runtime_pipeline.mlir
index 11e19b3b1a6813..49521180c69897 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_runtime_pipeline.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_runtime_pipeline.mlir
@@ -9,7 +9,7 @@ func.func @simple_add(%arg0: tensor<f64>) -> tensor<f64> {
 
 // -----
 
-#CSR = #sparse_tensor.encoding<{lvlTypes = [ "dense", "compressed" ]}>
+#CSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : compressed)}>
 
 // CHECK-LABEL: func.func @csr_gendot(
 // CHECK-SAME:    %[[PTR:.*0]]: memref<?xindex>,
@@ -53,8 +53,8 @@ func.func @csr_gendot(%arg0: tensor<32x64xf64, #CSR>,
 
 // -----
 
-#CSR  = #sparse_tensor.encoding<{ lvlTypes = ["dense", "compressed"] }>
-#DCSR = #sparse_tensor.encoding<{ lvlTypes = ["compressed", "compressed"] }>
+#CSR  = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
+#DCSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
 
 // CHECK-LABEL: func.func @convert_nop(
 // CHECK-SAME:    %[[PTR:.*0]]: memref<?xindex>,
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_sparsification.mlir b/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_sparsification.mlir
index 920c2ab744cf77..2fcf98eff0600e 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_sparsification.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_sparsification.mlir
@@ -1,6 +1,6 @@
 // RUN: tf-opt -hlo-legalize-to-linalg -hlo-xla-runtime-sparsification %s | FileCheck %s
 
-#SparseVector = #sparse_tensor.encoding<{ lvlTypes = ["compressed"] }>
+#SparseVector = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 
 // CHECK-LABEL: func.func @mult_sparse_dense(
 // CHECK-SAME:    %[[PTR:.*0]]: memref<?xindex>,
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir
index af49c6f259a0b7..fd218480f38a4d 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-hlo-opt --legalize-sparse-ops="legalize-to-custom-calls=false" %s | FileCheck %s
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"]
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
 
 // CHECK-LABEL: @asinh_scalar(
@@ -20,8 +20,7 @@ func.func @asinh_scalar(%arg : tensor<f32>) -> tensor<f32> {
 // CHECK-SAME:      tensor<10x20xf32, #{{.*}}>
 // CHECK:         %[[VAL:.*]] = linalg.generic
 // CHECK-SAME:        ins(%[[ARG]] : tensor<10x20xf32,
-// CHECK-SAME:        #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
-// CHECK-SAME:        outs(%[[OUT]]
+// CHECK-SAME:        #sparse_tensor.encoding<{{{.*}}}>>) outs(%[[OUT]]
 // CHECK:           sparse_tensor.unary %{{.*}} : f32 to f32
 // CHECK:           present = {
 // CHECK:             tensor.from_elements
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
index c2aa8dcc254fbc..4e4fd592d1862c 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
@@ -6135,15 +6135,15 @@ func.func @clamp_complex(%min: tensor<8xcomplex<f32>>,
 // CHECK-PRIMITIVE-LABEL: func @reshape_sparse_encoding
 
 #ST_3D = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed", "compressed", "compressed"]
+  map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed)
 }>
 
 #ST_4D = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed", "compressed", "compressed", "compressed"]
+  map = (d0, d1, d2, d3) -> (d0 : compressed, d1 : compressed, d2 : compressed, d3 : compressed)
 }>
 
 func.func @reshape_sparse_encoding(%arg0: tensor<1x49x16xf32, #ST_3D>) -> tensor<1x784x1x1xf32, #ST_4D> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<1x49x16xf32, #ST_3D>) -> tensor<1x784x1x1xf32, #ST_4D>
   func.return %0 : tensor<1x784x1x1xf32, #ST_4D>
 }
-// CHECK: tensor.reshape %{{.*}} : (tensor<1x49x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>, tensor<4xi64>) -> tensor<1x784x1x1xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ] }>>
+// CHECK: tensor.reshape %{{.*}} : (tensor<1x49x16xf32, #sparse_tensor.encoding<{{{.*}}}>>, tensor<4xi64>) -> tensor<1x784x1x1xf32, #sparse_tensor.encoding<{{{.*}}}>>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index 91fabf95f4decd..6fbd7ee730ae4f 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -1818,9 +1818,9 @@ func.func @type_quantization(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>, %ar
 }
 
 // CHECK-LABEL: "type_sparsity"
-func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32> {
-  // CHECK: "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32>
-  %0 = "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32>
+func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<16xf32> {
+  // CHECK: "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<16xf32>
+  %0 = "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
index 9bdb69a5e2d36b..3ebe9d38969e6a 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
@@ -895,7 +895,7 @@ func.func @scatter_bounds(%input_tensor: tensor<200x?x?xf32, #mhlo.type_extensio
 //===----------------------------------------------------------------------===//
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"]
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
 
 // CHECK-LABEL: @tanh_sparsity
@@ -909,7 +909,7 @@ func.func @tanh_sparsity(%arg0: tensor<10x10xf32, #CSR>) -> tensor<10x10xindex>
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"]
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
 
 // CHECK-LABEL: @abs_sparsity
@@ -923,7 +923,7 @@ func.func @abs_sparsity(%arg0: tensor<10x10xf32, #CSR>) -> tensor<10x10xindex> {
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"]
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
 
 // CHECK-LABEL: @real_sparsity
@@ -937,7 +937,7 @@ func.func @real_sparsity(%arg0: tensor<10x10xcomplex<f32>, #CSR>) -> tensor<10x1
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"]
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
 
 // CHECK-LABEL: @imag_sparsity
@@ -951,7 +951,7 @@ func.func @imag_sparsity(%arg0: tensor<10x10xcomplex<f32>, #CSR>) -> tensor<10x1
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"]
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
 
 // CHECK-LABEL: @complex_sparsity
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
index feb479578a42fd..665aa50545d204 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
@@ -231,11 +231,11 @@ func.func @extensions(%arg0 : tensor<?x?xf32, #mhlo.type_extensions<bounds = [3,
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"]
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
 
 #DCSR = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed", "compressed"]
+  map = (d0, d1) -> (d0 : compressed, d1 : compressed)
 }>
 
 // CHECK-LABEL: func @encodings
@@ -245,7 +245,7 @@ func.func @encodings(%arg0: tensor<10x20xf32, #CSR>,
   // CHECK-NEXT: %1 = mhlo.add %arg1, %arg1 : tensor<10x20xf32, #sparse_tensor.encoding<{{.*}}>>
   // CHECK-NEXT: %2 = mhlo.abs %arg0 : (tensor<10x20xf32, #sparse_tensor.encoding<{{.*}}>>) -> tensor<10x20xf32>
   // CHECK-NEXT: %3 = mhlo.abs %arg0 : tensor<10x20xf32, #sparse_tensor.encoding<{{.*}}>>
-  // CHECK-NEXT: %4 = mhlo.complex %arg0, %arg0 : (tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) -> tensor<10x20xcomplex<f32>>
+  // CHECK-NEXT: %4 = mhlo.complex %arg0, %arg0 : (tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>, tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<10x20xcomplex<f32>>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<10x20xf32, #CSR>,
                                    tensor<10x20xf32, #DCSR>) -> tensor<10x20xf32>
   %1 = "mhlo.add"(%arg1, %arg1) : (tensor<10x20xf32, #DCSR>,
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index 1030ce4c5dd0a9..4ba4e4085afae6 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -6142,7 +6142,7 @@ func.func @scatter_variadic(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>,
 
 
 #SV = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed"]
+  map = (d0) -> (d0 : compressed)
 }>
 
 func.func @is_compatible_sparse_mix_non_sparse(%arg0: tensor<1xf32>, %arg1: tensor<1xf32, #SV>) {
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir
index 3ac5a8d53d2e74..8f5e71b7bf5f28 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir
@@ -2,9 +2,9 @@
 // RUN: --verify-diagnostics \
 // RUN: --mhlo-test-lower-general-dot --canonicalize | FileCheck %s
 
-#SV  = #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>
-#CSR = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>
-#COO = #sparse_tensor.encoding<{ lvlTypes = [ "compressed_nu", "singleton_nu", "singleton" ] }>
+#SV  = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
+#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
+#COO = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique), d2 : singleton) }>
 
 //
 // Vector-vector gendot.
@@ -78,7 +78,7 @@ func.func @sparse_matmat_1s(%arg0: tensor<16x32xf64, #CSR>,
 // CHECK-LABEL: func.func @sparse_matmat_as(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<16x32xf64, #sparse_tensor.encoding<{{{.*}}}>>,
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<32x64xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<16x64xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<16x32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>, tensor<32x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) -> tensor<16x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>
+// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<16x32xf64, #sparse_tensor.encoding<{{{.*}}}>>, tensor<32x64xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<16x64xf64, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK:         return %[[DOT]] : tensor<16x64xf64, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK:       }
 //
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir
index 7318d78e2c6172..1567f01c55a0f1 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir
@@ -7,25 +7,25 @@
 // properly dealt with while lowering mhlo ops to linalg ops.
 
 #SV = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed"]
+  map = (d0) -> (d0 : compressed)
 }>
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"]
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
 
 #DCSR = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed", "compressed"]
+  map = (d0, d1) -> (d0 : compressed, d1 : compressed)
 }>
 
 #ST = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed", "compressed", "compressed"]
+  map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed)
 }>
 
 // CHECK-LABEL: func @sparse_abs_eltwise(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #{{.*}}>) -> tensor<10x20xf32, #{{.*}}> {
 // CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #{{.*}}>
-// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}} ins(%[[ARG0]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>)
+// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}} ins(%[[ARG0]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>)
 // CHECK:         ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32):
 // CHECK:           %[[ABS:.*]] = math.absf %[[A]] : f32
 // CHECK:           linalg.yield %[[ABS]] : f32
@@ -43,7 +43,7 @@ func.func @sparse_abs_eltwise(%arg0: tensor<10x20xf32, #CSR>)
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #{{.*}}>,
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<10x20xf32, #{{.*}}>) -> tensor<10x20xf32, #{{.*}}> {
 // CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #{{.*}}>
-// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) {
+// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>, tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>) {
 // CHECK:           ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32, %[[C:.*]]: f32):
 // CHECK:             %[[ADD:.*]] = arith.addf %[[A]], %[[B]] : f32
 // CHECK:             linalg.yield %[[ADD]] : f32
@@ -63,7 +63,7 @@ func.func @sparse_add_eltwise(%arg0: tensor<10x20xf32, #CSR>,
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #{{.*}}>,
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<10x20xf32, #{{.*}}>) -> tensor<10x20xf32, #{{.*}}> {
 // CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #{{.*}}>
-// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) {
+// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>, tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>) {
 // CHECK:           ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32, %[[C:.*]]: f32):
 // CHECK:             %[[ADD:.*]] = arith.mulf %[[A]], %[[B]] : f32
 // CHECK:             linalg.yield %[[ADD]] : f32
@@ -81,19 +81,19 @@ func.func @sparse_mul_eltwise(%arg0: tensor<10x20xf32, #CSR>,
 
 // CHECK-LABEL: func @sparse_math(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20x30xf64, #{{.*}}>) -> tensor<10x20x30xf64, #{{.*}}> {
-// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{{{.*}}}>>) outs
 // CHECK:            math.absf
 // CHECK:         }
-// CHECK:         %[[T1:.*]] = linalg.generic {{{.*}}} ins(%[[T0]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T1:.*]] = linalg.generic {{{.*}}} ins(%[[T0]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{{{.*}}}>>) outs
 // CHECK:            math.expm1
 // CHECK:         }
-// CHECK:         %[[T2:.*]] = linalg.generic {{{.*}}} ins(%[[T1]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T2:.*]] = linalg.generic {{{.*}}} ins(%[[T1]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{{{.*}}}>>) outs
 // CHECK:           math.log1p
 // CHECK:         }
-// CHECK:         %[[T3:.*]] = linalg.generic {{{.*}}} ins(%[[T2]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T3:.*]] = linalg.generic {{{.*}}} ins(%[[T2]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{{{.*}}}>>) outs
 // CHECK:           arith.negf
 // CHECK:         }
-// CHECK:         %[[T4:.*]] = linalg.generic {{{.*}}} ins(%[[T3]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T4:.*]] = linalg.generic {{{.*}}} ins(%[[T3]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{{{.*}}}>>) outs
 // CHECK:           sparse_tensor.unary %{{.*}} : f64 to f64
 // CHECK:           present = {
 // CHECK:             math.copysign
@@ -102,19 +102,19 @@ func.func @sparse_mul_eltwise(%arg0: tensor<10x20xf32, #CSR>,
 // CHECK:           absent = {
 // CHECK:           }
 // CHECK:         }
-// CHECK:         %[[T5:.*]] = linalg.generic {{{.*}}} ins(%[[T4]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T5:.*]] = linalg.generic {{{.*}}} ins(%[[T4]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{{{.*}}}>>) outs
 // CHECK:           math.sin
 // CHECK:         }
-// CHECK:         %[[T6:.*]] = linalg.generic {{{.*}}} ins(%[[T5]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T6:.*]] = linalg.generic {{{.*}}} ins(%[[T5]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{{{.*}}}>>) outs
 // CHECK:           math.sqrt
 // CHECK:         }
-// CHECK:         %[[T7:.*]] = linalg.generic {{{.*}}} ins(%[[T6]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T7:.*]] = linalg.generic {{{.*}}} ins(%[[T6]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{{{.*}}}>>) outs
 // CHECK:           math.tanh
 // CHECK:         }
-// CHECK:         %[[T8:.*]] = linalg.generic {{{.*}}} ins(%[[T7]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T8:.*]] = linalg.generic {{{.*}}} ins(%[[T7]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{{{.*}}}>>) outs
 // CHECK:           math.ceil
 // CHECK:         }
-// CHECK:         %[[T9:.*]] = linalg.generic {{{.*}}} ins(%[[T8]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T9:.*]] = linalg.generic {{{.*}}} ins(%[[T8]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{{{.*}}}>>) outs
 // CHECK:           math.floor
 // CHECK:         }
 // CHECK:         return %[[T9]] : tensor<10x20x30xf64, #{{.*}}>
@@ -175,7 +175,7 @@ func.func @sparse_int_abs(%arg0: tensor<100xi64, #SV>) -> tensor<100xi64> {
 
 // CHECK-LABEL: func @sparse_reduce(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10xi64, #{{.*}}>) -> tensor<i64> {
-// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10xi64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>)
+// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10xi64, #sparse_tensor.encoding<{{{.*}}}>>)
 // CHECK:           arith.addi
 // CHECK:         }
 // CHECK:         return %[[T0]] : tensor<i64>
@@ -193,7 +193,7 @@ func.func @sparse_reduce(%arg0: tensor<10xi64, #SV>) -> tensor<i64> {
 // CHECK-LABEL: func @sparse_dot(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<?xf32, #{{.*}}>,
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<?xf32, #{{.*}}>) -> tensor<f32> {
-// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>, tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>)
+// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>, tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>)
 // CHECK:           arith.mulf
 // CHECK:           arith.addf
 // CHECK:         }
@@ -212,11 +212,11 @@ func.func @sparse_dot(%arg0: tensor<?xf32, #SV>,
 
 // CHECK-LABEL: func @sparse_transpose(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<100x200xf64, #{{.*}}>) -> tensor<200x100xf64, #{{.*}}> {
-// CHECK:         %[[T0:.*]] = bufferization.alloc_tensor() : tensor<200x100xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>
-// CHECK:         %[[T1:.*]] = linalg.generic {{.*}} ins(%[[ARG0]] : tensor<100x200xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) outs(%[[T0]] : tensor<200x100xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) {
+// CHECK:         %[[T0:.*]] = bufferization.alloc_tensor() : tensor<200x100xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:         %[[T1:.*]] = linalg.generic {{.*}} ins(%[[ARG0]] : tensor<100x200xf64, #sparse_tensor.encoding<{{{.*}}}>>) outs(%[[T0]] : tensor<200x100xf64, #sparse_tensor.encoding<{{{.*}}}>>) {
 // CHECK:           linalg.yield
 // CHECK:         }
-// CHECK:         return %[[T1]] : tensor<200x100xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>
+// CHECK:         return %[[T1]] : tensor<200x100xf64, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK:       }
 func.func @sparse_transpose(%arg0: tensor<100x200xf64, #CSR>)
                                 -> tensor<200x100xf64, #DCSR> {
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_ops.mlir
index 89c3783a724b44..e5ea921f6df2a7 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_ops.mlir
@@ -6,15 +6,15 @@
 // output types), dense or sparse ops are semantically equivalent.
 
 #SV = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed"]
+  map = (d0) -> (d0 : compressed)
 }>
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"]
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
 
 #DCSR = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed", "compressed"]
+  map = (d0, d1) -> (d0 : compressed, d1 : compressed)
 }>
 
 //
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir
index 3e7b17a6a3e671..dcb0f778b545b4 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir
@@ -4,14 +4,14 @@
 
 // Verifies that mhlo sparse tensor type rewriting occurs.
 
-#SV= #sparse_tensor.encoding<{ lvlTypes = ["compressed"] }>
+#SV= #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"]
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
 
 #DCSR = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed", "compressed"]
+  map = (d0, d1) -> (d0 : compressed, d1 : compressed)
 }>
 
 // CHECK-LABEL: func @rewrite_unary(
@@ -39,7 +39,7 @@ func.func @rewrite_binary(%arg0: tensor<100xf64>,
 // CHECK-LABEL: func @rewrite_binary_override(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>,
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[VAL:.*]] = mhlo.multiply %[[ARG0]], %[[ARG1]] : (tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>, tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>
+// CHECK:         %[[VAL:.*]] = mhlo.multiply %[[ARG0]], %[[ARG1]] : (tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>, tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK-NEXT:    return %[[VAL:.*]] : tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 func.func @rewrite_binary_override(%arg0: tensor<10x10xf64, #CSR>,
                                    %arg1: tensor<10x10xf64, #CSR>) -> tensor<10x10xf64, #DCSR> {
@@ -50,8 +50,8 @@ func.func @rewrite_binary_override(%arg0: tensor<10x10xf64, #CSR>,
 
 // CHECK-LABEL: func @rewrite_convert(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64>) -> tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[VAL:.*]] = sparse_tensor.convert %[[ARG0]] : tensor<10x10xf64> to tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
-// CHECK-NEXT:    return %[[VAL:.*]] : tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:         %[[VAL:.*]] = sparse_tensor.convert %[[ARG0]] : tensor<10x10xf64> to tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-NEXT:    return %[[VAL:.*]] : tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 func.func @rewrite_convert(%arg0: tensor<10x10xf64>) -> tensor<10x10xf64, #CSR> {
   %0 = sparse_tensor.convert %arg0 : tensor<10x10xf64> to tensor<10x10xf64, #DCSR>
   %1 = sparse_tensor.convert %0 : tensor<10x10xf64, #DCSR> to tensor<10x10xf64, #CSR>
@@ -60,9 +60,9 @@ func.func @rewrite_convert(%arg0: tensor<10x10xf64>) -> tensor<10x10xf64, #CSR>
 }
 
 // CHECK-LABEL: func @rewrite_convert_nop(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK-NEXT:    %[[RES:.*]] = sparse_tensor.convert %[[ARG0]]
-// CHECK-NEXT:    return %[[RES]] : tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK-NEXT:    return %[[RES]] : tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 func.func @rewrite_convert_nop(%arg0: tensor<10x10xf64, #CSR>) -> tensor<10x10xf64, #CSR> {
   %0 = sparse_tensor.convert %arg0 : tensor<10x10xf64, #CSR> to tensor<10x10xf64, #DCSR>
   %1 = sparse_tensor.convert %0 : tensor<10x10xf64, #DCSR> to tensor<10x10xf64, #CSR>
@@ -84,7 +84,7 @@ func.func @rewrite_transpose(%arg0: tensor<100x200xf64, #CSR>) -> tensor<200x100
 // CHECK-SAME:    %[[ARG0:.*0]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>,
 // CHECK-SAME:    %[[ARG1:.*1]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>> {
 // CHECK:         %[[VAL:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]])
-// CHECK:         return %[[VAL]] : tensor<5x5xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:         return %[[VAL]] : tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>
 func.func @rewrite_dot(%arg0: tensor<5x5xf64, #CSR>,
                        %arg1: tensor<5x5xf64, #CSR>) -> tensor<5x5xf64, #CSR> {
   %0 = "mhlo.dot"(%arg0, %arg1)
@@ -100,7 +100,7 @@ func.func @rewrite_dot(%arg0: tensor<5x5xf64, #CSR>,
 // CHECK-SAME:    %[[ARG0:.*0]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>,
 // CHECK-SAME:    %[[ARG1:.*1]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>> {
 // CHECK:         %[[VAL:.*]] = "mhlo.dot_general"(%[[ARG0]], %[[ARG1]])
-// CHECK:         return %[[VAL]] : tensor<5x5xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:         return %[[VAL]] : tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>
 func.func @rewrite_general_dot(%arg0: tensor<5x5xf64, #CSR>,
                                %arg1: tensor<5x5xf64, #CSR>) -> tensor<5x5xf64, #CSR> {
    %0 = "mhlo.dot_general"(%arg0, %arg1)
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir
index 1ef4d5700fcc68..4d10863b90baee 100755
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir
@@ -3,7 +3,7 @@
 // RUN:   --canonicalize | FileCheck %s
 
 #DCSR = #sparse_tensor.encoding<{
-  lvlTypes = [ "compressed", "compressed" ]
+  map = (d0, d1) -> (d0 : compressed, d1 : compressed)
 }>
 
 //
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index 30cf935d960a57..753d36620ac176 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -1838,9 +1838,9 @@ func.func @type_quantization(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>, %ar
 }
 
 // CHECK-LABEL: "type_sparsity"
-func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32> {
-  // CHECK: "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32>
-  %0 = "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32>
+func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<16xf32> {
+  // CHECK: "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<16xf32>
+  %0 = "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
 
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
index 8e36e17743985f..7d3dc8eabdc2bb 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
@@ -1757,13 +1757,13 @@ add {
 }
 
 // CHECK-LABEL : func private @sparse
-// CHECK: tensor<10x10xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>>) -> tensor<10x10xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>>
+// CHECK: tensor<10x10xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<10x10xf32, #sparse_tensor.encoding<{{{.*}}}>>
 %sparse {
   ROOT root = f32[10,10]{1,0:D(D,C)} parameter(0)
 }
 
 // CHECK-LABEL : func private @sparse_nu_no
-// CHECK: tensor<3x4x5xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed_nu", "singleton_nu_no", "singleton_no" ], posWidth = 32, crdWidth = 32 }>>) -> tensor<3x4x5xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed_nu", "singleton_nu_no", "singleton_no" ], posWidth = 32, crdWidth = 32 }>>
+// CHECK: tensor<3x4x5xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<3x4x5xf32, #sparse_tensor.encoding<{{{.*}}}>>
 %sparse_nu_no {
   ROOT root = f32[3,4,5]{2,1,0:D(C+,S+~,S~)} parameter(0)
 }
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export-with-layouts.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export-with-layouts.mlir
index cf9b5d938e3d78..680341f0d899ac 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export-with-layouts.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export-with-layouts.mlir
@@ -2,8 +2,7 @@
 // RUN: xla-translate -split-input-file -mlir-hlo-to-hlo-text  -with-layouts -print-layouts --via-builder=true %s | FileCheck %s
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"],
-  dimToLvl = affine_map<(i, j) -> (i, j)>,
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
   posWidth = 32,
   crdWidth = 32
 }>
@@ -17,8 +16,7 @@ func.func @main(%arg: tensor<3x4xf32, #CSR>) -> tensor<3x4xf32, #CSR> {
 // -----
 
 #COO = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed_nu", "singleton"],
-  dimToLvl = affine_map<(i, j) -> (i, j)>,
+  map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton),
   posWidth = 32,
   crdWidth = 32
 }>
@@ -32,7 +30,7 @@ func.func @main(%arg: tensor<3x4xf32, #COO>) -> tensor<3x4xf32, #COO> {
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  lvlTypes = ["dense", "compressed"],
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
   posWidth = 32,
   crdWidth = 32
 }>
@@ -46,7 +44,7 @@ func.func @main(%arg: tensor<3x4xf32, #CSR>) -> tensor<3x4xf32, #CSR> {
 // -----
 
 #UnorderedCOOTensor = #sparse_tensor.encoding<{
-  lvlTypes = ["compressed_nu", "singleton_nu_no", "singleton_no"],
+  map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique, nonordered), d2 : singleton(nonordered)),
   posWidth = 32,
   crdWidth = 32
 }>

From e0be634722c26f8ec3562e0b0945da7e4879ec0a Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Thu, 28 Sep 2023 15:24:09 -0700
Subject: [PATCH 402/567] Allow running xla aot test locally outside docker.

The existing test makes assumptions about the python version and cmake paths
that do not generally hold on local workstations.

Added a cmake variable `TENSORFLOW_PACKAGE_PATH` that allows setting the full
directory of the python tensorflow package, and explicitly set the default
cmake archive directory to avoid writing to root (previously it was empty,
causing attempted writes directly to `/tf_runtime`).

PiperOrigin-RevId: 569305879
---
 .../xla_build/pip_test/CMakeLists.txt            |  9 +++++++--
 .../xla_build/pip_test/run_xla_aot_test.sh       | 16 +++++++++++++---
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/tensorflow/tools/pip_package/xla_build/pip_test/CMakeLists.txt b/tensorflow/tools/pip_package/xla_build/pip_test/CMakeLists.txt
index 1bd6152c18b189..361ee239347d61 100644
--- a/tensorflow/tools/pip_package/xla_build/pip_test/CMakeLists.txt
+++ b/tensorflow/tools/pip_package/xla_build/pip_test/CMakeLists.txt
@@ -16,6 +16,11 @@ FetchContent_MakeAvailable(googletest)
 enable_testing()
 add_definitions(-DTF_PIP_INTEGRATION_TEST)
 
+set(TENSORFLOW_PACKAGE_PATH "/usr/local/lib/python3.9/dist-packages/tensorflow"
+    CACHE STRING "Full path of tensorflow python package")
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+
 add_executable(aot_compiled_test
         aot_compiled_test.cc
         aot_compiled_vars_and_arithmetic.o
@@ -29,10 +34,10 @@ add_executable(aot_compiled_test
 # The paths assume the setup performed by run_integration_test.sh.
 target_include_directories(aot_compiled_test PUBLIC
         "/tmp/generated_models"
-        "/usr/local/lib/python3.9/dist-packages/tensorflow/include"
+        "${TENSORFLOW_PACKAGE_PATH}/include"
         "/tmp/tf_third_party")
 
-add_subdirectory("/usr/local/lib/python3.9/dist-packages/tensorflow/xla_aot_runtime_src"
+add_subdirectory("${TENSORFLOW_PACKAGE_PATH}/xla_aot_runtime_src"
         ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/tf_runtime)
 
 target_link_libraries(aot_compiled_test GTest::gtest_main tf_xla_runtime)
diff --git a/tensorflow/tools/pip_package/xla_build/pip_test/run_xla_aot_test.sh b/tensorflow/tools/pip_package/xla_build/pip_test/run_xla_aot_test.sh
index ce10ccd3e109eb..6734201b317a76 100644
--- a/tensorflow/tools/pip_package/xla_build/pip_test/run_xla_aot_test.sh
+++ b/tensorflow/tools/pip_package/xla_build/pip_test/run_xla_aot_test.sh
@@ -25,7 +25,17 @@
 #
 # under kokoro, this is run by learning/brain/testing/kokoro/rel/docker/aot_compile.sh
 #=============================================================================
-set -euo pipefail -o history
+set -eo pipefail -o history
+
+# If the TENSORFLOW_PACKAGE_PATH variable is set, add an argument to the cmake
+# command-line to explicitly set the package path.
+# This must be done before disallowing unset variables.
+CMAKE_TENSORFLOW_PACKAGE_ARG=
+if [ ! -z "$TENSORFLOW_PACKAGE_PATH" ]; then
+  CMAKE_TENSORFLOW_PACKAGE_ARG="-DTENSORFLOW_PACKAGE_PATH=${TENSORFLOW_PACKAGE_PATH}"
+fi
+
+set -u
 
 echo "Building x_matmul_y models"
 python3 \
@@ -84,7 +94,7 @@ saved_model_cli aot_compile_cpu \
   --output_prefix "${GEN_PREFIX}/aot_compiled_x_plus_y" \
   --cpp_class XPlusY --signature_def_key serving_default --tag_set serve
 
-echo "Creaating project and copying object files"
+echo "Creating project and copying object files"
 mkdir -p "${PROJECT}"
 cp -f \
   "${GEN_PREFIX}/aot_compiled_vars_and_arithmetic.o" \
@@ -102,7 +112,7 @@ cp tensorflow/tools/pip_package/xla_build/pip_test/CMakeLists.txt \
 
 echo "Building"
 mkdir "${PROJECT}/build"
-cmake -GNinja -S "${PROJECT}" -B "${PROJECT}/build" -DCMAKE_BUILD_TYPE=Release
+cmake -GNinja -S "${PROJECT}" -B "${PROJECT}/build" -DCMAKE_BUILD_TYPE=Release $CMAKE_TENSORFLOW_PACKAGE_ARG
 ninja -C "${PROJECT}/build"
 
 echo "Running test"

From 96a12741adab2a0326ec187cc37367b15174bd74 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Thu, 28 Sep 2023 16:00:06 -0700
Subject: [PATCH 403/567] Roll back
 https://github.com/tensorflow/tensorflow/commit/297ec1f82d90599d24033a0bbb677f79565ffeb9

PiperOrigin-RevId: 569314745
---
 third_party/xla/xla/python/py_executable.cc    | 4 ----
 third_party/xla/xla/python/py_host_callback.cc | 2 --
 2 files changed, 6 deletions(-)

diff --git a/third_party/xla/xla/python/py_executable.cc b/third_party/xla/xla/python/py_executable.cc
index 712d41fbffe000..1dcdb136e01a5f 100644
--- a/third_party/xla/xla/python/py_executable.cc
+++ b/third_party/xla/xla/python/py_executable.cc
@@ -87,10 +87,6 @@ PyLoadedExecutable::~PyLoadedExecutable() {
   if (next_) {
     next_->prev_ = prev_;
   }
-  {
-    py::gil_scoped_release gil_release;
-    ifrt_loaded_executable_.reset();
-  }
 }
 
 std::vector<ClientAndPtr<PjRtDevice>> PyLoadedExecutable::AddressableDevices()
diff --git a/third_party/xla/xla/python/py_host_callback.cc b/third_party/xla/xla/python/py_host_callback.cc
index 757b8e2f4d902c..e149742659aa11 100644
--- a/third_party/xla/xla/python/py_host_callback.cc
+++ b/third_party/xla/xla/python/py_host_callback.cc
@@ -204,8 +204,6 @@ PyHostSendAndRecvLoadedHostCallback::PyHostSendAndRecvLoadedHostCallback(
 PyHostSendAndRecvLoadedHostCallback::~PyHostSendAndRecvLoadedHostCallback() {
   GlobalPyRefManager()->AddGarbage(
       absl::MakeSpan(static_cast<pybind11::object*>(&callable_), 1));
-  GlobalPyRefManager()->AddGarbage(
-      absl::MakeSpan(static_cast<pybind11::object*>(&serializer_), 1));
 }
 
 StatusOr<std::string> PyHostSendAndRecvLoadedHostCallback::Serialize() const {

From 335cbac3c2e50494764ef9441c02ce1e164c6f36 Mon Sep 17 00:00:00 2001
From: Feng Wang <wffw@google.com>
Date: Thu, 28 Sep 2023 16:01:20 -0700
Subject: [PATCH 404/567] Add serving model run

PiperOrigin-RevId: 569315072
---
 .../xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc     | 2 ++
 .../xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h      | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
index 1978a45c54a51b..ca70bb29aa4b21 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
@@ -141,6 +141,8 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       {"ASBSQueue::Schedule", kASBSQueueSchedule},
       // TFRT related.
       {"TfrtModelRun", kTfrtModelRun},
+      // Serving related.
+      {"ServingModelRun", kServingModelRun},
       // GPU related.
       {"KernelLaunch", kKernelLaunch},
       {"KernelExecute", kKernelExecute},
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
index 70345137bce2ba..a1da54f73deaaf 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
@@ -140,6 +140,8 @@ enum HostEventType {
   kASBSQueueSchedule,
   // TFRT related.
   kTfrtModelRun,
+  // Serving related.
+  kServingModelRun,
   // GPU related.
   kKernelLaunch,
   kKernelExecute,

From 0128337b3388fbf2f0025dd013a305d8f09d84cd Mon Sep 17 00:00:00 2001
From: Jian Cai <jiancai@google.com>
Date: Thu, 28 Sep 2023 16:25:37 -0700
Subject: [PATCH 405/567] Keep attributes in C++ canonicalization rewrites

By default, pattern rewrites do not preserve op attributes. This is problematic for GPU graphs, where device placement of ops are stored in "device" attribute. Losing this information causesd significant performance lost in some cases. This fixes the patterns defined in tensorflow/compiler/mlir/tensorflow/ir/tf_ops_*.cc.

PiperOrigin-RevId: 569321098
---
 .../compiler/mlir/tensorflow/ir/tf_ops_a_m.cc |  31 ++--
 .../compiler/mlir/tensorflow/ir/tf_ops_n_z.cc |  49 +++--
 .../mlir/tensorflow/tests/canonicalize.mlir   | 170 ++++++++++--------
 3 files changed, 137 insertions(+), 113 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index 3668c41da3dfc2..4f21118dc24b71 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -1042,9 +1042,9 @@ class CaseOrIfRegionEliminatePassThrough
     if (result_to_extern_value.empty()) return failure();
 
     // Create new case/if region op.
-    auto new_op = rewriter.create<CaseOrIfRegionOp>(
-        op.getLoc(), new_result_types, op.getOperand(), op->getAttrs(),
-        op.getNumRegions());
+    auto new_op = CreateTfOp<CaseOrIfRegionOp>(rewriter, op, new_result_types,
+                                               op.getOperand(), op->getAttrs(),
+                                               op.getNumRegions());
 
     int next_index = 0;
     for (auto result : op.getResults()) {
@@ -1203,8 +1203,8 @@ LogicalResult HoistCwiseUnaryOutOfConcat::matchAndRewrite(
   SmallVector<Value, 8> unary_ops_args(unary_operands);
 
   // Concatenate unary ops operands.
-  auto concat_unary_operands = rewriter.create<ConcatV2Op>(
-      loc, op.getType(), unary_ops_args, op.getAxis());
+  auto concat_unary_operands = CreateTfOp<ConcatV2Op>(
+      rewriter, op, op.getType(), unary_ops_args, op.getAxis());
 
   // Replace original concat with an unary op.
   OperationState new_unary_op_state(loc, first_arg_op->getName().getStringRef(),
@@ -1212,6 +1212,7 @@ LogicalResult HoistCwiseUnaryOutOfConcat::matchAndRewrite(
                                     op.getResult().getType(),
                                     ArrayRef<NamedAttribute>());
   Operation* new_unary_op = rewriter.create(new_unary_op_state);
+  CopyDeviceAndUnderscoredAttributes(op, new_unary_op);
 
   rewriter.replaceOp(op, new_unary_op->getResults());
 
@@ -1371,8 +1372,8 @@ LogicalResult HoistCwiseBinaryOutOfConcat::matchAndRewrite(
     // Use `PackOp` for scalar concatenation because `ConcatV2Op` doesn't
     // support scalar concatenation.
     if (is_scalar) {
-      auto pack = rewriter.create<PackOp>(loc, result_type, args,
-                                          rewriter.getI64IntegerAttr(axis));
+      auto pack = CreateTfOp<PackOp>(rewriter, op, result_type, args,
+                                     rewriter.getI64IntegerAttr(axis));
       return pack.getResult();
     }
 
@@ -1389,7 +1390,7 @@ LogicalResult HoistCwiseBinaryOutOfConcat::matchAndRewrite(
     auto axis_const = rewriter.create<TF::ConstOp>(loc, attr);
 
     auto concat =
-        rewriter.create<ConcatV2Op>(loc, result_type, args, axis_const);
+        CreateTfOp<ConcatV2Op>(rewriter, op, result_type, args, axis_const);
     return concat.getResult();
   };
 
@@ -1406,6 +1407,7 @@ LogicalResult HoistCwiseBinaryOutOfConcat::matchAndRewrite(
       loc, first_arg_op->getName().getStringRef(), {lhs_concat, rhs_concat},
       op.getResult().getType(), ArrayRef<NamedAttribute>());
   Operation* new_binary_op = rewriter.create(new_binary_op_state);
+  CopyDeviceAndUnderscoredAttributes(op, new_binary_op);
 
   rewriter.replaceOp(op, new_binary_op->getResults());
 
@@ -2299,9 +2301,8 @@ class DivNoNanOrMulNoNanConstantY : public OpRewritePattern<OpT> {
         } else {
           // When `y` is a non-zero splat constant, replace tf.DivNoNan with
           // tf.Div and tf.MulNoNan with tf.Mul.
-          rewriter.replaceOpWithNewOp<RetT>(op, op->getResult(0).getType(),
-                                            op->getOperand(0),
-                                            op->getOperand(1));
+          ReplaceTfOpWithNewOp<RetT>(rewriter, op, op->getResult(0).getType(),
+                                     op->getOperand(0), op->getOperand(1));
         }
         return success();
       }
@@ -2312,8 +2313,8 @@ class DivNoNanOrMulNoNanConstantY : public OpRewritePattern<OpT> {
       } else {
         // When all the elements in `y` are non-splat and non-zero, replace
         // tf.DivNoNan with tf.Div and tf.MulNoNan with tf.Mul.
-        rewriter.replaceOpWithNewOp<RetT>(op, op->getResult(0).getType(),
-                                          op->getOperand(0), op->getOperand(1));
+        ReplaceTfOpWithNewOp<RetT>(rewriter, op, op->getResult(0).getType(),
+                                   op->getOperand(0), op->getOperand(1));
         return success();
       }
     }
@@ -2647,8 +2648,8 @@ static LogicalResult flipComatibleShapeError(Ty op, PatternRewriter& rewriter) {
   }
 
   // Shapes are known to be compatible.
-  rewriter.template replaceOpWithNewOp<Ty>(op, op.getX(), op.getY(),
-                                           rewriter.getBoolAttr(true));
+  ReplaceTfOpWithNewOp<Ty>(rewriter, op, op.getX(), op.getY(),
+                           rewriter.getBoolAttr(true));
   return success();
 }
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index e304bb2a24d9b5..50f2bceae7b282 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -2480,9 +2480,9 @@ class ConvertTensorListGetItemOpOfTensorListFromTensorOpToGather
       return failure();
     }
 
-    rewriter.replaceOpWithNewOp<GatherOp>(
-        op, op.getType(), tensor_list_from_tensor_op.getTensor(),
-        op.getIndex());
+    ReplaceTfOpWithNewOp<GatherOp>(rewriter, op, op.getType(),
+                                   tensor_list_from_tensor_op.getTensor(),
+                                   op.getIndex());
     return success();
   }
 };
@@ -2708,8 +2708,8 @@ class ToBoolOfRankedTensor : public OpRewritePattern<ToBoolOp> {
       if (!zero_attr) return failure();
 
       auto zero_const = rewriter.create<TF::ConstOp>(op.getLoc(), zero_attr);
-      rewriter.replaceOpWithNewOp<TF::NotEqualOp>(
-          op, result_type, op.getOperand(), zero_const, false);
+      ReplaceTfOpWithNewOp<TF::NotEqualOp>(rewriter, op, result_type,
+                                           op.getOperand(), zero_const, false);
     } else {
       // If the input is a non-scalar ranked tensor, ToBool can be expanded
       // to numElements != 0. numElements will be 0 iff one of the dimensions is
@@ -2960,8 +2960,8 @@ class NMSV3ToNMSV4Op : public OpRewritePattern<NonMaxSuppressionV3Op> {
         tensorflow::GetTypeFromTFTensorShape({}, input_ty.getElementType());
     new_result_types.push_back(valid_output_type);
 
-    auto nmsv4 = rewriter.create<TF::NonMaxSuppressionV4Op>(
-        nms_op.getLoc(), new_result_types, nms_op.getBoxes(),
+    auto nmsv4 = CreateTfOp<TF::NonMaxSuppressionV4Op>(
+        rewriter, nms_op, new_result_types, nms_op.getBoxes(),
         nms_op.getScores(), nms_op.getMaxOutputSize(), nms_op.getIouThreshold(),
         nms_op.getScoreThreshold());
     // Cannot replace the NMSv3 Op with NMSv4 since the outputs between the
@@ -2992,13 +2992,10 @@ class ConvertFusedBatchNorm : public OpRewritePattern<TF::FusedBatchNormOp> {
     // reserve_space_3
     new_result_types.push_back(
         UnrankedTensorType::get(FloatType::getF32(rewriter.getContext())));
-
-    OperationState new_state(tf_fused_batch_norm_op.getLoc(),
-                             TF::FusedBatchNormV3Op::getOperationName(),
-                             tf_fused_batch_norm_op.getOperands(),
-                             new_result_types,
-                             tf_fused_batch_norm_op->getAttrs());
-    Operation *tf_fused_batch_norm_op_v3 = rewriter.create(new_state);
+    auto tf_fused_batch_norm_op_v3 = CreateTfOp<TF::FusedBatchNormV3Op>(
+        rewriter, tf_fused_batch_norm_op, new_result_types,
+        tf_fused_batch_norm_op.getOperands(),
+        tf_fused_batch_norm_op->getAttrs());
 
     rewriter.replaceOp(tf_fused_batch_norm_op,
                        tf_fused_batch_norm_op_v3->getResults().drop_back());
@@ -3149,10 +3146,12 @@ LogicalResult HoistCwiseUnaryOutOfUnpack::matchAndRewrite(
                                     op.getOperand(), op.getOperand().getType(),
                                     ArrayRef<NamedAttribute>());
   Operation *new_unary_op = rewriter.create(new_unary_op_state);
+  CopyDeviceAndUnderscoredAttributes(op, new_unary_op);
 
   // Unpack results after applying unary operation.
-  auto unpack_unary_op = rewriter.create<UnpackOp>(
-      loc, op.getResultTypes(), new_unary_op->getResult(0), op.getAxis());
+  auto unpack_unary_op =
+      CreateTfOp<UnpackOp>(rewriter, op, op.getResultTypes(),
+                           new_unary_op->getResult(0), op.getAxis());
 
   // Bypass all users of the original unpack operation and use `unpack_unary_op`
   // results instead.
@@ -3787,8 +3786,8 @@ class XlaConvToV2 : public OpRewritePattern<TF::XlaConvOp> {
   LogicalResult matchAndRewrite(TF::XlaConvOp op,
                                 PatternRewriter &rewriter) const override {
     SmallVector<Type> result_types{op.getResult().getType()};
-    rewriter.replaceOpWithNewOp<TF::XlaConvV2Op>(
-        op, op.getResult().getType(), op.getLhs(), op.getRhs(),
+    ReplaceTfOpWithNewOp<TF::XlaConvV2Op>(
+        rewriter, op, op.getResult().getType(), op.getLhs(), op.getRhs(),
         op.getWindowStrides(), op.getPadding(), op.getLhsDilation(),
         op.getRhsDilation(), op.getFeatureGroupCount(),
         op.getDimensionNumbers(), op.getPrecisionConfig(), 1);
@@ -3891,9 +3890,9 @@ class XlaReduceToXlaVariadicReduceV2
     SmallVector<Value> inputs{op.getInput()};
     SmallVector<Value> init_values{op.getInitValue()};
     SmallVector<Type> result_types{op.getResult().getType()};
-    rewriter.replaceOpWithNewOp<TF::XlaVariadicReduceV2Op>(
-        op, result_types, inputs, init_values, op.getDimensionsToReduce(),
-        op.getReducer());
+    ReplaceTfOpWithNewOp<TF::XlaVariadicReduceV2Op>(
+        rewriter, op, result_types, inputs, init_values,
+        op.getDimensionsToReduce(), op.getReducer());
     return ::mlir::success();
   };
 };
@@ -4067,12 +4066,10 @@ class XlaVariadicReduceToV2 : public OpRewritePattern<TF::XlaVariadicReduceOp> {
 
   LogicalResult matchAndRewrite(TF::XlaVariadicReduceOp op,
                                 PatternRewriter &rewriter) const override {
-    mlir::TF::XlaVariadicReduceV2Op xla_variadic_reduce_v2_op =
-        rewriter.create<::mlir::TF::XlaVariadicReduceV2Op>(
-            op.getLoc(), op.getResults().getTypes(), op.getInput(),
-            op.getInitValue(), op.getDimensionsToReduce(), op.getReducer());
+    ReplaceTfOpWithNewOp<::mlir::TF::XlaVariadicReduceV2Op>(
+        rewriter, op, op.getResults().getTypes(), op.getInput(),
+        op.getInitValue(), op.getDimensionsToReduce(), op.getReducer());
 
-    rewriter.replaceOp(op, xla_variadic_reduce_v2_op.getResults());
     return ::mlir::success();
   };
 };
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index d6c33ce4e4d53e..2adf88449d3853 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -167,12 +167,12 @@ func.func @testConcatCanonicalization(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1x
 // CHECK-LABEL: testConcatCwiseUnary
 func.func @testConcatCwiseUnary(%arg0: tensor<?x1xf32>, %arg1: tensor<?x1xf32>, %arg2: tensor<i32>) -> tensor<?x2xf32> {
 
-  // CHECK: %[[CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %arg2)
-  // CHECK: %[[LOG1P:.*]] = "tf.Log1p"(%[[CONCAT]])
+  // CHECK: %[[CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %arg2) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: %[[LOG1P:.*]] = "tf.Log1p"(%[[CONCAT]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK: return %[[LOG1P]]
   %0 = "tf.Log1p"(%arg0) : (tensor<?x1xf32>) -> tensor<?x1xf32>
   %1 = "tf.Log1p"(%arg1) : (tensor<?x1xf32>) -> tensor<?x1xf32>
-  %2 = "tf.ConcatV2"(%0, %1, %arg2) : (tensor<?x1xf32>, tensor<?x1xf32>, tensor<i32>) -> tensor<?x2xf32>
+  %2 = "tf.ConcatV2"(%0, %1, %arg2) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<?x1xf32>, tensor<?x1xf32>, tensor<i32>) -> tensor<?x2xf32>
 
   func.return %2 : tensor<?x2xf32>
 }
@@ -183,13 +183,13 @@ func.func @testConcatCwiseBinaryOnInnerDim(%arg0: tensor<?x1xf32>,
 
   // CHECK-DAG: %[[LHS_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>}
 
-  // CHECK: %[[ADD_LHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64}
-  // CHECK: %[[MUL_LHS_CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %[[LHS_AXIS]])
-  // CHECK: %[[MUL_RHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64}
+  // CHECK: %[[ADD_LHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: %[[MUL_LHS_CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %[[LHS_AXIS]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: %[[MUL_RHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
 
-  // CHECK: %[[MUL:.*]] = "tf.Mul"(%[[MUL_LHS_CONCAT]], %[[MUL_RHS_CONCAT]])
+  // CHECK: %[[MUL:.*]] = "tf.Mul"(%[[MUL_LHS_CONCAT]], %[[MUL_RHS_CONCAT]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK-SAME: (tensor<?x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-  // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[ADD_LHS_CONCAT]], %[[MUL]])
+  // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[ADD_LHS_CONCAT]], %[[MUL]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK-SAME: (tensor<2xf32>, tensor<?x2xf32>) -> tensor<?x2xf32>
   // CHECK: return %[[ADD]]
 
@@ -200,7 +200,7 @@ func.func @testConcatCwiseBinaryOnInnerDim(%arg0: tensor<?x1xf32>,
   // Add of a scalar const and a tensor.
   %3 = "tf.AddV2"(%arg2, %1) : (tensor<f32>, tensor<?x1xf32>) -> tensor<?x1xf32>
   %4 = "tf.AddV2"(%arg3, %2) : (tensor<f32>, tensor<?x1xf32>) -> tensor<?x1xf32>
-  %5 = "tf.ConcatV2"(%3, %4, %0) : (tensor<?x1xf32>, tensor<?x1xf32>, tensor<i32>) -> tensor<?x2xf32>
+  %5 = "tf.ConcatV2"(%3, %4, %0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<?x1xf32>, tensor<?x1xf32>, tensor<i32>) -> tensor<?x2xf32>
 
   func.return %5 : tensor<?x2xf32>
 }
@@ -211,13 +211,15 @@ func.func @testConcatCwiseBinaryPreserveAxisType(%arg0: tensor<?x1xf32>,
 
   // CHECK-DAG: %[[LHS_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>}
 
-  // CHECK: %[[ADD_LHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64}
-  // CHECK: %[[MUL_LHS_CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %[[LHS_AXIS]])
-  // CHECK: %[[MUL_RHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64}
+  // CHECK: %[[ADD_LHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: %[[MUL_LHS_CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %[[LHS_AXIS]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: %[[MUL_RHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
 
   // CHECK: %[[MUL:.*]] = "tf.Mul"(%[[MUL_LHS_CONCAT]], %[[MUL_RHS_CONCAT]])
+  // CHECK-SAME: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK-SAME: (tensor<?x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
   // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[ADD_LHS_CONCAT]], %[[MUL]])
+  // CHECK-SAME: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK-SAME: (tensor<2xf32>, tensor<?x2xf32>) -> tensor<?x2xf32>
   // CHECK: return %[[ADD]]
 
@@ -228,7 +230,7 @@ func.func @testConcatCwiseBinaryPreserveAxisType(%arg0: tensor<?x1xf32>,
   // Add of a scalar const and a tensor.
   %3 = "tf.AddV2"(%arg2, %1) : (tensor<f32>, tensor<?x1xf32>) -> tensor<?x1xf32>
   %4 = "tf.AddV2"(%arg3, %2) : (tensor<f32>, tensor<?x1xf32>) -> tensor<?x1xf32>
-  %5 = "tf.ConcatV2"(%3, %4, %0) : (tensor<?x1xf32>, tensor<?x1xf32>, tensor<i64>) -> tensor<?x2xf32>
+  %5 = "tf.ConcatV2"(%3, %4, %0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<?x1xf32>, tensor<?x1xf32>, tensor<i64>) -> tensor<?x2xf32>
 
   func.return %5 : tensor<?x2xf32>
 }
@@ -285,15 +287,16 @@ func.func @testConcatCwiseBinaryNegativeAxis(%arg0: tensor<f32>,
 // Synthesize binary ops when 1 of the 3 concat inputs is a non-binary op.
 // CHECK-LABEL: testConcatCwiseBinarySynthMulOp3Inputs
 func.func @testConcatCwiseBinarySynthMulOp3Inputs(%arg0: tensor<?x1xf32>, %arg1: tensor<?x1xf32>, %arg2: tensor<?x1xf32>) -> tensor<?x3xf32> {
-  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<[2.000000e+00, 3.000000e+00, 1.000000e+00]>
-  // CHECK: %[[CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %arg2,
-  // CHECK: "tf.Mul"(%[[CONCAT]], %[[CONST]])
+  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[CONST0:.*]] = "tf.Const"() {value = dense<[2.000000e+00, 3.000000e+00, 1.000000e+00]>
+  // CHECK: %[[CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %arg2, %[[CONST]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: "tf.Mul"(%[[CONCAT]], %[[CONST0]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   %axis = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
   %mul0_const = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
   %mul0 = "tf.Mul"(%arg0, %mul0_const) : (tensor<?x1xf32>, tensor<f32>) -> tensor<?x1xf32>
   %mul1_const = "tf.Const"() { value = dense<3.0> : tensor<f32> } : () -> tensor<f32>
   %mul1 = "tf.Mul"(%arg1, %mul1_const) : (tensor<?x1xf32>, tensor<f32>) -> tensor<?x1xf32>
-  %ret = "tf.ConcatV2"(%mul0, %mul1, %arg2, %axis) : (tensor<?x1xf32>, tensor<?x1xf32>, tensor<?x1xf32>, tensor<i32>) -> tensor<?x3xf32>
+  %ret = "tf.ConcatV2"(%mul0, %mul1, %arg2, %axis) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<?x1xf32>, tensor<?x1xf32>, tensor<?x1xf32>, tensor<i32>) -> tensor<?x3xf32>
 
   func.return %ret : tensor<?x3xf32>
 }
@@ -678,16 +681,18 @@ func.func @testTileMultiplesAllOnes(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
 // CHECK-LABEL: func @testStaticAndIdenticalTypeForEqualOp
 func.func @testStaticAndIdenticalTypeForEqualOp(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
   // CHECK:      "tf.Equal"(%arg0, %arg1)
+  // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK-SAME:   incompatible_shape_error = true
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  %0 = "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   func.return %0: tensor<2xi1>
 }
 
 // CHECK-LABEL: func @testStaticAndIdenticalTypeForNotEqualOp
 func.func @testStaticAndIdenticalTypeForNotEqualOp(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
   // CHECK:      "tf.NotEqual"(%arg0, %arg1)
+  // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK-SAME:   incompatible_shape_error = true
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   func.return %0: tensor<2xi1>
 }
 
@@ -702,8 +707,9 @@ func.func @testUnknownBroadcastForNotEqualOp(%arg0: tensor<?xi32>, %arg1: tensor
 // CHECK-LABEL: func @testKnownGoodBroadcastForNotEqualOp
 func.func @testKnownGoodBroadcastForNotEqualOp(%arg0: tensor<1x?xi32>, %arg1: tensor<?x1xi32>) -> tensor<?x?xi1> {
   // CHECK:      "tf.NotEqual"(%arg0, %arg1)
+  // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK-SAME:   incompatible_shape_error = true
-  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false} : (tensor<1x?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
+  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<1x?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
   func.return %0: tensor<?x?xi1>
 }
 
@@ -734,8 +740,9 @@ func.func @testUnrankedLHSForNotEqualOp(%arg0: tensor<*xi32>, %arg1: tensor<i32>
 // CHECK-LABEL: func @testScalarForNotEqualOp
 func.func @testScalarForNotEqualOp(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i1> {
   // CHECK:      "tf.NotEqual"(%arg0, %arg1)
+  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK-SAME: incompatible_shape_error = true
-  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
   func.return %0: tensor<i1>
 }
 
@@ -1008,27 +1015,27 @@ func.func @ToBool_0DScalarI1(%arg0: tensor<i1>) -> tensor<i1> {
 // CHECK-LABEL: func @ToBool_0DScalarInt
 func.func @ToBool_0DScalarInt(%arg0: tensor<i32>) -> tensor<i1> {
   // CHECK: [[Zero:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]])
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0", incompatible_shape_error = true}
   // CHECK: return [[NE]]
-  %0 = "tf.ToBool"(%arg0) : (tensor<i32>) -> tensor<i1>
+  %0 = "tf.ToBool"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: func @ToBool_0DScalarFloat
 func.func @ToBool_0DScalarFloat(%arg0: tensor<f32>) -> tensor<i1> {
   // CHECK: [[Zero:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]])
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0", incompatible_shape_error = true}
   // CHECK: return [[NE]]
-  %0 = "tf.ToBool"(%arg0) : (tensor<f32>) -> tensor<i1>
+  %0 = "tf.ToBool"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: func @ToBool_0DScalarString
 func.func @ToBool_0DScalarString(%arg0: tensor<!tf_type.string>) -> tensor<i1> {
   // CHECK: [[EmptyStr:%.*]] = "tf.Const"() {value = dense<""> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
-  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[EmptyStr]]) {incompatible_shape_error = true} : (tensor<!tf_type.string>, tensor<!tf_type.string>) -> tensor<i1>
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[EmptyStr]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0", incompatible_shape_error = true} : (tensor<!tf_type.string>, tensor<!tf_type.string>) -> tensor<i1>
   // CHECK: return [[NE]] : tensor<i1>
-  %0 = "tf.ToBool"(%arg0) : (tensor<!tf_type.string>) -> tensor<i1>
+  %0 = "tf.ToBool"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.string>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
@@ -1162,8 +1169,9 @@ func.func @foldIf(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) ->
   %2 = "tf.If"(%0, %arg0, %arg1) {then_branch = @add, else_branch = @sub, output_shapes = [#tf_type.shape<>], device = "noodle", is_stateless = true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK: %1 = "tf.StatefulPartitionedCall"(%0, %arg1)
   // CHECK-SAME: _underscore_attr = "something"
+  // CHECK-SAME: device = "noodle"
   // CHECK-SAME: f = @add
-  %3 = "tf.If"(%1, %2, %arg1) {then_branch = @add, else_branch = @sub, output_shapes = [#tf_type.shape<>], _underscore_attr = "something", is_stateless = false} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %3 = "tf.If"(%1, %2, %arg1) {then_branch = @add, else_branch = @sub, output_shapes = [#tf_type.shape<>], device = "noodle", _underscore_attr = "something", is_stateless = false} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
 
   // CHECK: %2 = "tf.If"
   %4 = "tf.If"(%arg2, %3, %arg1) {then_branch = @add, else_branch = @sub, is_stateless = false} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -1178,8 +1186,9 @@ func.func @foldIfRegion(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1
   %true = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
 
   // CHECK: [[Val0:%.*]] = "tf.Mul"(%arg0, %arg1)
+  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   %0 = "tf.IfRegion"(%true) ({
-      %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      %true_value = "tf.Mul"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "tf.Yield"(%true_value) : (tensor<f32>) -> ()
     }, {
       %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -1187,11 +1196,12 @@ func.func @foldIfRegion(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1
     }) { is_stateless = true}: (tensor<i1>) -> tensor<f32>
 
   // CHECK: [[Val1:%.*]] = "tf.Sub"(%arg0, %arg1)
+  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   %1 = "tf.IfRegion"(%false) ({
       %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "tf.Yield"(%true_value) : (tensor<f32>) -> ()
     }, {
-      %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      %false_value = "tf.Sub"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "tf.Yield"(%false_value) : (tensor<f32>) -> ()
     }) { is_stateless = true}: (tensor<i1>) -> tensor<f32>
 
@@ -1205,10 +1215,11 @@ func.func @foldIfRegionMismatchedTypes(%arg0: tensor<?xf32>, %arg1: tensor<?xf32
   %true = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
 
   // CHECK: [[Val0:%.*]] = "tf.Mul"(%arg0, %arg1)
+  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK-NEXT: [[Cast:%.*]] = "tf.Cast"([[Val0]])
   // CHECK-NEXT: return [[Cast]]
   %0 = "tf.IfRegion"(%true) ({
-      %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+      %true_value = "tf.Mul"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
       "tf.Yield"(%true_value) : (tensor<?xf32>) -> ()
     }, {
       %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
@@ -1228,14 +1239,14 @@ func.func @eliminatePassThroughIfRegion(%arg0: tensor<f32>, %arg1: tensor<f32>,
   // CHECK:  },  {
   // CHECK:    %[[SUB:.*]] = "tf.Sub"(%[[ARG0]], %[[ARG1]])
   // CHECK:    "tf.Yield"(%[[SUB]]) : (tensor<f32>)
-  // CHECK:  }) {is_stateless = true} : (tensor<i1>) -> tensor<f32>
+  // CHECK:  }) {device = "/job:localhost/replica:0/task:0/device:GPU:0", is_stateless = true} : (tensor<i1>) -> tensor<f32>
   %0:4 = "tf.IfRegion"(%pred) ({
       %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "tf.Yield"(%arg1, %arg2, %true_value, %arg2) : (tensor<f32>, tensor<!tf_type.resource>, tensor<f32>, tensor<!tf_type.resource>) -> ()
     }, {
       %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "tf.Yield"(%arg1, %arg2, %false_value, %arg2) : (tensor<f32>, tensor<!tf_type.resource>, tensor<f32>, tensor<!tf_type.resource>) -> ()
-    }) { is_stateless = true}: (tensor<i1>) -> (tensor<f32>, tensor<!tf_type.resource>, tensor<f32>, tensor<!tf_type.resource>)
+    }) { is_stateless = true, device = "/job:localhost/replica:0/task:0/device:GPU:0"}: (tensor<i1>) -> (tensor<f32>, tensor<!tf_type.resource>, tensor<f32>, tensor<!tf_type.resource>)
   // CHECK: "tf._SomeOp"(%[[ARG2]], %[[ARG1]]) : (tensor<!tf_type.resource>, tensor<f32>) -> ()
   "tf._SomeOp"(%0#1, %0#0) : (tensor<!tf_type.resource>, tensor<f32>) -> ()
   // CHECK: "tf._SomeOp"(%[[ARG2]], %[[IF_OUTPUT]]) : (tensor<!tf_type.resource>, tensor<f32>) -> ()
@@ -1258,7 +1269,7 @@ func.func @eliminatePassThroughCaseRegion(%arg0: tensor<f32>, %arg1: tensor<f32>
   // CHECK:  },  {
   // CHECK:    %[[ADD:.*]] = "tf.AddV2"(%[[ARG0]], %[[ARG1]])
   // CHECK:    "tf.Yield"(%[[ADD]]) : (tensor<f32>)
-  // CHECK:  }) {is_stateless = true} : (tensor<i32>) -> tensor<f32>
+  // CHECK:  }) {device = "/job:localhost/replica:0/task:0/device:GPU:0", is_stateless = true} : (tensor<i32>) -> tensor<f32>
   %0:3 = "tf.CaseRegion"(%index) ({
       %mul = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "tf.Yield"(%arg1, %mul, %arg2) : (tensor<f32>, tensor<f32>, tensor<!tf_type.resource>) -> ()
@@ -1268,7 +1279,7 @@ func.func @eliminatePassThroughCaseRegion(%arg0: tensor<f32>, %arg1: tensor<f32>
     }, {
       %add = "tf.AddV2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "tf.Yield"(%arg1, %add, %arg2) : (tensor<f32>, tensor<f32>, tensor<!tf_type.resource>) -> ()
-    }) { is_stateless = true}: (tensor<i32>) -> (tensor<f32>, tensor<f32>, tensor<!tf_type.resource>)
+    }) { is_stateless = true, device = "/job:localhost/replica:0/task:0/device:GPU:0"}: (tensor<i32>) -> (tensor<f32>, tensor<f32>, tensor<!tf_type.resource>)
   // CHECK: "tf._SomeOp"(%[[ARG2]], %[[ARG1]]) : (tensor<!tf_type.resource>, tensor<f32>) -> ()
   "tf._SomeOp"(%0#2, %0#0) : (tensor<!tf_type.resource>, tensor<f32>) -> ()
   // CHECK: return %[[CASE_OUTPUT]] : tensor<f32>
@@ -1287,8 +1298,9 @@ func.func @foldCase(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
   %4 = "tf.Case"(%2, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf_type.shape<>], device = "noodle", is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK: PartitionedCall
   // CHECK-SAME: _cluster_launch = "not_ready"
+  // CHECK-SAME: device = "noodle"
   // CHECK-SAME: f = @sub
-  %5 = "tf.Case"(%3, %4, %arg1) {branches = [@sub, @add], output_shapes = [#tf_type.shape<>], _cluster_launch = "not_ready", is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %5 = "tf.Case"(%3, %4, %arg1) {branches = [@sub, @add], output_shapes = [#tf_type.shape<>], device= "noodle", _cluster_launch = "not_ready", is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %5 : tensor<f32>
 }
 
@@ -1357,7 +1369,8 @@ func.func @testWhileRegionSimplePassThrough(%arg0 : tensor<*xf32>, %arg1 : tenso
       %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
       "tf.Yield"(%barg0, %sub) : (tensor<*xf32>, tensor<i32>) -> ()
     }
-  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  ) {device = "/job:localhost/replica:0/task:0/device:GPU:0", is_stateless = false} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK: return %arg0 : tensor<*xf32>
   func.return %0#0 : tensor<*xf32>
 }
@@ -1381,7 +1394,8 @@ func.func @testWhileRegionReturnExternValues(%arg0 : tensor<*xf32>, %arg1 : tens
       %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
       "tf.Yield"(%arg0, %sub) : (tensor<*xf32>, tensor<i32>) -> ()
     }
-  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  ) {is_stateless = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK: return %arg0 : tensor<*xf32>
   func.return %0#0 : tensor<*xf32>
 }
@@ -1406,7 +1420,8 @@ func.func @testWhileRegionMultiplePassThrough(%arg0 : tensor<*xf32>, %arg1 : ten
       %sub = "tf.Sub"(%barg3, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
       "tf.Yield"(%barg0, %barg1, %barg2, %sub) : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<i32>) -> ()
     }
-  ) { is_stateless = false } : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<i32>)
+  ) {is_stateless = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<i32>)
+  // CHECK: device = "/job:localhost/replica:0/task:0/device:GPU:0"
 
   // CHECK: %[[SUB0:.*]] = "tf.Sub"(%arg0, %arg1)
   // CHECK: %[[SUB1:.*]] = "tf.Sub"(%arg2, %[[SUB0]])
@@ -1437,7 +1452,8 @@ func.func @testWhileRegionMultiplePassThroughNonContiguous(%arg0 : tensor<*xf32>
       %sub = "tf.Sub"(%barg3, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
       "tf.Yield"(%barg0, %arg1neg, %barg2, %sub) : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<i32>) -> ()
     }
-  ) { is_stateless = false } : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<i32>)
+  ) {is_stateless = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<i32>)
+  // CHECK: device = "/job:localhost/replica:0/task:0/device:GPU:0"
 
   // Verify that use of while loop results corresponding to result #0 and 2 of
   // the while are replaces with corresponding WhileRegion operands
@@ -1471,7 +1487,9 @@ func.func @testWhileRegionPassThroughTypeMismatch(%arg0 : tensor<*xf32>, %arg1 :
       %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
       "tf.Yield"(%barg0, %sub) : (tensor<?x?xf32>, tensor<i32>) -> ()
     }
-  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  ) {is_stateless = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: device = "/job:localhost/replica:0/task:0/device:GPU:0"
+
   // Verify that the result stays uchanged
   // CHECK: return %arg0 : tensor<*xf32>
   func.return %0#0 : tensor<*xf32>
@@ -1503,7 +1521,8 @@ func.func @testWhileRegionUnusedValue(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>
       %dummy1 = arith.constant dense<3.0> : tensor<f32>
       "tf.Yield"(%add, %sub, %dummy0, %dummy1) : (tensor<*xf32>, tensor<i32>, tensor<i32>, tensor<f32>) -> ()
     }
-  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>,  tensor<i32>, tensor<f32>) -> (tensor<*xf32>, tensor<i32>,  tensor<i32>, tensor<f32>)
+  ) {is_stateless = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xf32>, tensor<i32>,  tensor<i32>, tensor<f32>) -> (tensor<*xf32>, tensor<i32>,  tensor<i32>, tensor<f32>)
+  // CHECK: device = "/job:localhost/replica:0/task:0/device:GPU:0"
 
   // Verify that return still uses while result # 0
   // CHECK: return %[[WHILE_OUT]]#0 : tensor<*xf32>
@@ -1531,7 +1550,8 @@ func.func @testWhileRegionExplicitCast(%arg0 : tensor<i32>, %arg1 : tensor<*xi32
       %sub1 = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
       "tf.Yield"(%sub0, %sub1) : (tensor<i32>, tensor<i32>) -> ()
     }
-  ) { is_stateless = false } : (tensor<i32>, tensor<*xi32>) -> (tensor<i32>, tensor<i32>)
+  ) {is_stateless = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>, tensor<*xi32>) -> (tensor<i32>, tensor<i32>)
+  // CHECK: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   return %0#0 : tensor<i32>
 }
 
@@ -1556,7 +1576,8 @@ func.func @testWhileRegionPassThroughExplicitCast(%arg0 : tensor<i32>, %arg1 : t
       %sub = "tf.Sub"(%barg0, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
       "tf.Yield"(%sub, %barg1) : (tensor<i32>, tensor<i32>) -> ()
     }
-  ) { is_stateless = false } : (tensor<i32>, tensor<*xi32>) -> (tensor<i32>, tensor<i32>)
+  ) {is_stateless = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>, tensor<*xi32>) -> (tensor<i32>, tensor<i32>)
+  // CHECK: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK: return [[CAST1]]
   func.return %0#1 : tensor<i32>
 }
@@ -1578,7 +1599,8 @@ func.func @testWhileRegionPassThroughWithForwarded(%arg0 : tensor<*xf32>, %arg1
       %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
       "tf.Yield"(%barg0, %sub) : (tensor<*xf32>, tensor<i32>) -> ()
     }
-  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  ) {is_stateless = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK: return %arg0 : tensor<*xf32>
   func.return %0#0 : tensor<*xf32>
 }
@@ -1590,9 +1612,10 @@ func.func private @testIfElse(tensor<*xf32>) -> tensor<*xf32>
 func.func @testIfDropOutputShapes(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
   // CHECK: "tf.If"
+  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK-NOT: output_shapes
   %1 = "tf.If"(%arg0, %arg1) {
-    then_branch = @testIfThen, else_branch = @testIfElse, is_stateless = false, output_shapes = [#tf_type.shape<>]
+    then_branch = @testIfThen, else_branch = @testIfElse, is_stateless = false, output_shapes = [#tf_type.shape<>], device = "/job:localhost/replica:0/task:0/device:GPU:0"
   } : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
 
   func.return %1 : tensor<2xf32>
@@ -1602,14 +1625,16 @@ func.func @testIfDropOutputShapes(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 func.func @testNMSV3ToNMSV4(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<2xi32> {
   %max_size = arith.constant dense<2> : tensor<i32>
   // CHECK: "tf.NonMaxSuppressionV4"
-  %0 = "tf.NonMaxSuppressionV3"(%arg0, %arg1, %max_size, %arg2, %arg3): (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>)
+  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:GPU:0"
+  %0 = "tf.NonMaxSuppressionV3"(%arg0, %arg1, %max_size, %arg2, %arg3) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>)
   func.return %0 : tensor<2xi32>
 }
 
 // CHECK-LABEL: testFusedBatchNormToBatchNormV3
 func.func @testFusedBatchNormToBatchNormV3(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
   // CHECK: "tf.FusedBatchNormV3"
-  %0:5 = "tf.FusedBatchNorm"(%arg0, %arg1, %arg2, %arg3, %arg4): (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32> )
+  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:GPU:0"
+  %0:5 = "tf.FusedBatchNorm"(%arg0, %arg1, %arg2, %arg3, %arg4) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32> )
   func.return %0#0  : tensor<8x8x8x8xf32>
 }
 
@@ -1665,8 +1690,8 @@ func.func @testVariableToVariableV2() {
 // CHECK-LABEL: testUnpackAndCwiseUnary
 func.func @testUnpackAndCwiseUnary(%arg0: tensor<?x2xf32>) -> (tensor<?xf32>, tensor<?xf32>) {
 
-  // CHECK: %[[NEG:.*]] = "tf.Neg"(%arg0)
-  // CHECK: %[[UNPACK:.*]]:2 = "tf.Unpack"(%[[NEG]])
+  // CHECK: %[[NEG:.*]] = "tf.Neg"(%arg0) {device = ""}
+  // CHECK: %[[UNPACK:.*]]:2 = "tf.Unpack"(%[[NEG]]) {axis = 1 : i64, device = ""}
   %unpacked:2 = "tf.Unpack"(%arg0) {axis = 1 : i64, device = ""}
                 : (tensor<?x2xf32>) -> (tensor<?xf32>, tensor<?xf32>)
   %0 = "tf.Neg"(%unpacked#0): (tensor<?xf32>) -> tensor<?xf32>
@@ -1897,19 +1922,19 @@ func.func @testFoldEnsureShapeOp(%arg0: tensor<10x20xf32>) -> (tensor<10x20xf32>
 
 // CHECK-LABEL: testConvertPackToReshapeAxis0
 func.func @testConvertPackToReshapeAxis0(%arg0: tensor<2x3xf32>) -> tensor<1x2x3xf32> {
-  %0 = "tf.Pack"(%arg0) {axis = 0 : i64, _xla_outside_compilation = "1"} : (tensor<2x3xf32>) -> tensor<1x2x3xf32>
+  %0 = "tf.Pack"(%arg0) {axis = 0 : i64, _xla_outside_compilation = "1", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>) -> tensor<1x2x3xf32>
   func.return %0 : tensor<1x2x3xf32>
   // CHECK: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) {_xla_outside_compilation = "1"} : (tensor<2x3xf32>, tensor<3xi64>) -> tensor<1x2x3xf32>
+  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) {_xla_outside_compilation = "1", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3xi64>) -> tensor<1x2x3xf32>
   // CHECK: return %[[RESHAPE]] : tensor<1x2x3xf32>
 }
 
 // CHECK-LABEL: testConvertPackToReshapeAxis1
 func.func @testConvertPackToReshapeAxis1(%arg0: tensor<2x3xf32>) -> tensor<2x1x3xf32> {
-  %0 = "tf.Pack"(%arg0) {axis = 1 : i64} : (tensor<2x3xf32>) -> tensor<2x1x3xf32>
+  %0 = "tf.Pack"(%arg0) {axis = 1 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>) -> tensor<2x1x3xf32>
   func.return %0 : tensor<2x1x3xf32>
   // CHECK: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[2, 1, 3]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) : (tensor<2x3xf32>, tensor<3xi64>) -> tensor<2x1x3xf32>
+  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3xi64>) -> tensor<2x1x3xf32>
   // CHECK: return %[[RESHAPE]] : tensor<2x1x3xf32>
 }
 
@@ -1949,6 +1974,7 @@ func.func @while_with_id_passthrough(%arg0: tensor<7xf32> {tf._user_specified_na
       %15 = "tf.Identity"(%arg2) {device = ""} : (tensor<i32>) -> tensor<i32>
       "tf.Yield"(%14, %15, %12, %8) : (tensor<i32>, tensor<i32>, tensor<?xf32>, tensor<1xi32>) -> ()
   }) {_num_original_outputs = 4 : i64, _read_only_resource_inputs = [], _xla_propagate_compile_time_consts = true, device = "", is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<7xf32>, tensor<1xi32>) -> (tensor<i32>, tensor<i32>, tensor<?xf32>, tensor<1xi32>)
+  // CHECK: device = ""
   %7 = "tf.Identity"(%6#2) {device = ""} : (tensor<?xf32>) -> tensor<?xf32>
   func.return %7 : tensor<?xf32>
 }
@@ -2018,16 +2044,16 @@ func.func @testDivNoNanAndMulNoNanWithConstantY(%arg0: tensor<2xf32>) -> (tensor
   // CHECK: %[[CON1:.*]] = "tf.Const"() {value = dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
   // CHECK-NEXT: %[[CON2:.*]] = "tf.Const"() {value = dense<[1.000000e+01, 0.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
   // CHECK-NEXT: %[[CON3:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
-  // CHECK-NEXT: %[[RES1:.*]] = "tf.Div"(%[[ARG0]], %[[CON1]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  // CHECK-NEXT: %[[RES1:.*]] = "tf.Div"(%[[ARG0]], %[[CON1]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   // CHECK-NEXT: %[[RES2:.*]] = "tf.MulNoNan"(%[[ARG0]], %[[CON2]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   // CHECK-NEXT: return %[[RES1]], %[[RES2]], %[[CON3]] : tensor<2xf32>, tensor<2xf32>, tensor<2xf32>
   %con1 = "tf.Const"() { value = dense<[1.0, 2.0]> : tensor<2xf32> } : () -> tensor<2xf32>
   %con2 = "tf.Const"() { value = dense<[10.0, 0.0]> : tensor<2xf32> } : () -> tensor<2xf32>
   %con3 = "tf.Const"() { value = dense<[0.0, 0.0]> : tensor<2xf32> } : () -> tensor<2xf32>
 
-  %res1 = "tf.DivNoNan"(%arg0, %con1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  %res2 = "tf.MulNoNan"(%arg0, %con2) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  %res3 = "tf.DivNoNan"(%arg0, %con3) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  %res1 = "tf.DivNoNan"(%arg0, %con1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  %res2 = "tf.MulNoNan"(%arg0, %con2) {} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  %res3 = "tf.DivNoNan"(%arg0, %con3) {}: (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   func.return %res1, %res2, %res3 : tensor<2xf32>, tensor<2xf32>, tensor<2xf32>
 }
 
@@ -2037,7 +2063,7 @@ func.func @testComplexDivNoNanAndMulNoNanWithConstantY(%arg0: tensor<2xcomplex<f
   // CHECK-NEXT: %[[COMP2:.*]] = "tf.Const"() {value = dense<[(0.000000e+00,0.000000e+00), (2.000000e+00,0.000000e+00)]> : tensor<2xcomplex<f32>>} : () -> tensor<2xcomplex<f32>>
   // CHECK-NEXT: %[[COMP1:.*]] = "tf.Const"() {value = dense<[(1.000000e+00,3.000000e+00), (2.000000e+00,4.000000e+00)]> : tensor<2xcomplex<f32>>} : () -> tensor<2xcomplex<f32>>
   // CHECK-NEXT: %[[COMP3:.*]] = "tf.Const"() {value = dense<(0.000000e+00,0.000000e+00)> : tensor<2xcomplex<f32>>} : () -> tensor<2xcomplex<f32>>
-  // CHECK-NEXT: %[[RES1:.*]] = "tf.Mul"(%[[ARG0]], %[[COMP1]]) : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
+  // CHECK-NEXT: %[[RES1:.*]] = "tf.Mul"(%[[ARG0]], %[[COMP1]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
   // CHECK-NEXT: %[[RES2:.*]] = "tf.DivNoNan"(%[[ARG0]], %[[COMP2]]) : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
   // CHECK-NEXT: return %[[RES1]], %[[RES2]], %[[COMP3]] : tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>
   %con11 = "tf.Const"() { value = dense<[1.0, 2.0]> : tensor<2xf32> } : () -> tensor<2xf32>
@@ -2051,7 +2077,7 @@ func.func @testComplexDivNoNanAndMulNoNanWithConstantY(%arg0: tensor<2xcomplex<f
   %comp2 = "tf.Complex"(%con21, %con22) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xcomplex<f32>>
   %comp3 = "tf.Complex"(%con31, %con32) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xcomplex<f32>>
 
-  %res1 = "tf.MulNoNan"(%arg0, %comp1) : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
+  %res1 = "tf.MulNoNan"(%arg0, %comp1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
   %res2 = "tf.DivNoNan"(%arg0, %comp2) : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
   %res3 = "tf.MulNoNan"(%arg0, %comp3) : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
   func.return %res1, %res2, %res3 : tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>
@@ -2102,16 +2128,16 @@ func.func @testXlaConvToV2(%lhs: tensor<8x4x16x16x16xf32>, %rhs: tensor<4x3x3x16
   %rhs_dilation = "tf.Const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
   %padding = "tf.Const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
   %strides = "tf.Const"() {value = dense<[3, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-  // CHECK: "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_0, %cst_1, %cst) {batch_group_count = 1 : i64, dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
-  %0 = "tf.XlaConv"(%lhs, %rhs, %strides, %padding, %lhs_dilation, %rhs_dilation, %feature_group_count) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
+  // CHECK: "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_0, %cst_1, %cst) {batch_group_count = 1 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0", dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
+  %0 = "tf.XlaConv"(%lhs, %rhs, %strides, %padding, %lhs_dilation, %rhs_dilation, %feature_group_count) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = "", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
   func.return %0 : tensor<8x4x14x14x16xf32>
 }
 
 
 // CHECK-LABEL: testXlaReduceToXlaVariadicReduceV2
 func.func @testXlaReduceToXlaVariadicReduceV2(%arg0: tensor<*xbf16>, %arg1: tensor<*xbf16>) -> tensor<*xbf16> {
-  // CHECK: "tf.XlaVariadicReduceV2"(%arg0, %arg1) {dimensions_to_reduce = [], operandSegmentSizes = array<i32: 1, 1>, reducer = @sum1} : (tensor<*xbf16>, tensor<*xbf16>) -> tensor<*xbf16>
-  %0 = "tf.XlaReduce"(%arg0, %arg1) {dimensions_to_reduce = [], reducer = @sum1} : (tensor<*xbf16>, tensor<*xbf16>) -> tensor<*xbf16>
+  // CHECK: "tf.XlaVariadicReduceV2"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", dimensions_to_reduce = [], operandSegmentSizes = array<i32: 1, 1>, reducer = @sum1} : (tensor<*xbf16>, tensor<*xbf16>) -> tensor<*xbf16>
+  %0 = "tf.XlaReduce"(%arg0, %arg1) {dimensions_to_reduce = [], reducer = @sum1, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xbf16>, tensor<*xbf16>) -> tensor<*xbf16>
   func.return %0 : tensor<*xbf16>
 }
 
@@ -2122,8 +2148,8 @@ func.func private @sum1(%arg0: tensor<*xbf16>, %arg1: tensor<*xbf16>) -> tensor<
 
 // CHECK-LABEL: testXlaVariadicReduceToV2
 func.func @testXlaVariadicReduceToV2(%arg0: tensor<3x4xf32>, %arg1: tensor<f32>) -> tensor<?x?xf32> {
-  // CHECK:  "tf.XlaVariadicReduceV2"(%arg0, %arg1) {dimensions_to_reduce = [], operandSegmentSizes = array<i32: 1, 1>, reducer = @sum2} : (tensor<3x4xf32>, tensor<f32>) -> tensor<?x?xf32>
-  %0 = "tf.XlaVariadicReduce"(%arg0, %arg1) {dimensions_to_reduce = [], reducer = @sum2} : (tensor<3x4xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK:  "tf.XlaVariadicReduceV2"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", dimensions_to_reduce = [], operandSegmentSizes = array<i32: 1, 1>, reducer = @sum2} : (tensor<3x4xf32>, tensor<f32>) -> tensor<?x?xf32>
+  %0 = "tf.XlaVariadicReduce"(%arg0, %arg1) {dimensions_to_reduce = [], reducer = @sum2, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<3x4xf32>, tensor<f32>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 func.func private @sum2(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
@@ -2200,23 +2226,23 @@ func.func @testReluOfMinimum6ToRelu6Int(%arg0: tensor<4xi32>) -> tensor<4xi32> {
 // CHECK-LABEL: testTensorListGetItem
 func.func @testTensorListGetItem(%arg0: tensor<1600x1x32xf32>, %arg1: tensor<2xi32>, %arg2: tensor<i32>) -> tensor<1x32xf32> {
   %0 = "tf.TensorListFromTensor"(%arg0, %arg1) : (tensor<1600x1x32xf32>, tensor<2xi32>) -> tensor<!tf_type.variant<tensor<1x32xf32>>>
-  %1 = "tf.TensorListGetItem"(%0, %arg2, %arg1) : (tensor<!tf_type.variant<tensor<1x32xf32>>>, tensor<i32>, tensor<2xi32>) -> tensor<1x32xf32>
+  %1 = "tf.TensorListGetItem"(%0, %arg2, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.variant<tensor<1x32xf32>>>, tensor<i32>, tensor<2xi32>) -> tensor<1x32xf32>
   func.return %1 : tensor<1x32xf32>
 
   // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[RES:.*]] = "tf.GatherV2"(%arg0, %arg2, %cst) {batch_dims = 0 : i64} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
+  // CHECK: %[[RES:.*]] = "tf.GatherV2"(%arg0, %arg2, %cst) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
 }
 
 // CHECK-LABEL: testTensorListGetItemMultipleUsers
 func.func @testTensorListGetItemMultipleUsers(%arg0: tensor<1600x1x32xf32>, %arg1: tensor<2xi32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<1x32xf32>, tensor<1x32xf32>) {
   %0 = "tf.TensorListFromTensor"(%arg0, %arg1) : (tensor<1600x1x32xf32>, tensor<2xi32>) -> tensor<!tf_type.variant<tensor<1x32xf32>>>
-  %1 = "tf.TensorListGetItem"(%0, %arg2, %arg1) : (tensor<!tf_type.variant<tensor<1x32xf32>>>, tensor<i32>, tensor<2xi32>) -> tensor<1x32xf32>
-  %2 = "tf.TensorListGetItem"(%0, %arg3, %arg1) : (tensor<!tf_type.variant<tensor<1x32xf32>>>, tensor<i32>, tensor<2xi32>) -> tensor<1x32xf32>
+  %1 = "tf.TensorListGetItem"(%0, %arg2, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.variant<tensor<1x32xf32>>>, tensor<i32>, tensor<2xi32>) -> tensor<1x32xf32>
+  %2 = "tf.TensorListGetItem"(%0, %arg3, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.variant<tensor<1x32xf32>>>, tensor<i32>, tensor<2xi32>) -> tensor<1x32xf32>
   func.return %1, %2 : tensor<1x32xf32>, tensor<1x32xf32>
 
   // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[RES0:.*]] = "tf.GatherV2"(%arg0, %arg2, %cst) {batch_dims = 0 : i64} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
-  // CHECK: %[[RES1:.*]] = "tf.GatherV2"(%arg0, %arg3, %cst) {batch_dims = 0 : i64} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
+  // CHECK: %[[RES0:.*]] = "tf.GatherV2"(%arg0, %arg2, %cst) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
+  // CHECK: %[[RES1:.*]] = "tf.GatherV2"(%arg0, %arg3, %cst) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
 }
 
 // CHECK-LABEL: testUnaryIdempotent

From 8597a45659de0f0776535adc42769f91ce1000a6 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 28 Sep 2023 17:12:53 -0700
Subject: [PATCH 406/567] [stream_executor] Add support for tracing into
 command buffers (graph capture)

We should never mix traced command buffers (captured CUDA graphs) with explicitly constructed command buffers (graphs).

Traced command buffers should be recorded into "parent" command buffers as nested submissions.

Public API is structured in a way that makes it impossible to make this mistake accidentally, and in internal APIs we always check preconditions.

https://github.com/openxla/xla/issues/5755

PiperOrigin-RevId: 569332029
---
 third_party/xla/xla/stream_executor/BUILD     |  2 +
 .../xla/xla/stream_executor/command_buffer.cc | 28 ++++++++++
 .../xla/xla/stream_executor/command_buffer.h  | 42 +++++++++++++--
 .../cuda/cuda_command_buffer_test.cc          | 49 ++++++++++++++++--
 third_party/xla/xla/stream_executor/gpu/BUILD |  3 ++
 .../stream_executor/gpu/gpu_command_buffer.cc | 51 ++++++++++++++++++-
 .../stream_executor/gpu/gpu_command_buffer.h  |  8 +++
 .../stream_executor_internal.h                |  5 ++
 8 files changed, 179 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 2fd02c7453e3d0..599a73489333a2 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -275,6 +275,8 @@ cc_library(
         ":stream_executor_headers",
         ":stream_executor_internal",
         "//xla/stream_executor/platform",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
diff --git a/third_party/xla/xla/stream_executor/command_buffer.cc b/third_party/xla/xla/stream_executor/command_buffer.cc
index 9438d8c798b51e..1a77ebb11c2faa 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/command_buffer.cc
@@ -19,10 +19,13 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
@@ -36,6 +39,31 @@ namespace stream_executor {
   return tsl::StatusOr<CommandBuffer>(std::move(command_buffer));
 }
 
+/*static*/ tsl::StatusOr<CommandBuffer> CommandBuffer::Trace(
+    StreamExecutor* executor,
+    absl::AnyInvocable<tsl::Status(Stream*)> function) {
+  Stream stream(executor);
+
+  // TODO(ezhulenev): Keep a dedicated stream for command buffer tracing in the
+  // StreamExecutor itself, and maybe add a StreamPool argument to the traced
+  // function arguments to be able to trace multiple stream simultaneously.
+  stream.Init();
+  if (!stream.ok())
+    return absl::InternalError(
+        "Failed to initialize stream for command buffer tracing");
+
+  // Prepare an empty command buffer instance.
+  TF_ASSIGN_OR_RETURN(CommandBuffer command_buffer,
+                      CommandBuffer::Create(executor));
+
+  // Trace and finalize the command buffer.
+  TF_RETURN_IF_ERROR(command_buffer.implementation()->Trace(
+      &stream, [&]() { return function(&stream); }));
+  TF_RETURN_IF_ERROR(command_buffer.implementation()->Finalize());
+
+  return command_buffer;
+}
+
 CommandBuffer::CommandBuffer(
     std::unique_ptr<internal::CommandBufferInterface> implementation)
     : implementation_(std::move(implementation)) {}
diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index e5d5daed064e07..31361638d52d4a 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <tuple>
 
+#include "absl/functional/any_invocable.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform/port.h"
@@ -29,6 +30,7 @@ limitations under the License.
 
 namespace stream_executor {
 
+class Stream;
 class StreamExecutor;
 
 namespace internal {
@@ -45,16 +47,36 @@ class CommandBufferInterface;
 // buffers allow to amortize the cost of launching "work" on device by building
 // it on the host ahead of time without expensive interaction with underlying
 // device.
+//
+// TODO(ezhulenev): Add a concept of a "nested" command buffer which can't be
+// submitted on its own, but has to be recorded into the parent command buffer.
+// For CUDA backend nested command buffers will never be instantiated into
+// executable graphs, but instead will only have a regular graph instance. We
+// should almost always trace into nested command buffers.
 class CommandBuffer {
  public:
-  explicit CommandBuffer(
-      std::unique_ptr<internal::CommandBufferInterface> implementation);
-
-  CommandBuffer(CommandBuffer&&) = default;
-  CommandBuffer& operator=(CommandBuffer&&) = default;
+  //===--------------------------------------------------------------------===//
+  // Command buffer constructors
+  //===--------------------------------------------------------------------===//
 
+  // Creates a new empty command buffer on the given executor.
   static tsl::StatusOr<CommandBuffer> Create(StreamExecutor* executor);
 
+  // Creates a new command buffer on the given executor by tracing `function`
+  // invocation. All StreamExecutor operations on a Stream argument will be
+  // recorded into the command buffer. Returned command buffer is finalized, and
+  // can't be updated.
+  //
+  // Command buffer tracing should be used only when it is impossible to use
+  // explicit construction APIs, e.g. when calling external libraries.
+  static tsl::StatusOr<CommandBuffer> Trace(
+      StreamExecutor* executor,
+      absl::AnyInvocable<tsl::Status(Stream*)> function);
+
+  //===--------------------------------------------------------------------===//
+  // Command buffer API
+  //===--------------------------------------------------------------------===//
+
   // Adds a kernel launch command to the command buffer.
   tsl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
                      const KernelBase& kernel, const KernelArgsArrayBase& args);
@@ -74,10 +96,20 @@ class CommandBuffer {
                      const ThreadDim& threads, const BlockDim& blocks,
                      Args... args);
 
+  internal::CommandBufferInterface* implementation() {
+    return implementation_.get();
+  }
+
   const internal::CommandBufferInterface* implementation() const {
     return implementation_.get();
   }
 
+  explicit CommandBuffer(
+      std::unique_ptr<internal::CommandBufferInterface> implementation);
+
+  CommandBuffer(CommandBuffer&&) = default;
+  CommandBuffer& operator=(CommandBuffer&&) = default;
+
  private:
   std::unique_ptr<internal::CommandBufferInterface> implementation_;
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
index 4e1304df8d8c17..254e8f7a0c4a53 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
@@ -24,14 +24,15 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/test.h"
 
 namespace stream_executor::cuda {
 
-TEST(CudaCommandBufferTest, LaunchSingleKernel) {
-  using AddI32Kernel = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
-                                   DeviceMemory<int32_t>>;
+using AddI32Kernel = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
+                                 DeviceMemory<int32_t>>;
 
+TEST(CudaCommandBufferTest, LaunchSingleKernel) {
   Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
   StreamExecutor* executor = platform->ExecutorForDevice(0).value();
 
@@ -72,4 +73,46 @@ TEST(CudaCommandBufferTest, LaunchSingleKernel) {
   ASSERT_EQ(dst, expected);
 }
 
+TEST(CudaCommandBufferTest, TraceSingleKernel) {
+  Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
+  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  MultiKernelLoaderSpec spec(/*arity=*/3);
+  spec.AddCudaPtxInMemory(internal::kAddI32Kernel, "add");
+
+  AddI32Kernel add(executor);
+  ASSERT_TRUE(executor->GetKernel(spec, &add).ok());
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: a=1, b=2, c=0
+  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+
+  stream.ThenMemset32(&a, 1, byte_length);
+  stream.ThenMemset32(&b, 2, byte_length);
+  stream.ThenMemZero(&c, byte_length);
+
+  // Create a command buffer by tracing kernel launch operations.
+  auto cmd_buffer = CommandBuffer::Trace(executor, [&](Stream* stream) {
+    return stream->ThenLaunch(ThreadDim(), BlockDim(4), add, a, b, c);
+  });
+
+  ASSERT_TRUE(cmd_buffer.ok());
+  ASSERT_TRUE(executor->Submit(&stream, *cmd_buffer).ok());
+
+  // Copy data back to host.
+  std::vector<int32_t> dst(4, 42);
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+
+  std::vector<int32_t> expected = {3, 3, 3, 3};
+  ASSERT_EQ(dst, expected);
+}
+
 }  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 452a1e46a90b5a..45a34a753289f5 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -94,8 +94,11 @@ cc_library(
         ":gpu_stream",
         ":gpu_types_header",
         "//xla/stream_executor:stream_executor_headers",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index 3867d72f9f3777..11080a80fcd0ff 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -18,11 +18,15 @@ limitations under the License.
 #include <atomic>
 #include <cstdint>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_kernel.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
@@ -82,10 +86,51 @@ static GpuDevicePtr AsDevicePtr(const DeviceMemoryBase& mem) {
   return reinterpret_cast<GpuDevicePtr>(const_cast<void*>(mem.opaque()));
 }
 
+tsl::Status GpuCommandBuffer::Trace(
+    Stream* stream, absl::AnyInvocable<tsl::Status()> function) {
+  // TODO(ezhulenev): Check that graph is empty, because we should not be mixing
+  // graph tracing with explicit graph construction.
+  TF_RETURN_IF_ERROR(CheckNotFinalized());
+
+  VLOG(5) << "Trace into GPU command buffer graph " << graph_
+          << " on a stream: " << stream->DebugStreamPointers();
+
+  auto gpu_stream = AsGpuStreamValue(stream);
+
+  // Switch stream into the capture mode.
+  uint64_t start_nanos = tsl::Env::Default()->NowNanos();
+  TF_RETURN_IF_ERROR(GpuDriver::StreamBeginCapture(
+      gpu_stream, GpuDriver::StreamCaptureMode::kThreadLocal));
+
+  auto traced = function();
+
+  // Always stop capturing the stream before checking `traced` result.
+  TF_RETURN_IF_ERROR(GpuDriver::StreamEndCapture(gpu_stream, &graph_));
+  uint64_t end_nanos = tsl::Env::Default()->NowNanos();
+
+  if (!traced.ok())
+    return absl::InternalError(
+        absl::StrCat("Failed to capture gpu graph: ", traced.message()));
+
+  VLOG(5) << "Traced into the GPU command buffer graph " << graph_ << " (took "
+          << (end_nanos - start_nanos) / 1000 << " μs)";
+
+  return tsl::OkStatus();
+}
+
+tsl::Status GpuCommandBuffer::CheckNotFinalized() {
+  if (exec_ == nullptr) return tsl::OkStatus();
+
+  return absl::InternalError(
+      "Command buffer can't be updated after it was finalized");
+}
+
 tsl::Status GpuCommandBuffer::Launch(const ThreadDim& threads,
                                      const BlockDim& blocks,
                                      const KernelBase& kernel,
                                      const KernelArgsArrayBase& args) {
+  TF_RETURN_IF_ERROR(CheckNotFinalized());
+
   const GpuKernel* gpu_kernel = AsGpuKernel(&kernel);
   GpuFunctionHandle gpu_func = gpu_kernel->AsGpuFunctionHandle();
 
@@ -103,6 +148,8 @@ tsl::Status GpuCommandBuffer::Launch(const ThreadDim& threads,
 tsl::Status GpuCommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
                                                    const DeviceMemoryBase& src,
                                                    uint64_t size) {
+  TF_RETURN_IF_ERROR(CheckNotFinalized());
+
   GpuGraphNodeHandle node;
   TF_RETURN_IF_ERROR(GpuDriver::GraphAddMemcpyD2DNode(
       parent_->gpu_context(), &node, graph_, {}, AsDevicePtr(*dst),
@@ -112,9 +159,11 @@ tsl::Status GpuCommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
 }
 
 tsl::Status GpuCommandBuffer::Finalize() {
-  uint64_t start_nanos = tsl::Env::Default()->NowNanos();
+  TF_RETURN_IF_ERROR(CheckNotFinalized());
 
   GpuDriver::GraphInstantiateFlags flags;
+
+  uint64_t start_nanos = tsl::Env::Default()->NowNanos();
   TF_RETURN_IF_ERROR(GpuDriver::GraphInstantiate(&exec_, graph_, flags));
   uint64_t end_nanos = tsl::Env::Default()->NowNanos();
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index 9e87adb4fa50d8..8680f0966348bf 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <type_traits>
 
+#include "absl/functional/any_invocable.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
@@ -36,6 +37,9 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   GpuCommandBuffer(GpuExecutor* parent, GpuGraphHandle graph);
   ~GpuCommandBuffer() override;
 
+  tsl::Status Trace(Stream* stream,
+                    absl::AnyInvocable<tsl::Status()> function) override;
+
   tsl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
                      const KernelBase& kernel,
                      const KernelArgsArrayBase& args) override;
@@ -67,6 +71,10 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   }
 
  private:
+  // Returns OK status if command buffer is not finalized and it is still
+  // possible to add new commands to it, otherwise returns internal error.
+  tsl::Status CheckNotFinalized();
+
   static_assert(std::is_pointer_v<GpuGraphHandle>,
                 "GpuGraphHandle must be a pointer");
   static_assert(std::is_pointer_v<GpuGraphExecHandle>,
diff --git a/third_party/xla/xla/stream_executor/stream_executor_internal.h b/third_party/xla/xla/stream_executor/stream_executor_internal.h
index af79f35ddaa395..1464348b3688fe 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_internal.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_internal.h
@@ -141,6 +141,11 @@ class CommandBufferInterface {
   CommandBufferInterface() = default;
   virtual ~CommandBufferInterface() = default;
 
+  // Traces `function` invocation by recording all operations on the `stream`
+  // into the command buffer. Command buffer must be empty.
+  virtual tsl::Status Trace(Stream* stream,
+                            absl::AnyInvocable<tsl::Status()> function) = 0;
+
   // Adds a kernel launch command to the command buffer.
   virtual tsl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
                              const KernelBase& kernel,

From 1314592ab06ada38cc339cdd62094e245a2bcdae Mon Sep 17 00:00:00 2001
From: Michael Hudgins <michaelhudgins@google.com>
Date: Thu, 28 Sep 2023 17:21:43 -0700
Subject: [PATCH 407/567] Update ARM64 docker containers to have a target for
 JAX builds and add retry logic for flaky networks

PiperOrigin-RevId: 569333946
---
 ci/official/containers/linux_arm64/Dockerfile | 23 ++++++---
 ci/official/containers/linux_arm64/build.sh   | 49 ++++++++++++-------
 .../builder.devtoolset/build_devtoolset.sh    |  8 +--
 .../linux_arm64/builder.packages.txt          |  1 +
 .../containers/linux_arm64/devel.packages.txt |  1 +
 .../linux_arm64/jax.requirements.txt          | 27 ++++++++++
 .../containers/linux_arm64/setup.packages.sh  |  2 +-
 .../containers/linux_arm64/setup.python.sh    |  3 +-
 8 files changed, 83 insertions(+), 31 deletions(-)
 create mode 100644 ci/official/containers/linux_arm64/jax.requirements.txt

diff --git a/ci/official/containers/linux_arm64/Dockerfile b/ci/official/containers/linux_arm64/Dockerfile
index 55bcb973c6a0f0..f72a8b983e1b3d 100644
--- a/ci/official/containers/linux_arm64/Dockerfile
+++ b/ci/official/containers/linux_arm64/Dockerfile
@@ -6,7 +6,7 @@ FROM ubuntu:20.04 as builder
 COPY setup.packages.sh setup.packages.sh
 COPY builder.packages.txt builder.packages.txt
 RUN /setup.packages.sh /builder.packages.txt
-
+RUN update-ca-certificates
 # Install devtoolset-9 in /dt10 with glibc 2.17 and libstdc++ 4.8, for building
 # manylinux2014-compatible packages.
 COPY builder.devtoolset/fixlinks_aarch64.sh /fixlinks.sh
@@ -14,6 +14,7 @@ COPY builder.devtoolset/rpm-patch.sh /rpm-patch.sh
 COPY builder.devtoolset/build_devtoolset.sh /build_devtoolset.sh
 COPY builder.devtoolset/gcc9-fixups.patch /gcc9-fixups.patch
 COPY builder.devtoolset/stringop_trunc.patch /stringop_trunc.patch
+
 RUN /build_devtoolset.sh devtoolset-10 /dt10
 
 # Build later version of patchelf that is not so buggy
@@ -40,9 +41,9 @@ RUN /setup.sources.sh && /setup.packages.sh /devel.packages.txt
 # - buildifier: clean bazel build deps
 # - buildozer: clean bazel build deps
 RUN git clone --branch v1.7.0 https://github.com/bats-core/bats-core.git && bats-core/install.sh /usr/local && rm -rf bats-core
-RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.12.0/bazelisk-linux-arm64 -O /usr/local/bin/bazel && chmod +x /usr/local/bin/bazel
-RUN wget https://github.com/bazelbuild/buildtools/releases/download/4.2.5/buildifier-linux-arm64 -O /usr/local/bin/buildifier && chmod +x /usr/local/bin/buildifier
-RUN wget https://github.com/bazelbuild/buildtools/releases/download/4.2.5/buildozer-linux-arm64 -O /usr/local/bin/buildozer && chmod +x /usr/local/bin/buildozer
+RUN wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 https://github.com/bazelbuild/bazelisk/releases/download/v1.12.0/bazelisk-linux-arm64 -O /usr/local/bin/bazel && chmod +x /usr/local/bin/bazel
+RUN wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 https://github.com/bazelbuild/buildtools/releases/download/4.2.5/buildifier-linux-arm64 -O /usr/local/bin/buildifier && chmod +x /usr/local/bin/buildifier
+RUN wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 https://github.com/bazelbuild/buildtools/releases/download/4.2.5/buildozer-linux-arm64 -O /usr/local/bin/buildozer && chmod +x /usr/local/bin/buildozer
 
 RUN groupadd -g 1001 buildslave && useradd -m -u 1001 -g buildslave buildslave
 RUN mkdir -p /tf/venv
@@ -60,11 +61,19 @@ COPY devel.usertools /usertools
 COPY devel.bashrc /root/.bashrc
 COPY ld.so.conf /dt10/etc/
 
-# Setup Python environment.
+# Setup JAX Python environment.
+FROM devel as jax
+COPY jax.requirements.txt /devel.requirements.txt
 COPY setup.python.sh /setup.python.sh
-COPY devel.requirements.txt /devel.requirements.txt
-RUN /setup.python.sh python3.8 devel.requirements.txt
 RUN /setup.python.sh python3.9 devel.requirements.txt
 RUN /setup.python.sh python3.10 devel.requirements.txt
 RUN /setup.python.sh python3.11 devel.requirements.txt
+RUN /setup.python.sh python3.12 devel.requirements.txt
 
+FROM devel as tf
+# Setup TF Python environment.
+COPY devel.requirements.txt /devel.requirements.txt
+COPY setup.python.sh /setup.python.sh
+RUN /setup.python.sh python3.9 devel.requirements.txt
+RUN /setup.python.sh python3.10 devel.requirements.txt
+RUN /setup.python.sh python3.11 devel.requirements.txt
\ No newline at end of file
diff --git a/ci/official/containers/linux_arm64/build.sh b/ci/official/containers/linux_arm64/build.sh
index dc30a354f20f0d..0010fdff8503b4 100755
--- a/ci/official/containers/linux_arm64/build.sh
+++ b/ci/official/containers/linux_arm64/build.sh
@@ -14,16 +14,16 @@
 # limitations under the License.
 # ==============================================================================
 
-set -e
+set -exo pipefail
 
-is_continuous_or_release() {
-  [[ "$KOKORO_JOB_TYPE" == "CONTINUOUS_INTEGRATION" ]] || [[ "${KOKORO_JOB_TYPE}" == "RELEASE" ]]
+function is_continuous_or_release() {
+  [[ "$KOKORO_JOB_TYPE" == "CONTINUOUS_INTEGRATION" ]] || [[ "$KOKORO_JOB_TYPE" == "RELEASE" ]]
 }
 
 # Move into the directory of the script
 cd "$(dirname "$0")"
 
-if is_continuous_or_release; then
+if is_continuous_or_release || [[ -z "$KOKORO_BUILD_ID" ]]; then
   # A continuous job is the only one to publish to latest
   TAG="latest-multi-python"
 else
@@ -35,18 +35,31 @@ else
   fi
 fi
 
-# IMAGE="gcr.io/tensorflow-sigs/build-arm64:$TAG-$PYVER"
-IMAGE="gcr.io/tensorflow-sigs/build-arm64:$TAG"
-docker pull "$IMAGE" || true
-
-gcloud auth configure-docker
-
-# TODO(michaelhudgins): align with sig build and make it so not every python is
-# being included in a single image
-# --build-arg "PYTHON_VERSION=$PYVER" \
-DOCKER_BUILDKIT=1 docker build \
-  --cache-from "$IMAGE" \
-  --target=devel \
-  -t "$IMAGE"  .
+# Build for both JAX and TF usage.  We do these in one place because they share
+# almost all of the same cache layers
+export DOCKER_BUILDKIT=1
+for target in jax tf; do
+  IMAGE="gcr.io/tensorflow-sigs/build-arm64:$target-$TAG"
+  docker pull "$IMAGE" || true
+  # Due to some flakiness of resources pulled in the build, allow the docker
+  # command to reattempt build a few times in the case of failure (b/302558736)
+  set +e
+  for i in $(seq 1 5)
+  do
+    docker build \
+    --build-arg REQUIREMENTS_FILE=jax.requirements.txt \
+    --target=$target \
+    --cache-from "$IMAGE" \
+    -t "$IMAGE"  . && break
+  done
+  final=$?
+  if [ $final -ne 0 ]; then
+    exit $final
+  fi
+  set -e
 
-docker push "$IMAGE"
+  if [[ -n "$KOKORO_BUILD_ID" ]]; then
+    gcloud auth configure-docker
+    docker push "$IMAGE"
+  fi
+done
diff --git a/ci/official/containers/linux_arm64/builder.devtoolset/build_devtoolset.sh b/ci/official/containers/linux_arm64/builder.devtoolset/build_devtoolset.sh
index b90cb54a2ebe0d..2bba3fe55ac8d0 100755
--- a/ci/official/containers/linux_arm64/builder.devtoolset/build_devtoolset.sh
+++ b/ci/official/containers/linux_arm64/builder.devtoolset/build_devtoolset.sh
@@ -52,7 +52,7 @@ ln -s "/usr/include/aarch64-linux-gnu/asm" "${TARGET}/usr/include/asm"
 mkdir -p glibc-src
 mkdir -p glibc-build
 cd glibc-src
-wget "https://vault.centos.org/centos/7/os/Source/SPackages/glibc-2.17-317.el7.src.rpm"
+wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "https://vault.centos.org/centos/7/os/Source/SPackages/glibc-2.17-317.el7.src.rpm"
 rpm2cpio "glibc-2.17-317.el7.src.rpm" |cpio -idmv
 tar -xvzf "glibc-2.17-c758a686.tar.gz" --strip 1
 tar -xvzf "glibc-2.17-c758a686-releng.tar.gz" --strip 1
@@ -77,7 +77,7 @@ sed -i '54i#define TCP_USER_TIMEOUT 18' "/${TARGET}/usr/include/netinet/tcp.h"
 # Download specific version of libstdc++ shared library based on the value of
 # the `VERSION` parameter
   # Download binary libstdc++ 4.8 shared library release
-wget "http://old-releases.ubuntu.com/ubuntu/pool/main/g/gcc-4.8/libstdc++6_4.8.1-10ubuntu8_arm64.deb" && \
+wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "http://old-releases.ubuntu.com/ubuntu/pool/main/g/gcc-4.8/libstdc++6_4.8.1-10ubuntu8_arm64.deb" && \
     unar "libstdc++6_4.8.1-10ubuntu8_arm64.deb" && \
     tar -C "${TARGET}" -xvzf "libstdc++6_4.8.1-10ubuntu8_arm64/data.tar.gz" "./usr/lib/aarch64-linux-gnu/libstdc++.so.6.0.18"  && \
     rm -rf "libstdc++6_4.8.1-10ubuntu8_arm64.deb" "libstdc++6_4.8.1-10ubuntu8_arm64"
@@ -88,12 +88,12 @@ cd "${TARGET}-src"
 # Build a devtoolset cross-compiler based on our glibc 2.12/glibc 2.17 sysroot setup.
 case "${VERSION}" in
 devtoolset-9)
-  wget "https://vault.centos.org/centos/7/sclo/Source/rh/devtoolset-9-gcc-9.3.1-2.2.el7.src.rpm"
+  wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "https://vault.centos.org/centos/7/sclo/Source/rh/devtoolset-9-gcc-9.3.1-2.2.el7.src.rpm"
   rpm2cpio "devtoolset-9-gcc-9.3.1-2.2.el7.src.rpm" |cpio -idmv
   tar -xvf "gcc-9.3.1-20200408.tar.xz" --strip 1
   ;;
 devtoolset-10)
-  wget "https://vault.centos.org/centos/7/sclo/Source/rh/devtoolset-10-gcc-10.2.1-11.2.el7.src.rpm"
+  wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "https://vault.centos.org/centos/7/sclo/Source/rh/devtoolset-10-gcc-10.2.1-11.2.el7.src.rpm"
   rpm2cpio "devtoolset-10-gcc-10.2.1-11.2.el7.src.rpm" |cpio -idmv
   tar -xvf "gcc-10.2.1-20210130.tar.xz" --strip 1
   ;;
diff --git a/ci/official/containers/linux_arm64/builder.packages.txt b/ci/official/containers/linux_arm64/builder.packages.txt
index 3a0d2cd10442b6..dae24d10faac31 100644
--- a/ci/official/containers/linux_arm64/builder.packages.txt
+++ b/ci/official/containers/linux_arm64/builder.packages.txt
@@ -13,6 +13,7 @@ libxml2-dev
 libxmlsec1-dev
 llvm
 make
+openssl
 tk-dev
 wget
 xz-utils
diff --git a/ci/official/containers/linux_arm64/devel.packages.txt b/ci/official/containers/linux_arm64/devel.packages.txt
index 5d208cc825aefa..a8a9cb442c8b0b 100644
--- a/ci/official/containers/linux_arm64/devel.packages.txt
+++ b/ci/official/containers/linux_arm64/devel.packages.txt
@@ -28,6 +28,7 @@ mlocate
 moreutils
 openjdk-11-jdk
 openjdk-11-jre-headless
+openssl
 patchelf
 pkg-config
 python3-dev
diff --git a/ci/official/containers/linux_arm64/jax.requirements.txt b/ci/official/containers/linux_arm64/jax.requirements.txt
new file mode 100644
index 00000000000000..878d229d0f237e
--- /dev/null
+++ b/ci/official/containers/linux_arm64/jax.requirements.txt
@@ -0,0 +1,27 @@
+# JAX requirements, passed into container by defining the ARG 
+# REQUIREMENTS_FILE=jax.requirements.txt
+
+
+setuptools
+wheel
+cloudpickle
+colorama>=0.4.4
+matplotlib
+pillow>=9.1.0
+rich
+absl-py
+portpicker
+six
+opt-einsum
+auditwheel
+typing_extensions
+importlib_metadata>=4.6
+numpy==1.26.0;python_version=="3.12"
+numpy==1.23.4;python_version=="3.11"
+numpy==1.22.4;python_version<"3.11"
+scipy==1.11.2;python_version=="3.12"
+scipy==1.9.2;python_version=="3.11"
+scipy==1.7.3;python_version<"3.11"
+ml_dtypes>=0.2.0
+
+
diff --git a/ci/official/containers/linux_arm64/setup.packages.sh b/ci/official/containers/linux_arm64/setup.packages.sh
index f808cf7d22a7ce..347b853e349385 100755
--- a/ci/official/containers/linux_arm64/setup.packages.sh
+++ b/ci/official/containers/linux_arm64/setup.packages.sh
@@ -25,4 +25,4 @@ export DEBIAN_FRONTEND=noninteractive
 apt-get update
 # Remove commented lines and blank lines
 apt-get install -y --no-install-recommends $(sed -e '/^\s*#.*$/d' -e '/^\s*$/d' "$1" | sort -u)
-rm -rf /var/lib/apt/lists/*
+rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/ci/official/containers/linux_arm64/setup.python.sh b/ci/official/containers/linux_arm64/setup.python.sh
index 3401ce03f56089..767cf7d2df7247 100755
--- a/ci/official/containers/linux_arm64/setup.python.sh
+++ b/ci/official/containers/linux_arm64/setup.python.sh
@@ -51,7 +51,8 @@ if [[ ! -f "/usr/local/include/$VERSION" ]]; then
 fi
 
 # Install pip
-curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+
+wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 https://bootstrap.pypa.io/get-pip.py
 /usr/bin/$VERSION get-pip.py
 /usr/bin/$VERSION -m pip install --no-cache-dir --upgrade pip
 

From 0dd7a59c937f61f968a4083de1332f2cb537feba Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Thu, 28 Sep 2023 17:45:25 -0700
Subject: [PATCH 408/567] [xla:gpu] Add CommandBufferScheduling pass

The pass outlines command buffers that contain a single fusion instruction.

https://github.com/openxla/xla/issues/5756

PiperOrigin-RevId: 569338364
---
 third_party/xla/xla/service/gpu/BUILD         | 26 ++++++
 .../service/gpu/command_buffer_scheduling.cc  | 80 +++++++++++++++++++
 .../service/gpu/command_buffer_scheduling.h   | 75 +++++++++++++++++
 .../gpu/command_buffer_scheduling_test.cc     | 58 ++++++++++++++
 4 files changed, 239 insertions(+)
 create mode 100644 third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
 create mode 100644 third_party/xla/xla/service/gpu/command_buffer_scheduling.h
 create mode 100644 third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 7d0ab81fdbb0ed..016680c5fec8ce 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2496,6 +2496,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "command_buffer_scheduling",
+    srcs = ["command_buffer_scheduling.cc"],
+    hdrs = ["command_buffer_scheduling.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "command_buffer_scheduling_test",
+    srcs = ["command_buffer_scheduling_test.cc"],
+    deps = [
+        ":command_buffer_scheduling",
+        "//xla/tests:hlo_test_base",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "fusion_pipeline",
     srcs = ["fusion_pipeline.cc"],
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
new file mode 100644
index 00000000000000..25fe4efe6afd5b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
@@ -0,0 +1,80 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/command_buffer_scheduling.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/statusor.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+StatusOr<bool> CommandBufferScheduling::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  HloComputation* entry = module->entry_computation();
+  std::vector<HloInstruction*> instructions = entry->MakeInstructionPostOrder();
+
+  // TODO(anlunx): Add support for multiple fusion instructions.
+  // TODO(anlunx): Add support for conditionals and while loops.
+  for (HloInstruction* instruction : instructions) {
+    if (instruction->opcode() != HloOpcode::kFusion) {
+      continue;
+    }
+
+    auto* fusion = static_cast<HloFusionInstruction*>(instruction);
+    auto builder = HloComputation::Builder("command_buffer");
+
+    // Create parameters to the command buffer computation.
+    std::vector<HloInstruction*> parameters;
+    for (int64_t i = 0; i < fusion->operand_count(); i++) {
+      const HloInstruction* operand = fusion->operand(i);
+      TF_ASSIGN_OR_RETURN(HloInstruction * parameter,
+                          builder.AddParameter(HloInstruction::CreateParameter(
+                              i, operand->shape(), "param")));
+      parameters.push_back(parameter);
+    }
+
+    // Create the fusion instruction inside the command buffer.
+    builder.AddInstruction(HloInstruction::CreateFusion(
+        fusion->shape(), fusion->fusion_kind(), parameters,
+        fusion->fused_instructions_computation()));
+
+    HloComputation* command_buffer =
+        module->AddComputationAndUnifyNamesAndIds(builder.Build(),
+                                                  /*is_entry=*/false);
+
+    // Replace the fusion instruction with a call to the command buffer.
+    HloInstruction* call_command_buffer =
+        entry->AddInstruction(HloInstruction::CreateCall(
+            fusion->shape(), fusion->operands(), command_buffer));
+    TF_RETURN_IF_ERROR(call_command_buffer->CopyAllControlDepsFrom(fusion));
+    TF_RETURN_IF_ERROR(fusion->DropAllControlDeps());
+    TF_RETURN_IF_ERROR(entry->ReplaceInstruction(fusion, call_command_buffer));
+  }
+
+  return true;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.h b/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
new file mode 100644
index 00000000000000..69fd3b44114d7b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
@@ -0,0 +1,75 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_COMMAND_BUFFER_SCHEDULING_H_
+#define XLA_SERVICE_GPU_COMMAND_BUFFER_SCHEDULING_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+#include "xla/statusor.h"
+
+namespace xla::gpu {
+
+// Lift fusion instructions to command buffers.
+//
+// Before the pass:
+//   %fused_computation (param_0: s32[], param_1: s32[]) -> s32[] {
+//     ...
+//   }
+//
+//   ENTRY %main (a: s32[], b: s32[]) -> s32[] {
+//     %a = s32[] parameter(0)
+//     %b = s32[] parameter(1)
+//     ROOT %fusion = s32[] fusion(s32[] %a, s32[] %b), kind=kLoop,
+//       calls=%fused_computation
+//   }
+//
+// After the pass:
+//   %fused_computation (param_0: s32[], param_1: s32[]) -> s32[] {
+//     ...
+//   }
+//
+//   %command_buffer (param_0: s32[], param_1: s32[]) -> s32[] {
+//     %param_0 = s32[] parameter(0)
+//     %param_1 = s32[] parameter(1)
+//     ROOT %fusion = s32[] fusion(s32[] %param_0, s32[] %param_1), kind=kLoop,
+//       calls=%fused_computation
+//   }
+//
+//   ENTRY %main (a: s32[], b: s32[]) -> s32[] {
+//     %a = s32[] parameter(0)
+//     %b = s32[] parameter(1)
+//     ROOT %call = s32[] call(s32[] %a, s32[] %b), to_apply=%command_buffer
+//  }
+//
+// We currently do not have a command_buffer HLO operation, so we'll start with
+// a kCall op code with an attached HLO computation. We'll consider graduating
+// custom call to a first class operation later.
+class CommandBufferScheduling : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "command-buffer-scheduling";
+  }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_COMMAND_BUFFER_SCHEDULING_H_
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc
new file mode 100644
index 00000000000000..83b681dbbca135
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc
@@ -0,0 +1,58 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/command_buffer_scheduling.h"
+
+#include <gtest/gtest.h>
+#include "xla/tests/hlo_test_base.h"
+
+namespace xla::gpu {
+
+namespace {
+
+class CommandBufferSchedulingTest : public HloTestBase {};
+
+TEST_F(CommandBufferSchedulingTest, SingleFusion) {
+  const char* hlo = R"(
+      HloModule TestModule
+
+      %fused_computation (param_0: s32[], param_1: s32[]) -> s32[] {
+        %p0 = s32[] parameter(0)
+        %p1 = s32[] parameter(1)
+        ROOT %add = s32[] add(s32[] %p0, s32[] %p1)
+      }
+
+      ENTRY %main (a: s32[], b: s32[]) -> s32[] {
+        %a = s32[] parameter(0)
+        %b = s32[] parameter(1)
+        ROOT %fusion = s32[] fusion(s32[] %a, s32[] %b), kind=kLoop, calls=%fused_computation
+      })";
+
+  RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(), R"(
+// CHECK: %command_buffer (param: s32[], param.1: s32[]) -> s32[] {
+// CHECK:   %param = s32[] parameter(0)
+// CHECK:   %param.1 = s32[] parameter(1)
+// CHECK:   ROOT %fusion.1 = s32[] fusion(%param, %param.1), kind=kLoop, calls=%fused_computation
+// CHECK: }
+//
+// CHECK: ENTRY %main (a: s32[], b: s32[]) -> s32[] {
+// CHECK:   %a = s32[] parameter(0)
+// CHECK:   %b = s32[] parameter(1)
+// CHECK:   ROOT %call = s32[] call(%a, %b), to_apply=%command_buffer
+// CHECK: })");
+}
+
+}  // namespace
+
+}  // namespace xla::gpu

From d85de9d8479c54349824fe9e9bff9eb8d71ec271 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 28 Sep 2023 17:49:31 -0700
Subject: [PATCH 409/567] [stream_executor] Make stream_executor_headers
 internal and document what's going on in a BUILD file

Clearly document what StreamExecutor clients should depend on, and how BUILD file is structured.

PiperOrigin-RevId: 569339256
---
 third_party/xla/xla/stream_executor/BUILD | 204 +++++++++-------------
 1 file changed, 86 insertions(+), 118 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 599a73489333a2..fd229b9c47d8ea 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -34,18 +34,31 @@ package_group(
 # StreamExecutor public API
 #===--------------------------------------------------------------------------------------------===#
 
+# If you want to use StreamExecutor you have to depend on the "core" `xla/stream_executor` target
+# defined below which exports StreamExecutor public API headers and also provides implemenetation.
+
+# StreamExecutor also have a small number of public libraries that do not depend on StreamExecutor
+# itself (e.g. `device_description`), this is mostly a result of StreamExecutor being a hardware
+# abstraction layer for XLA (and Tensorflow).
+
+# StreamExecutor itself is a small abstrtaction layer on top of platform-specific API
+# implementations (e.g. see `stream_executor/cuda` folder for CUDA-specific details), and should
+# not contribute a lot to binary size or compilation time.
+
 # We bundle headers into filegroups for internal use only (we re-export the same set of headers
 # from multiple targets), and all external clients should depend on one of the public `cc_library`
 # targets that have dependencies required for compiling headers (e.g. absl dependencies). These
 # filegroup roughly correspond to "StreamExecutor components" that are available to the clients.
 
 # These are the headers that constitute StreamExecutor public API. Clients should not depend on
-# this filegroup directly, but instead depend on a `stream_executor_headers` target if they need
-# a header-only dependency (this is a very rare exception when we are building dynamic librarires
-# for open source projects, e.g. Tensorflow, internally at Google we almost always link statically),
-# or usually on a `sream_executor` target that will also link implementation.
+# this filegroup directly, but instead depend on a `stream_executor` target that implements these
+# headers. We also have a header-only `stream_executor_headers` target, but currently this is
+# an implementation detail of StreamExecutor and has internal visibility.
+#
+# TODO(ezhulenev): Remove from public API headers that are exported via standalone public libraries,
+# e.g. `platform` and `multi_platform_manager` should be added with an explicit dependency.
 filegroup(
-    name = "stream_executor_public_headers",
+    name = "stream_executor_api_headers",
     srcs = [
         "allocator_stats.h",
         "command_buffer.h",
@@ -62,6 +75,7 @@ filegroup(
         "kernel_spec.h",
         "launch_dim.h",
         "module_spec.h",
+        "multi_platform_manager.h",
         "numeric_options.h",
         "platform.h",
         "plugin.h",
@@ -89,6 +103,53 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+# This is a list of dependencies required for building `stream_executor` target (and required for
+# making `stream_executor_headers` self-contained, which means that you can include any of the
+# public API headers and don't worry about adding dependencies).
+STREAM_EXECUTOR_DEPENDENCIES = [
+    ":allocator_stats",
+    ":device_description_proto_cc",
+    ":dnn_proto_cc",
+    ":host_or_device_scalar",
+    ":multi_platform_manager",
+    "@com_google_absl//absl/algorithm:container",
+    "@com_google_absl//absl/base:core_headers",
+    "@com_google_absl//absl/container:inlined_vector",
+    "@com_google_absl//absl/container:node_hash_map",
+    "@com_google_absl//absl/functional:any_invocable",
+    "@com_google_absl//absl/log:check",
+    "@com_google_absl//absl/memory",
+    "@com_google_absl//absl/status",
+    "@com_google_absl//absl/strings",
+    "@com_google_absl//absl/synchronization",
+    "@com_google_absl//absl/types:optional",
+    "@com_google_absl//absl/types:span",
+    "//xla/stream_executor/platform",
+    "@local_tsl//tsl/framework:device_id",
+    "@local_tsl//tsl/framework:device_type",
+    "@local_tsl//tsl/platform:env",
+    "@local_tsl//tsl/platform:errors",
+    "@local_tsl//tsl/platform:float8",
+    "@local_tsl//tsl/platform:logging",
+    "@local_tsl//tsl/platform:status",
+    "@local_tsl//tsl/platform:statusor",
+    "@local_tsl//tsl/platform:types",
+    "@local_tsl//tsl/protobuf:dnn_proto_cc",
+]
+
+cc_library(
+    name = "stream_executor",
+    hdrs = [
+        ":stream_executor_api_headers",
+        ":stream_executor_plugin_headers",
+    ],
+    visibility = ["//visibility:public"],
+    deps = STREAM_EXECUTOR_DEPENDENCIES + if_static([
+        ":stream_executor_impl",
+        "@com_google_protobuf//:protobuf",  # indirectly-used by dnn.h
+    ]),
+)
+
 #===--------------------------------------------------------------------------------------------===#
 # StreamExecutor public libraries
 #===--------------------------------------------------------------------------------------------===#
@@ -261,6 +322,26 @@ filegroup(
 # StreamExecutor implementation
 #===--------------------------------------------------------------------------------------------===#
 
+# TODO(ezhulenev): We need a clear separation between StreamExecutor "core" (event, stream, etc.),
+# and plugins (FFT, Blas, etc.). We should not be mixing headers and implementation of core
+# libraries with plugins. Today `stream_executor` exports all plugin headers, and we should remove
+# this, however it requires more work to break dependency from "core" to plugin implementation.
+
+# This header-only library is an internal implementation detail of StreamExecutor (to break
+# dependency cycles between internal libraries). All StreamExecutor clients should depend on a
+# regular `stream_executor` target that also provides an implementation for all headers.
+cc_library(
+    name = "stream_executor_headers",
+    hdrs = [
+        ":stream_executor_api_headers",
+        ":stream_executor_plugin_headers",
+    ],
+    visibility = ["//visibility:public"],
+    deps = STREAM_EXECUTOR_DEPENDENCIES + if_static([
+        "@com_google_protobuf//:protobuf",  # indirectly-used by dnn.h
+    ]),
+)
+
 # Targets that implement StreamExecutor APIs are private, and should not be used outside of
 # `stream_executor` package. Clients should depend on `stream_executor` (headers and
 # implementation) or `stream_executor_headers` (only headers, if there is a reason not to link
@@ -404,71 +485,6 @@ cc_library(
 
 #===--------------------------------------------------------------------------------------------===#
 
-# The stream_executor_headers target does not prescribe an implementation.
-cc_library(
-    name = "stream_executor_headers",
-    textual_hdrs = [
-        "allocator_stats.h",
-        "blas.h",
-        "command_buffer.h",
-        "device_description.h",
-        "device_memory.h",
-        "device_memory_allocator.h",
-        "device_options.h",
-        "device_id_utils.h",
-        "dnn.h",
-        "event.h",
-        "executor_cache.h",
-        "fft.h",
-        "kernel.h",
-        "kernel_cache_config.h",
-        "kernel_spec.h",
-        "scratch_allocator.h",
-        "data_type.h",
-        "launch_dim.h",
-        "numeric_options.h",
-        "module_spec.h",
-        "multi_platform_manager.h",
-        "platform.h",
-        "plugin.h",
-        "plugin_registry.h",
-        "stream.h",
-        "stream_executor.h",
-        "stream_executor_internal.h",
-        "stream_executor_pimpl.h",
-        "temporary_device_memory.h",
-        "temporary_memory_manager.h",
-        "trace_listener.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":allocator_stats",
-        ":device_description_proto_cc",
-        ":dnn_proto_cc",
-        ":host_or_device_scalar",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/framework:device_id",
-        "@local_tsl//tsl/framework:device_type",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:float8",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:types",
-    ],
-)
-
 transitive_hdrs(
     name = "stream_executor_install_hdrs",
     deps = [":stream_executor_headers"],
@@ -527,7 +543,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":platform",
-        ":stream_executor_headers",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -685,53 +700,6 @@ cc_library(
     ] + if_static(["@com_google_protobuf//:protobuf"]),
 )
 
-cc_library(
-    name = "stream_executor",
-    textual_hdrs = [
-        "allocator_stats.h",
-        "blas.h",
-        "data_type.h",
-        "device_description.h",
-        "device_memory.h",
-        "device_memory_allocator.h",
-        "device_options.h",
-        "device_id_utils.h",
-        "dnn.h",
-        "event.h",
-        "executor_cache.h",
-        "fft.h",
-        "kernel.h",
-        "kernel_cache_config.h",
-        "kernel_spec.h",
-        "launch_dim.h",
-        "module_spec.h",
-        "multi_platform_manager.h",
-        "scratch_allocator.h",
-        "platform.h",
-        "plugin.h",
-        "plugin_registry.h",
-        "stream.h",
-        "numeric_options.h",
-        "stream_executor.h",
-        "stream_executor_internal.h",
-        "stream_executor_pimpl.h",
-        "temporary_device_memory.h",
-        "temporary_memory_manager.h",
-        "trace_listener.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":stream_executor_headers",
-        "@com_google_absl//absl/log:check",
-        "@local_tsl//tsl/framework:device_id",
-        "@local_tsl//tsl/framework:device_type",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:types",
-    ] + if_static([":stream_executor_impl"]),
-)
-
 cc_library(
     name = "stream_executor_impl",
     visibility = ["//visibility:public"],

From 24cc7e24b8091abd0b1bc527665e94a1b087d633 Mon Sep 17 00:00:00 2001
From: Pat Notz <patn@google.com>
Date: Thu, 28 Sep 2023 20:17:30 -0700
Subject: [PATCH 410/567] GetMaxIdsAndUniques refactoring and logging

PiperOrigin-RevId: 569365982
---
 .../tpu/kernels/sparse_core_preprocess_ops.cc    | 16 +++++++---------
 .../core/tpu/kernels/sparse_core_xla_ops.cc      | 10 ++++------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index da656554186be5..1dd490a6799229 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -168,8 +168,6 @@ class ConvertToCooTensorOp : public OpKernel {
   ConvertToCooTensorOp& operator=(const ConvertToCooTensorOp&) = delete;
 
   void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "Compute ConvertToCooTensorOp";
-
     const Tensor* indices_or_row_splits;
     OP_REQUIRES_OK(ctx,
                    ctx->input("indices_or_row_splits", &indices_or_row_splits));
@@ -288,7 +286,6 @@ class ConvertToCooTensorOp : public OpKernel {
       std::copy(col_ids.begin(), col_ids.end(), col_ids_tensor_ptr);
       std::copy(gains.begin(), gains.end(), gains_tensor_ptr);
     }
-    VLOG(1) << "Compute ConvertToCooTensorOp done";
   }
 
  private:
@@ -327,8 +324,6 @@ GetMinibatchesInCsrWithPhysicalReplicaOp::
 }
 
 void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
-  VLOG(1) << "Compute GetMinibatchesInCsrWithPhysicalReplicaOp";
-
   const Tensor* row_ids;
   OP_REQUIRES_OK(ctx, ctx->input("row_ids", &row_ids));
   const Tensor* col_ids;
@@ -346,6 +341,9 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   const Tensor* program_key_t;
   OP_REQUIRES_OK(ctx, ctx->input("program_key", &program_key_t));
   tstring program_key = program_key_t->vec<tstring>()(0);
+  OP_REQUIRES(ctx, !program_key.empty(),
+              absl::FailedPreconditionError("Expected non-empty program key"));
+
   int64_t per_sparse_core_batch_size = sample_count_ / num_sc_per_chip_;
 
   int64_t max_ids_per_partition = -1;
@@ -396,8 +394,8 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
           ". But the max minibatches per sparse core is set to be ",
           max_minibatches_per_sc_, " which is smaller.")));
   VLOG(2) << "GetMinibatchesInCsrWithPhysicalReplicaOp: "
-          << "program_key ='" << program_key << "'"
-          << ", table_name = " << table_name_
+          << "program_key = '" << program_key << "'"
+          << ", table_name = '" << table_name_ << "'"
           << ", max_ids = " << max_ids_per_partition
           << ", max_uniques = " << max_unique_ids_per_partition
           << ", num_minibatch_per_sc = " << num_minibatch_per_sc;
@@ -519,8 +517,6 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   row_pointers_unpadded_size_tensor->flat<int32>()(0) =
       row_pointers_unpadded_size;
   ids_unpadded_size_tensor->flat<int32>()(0) = ids_unpadded_size;
-
-  VLOG(1) << "Compute GetMinibatchesInCsrWithPhysicalReplicaOp done";
 }
 
 #ifdef LIBTPU_ON_GCE
@@ -557,6 +553,8 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   const Tensor* program_key_t;
   OP_REQUIRES_OK(ctx, ctx->input("program_key", &program_key_t));
   tstring program_key = program_key_t->vec<tstring>()(0);
+  OP_REQUIRES(ctx, !program_key.empty(),
+              absl::FailedPreconditionError("Expected non-empty program key"));
 
   int32 per_sc_sample_count = sample_count_ / num_sc_per_chip_;
 
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
index 2033559b64ca0a..abe0871ac318ae 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "xla/client/lib/constants.h"
 #include "xla/client/lib/slicing.h"
 #include "xla/client/xla_builder.h"
 #include "xla/client/xla_computation.h"
@@ -269,8 +268,8 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
                                  feature_width, &max_ids_per_partition,
                                  &max_unique_ids_per_partition));
     VLOG(3) << "XlaSparseDenseMatmulWithCsrInputOp: "
-            << "table_name = " << table_name_
-            << ", max_ids = " << max_ids_per_partition
+            << "table_name = '" << table_name_
+            << "', max_ids = " << max_ids_per_partition
             << ", max_uniques = " << max_unique_ids_per_partition;
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsScalar(ctx->InputShape(
@@ -411,11 +410,10 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
                                  feature_width, &max_ids_per_partition,
                                  &max_unique_ids_per_partition));
     VLOG(3) << "XlaSparseDenseMatmulWithCsrInputOp: "
-            << "table_name = " << table_name_
-            << ", max_ids = " << max_ids_per_partition
+            << "table_name = '" << table_name_
+            << "', max_ids = " << max_ids_per_partition
             << ", max_uniques = " << max_unique_ids_per_partition;
 
-
     xla::XlaComputation optimizer = build_optimizer_computation(feature_width);
 
     xla::FrontendAttributes original_frontend_attributes =

From 05ab3513abbcd20263e728f432a789082edbcf71 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 28 Sep 2023 23:26:58 -0700
Subject: [PATCH 411/567] Also run PostSchedulingCopyInsertion in
 CompileModuleToLLVMIrImpl.

We want to run this always after scheduling, but so far we only run it in the
AssignBuffers() function. CompileModuleToLLVMIrImpl is on the regular
compilation path.

PiperOrigin-RevId: 569395560
---
 third_party/xla/xla/service/gpu/BUILD         |  3 +
 .../xla/xla/service/gpu/gpu_compiler.cc       |  2 +
 .../xla/xla/service/gpu/gpu_compiler.h        |  2 +-
 .../xla/xla/service/gpu/nvptx_compiler.cc     |  2 +-
 .../xla/xla/service/gpu/nvptx_compiler.h      |  2 +-
 .../xla/service/gpu/nvptx_compiler_test.cc    | 87 +++++++++++++++++++
 6 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 016680c5fec8ce..b412f4bced6ade 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2932,11 +2932,13 @@ xla_cc_test(
         "requires-gpu-sm70",
     ],
     deps = [
+        ":gpu_compiler",
         ":nvptx_compiler_impl",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
         "//xla/service:backend",
         "//xla/service:buffer_assignment",
         "//xla/service:gpu_plugin",
@@ -2944,6 +2946,7 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 4688c09be1fa5f..d0e99e475eb69f 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1747,6 +1747,8 @@ StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
 
 Status GpuCompiler::RunPostSchedulingPipelines(
     HloModule* module, int64_t scheduler_mem_limit) const {
+  TF_RETURN_IF_ERROR(
+      RunPostSchedulingCopyInsertion(module, GetCanShareBuffer()));
   {
     HloPassPipeline pipeline("post-scheduling-passes");
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 765df534908ffe..8deac1552da5ae 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -234,7 +234,7 @@ class GpuCompiler : public LLVMCompiler {
       se::dnn::VersionInfo dnn_version,
       se::DeviceMemoryAllocator* device_allocator) = 0;
 
-  virtual HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() {
+  virtual HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() const {
     return &FusionCanShareBufferHint;
   }
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 433422cda60bf2..5ea8134ff5dc20 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -465,7 +465,7 @@ NVPTXCompiler::NVPTXCompiler()
     : GpuCompiler(stream_executor::cuda::kCudaPlatformId, nvptx::TargetTriple(),
                   nvptx::DataLayout()) {}
 
-HloDataflowAnalysis::CanShareBuffer NVPTXCompiler::GetCanShareBuffer() {
+HloDataflowAnalysis::CanShareBuffer NVPTXCompiler::GetCanShareBuffer() const {
   return &CanShareBufferHint;
 }
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/xla/xla/service/gpu/nvptx_compiler.h
index 6598e0f232408d..c31f1cc21609e9 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.h
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -69,7 +69,7 @@ class NVPTXCompiler : public GpuCompiler {
   Status SerializeAutotuneResultsToFile(
       const DebugOptions& debug_options) override;
 
-  HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() override;
+  HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() const override;
 
   StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileTargetBinary(
       const HloModuleConfig& module_config, llvm::Module* llvm_module,
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
index 9e61390635a471..f32502deb60759 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
@@ -20,16 +20,38 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/backend.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/gpu_compiler.h"
 #include "xla/statusor.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
+#include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
+namespace {
+
+int64_t CountCopies(const HloComputation& computation) {
+  int64_t count = 0;
+  for (const auto& instruction : computation.instructions()) {
+    if (instruction->opcode() == HloOpcode::kCopy) {
+      count++;
+    }
+  }
+  return count;
+}
+
+int64_t CountCopies(const HloModule& module) {
+  int64_t count = 0;
+  for (const auto& computation : module.computations()) {
+    count += CountCopies(*computation);
+  }
+  return count;
+}
 
 class NVPTXCompilerTest : public HloTestBase {
  public:
@@ -125,5 +147,70 @@ ENTRY e {
   )");
 }
 
+TEST_F(NVPTXCompilerTest, RemovesUnnecessaryCopyInPostSchedulingPipelines) {
+  const absl::string_view hlo_text = R"(
+HloModule all_gather_overlapping, is_scheduled=true
+
+condition {
+  input_tuple = (f32[1,128], f32[2,128], pred[]) parameter(0)
+  ROOT cond = pred[] get-tuple-element(input_tuple), index=2
+}
+
+body {
+  c0 = f32[] constant(0)
+  splat_c0 = f32[1,128] broadcast(c0), dimensions={}
+  input_tuple = (f32[1,128], f32[2,128], pred[]) parameter(0)
+  param_0 = f32[1,128] get-tuple-element(input_tuple), index=0
+  add = f32[1,128] add(splat_c0, param_0)
+  param_1 = f32[2,128] get-tuple-element(input_tuple), index=1
+
+  c1_s32 = s32[] constant(1)
+  c0_s32 = s32[] constant(0)
+  dynamic-slice = f32[1,128] dynamic-slice(param_1, c1_s32, c0_s32), dynamic_slice_sizes={1,128}
+
+  // If a schedule was chosen where the all-gather and the dynamic-slice are not
+  // intertwined, we can get rid of the copy.
+  all-gather-start = (f32[1,128], f32[2,128]) all-gather-start(add), channel_id=1337, replica_groups={{0,1}}, dimensions={0}, use_global_device_ids=true
+  all-gather-done = f32[2,128] all-gather-done(all-gather-start)
+  copy = f32[2,128] copy(all-gather-done)
+
+  cond = pred[] get-tuple-element(input_tuple), index=2
+  ROOT output_tuple = (f32[1,128], f32[2,128], pred[]) tuple(dynamic-slice, copy, cond)
+}
+
+ENTRY main {
+  param_0 = f32[1,128] parameter(0)
+  param_1 = f32[2,128] parameter(1)
+  param_2 = pred[] parameter(2)
+  copy_param_0 = f32[1,128] copy(param_0)
+  copy_param_1 = f32[2,128] copy(param_1)
+  tuple = (f32[1,128], f32[2,128], pred[]) tuple(copy_param_0, copy_param_1, param_2)
+  while = (f32[1,128], f32[2,128], pred[]) while(tuple), condition=condition, body=body
+  get-tuple-element = f32[1,128]{1,0} get-tuple-element((f32[1,128]{1,0}, f32[2,128]{1,0}, pred[]) while), index=0
+  get-tuple-element.1 = f32[2,128]{1,0} get-tuple-element((f32[1,128]{1,0}, f32[2,128]{1,0}, pred[]) while), index=1
+  get-tuple-element.2 = pred[] get-tuple-element((f32[1,128]{1,0}, f32[2,128]{1,0}, pred[]) while), index=2
+  copy.3 = pred[] copy(pred[] get-tuple-element.2)
+  ROOT tuple.2 = (f32[1,128]{1,0}, f32[2,128]{1,0}, pred[]) tuple(f32[1,128]{1,0} get-tuple-element, f32[2,128]{1,0} get-tuple-element.1, pred[] copy.3)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_text).value();
+  EXPECT_EQ(CountCopies(*module), 4);
+
+  const HloInstruction* while_op = hlo_query::GetFirstInstructionWithOpcode(
+      *module->entry_computation(), HloOpcode::kWhile);
+  EXPECT_EQ(while_op->while_body()->root_instruction()->operand(1)->opcode(),
+            HloOpcode::kCopy);
+
+  NVPTXCompiler compiler;
+  TF_EXPECT_OK(compiler.RunPostSchedulingPipelines(module.get(), 100000));
+  EXPECT_EQ(CountCopies(*module), 3);
+  while_op = hlo_query::GetFirstInstructionWithOpcode(
+      *module->entry_computation(), HloOpcode::kWhile);
+  // Make sure that the copy of AllGatherDone has been removed.
+  EXPECT_EQ(while_op->while_body()->root_instruction()->operand(1)->opcode(),
+            HloOpcode::kAllGatherDone);
+}
+
+}  // namespace
 }  // namespace gpu
 }  // namespace xla

From 81d69f4a69d7ad7921eea635597d4421f78ba656 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Sep 2023 02:02:15 -0700
Subject: [PATCH 412/567] compat: Update forward compatibility horizon to
 2023-09-29

PiperOrigin-RevId: 569427815
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 052bd01a1e1e4a..ba6dcd40265a78 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 28)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 29)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 14e4c41bf1eaa595f24e88ab94c54b60304117a4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Sep 2023 02:03:12 -0700
Subject: [PATCH 413/567] Update GraphDef version to 1634.

PiperOrigin-RevId: 569428007
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 414b97873f5f23..08f4929b65a299 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1633  // Updated: 2023/9/28
+#define TF_GRAPH_DEF_VERSION 1634  // Updated: 2023/9/29
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 8886b41b2c50cf1caebcf96e7c0a750c2e8ea3e2 Mon Sep 17 00:00:00 2001
From: kushanam <kahmadian@nvidia.com>
Date: Fri, 29 Sep 2023 03:42:29 -0700
Subject: [PATCH 414/567] PR #5300: A new pass to optimize the
 AllGather->Binary_Op order sequence

Imported from GitHub PR https://github.com/openxla/xla/pull/5300

This is a new GPU SPMD optimization pass for the following pattern:
binary-op(all-gather(a), all-gather(b))
to
all-gather(binary-op(a, b))

Copybara import of the project:

--
198c4b2b8b8c155b50a5643e960366bdb51aece0 by kushanam <kahmadian@nvidia.com>:

adding a new pass to optimize reduce_scatter->all_gather->binary_op sequence

--
8f8cc822229f1c6a54c969188240d5b2a421e9ee by kushanam <kahmadian@nvidia.com>:

applying review refactors

--
1beffbb5ecc007aa729b2c20e39ec95a00a73fd8 by kushanam <kahmadian@nvidia.com>:

removing reduce-scatter from the all-gather optimization

--
993f3d66e1a75b774e39aa3a55134f841d132df5 by kushanam <kahmadian@nvidia.com>:

remove traversal all-gather search and rely on immediate parent

--
c7a3bea5846220dc49ae0087039e5ad77fd308c7 by kushanam <kahmadian@nvidia.com>:

remove extra gpu word from the directive

--
2b9afd129a7759ba4e59df461b9d8d06033f7649 by kushanam <kahmadian@nvidia.com>:

fixing for disabled SPMD partitioning

--
ee412cbbbb65f322d337dd522c17414a3c6afbd6 by kushanam <kahmadian@nvidia.com>:

diferring node removal and fixing the corresponding tests

Merging this change closes #5300

PiperOrigin-RevId: 569445810
---
 .../xla/xla/service/collective_ops_utils.cc   |  18 ++
 .../xla/xla/service/collective_ops_utils.h    |   4 +
 third_party/xla/xla/service/gpu/BUILD         |  19 ++
 .../service/gpu/gpu_all_gather_optimizer.cc   | 103 +++++++++
 .../service/gpu/gpu_all_gather_optimizer.h    |  42 ++++
 .../xla/xla/service/gpu/gpu_compiler.cc       |   2 +
 third_party/xla/xla/service/gpu/tests/BUILD   |  16 ++
 .../tests/gpu_all_gather_optimizer_test.cc    | 213 ++++++++++++++++++
 8 files changed, 417 insertions(+)
 create mode 100644 third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc
 create mode 100644 third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.h
 create mode 100644 third_party/xla/xla/service/gpu/tests/gpu_all_gather_optimizer_test.cc

diff --git a/third_party/xla/xla/service/collective_ops_utils.cc b/third_party/xla/xla/service/collective_ops_utils.cc
index 0c09d5532349d1..3402f9a73b9f4d 100644
--- a/third_party/xla/xla/service/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/collective_ops_utils.cc
@@ -467,6 +467,24 @@ bool ReplicaGroupsOrthogonal(absl::Span<const ReplicaGroup> first,
   return true;
 }
 
+bool ReplicaGroupsEqual(absl::Span<const ReplicaGroup> first,
+                        absl::Span<const ReplicaGroup> second) {
+  if (first.size() != second.size()) {
+    return false;
+  }
+  for (int64_t i = 0; i < first.size(); ++i) {
+    if (first[i].replica_ids_size() != second[i].replica_ids_size()) {
+      return false;
+    }
+    for (int j = 0; j < first[i].replica_ids_size(); ++j) {
+      if (first[i].replica_ids(j) != second[i].replica_ids(j)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 bool IsCollective(const HloInstruction* instruction) {
   switch (instruction->opcode()) {
     case HloOpcode::kAllReduce:
diff --git a/third_party/xla/xla/service/collective_ops_utils.h b/third_party/xla/xla/service/collective_ops_utils.h
index da57e3f67b2eb6..28a538833ef00c 100644
--- a/third_party/xla/xla/service/collective_ops_utils.h
+++ b/third_party/xla/xla/service/collective_ops_utils.h
@@ -147,6 +147,10 @@ StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
 bool ReplicaGroupsOrthogonal(absl::Span<const ReplicaGroup> first,
                              absl::Span<const ReplicaGroup> second);
 
+// Returns true if the two replica group are Equal.
+bool ReplicaGroupsEqual(absl::Span<const ReplicaGroup> first,
+                        absl::Span<const ReplicaGroup> second);
+
 // A custom call target that can be used to create a nop that can legally
 // replace a collective op.
 inline constexpr absl::string_view kNopCustomCallTarget = "AllocateBuffer";
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index b412f4bced6ade..49ada8a5603e3a 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2424,6 +2424,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_all_gather_optimizer",
+    srcs = ["gpu_all_gather_optimizer.cc"],
+    hdrs = ["gpu_all_gather_optimizer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
 cc_library(
     name = "gpu_float_support",
     srcs = ["gpu_float_support.cc"],
@@ -2625,6 +2642,7 @@ cc_library(
         ":gpu_hlo_schedule",
         ":gpu_layout_assignment",
         ":gpu_reduce_scatter_creator",
+        ":gpu_all_gather_optimizer",
         ":gpu_sanitize_constant_names",
         ":gpu_scatter_expander",
         ":gpu_shape_verifier",
@@ -4080,6 +4098,7 @@ test_suite(
         "//xla/service/gpu/tests:gemm_broadcast_folding_rewrite_test",
         "//xla/service/gpu/tests:gemm_rewrite_test",
         "//xla/service/gpu/tests:gpu_alignment_test",
+        "//xla/service/gpu/tests:gpu_all_gather_optimizer_test",
         "//xla/service/gpu/tests:gpu_atomic_test",
         "//xla/service/gpu/tests:gpu_compilation_parallelism_test",
         "//xla/service/gpu/tests:gpu_convolution_regression_test",
diff --git a/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc b/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc
new file mode 100644
index 00000000000000..29f52ffc7da620
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc
@@ -0,0 +1,103 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_all_gather_optimizer.h"
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/statusor.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+namespace gpu {
+
+StatusOr<bool> AllGatherOptimizer::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (!HloOpcodeIsBinaryCommutative(instruction->opcode())) {
+        continue;
+      }
+
+      HloInstruction* left_op = instruction->mutable_operand(0);
+      HloInstruction* right_op = instruction->mutable_operand(1);
+
+      if (right_op->opcode() != HloOpcode::kAllGather ||
+          left_op->opcode() != HloOpcode::kAllGather) {
+        VLOG(2) << "Binary op's operands are not all-gather deduced types.";
+        continue;
+      }
+
+      auto* left_all_gather = Cast<HloAllGatherInstruction>(left_op);
+      auto* right_all_gather = Cast<HloAllGatherInstruction>(right_op);
+
+      if (right_all_gather->constrain_layout() !=
+              left_all_gather->constrain_layout() ||
+          right_all_gather->use_global_device_ids() !=
+              left_all_gather->use_global_device_ids() ||
+          !ReplicaGroupsEqual(right_all_gather->replica_groups(),
+                              left_all_gather->replica_groups())) {
+        VLOG(2) << "The right and left all-gather ops are not compatible "
+                   "to merge. ";
+        continue;
+      }
+
+      if (right_all_gather->user_count() != 1 ||
+          left_all_gather->user_count() != 1) {
+        VLOG(2) << "all-gather user_count > 1 ";
+        continue;
+      }
+      auto index_in_full_shape =
+          computation->AddInstruction(HloInstruction::CreateBinary(
+              right_all_gather->operand(0)->shape(), instruction->opcode(),
+              left_all_gather->mutable_operand(0),
+              right_all_gather->mutable_operand(0)));
+
+      int64_t all_gather_dimension =
+          Cast<HloAllGatherInstruction>(right_all_gather)
+              ->all_gather_dimension();
+
+      auto combined = HloInstruction::CreateAllGather(
+          left_all_gather->shape(), {index_in_full_shape}, all_gather_dimension,
+          left_all_gather->replica_groups(),
+          /*constrain_layout=*/false, left_all_gather->channel_id(),
+          Cast<HloAllGatherInstruction>(left_all_gather)
+              ->use_global_device_ids());
+
+      TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+          instruction, std::move(combined)));
+      changed = true;
+    }
+  }
+
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.h b/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.h
new file mode 100644
index 00000000000000..717c6088f06e4b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_ALL_GATHER_OPTIMIZER_H_
+#define XLA_SERVICE_GPU_GPU_ALL_GATHER_OPTIMIZER_H_
+
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Transforms binary_op(all-gather(reduce_scatter(a)),
+// all-gather(reduce_scatter(b))) to allgather(binary_op(reduce_scatter(a),
+// reduce_scatter(b)))
+
+class AllGatherOptimizer : public HloModulePass {
+ public:
+  AllGatherOptimizer() = default;
+  absl::string_view name() const override { return "all-gather-optimizer"; }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_ALL_GATHER_OPTIMIZER_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index d0e99e475eb69f..21374a6e7f29bf 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -99,6 +99,7 @@ limitations under the License.
 #include "xla/service/gpu/gemm_broadcast_folding_rewriter.h"
 #include "xla/service/gpu/gemm_rewriter.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
+#include "xla/service/gpu/gpu_all_gather_optimizer.h"
 #include "xla/service/gpu/gpu_async_collective_annotator.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_conv_rewriter.h"
@@ -615,6 +616,7 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
     HloPassPipeline collectives_pipeline("collective-optimizations");
     collectives_pipeline.AddPass<AllReduceFolder>();
     collectives_pipeline.AddPass<ReduceScatterCreator>();
+    collectives_pipeline.AddPass<AllGatherOptimizer>();
     collectives_pipeline.AddPass<AllReduceReassociate>(
         debug_options.xla_gpu_enable_reassociation_for_converted_ar());
     collectives_pipeline.AddPass<ReduceScatterReassociate>();
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 5cd1ba4a5c87ee..cd8e689d5b2428 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -108,6 +108,22 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "gpu_all_gather_optimizer_test",
+    srcs = ["gpu_all_gather_optimizer_test.cc"],
+    deps = [
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
+        "//xla/service/gpu:gpu_all_gather_optimizer",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 xla_cc_test(
     name = "gpu_spmd_e2e_compile_test",
     size = "small",
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_all_gather_optimizer_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_all_gather_optimizer_test.cc
new file mode 100644
index 00000000000000..0b097c9a2323d7
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/gpu_all_gather_optimizer_test.cc
@@ -0,0 +1,213 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_all_gather_optimizer.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/statusor.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/util.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuAllGatherOptimizerTest : public HloTestBase {
+ public:
+  StatusOr<std::unique_ptr<HloModule>> RunPass(absl::string_view hlo_module,
+                                               int64_t num_replicas,
+                                               int64_t num_partitions,
+                                               bool expect_change) {
+    HloModuleConfig config = GetModuleConfigForTest(
+        /*replica_count=*/num_replicas,
+        /*num_partitions=*/num_partitions);
+    config.set_use_spmd_partitioning(num_partitions > 1);
+    TF_ASSIGN_OR_RETURN(auto module,
+                        ParseAndReturnVerifiedModule(hlo_module, config));
+
+    auto changed = AllGatherOptimizer().Run(module.get());
+    if (!changed.ok()) {
+      return changed.status();
+    }
+    EXPECT_EQ(changed.value(), expect_change);
+    return StatusOr<std::unique_ptr<HloModule>>(std::move(module));
+  }
+
+  template <HloOpcode oc>
+  size_t CollectiveCount(std::unique_ptr<HloModule> &module) {
+    return absl::c_count_if(module->entry_computation()->instructions(),
+                            HloPredicateIsOp<oc>);
+  }
+};
+
+TEST_F(GpuAllGatherOptimizerTest, BranchesOptimized) {
+  absl::string_view hlo_string = R"(
+HloModule ReduceScatter
+
+add {
+  x = bf16[] parameter(0)
+  y = bf16[] parameter(1)
+  ROOT add = bf16[] add(x, y)
+}
+
+ENTRY main {
+param.1 = bf16[8,128,1024]{2,1,0} parameter(0)
+param.2 = bf16[8,128,1024]{2,1,0} parameter(1)
+reduce-scatter.1 = bf16[8,64,1024]{2,1,0} reduce-scatter(param.1), channel_id=8, replica_groups={{0,1},{2,3},{4,5},{6,7}}, use_global_device_ids=true, dimensions={1}, to_apply=add
+all-gather.1 = bf16[8,128,1024]{2,1,0} all-gather(reduce-scatter.1), channel_id=5, replica_groups={{0,1},{2,3},{4,5},{6,7}}, dimensions={1}, use_global_device_ids=true
+reduce-scatter.2 = bf16[8,64,1024]{2,1,0} reduce-scatter(param.2), channel_id=9, replica_groups={{0,1},{2,3},{4,5},{6,7}}, use_global_device_ids=true, dimensions={1}, to_apply=add
+all-gather.2 = bf16[8,128,1024]{2,1,0} all-gather(reduce-scatter.2), channel_id=5, replica_groups={{0,1},{2,3},{4,5},{6,7}}, dimensions={1}, use_global_device_ids=true
+add.1 = bf16[8,128,1024]{2,1,0} add(all-gather.1, all-gather.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, RunPass(hlo_string,
+                                               /*num_replicas=*/8,
+                                               /*num_partitions=*/1,
+                                               /*expect_change=*/true));
+  // graph should contain 1 all-gather but since the node removal piece
+  // is diferred, they still exist at this stage
+  EXPECT_EQ(CollectiveCount<HloOpcode::kAllGather>(module), 3);
+  EXPECT_EQ(CollectiveCount<HloOpcode::kReduceScatter>(module), 2);
+}
+
+TEST_F(GpuAllGatherOptimizerTest, DisbledSPMDPartitioningJAXBug) {
+  absl::string_view hlo_string = R"(
+HloModule pjit_f, entry_computation_layout={(f32[4,8]{1,0}, f32[4,8]{1,0})->f32[8,8]{1,0}}
+
+ENTRY %main.6_spmd (param: f32[4,8], param.1: f32[4,8]) -> f32[8,8] {
+  %param = f32[4,8]{1,0} parameter(0), sharding={devices=[2,1]<=[2]}
+  %all-gather = f32[8,8]{1,0} all-gather(f32[4,8]{1,0} %param), channel_id=1, replica_groups={{0,1}}, dimensions={0}, use_global_device_ids=true, metadata={op_name="pjit(f)/jit(main)/add" source_file="third_party/py/jax/tests/pjit_test.py" source_line=207}
+  %param.1 = f32[4,8]{1,0} parameter(1), sharding={devices=[2,1]<=[2]}
+  %all-gather.1 = f32[8,8]{1,0} all-gather(f32[4,8]{1,0} %param.1), channel_id=2, replica_groups={{0,1}}, dimensions={0}, use_global_device_ids=true, metadata={op_name="pjit(f)/jit(main)/add" source_file="third_party/py/jax/tests/pjit_test.py" source_line=207}
+  ROOT %add.0 = f32[8,8]{1,0} add(f32[8,8]{1,0} %all-gather, f32[8,8]{1,0} %all-gather.1), metadata={op_name="pjit(f)/jit(main)/add" source_file="third_party/py/jax/tests/pjit_test.py" source_line=207}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, RunPass(hlo_string,
+                                               /*num_replicas=*/1,
+                                               /*num_partitions=*/2,
+                                               /*expect_change=*/true));
+  EXPECT_EQ(CollectiveCount<HloOpcode::kAllGather>(module), 1);
+}
+
+TEST_F(GpuAllGatherOptimizerTest, MoreThanSingleUserForAllGather) {
+  absl::string_view hlo_string = R"(
+HloModule ReduceScatter
+
+add {
+  x = bf16[] parameter(0)
+  y = bf16[] parameter(1)
+  ROOT add = bf16[] add(x, y)
+}
+
+ENTRY main {
+param.1 = bf16[8,128,1024]{2,1,0} parameter(0)
+param.2 = bf16[8,128,1024]{2,1,0} parameter(1)
+param.3 = bf16[8,128,1024]{2,1,0} parameter(2)
+reduce-scatter.1 = bf16[8,64,1024]{2,1,0} reduce-scatter(param.1), channel_id=8, replica_groups={{0,1},{2,3},{4,5},{6,7}}, use_global_device_ids=true, dimensions={1}, to_apply=add
+all-gather.1 = bf16[8,128,1024]{2,1,0} all-gather(reduce-scatter.1), channel_id=5, replica_groups={{0,1},{2,3},{4,5},{6,7}}, dimensions={1}, use_global_device_ids=true
+reduce-scatter.2 = bf16[8,64,1024]{2,1,0} reduce-scatter(param.2), channel_id=9, replica_groups={{0,1},{2,3},{4,5},{6,7}}, use_global_device_ids=true, dimensions={1}, to_apply=add
+all-gather.2 = bf16[8,128,1024]{2,1,0} all-gather(reduce-scatter.2), channel_id=5, replica_groups={{0,1},{2,3},{4,5},{6,7}}, dimensions={1}, use_global_device_ids=true
+reduce-scatter.3 = bf16[8,64,1024]{2,1,0} reduce-scatter(param.3), channel_id=9, replica_groups={{0,1},{2,3},{4,5},{6,7}}, use_global_device_ids=true, dimensions={1}, to_apply=add
+all-gather.3 = bf16[8,128,1024]{2,1,0} all-gather(reduce-scatter.3), channel_id=5, replica_groups={{0,1},{2,3},{4,5},{6,7}}, dimensions={1}, use_global_device_ids=true
+add.1 = bf16[8,128,1024]{2,1,0} add(all-gather.1, all-gather.3)
+add.2 = bf16[8,128,1024]{2,1,0} add(all-gather.1, all-gather.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, RunPass(hlo_string,
+                                               /*num_replicas=*/8,
+                                               /*num_partitions=*/1,
+                                               /*expect_change=*/false));
+  // see the comment for BranchesOptimized test
+  EXPECT_EQ(CollectiveCount<HloOpcode::kAllGather>(module), 3);
+  EXPECT_EQ(CollectiveCount<HloOpcode::kReduceScatter>(module), 3);
+}
+
+TEST_F(GpuAllGatherOptimizerTest, AllGatherWithOpInBetweenOnRightBranch) {
+  absl::string_view hlo_string = R"(
+HloModule ReduceScatter
+
+add {
+  x = bf16[] parameter(0)
+  y = bf16[] parameter(1)
+  ROOT add = bf16[] add(x, y)
+}
+
+ENTRY main {
+param.1 = bf16[8,128,1024]{2,1,0} parameter(0)
+param.2 = bf16[8,128,1024]{2,1,0} parameter(1)
+param.3 = bf16[8,128,1024]{2,1,0} parameter(2)
+reduce-scatter.1 = bf16[8,64,1024]{2,1,0} reduce-scatter(param.1), channel_id=8, replica_groups={{0,1},{2,3},{4,5},{6,7}}, use_global_device_ids=true, dimensions={1}, to_apply=add
+reduce-scatter.2 = bf16[8,64,1024]{2,1,0} reduce-scatter(param.2), channel_id=9, replica_groups={{0,1},{2,3},{4,5},{6,7}}, use_global_device_ids=true, dimensions={1}, to_apply=add
+add.1 = bf16[8,64,1024]{2,1,0} add(reduce-scatter.1, reduce-scatter.2)
+all-gather.1 = bf16[8,128,1024]{2,1,0} all-gather(add.1), channel_id=5, replica_groups={{0,1},{2,3},{4,5},{6,7}}, dimensions={1}, use_global_device_ids=true
+reduce-scatter.3 = bf16[8,64,1024]{2,1,0} reduce-scatter(param.3), channel_id=9, replica_groups={{0,1},{2,3},{4,5},{6,7}}, use_global_device_ids=true, dimensions={1}, to_apply=add
+all-gather.3 = bf16[8,128,1024]{2,1,0} all-gather(reduce-scatter.3), channel_id=5, replica_groups={{0,1},{2,3},{4,5},{6,7}}, dimensions={1}, use_global_device_ids=true
+add.2 = bf16[8,128,1024]{2,1,0} add(all-gather.1, all-gather.3)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, RunPass(hlo_string,
+                                               /*num_replicas=*/8,
+                                               /*num_partitions=*/1,
+                                               /*expect_change=*/true));
+  EXPECT_EQ(CollectiveCount<HloOpcode::kAllGather>(module), 3);
+  EXPECT_EQ(CollectiveCount<HloOpcode::kReduceScatter>(module), 3);
+}
+
+TEST_F(GpuAllGatherOptimizerTest, AllGatherOneSided) {
+  absl::string_view hlo_string = R"(
+HloModule ReduceScatter
+
+add {
+  x = bf16[] parameter(0)
+  y = bf16[] parameter(1)
+  ROOT add = bf16[] add(x, y)
+}
+
+ENTRY main {
+param.1 = bf16[8,128,1024]{2,1,0} parameter(0)
+param.2 = bf16[8,128,1024]{2,1,0} parameter(1)
+param.3 = bf16[8,128,1024]{2,1,0} parameter(2)
+
+add.1 = bf16[8,128,1024]{2,1,0} add(param.1, param.2)
+reduce-scatter = bf16[8,64,1024]{2,1,0} reduce-scatter(param.3), channel_id=9, replica_groups={{0,1},{2,3},{4,5},{6,7}}, use_global_device_ids=true, dimensions={1}, to_apply=add
+all-gather = bf16[8,128,1024]{2,1,0} all-gather(reduce-scatter), channel_id=5, replica_groups={{0,1},{2,3},{4,5},{6,7}}, dimensions={1}, use_global_device_ids=true
+add.2 = bf16[8,128,1024]{2,1,0} add(all-gather, add.1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, RunPass(hlo_string,
+                                               /*num_replicas=*/8,
+                                               /*num_partitions=*/1,
+                                               /*expect_change=*/false));
+  EXPECT_EQ(CollectiveCount<HloOpcode::kAllGather>(module), 1);
+  EXPECT_EQ(CollectiveCount<HloOpcode::kReduceScatter>(module), 1);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla

From 9ec9136eb1612d8bdc365719ddf1d9698db04053 Mon Sep 17 00:00:00 2001
From: Boian Petkantchin <boian@nod-labs.com>
Date: Fri, 29 Sep 2023 04:06:22 -0700
Subject: [PATCH 415/567] PR #5491: Add frontend attributes to HLO module

Imported from GitHub PR https://github.com/openxla/xla/pull/5491

This is a PR that addresses #5424.
Copybara import of the project:

--
6cf274695c9bdbee263c8e136abd5c2a6279593e by Boian Petkantchin <boian@nod-labs.com>:

Add frontend attributes to HLO module

--
72424758e56e2480fe256f7de5305bd18b6a0ac0 by Boian Petkantchin <boian@nod-labs.com>:

Fix license year in new files (squash)

Merging this change closes #5491

PiperOrigin-RevId: 569450110
---
 third_party/xla/xla/hlo/ir/BUILD              |  2 +
 .../xla/xla/hlo/ir/hlo_frontend_attributes.cc | 44 +++++++++++++++++++
 .../xla/xla/hlo/ir/hlo_frontend_attributes.h  | 28 ++++++++++++
 third_party/xla/xla/hlo/ir/hlo_instruction.cc | 16 +------
 third_party/xla/xla/hlo/ir/hlo_instruction.h  |  2 -
 third_party/xla/xla/hlo/ir/hlo_module.cc      | 18 +++++++-
 third_party/xla/xla/hlo/ir/hlo_module.h       | 17 +++++++
 third_party/xla/xla/service/hlo.proto         |  3 ++
 .../xla/xla/service/hlo_module_test.cc        | 16 +++++++
 third_party/xla/xla/service/hlo_parser.cc     |  6 +++
 .../xla/xla/service/hlo_parser_test.cc        | 20 +++++++++
 .../hlo_to_mhlo/hlo_module_importer.cc        | 19 ++++++++
 .../hlo_to_mhlo/hlo_module_importer.h         |  3 ++
 .../hlo_to_mhlo/tests/module_attributes.hlo   |  7 +++
 .../translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc  |  5 +++
 .../mhlo_to_hlo/tests/module_attributes.mlir  | 14 ++++++
 16 files changed, 201 insertions(+), 19 deletions(-)
 create mode 100644 third_party/xla/xla/hlo/ir/hlo_frontend_attributes.cc
 create mode 100644 third_party/xla/xla/hlo/ir/hlo_frontend_attributes.h

diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index 8bda94ab187feb..1e43109946bf89 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -21,6 +21,7 @@ cc_library(
         "dfs_hlo_visitor.cc",
         "dynamic_parameter_binding.cc",
         "hlo_computation.cc",
+        "hlo_frontend_attributes.cc",
         "hlo_input_output_alias_config.cc",
         "hlo_instruction.cc",
         "hlo_instructions.cc",
@@ -40,6 +41,7 @@ cc_library(
         "hlo_clone_context.h",
         "hlo_computation.h",
         "hlo_domain_metadata.h",
+        "hlo_frontend_attributes.h",
         "hlo_input_output_alias_config.h",
         "hlo_instruction.h",
         "hlo_instructions.h",
diff --git a/third_party/xla/xla/hlo/ir/hlo_frontend_attributes.cc b/third_party/xla/xla/hlo/ir/hlo_frontend_attributes.cc
new file mode 100644
index 00000000000000..f5ae28dcf15c50
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/hlo_frontend_attributes.cc
@@ -0,0 +1,44 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/hlo_frontend_attributes.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+
+namespace xla {
+
+std::string FrontendAttributesToString(
+    const FrontendAttributes& frontend_attributes) {
+  std::vector<std::pair<std::string, std::string>> sorted_attributes(
+      frontend_attributes.map().begin(), frontend_attributes.map().end());
+  absl::c_sort(sorted_attributes);
+  // Frontend attribute is a comma-separated list of attribute="value" pairs,
+  // e.g., frontend_attributes={name="value_a",type="int32_t"}.
+  const auto formatter = [](std::string* out,
+                            const std::pair<std::string, std::string>& item) {
+    absl::StrAppend(out, item.first, "=\"", item.second, "\"");
+  };
+  return absl::StrFormat("{%s}",
+                         absl::StrJoin(sorted_attributes, ",", formatter));
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_frontend_attributes.h b/third_party/xla/xla/hlo/ir/hlo_frontend_attributes.h
new file mode 100644
index 00000000000000..7e38d15806dced
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/hlo_frontend_attributes.h
@@ -0,0 +1,28 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_FRONTEND_ATTRIBUTES_H_
+#define XLA_HLO_IR_HLO_FRONTEND_ATTRIBUTES_H_
+
+#include <string>
+
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+std::string FrontendAttributesToString(
+    const FrontendAttributes& frontend_attributes);
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_FRONTEND_ATTRIBUTES_H_
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index 2da288ff163b88..c9ae190747e81f 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_domain_metadata.h"
+#include "xla/hlo/ir/hlo_frontend_attributes.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_op_metadata.h"
@@ -4312,21 +4313,6 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
   return InvalidArgument("Unknown fusion kind: %s", kind_name);
 }
 
-std::string FrontendAttributesToString(
-    const FrontendAttributes& frontend_attributes) {
-  std::vector<std::pair<std::string, std::string>> sorted_attributes(
-      frontend_attributes.map().begin(), frontend_attributes.map().end());
-  absl::c_sort(sorted_attributes);
-  // Frontend attribute is a comma-separated list of attribute="value" pairs,
-  // e.g., frontend_attributes={name="value_a",type="int32_t"}.
-  const auto formatter = [](std::string* out,
-                            const std::pair<std::string, std::string>& item) {
-    absl::StrAppend(out, item.first, "=\"", item.second, "\"");
-  };
-  return absl::StrFormat("{%s}",
-                         absl::StrJoin(sorted_attributes, ",", formatter));
-}
-
 std::string StatisticsVizToString(const StatisticsViz& statistics_viz) {
   // Statistics is either empty, or always starts with the index of the
   // statistic that is rendered on the graph, followed by the statistics that
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index ecd47132020cd0..7a8938f72d0659 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -2528,8 +2528,6 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
 // Custom (de)stringification functions for protos that live inside
 // HloInstruction.
 std::string PaddingConfigToString(const PaddingConfig& padding);
-std::string FrontendAttributesToString(
-    const FrontendAttributes& frontend_attributes);
 std::string StatisticsVizToString(const StatisticsViz& statistics_viz);
 std::string RandomAlgorithmToString(const RandomAlgorithm& algorithm);
 std::string RandomDistributionToString(const RandomDistribution& distribution);
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index 6f4be16069026f..66743fe496de3d 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_frontend_attributes.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_schedule.h"
@@ -368,6 +369,10 @@ void HloModule::Print(Printer* printer, const HloPrintOptions& options) const {
                });
     printer->Append("}");
   }
+  if (!frontend_attributes_.map().empty()) {
+    AppendCat(printer, ", frontend_attributes=",
+              FrontendAttributesToString(frontend_attributes_));
+  }
   printer->Append("\n\n");
   const auto& computations = options.canonicalize_computations()
                                  ? MakeComputationSorted()
@@ -440,6 +445,8 @@ HloModuleProto HloModule::ToProto() const {
     *proto.mutable_spmd_output_sharding() = spmd_output_sharding().ToProto();
   }
 
+  *proto.mutable_frontend_attributes() = frontend_attributes_;
+
   if (has_spmd_parameters_shardings()) {
     for (const auto& parameter_sharding : spmd_parameters_shardings()) {
       *proto.add_spmd_parameters_shardings() = parameter_sharding.ToProto();
@@ -605,6 +612,10 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 
   module->set_is_dynamic(proto.is_dynamic());
 
+  if (proto.has_frontend_attributes()) {
+    module->set_frontend_attributes(proto.frontend_attributes());
+  }
+
   if (proto.has_spmd_output_sharding()) {
     TF_ASSIGN_OR_RETURN(HloSharding hlo_sharding,
                         HloSharding::FromProto(proto.spmd_output_sharding()));
@@ -1021,11 +1032,14 @@ std::unique_ptr<HloModule> HloModule::Clone(const HloModuleConfig& config,
       std::make_unique<CompilationEnvironments>(*comp_envs_));
 
   HloCloneContext context(module.get(), suffix);
-  auto cloned_computation = entry_computation_->Clone(suffix, &context);
-  module->AddEntryComputation(std::move(cloned_computation));
+  if (entry_computation_) {
+    auto cloned_computation = entry_computation_->Clone(suffix, &context);
+    module->AddEntryComputation(std::move(cloned_computation));
+  }
   module->input_output_alias_config() = input_output_alias_config();
   module->buffer_donor_config() = buffer_donor_config();
   module->set_is_dynamic(is_dynamic());
+  module->set_frontend_attributes(frontend_attributes());
   if (has_schedule() && schedule().Verify().ok()) {
     HloSchedule clone_schedule(module.get());
     for (HloComputation* computation : computations()) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index 7ebc303b1b83e8..c448cfa76a261d 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -176,6 +176,19 @@ class HloModule {
     return config_.entry_computation_layout();
   }
 
+  void set_frontend_attributes(FrontendAttributes frontend_attributes) {
+    frontend_attributes_ = std::move(frontend_attributes);
+  }
+
+  void add_frontend_attributes(FrontendAttributes frontend_attributes) {
+    frontend_attributes_.mutable_map()->insert(
+        frontend_attributes.map().begin(), frontend_attributes.map().end());
+  }
+
+  const FrontendAttributes& frontend_attributes() const {
+    return frontend_attributes_;
+  }
+
   void set_use_auto_spmd_partitioning(bool use) {
     use_auto_spmd_partitioning_ = use;
   }
@@ -660,6 +673,10 @@ class HloModule {
   // are expected from the module.
   HloBufferDonorConfig buffer_donor_config_;
 
+  // Attributes passed from the frontend to give hints to the backend about
+  // how to compile this HLO.
+  FrontendAttributes frontend_attributes_;
+
   // The HLO shardings of the entry computation's parameters for
   // SPMD-partitioned programs.
   std::optional<std::vector<HloSharding>> spmd_parameters_shardings_;
diff --git a/third_party/xla/xla/service/hlo.proto b/third_party/xla/xla/service/hlo.proto
index 8ba08fa53be7ff..22d46f62ff1a76 100644
--- a/third_party/xla/xla/service/hlo.proto
+++ b/third_party/xla/xla/service/hlo.proto
@@ -584,6 +584,9 @@ message HloModuleProto {
   // Stack frames index.
   StackFrameIndexProto stack_frame_index = 17;
 
+  // Frontend attributes to pass to the XLA backend.
+  xla.FrontendAttributes frontend_attributes = 19;
+
   reserved 9;
   reserved "dynamic_parameter_binding";
 }
diff --git a/third_party/xla/xla/service/hlo_module_test.cc b/third_party/xla/xla/service/hlo_module_test.cc
index 9293a5c366d954..34c4d20a0f4e79 100644
--- a/third_party/xla/xla/service/hlo_module_test.cc
+++ b/third_party/xla/xla/service/hlo_module_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_module.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -149,6 +150,21 @@ TEST_F(HloModuleTest, CloneTest) {
   }
 }
 
+TEST_F(HloModuleTest, CloneFrontendAttributes) {
+  auto module = CreateNewVerifiedModule();
+  FrontendAttributes frontend_attributes;
+  frontend_attributes.mutable_map()->emplace("attribute1", "attribute1_value");
+  module->set_frontend_attributes(frontend_attributes);
+  std::unique_ptr<HloModule> clone = module->Clone();
+  bool areEqual = std::equal(
+      frontend_attributes.map().begin(), frontend_attributes.map().end(),
+      clone->frontend_attributes().map().begin(),
+      [](const auto& kv1, const auto& kv2) {
+        return kv1.first == kv2.first && kv1.second == kv2.second;
+      });
+  EXPECT_TRUE(areEqual);
+}
+
 TEST_F(HloModuleTest, CloneHasFusion) {
   auto module = CreateNewVerifiedModule();
 
diff --git a/third_party/xla/xla/service/hlo_parser.cc b/third_party/xla/xla/service/hlo_parser.cc
index faae4f7aa0000c..a916fd555af2a6 100644
--- a/third_party/xla/xla/service/hlo_parser.cc
+++ b/third_party/xla/xla/service/hlo_parser.cc
@@ -995,6 +995,7 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
   std::optional<bool> alias_passthrough_params;
   absl::flat_hash_map<std::string, AttrConfig> attrs;
   std::optional<ComputationLayout> entry_computation_layout;
+  std::optional<FrontendAttributes> frontend_attributes;
   BoolList allow_spmd_sharding_propagation_to_output;
 
   attrs["is_scheduled"] = {/*required=*/false, AttrTy::kBool, &is_scheduled};
@@ -1007,6 +1008,8 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
   attrs["entry_computation_layout"] = {/*required=*/false,
                                        AttrTy::kComputationLayout,
                                        &entry_computation_layout};
+  attrs["frontend_attributes"] = {
+      /*required=*/false, AttrTy::kFrontendAttributes, &frontend_attributes};
   attrs["allow_spmd_sharding_propagation_to_output"] = {
       /*required=*/false, AttrTy::kBracedBoolListOrBool,
       &allow_spmd_sharding_propagation_to_output};
@@ -1053,6 +1056,9 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
     *config.mutable_entry_computation_layout() = *entry_computation_layout;
     default_config = false;
   }
+  if (frontend_attributes) {
+    module->set_frontend_attributes(frontend_attributes.value());
+  }
   if (!allow_spmd_sharding_propagation_to_output.empty()) {
     config.set_allow_spmd_sharding_propagation_to_output(
         allow_spmd_sharding_propagation_to_output);
diff --git a/third_party/xla/xla/service/hlo_parser_test.cc b/third_party/xla/xla/service/hlo_parser_test.cc
index c26cb386d8ae25..48bdeaecbd4e85 100644
--- a/third_party/xla/xla/service/hlo_parser_test.cc
+++ b/third_party/xla/xla/service/hlo_parser_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_frontend_attributes.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_sharding.h"
@@ -4473,6 +4474,25 @@ ENTRY TestComputation {
   EXPECT_TRUE(result.value()->config().alias_passthrough_params());
 }
 
+TEST_F(HloParserTest, CheckFrontendAttributes) {
+  const char* const hlo_string = R"(
+HloModule TestModule, frontend_attributes={attr_name="attr_value"}
+
+ENTRY TestComputation {
+    p0 = f16[2048,1024] parameter(0)
+    p1 = f16[2048,1024] parameter(1)
+    ROOT root = (f16[2048,1024], f16[2048,1024]) tuple(p0, p1)
+}
+)";
+  auto result = ParseAndReturnVerifiedModule(hlo_string);
+  TF_EXPECT_OK(result.status());
+  EXPECT_EQ(result.value()->frontend_attributes().map().size(), 1);
+  EXPECT_EQ(result.value()->frontend_attributes().map().begin()->first,
+            "attr_name");
+  EXPECT_EQ(result.value()->frontend_attributes().map().begin()->second,
+            "attr_value");
+}
+
 TEST_F(HloParserTest, CheckAllowSpmdShardingPropagationToOutput) {
   const char* const hlo_string = R"(
 HloModule TestModule, allow_spmd_sharding_propagation_to_output=true
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
index 9f05992c8ae70c..1494efd9ebc538 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
@@ -44,6 +44,9 @@ HloModuleImporter::HloModuleImporter(mlir::ModuleOp module,
 }
 
 namespace {
+
+constexpr char kFrontendAttributesAttr[] = "mhlo.frontend_attributes";
+
 mlir::ArrayAttr ConvertCrossProgramPrefetches(
     const absl::Span<const xla::HloModule::CrossProgramPrefetchInfo> prefetches,
     mlir::Builder* builder) {
@@ -71,6 +74,7 @@ Status HloModuleImporter::Import(const xla::HloModule& hlo_module) {
   module->setAttr(
       "mhlo.is_dynamic",
       mlir::BoolAttr::get(builder_.getContext(), hlo_module.is_dynamic()));
+  ImportFrontendAttributes(hlo_module, module);
   module->setAttr("mhlo.use_auto_spmd_partitioning",
                   mlir::BoolAttr::get(builder_.getContext(),
                                       hlo_module.use_auto_spmd_partitioning()));
@@ -120,4 +124,19 @@ Status HloModuleImporter::Import(const xla::HloModuleProto& module_proto) {
   return Import(*module);
 }
 
+void HloModuleImporter::ImportFrontendAttributes(
+    const xla::HloModule& hlo_module, mlir::ModuleOp module) {
+  if (!hlo_module.frontend_attributes().map().empty()) {
+    llvm::SmallVector<mlir::NamedAttribute, 4> frontend_attributes;
+    for (const auto& [k, v] : hlo_module.frontend_attributes().map()) {
+      frontend_attributes.push_back(
+          builder_.getNamedAttr(k, builder_.getStringAttr(v)));
+    }
+    if (!frontend_attributes.empty()) {
+      module->setAttr(kFrontendAttributesAttr,
+                      builder_.getDictionaryAttr(frontend_attributes));
+    }
+  }
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
index b96c1a8a5a7372..ac42e28f3f2123 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
@@ -48,6 +48,9 @@ class HloModuleImporter {
   Status Import(const xla::HloModuleProto& module);
 
  private:
+  void ImportFrontendAttributes(const xla::HloModule& hlo_module,
+                                mlir::ModuleOp module);
+
   bool import_all_computation_;
   mlir::SymbolTable symbol_table_;
   mlir::Builder builder_;
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/module_attributes.hlo b/third_party/xla/xla/translate/hlo_to_mhlo/tests/module_attributes.hlo
index 300625adf6856c..1acf05ac3a3f04 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/module_attributes.hlo
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/module_attributes.hlo
@@ -207,6 +207,13 @@ hlo_module       {
     parameter: 1
     index: 0
   }
+# CHECK-SAME: mhlo.frontend_attributes = {attr_name = "attr_value"}
+  frontend_attributes {
+    map {
+      key: "attr_name"
+      value: "attr_value"
+    }
+  }
 # CHECK-SAME: mhlo.is_dynamic = true
   is_dynamic: true
 # CHECK-SAME: mhlo.use_auto_spmd_partitioning = true
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 1e535ea62f3b3b..ed3525fb6c79e1 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -3526,6 +3526,11 @@ xla::Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
           module->getAttrOfType<mlir::BoolAttr>("mhlo.is_dynamic")) {
     hlo_module.set_is_dynamic(is_dynamic.getValue());
   }
+  if (auto frontend_attributes =
+          module->getAttrOfType<DictionaryAttr>(kFrontendAttributesAttr)) {
+    ConstructFrontendAttributesFromAttribute(
+        frontend_attributes, *hlo_module.mutable_frontend_attributes());
+  }
   if (auto use_auto_spmd_partitioning = module->getAttrOfType<mlir::BoolAttr>(
           "mhlo.use_auto_spmd_partitioning")) {
     hlo_module.set_use_auto_spmd_partitioning(
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/module_attributes.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/module_attributes.mlir
index 814e794453cdb1..049456bb09e6f7 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/module_attributes.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/module_attributes.mlir
@@ -86,3 +86,17 @@ module attributes { mhlo.spmd_output_sharding = "\08\03\1A\02\01\02\22\02\00\01"
 // CHECK:   tile_assignment_devices: 0
 // CHECK:   tile_assignment_devices: 1
 // CHECK: }
+
+// -----
+
+//         CHECK-LABEL: ModuleWithFrontendAttributes
+module @ModuleWithFrontendAttributes attributes {
+//      CHECK{LITERAL}: frontend_attributes {
+//      CHECK{LITERAL}: key: "attr_name"
+//      CHECK{LITERAL}: value: "attr_value"
+  mhlo.frontend_attributes = { attr_name="attr_value" }
+} {
+  func.func @main(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+    func.return %arg0 : tensor<1xf32>
+  }
+}

From 13dca754ed367979511b29b71b6ee6e70641267a Mon Sep 17 00:00:00 2001
From: Sam McCall <sammccall@google.com>
Date: Fri, 29 Sep 2023 05:33:12 -0700
Subject: [PATCH 416/567] Integrate LLVM at llvm/llvm-project@512739ebbb25

Updates LLVM usage to match
[512739ebbb25](https://github.com/llvm/llvm-project/commit/512739ebbb25)

PiperOrigin-RevId: 569465927
---
 third_party/llvm/workspace.bzl                |  4 +-
 .../cpu/transforms/sparse_rewrite_passes.cc   | 42 ++++++++++---------
 .../sparse_ops_to_custom_calls.cc             |  4 +-
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 1b4bdad80ced49..1a35cc3c8dd812 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "23ef8bf9c0f338ee073c6c1b553c42e46d2f22ad"
-    LLVM_SHA256 = "416d1cc3b9e7d9272c5677a94dbc6298ad6d20200171f62134af1eee5fa4c24f"
+    LLVM_COMMIT = "512739ebbb25a75a6298b783f97aea2fb9f6a8e8"
+    LLVM_SHA256 = "ebb856efcb8e4e2b6a45b313766df92ffcf430ab37cbc4ee5571afe8ee9e992d"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
index 3402f867a3915f..871292d2a6eede 100644
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
+++ b/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
@@ -79,11 +79,11 @@ Value getEmptyTensor(OpBuilder& b, Location loc, RankedTensorType type) {
       .getResult(0);
 }
 
-struct SparseBatchedPackCallRewriter {
+struct SparseBatchedAssembleCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
     assert(op.getResults().size() == 1 && "Must be packing into one tensor");
     Value ret_sp_tensor = op.getResults()[0];
-    rewriter.replaceOpWithNewOp<sparse_tensor::PackOp>(
+    rewriter.replaceOpWithNewOp<sparse_tensor::AssembleOp>(
         op, ret_sp_tensor.getType(), op.getInputs()[0],  // sparse tensor values
         op.getInputs().drop_front());                    // sparse tensor levels
     return success();
@@ -396,31 +396,33 @@ struct SparseUnaryChloCallRewriter {
   }
 };
 
-struct SparseUnpackCallRewriter {
+struct SparseDisassembleCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
     // TODO(peiming): Canonicalizes these two cases. The old bridge that uses
     // jax.BCOO/BCSR does not require buffer lengths.
-    unsigned unpack_bufs_num = op.getInputs().size() - 1;
-    assert(op.getResults().size() == unpack_bufs_num ||
-           op.getResults().size() == unpack_bufs_num * 2);
-    SmallVector<Type> unpack_ret_tp(
-        op.getResults().take_front(unpack_bufs_num).getTypes());
+    unsigned disassemble_bufs_num = op.getInputs().size() - 1;
+    assert(op.getResults().size() == disassemble_bufs_num ||
+           op.getResults().size() == disassemble_bufs_num * 2);
+    SmallVector<Type> disassemble_ret_tp(
+        op.getResults().take_front(disassemble_bufs_num).getTypes());
     // Extra lengths for each buffer returned.
-    unpack_ret_tp.append(unpack_bufs_num, rewriter.getIndexType());
+    disassemble_ret_tp.append(disassemble_bufs_num, rewriter.getIndexType());
     Value tensor = op.getInputs()[0];
     Value out_vals = op.getInputs()[1];
     ValueRange out_lvls = op.getInputs().drop_front(2);
-    // Constructs the UnpackOp.
-    auto unpack_op = rewriter.create<sparse_tensor::UnpackOp>(
-        op.getLoc(), unpack_ret_tp, tensor, out_vals, out_lvls);
-    assert(unpack_op.getResults().size() == unpack_bufs_num * 2);
-    ValueRange bufs = unpack_op.getResults().take_front(unpack_bufs_num);
-    ValueRange lens = unpack_op.getResults().take_back(unpack_bufs_num);
+    // Constructs the disassembleOp.
+    auto disassemble_op = rewriter.create<sparse_tensor::DisassembleOp>(
+        op.getLoc(), disassemble_ret_tp, tensor, out_vals, out_lvls);
+    assert(disassemble_op.getResults().size() == disassemble_bufs_num * 2);
+    ValueRange bufs =
+        disassemble_op.getResults().take_front(disassemble_bufs_num);
+    ValueRange lens =
+        disassemble_op.getResults().take_back(disassemble_bufs_num);
 
     // Wraps the scalar value into a "scalar tensor", i.e., tensor<i64>
     SmallVector<Value> rets(bufs.begin(), bufs.end());
-    if (op.getResults().size() == unpack_bufs_num * 2) {
-      ValueRange ret_lens = op.getResults().take_back(unpack_bufs_num);
+    if (op.getResults().size() == disassemble_bufs_num * 2) {
+      ValueRange ret_lens = op.getResults().take_back(disassemble_bufs_num);
       for (auto [len, tensor_len] : llvm::zip(lens, ret_lens)) {
         auto ret_t_len = rewriter.create<tensor::EmptyOp>(
             op.getLoc(), tensor_len.getType(), ValueRange{});
@@ -629,8 +631,10 @@ class SparseCustomCallRewriter : public OpRewritePattern<mhlo::CustomCallOp> {
       std::make_pair("sparse_tensor_sinh",
                      SparseUnaryChloCallRewriter<chlo::SinhOp>()),
       std::make_pair("sparse_tensor_slice", SparseSliceCallRewriter()),
-      std::make_pair("sparse_tensor_pack", SparseBatchedPackCallRewriter()),
-      std::make_pair("sparse_tensor_unpack", SparseUnpackCallRewriter()),
+      std::make_pair("sparse_tensor_assemble",
+                     SparseBatchedAssembleCallRewriter()),
+      std::make_pair("sparse_tensor_disassemble",
+                     SparseDisassembleCallRewriter()),
       std::make_pair("sparse_tensor_sub",
                      SparseBinaryCallRewriter<mhlo::SubtractOp>()),
       std::make_pair("sparse_tensor_tan",
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sparse_ops/sparse_ops_to_custom_calls.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sparse_ops/sparse_ops_to_custom_calls.cc
index fa17f5145fa87f..6641c0451e8528 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sparse_ops/sparse_ops_to_custom_calls.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sparse_ops/sparse_ops_to_custom_calls.cc
@@ -56,8 +56,8 @@ class SparseOpToCustomCallConverter : public OpConversionPattern<OpTy> {
 void populateLegalizeSparseOpsToCustomCallPatterns(
     MLIRContext* context, TypeConverter& typeConverter,
     RewritePatternSet* patterns) {
-  patterns->add<SparseOpToCustomCallConverter<sparse_tensor::PackOp>,
-                SparseOpToCustomCallConverter<sparse_tensor::UnpackOp>,
+  patterns->add<SparseOpToCustomCallConverter<sparse_tensor::AssembleOp>,
+                SparseOpToCustomCallConverter<sparse_tensor::DisassembleOp>,
                 SparseOpToCustomCallConverter<sparse_tensor::ConvertOp>>(
       typeConverter, context);
 }

From 0af9d56fe45e7b5dfe29a02190590ee0f14d292d Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Fri, 29 Sep 2023 06:06:02 -0700
Subject: [PATCH 417/567] [XLA:GPU] Replace the use of global TSL flag with
 precision config in Triton GEMM.

PiperOrigin-RevId: 569471716
---
 third_party/xla/xla/service/gpu/BUILD         |   7 +-
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |  13 +-
 .../ir_emitter_triton_parametrized_test.cc    | 125 ++++++------------
 .../xla/service/gpu/ir_emitter_triton_test.cc |  32 +----
 4 files changed, 61 insertions(+), 116 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 49ada8a5603e3a..f95c89f7a0e363 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -485,6 +485,7 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/translate/hlo_to_mhlo:hlo_module_importer",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -549,7 +550,6 @@ xla_test(
         ":ir_emitter_triton",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
-        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
@@ -576,7 +576,6 @@ xla_test(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -624,8 +623,6 @@ xla_test(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/service:pattern_matcher",
-        "//xla/service:pattern_matcher_gmock",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cublas_plugin",
@@ -633,8 +630,6 @@ xla_test(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index eeba9f284dd45e..e3d2a62d3e2144 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
@@ -109,7 +110,6 @@ limitations under the License.
 #include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/platform/tensor_float_32_utils.h"
 #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h"
 #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -1397,10 +1397,15 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
       dot_input_rhs = apply_mask(1, dot_input_rhs);
     }
 
+    const bool allow_tf32 =
+        absl::c_none_of(dot_instr->precision_config().operand_precision(),
+                        [](const int precision) {
+                          return precision != PrecisionConfig::DEFAULT;
+                        });
+
     // Execute matrix multiplication of input tiles and pass the accumulator.
-    Value accumulator_next = b.create<mt::DotOp>(
-        dot_input_lhs, dot_input_rhs, iter_args.back(),
-        /*allowTF32=*/tsl::tensor_float_32_execution_enabled());
+    Value accumulator_next = b.create<mt::DotOp>(dot_input_lhs, dot_input_rhs,
+                                                 iter_args.back(), allow_tf32);
     iter_args_next.push_back(accumulator_next);
 
     b.create<mlir::scf::YieldOp>(iter_args_next);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index d33f5052ec483f..24c686949281ac 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
-#include <memory>
 #include <string>
 #include <tuple>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/base/optimization.h"
 #include "absl/strings/str_cat.h"
@@ -28,26 +26,18 @@ limitations under the License.
 #include "absl/strings/substitute.h"
 #include "xla/comparison_util.h"
 #include "xla/error_spec.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/tensor_float_32_utils.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-namespace m = ::xla::match;
-
 struct MixTypeParams {
   PrimitiveType lhs_ty;
   PrimitiveType rhs_ty;
@@ -143,18 +133,8 @@ INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, MixedTypeTest,
                          }),
                          GemmTestParamsParamsToString);
 
-class NoTF32Test : public GpuCodegenTest {
+class TritonTest : public GpuCodegenTest {
  public:
-  void SetUp() override {
-    tf32_state_ = tsl::tensor_float_32_execution_enabled();
-    // Elementwise operations do not use tf32 anyway but disabling it on the
-    // dot allows verification in higher precision for f32.
-    tsl::enable_tensor_float_32_execution(false);
-  }
-  void TearDown() override {
-    tsl::enable_tensor_float_32_execution(tf32_state_);
-  }
-
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_triton_gemm_any(true);
@@ -168,12 +148,9 @@ class NoTF32Test : public GpuCodegenTest {
         ->GetDeviceDescription()
         .cuda_compute_capability();
   }
-
- private:
-  bool tf32_state_;
 };
 
-class ElementwiseTest : public NoTF32Test,
+class ElementwiseTest : public TritonTest,
                         public ::testing::WithParamInterface<
                             std::tuple<PrimitiveType, HloOpcode, float>> {};
 
@@ -197,16 +174,15 @@ TEST_P(UnaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
   float tolerance;
   std::tie(data_type, opcode, tolerance) = GetParam();
 
-  const std::string hHloTestTemplate = R"(
-HloModule m, is_scheduled=true
-
+  const std::string kHloTestTemplate = R"(
 triton_gemm___computation {
   parameter_0 = f32[15,33]{1,0} parameter(0)
   parameter_1 = $0[33,68]{1,0} parameter(1)
   f1.1 = $0[33,68]{1,0} $1(parameter_1)
   c.1 = f32[33,68]{1,0} convert(f1.1)
   ROOT _.1 = f32[15,68]{1,0} dot(parameter_0, c.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
 }
 
 ENTRY e {
@@ -220,12 +196,10 @@ ENTRY e {
                                           "num_stages":"1","num_warps":"4"}}
 })";
   const std::string hlo_test = absl::Substitute(
-      hHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
       HloOpcodeString(opcode));
 
-  const std::string hHloRefTemplate = R"(
-HloModule m, is_scheduled=true
-
+  const std::string kHloRefTemplate = R"(
 fused_computation {
   param_0.1 = $0[33,68]{1,0} parameter(0)
   f.1 = $0[33,68]{1,0} $1(param_0.1)
@@ -242,10 +216,10 @@ ENTRY e {
       {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
       "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
       "alpha_imag":0,"precision_config":
-      {"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}
 })";
   const std::string hlo_ref = absl::Substitute(
-      hHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
       HloOpcodeString(opcode));
 
   EXPECT_TRUE(RunAndCompareTwoModules(
@@ -308,9 +282,7 @@ TEST_P(BinaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
   float tolerance;
   std::tie(data_type, opcode, tolerance) = GetParam();
 
-  const std::string hHloTestTemplate = R"(
-HloModule m, is_scheduled=true
-
+  const std::string kHloTestTemplate = R"(
 triton_gemm___computation {
   parameter_0 = f32[92,11]{1,0} parameter(0)
   parameter_1 = $0[11,63]{1,0} parameter(1)
@@ -318,7 +290,8 @@ triton_gemm___computation {
   f1.1 = $0[11,63]{1,0} $1(parameter_1, parameter_2)
   c.1 = f32[11,63]{1,0} convert(f1.1)
   ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, c.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
 }
 
 ENTRY e {
@@ -333,12 +306,10 @@ ENTRY e {
                                           "num_stages":"2","num_warps":"2"}}
 })";
   const std::string hlo_test = absl::Substitute(
-      hHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
       HloOpcodeString(opcode));
 
-  const std::string hHloRefTemplate = R"(
-HloModule m, is_scheduled=true
-
+  const std::string kHloRefTemplate = R"(
 fused_computation {
   p0 = $0[11,63]{1,0} parameter(0)
   p1 = $0[11,63]{1,0} parameter(1)
@@ -357,10 +328,10 @@ ENTRY e {
       {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
       "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
       "alpha_imag":0,"precision_config":
-      {"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}
 })";
   const std::string hlo_ref = absl::Substitute(
-      hHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
       HloOpcodeString(opcode));
 
   EXPECT_TRUE(RunAndCompareTwoModules(
@@ -418,7 +389,7 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::Values(1e-6)),
     ElementwiseTestParamsToString);
 
-class CompareTest : public NoTF32Test,
+class CompareTest : public TritonTest,
                     public ::testing::WithParamInterface<
                         std::tuple<PrimitiveType, Comparison::Direction>> {};
 
@@ -437,9 +408,7 @@ TEST_P(CompareTest, CompareFusionExecutesCorrectly) {
   Comparison::Direction direction;
   std::tie(data_type, direction) = GetParam();
 
-  const std::string hHloTestTemplate = R"(
-HloModule m, is_scheduled=true
-
+  const std::string kHloTestTemplate = R"(
 triton_gemm___computation {
   parameter_0 = f32[92,11]{1,0} parameter(0)
   parameter_1 = $0[11,63]{1,0} parameter(1)
@@ -447,7 +416,8 @@ triton_gemm___computation {
   f1.1 = pred[11,63]{1,0} compare(parameter_1, parameter_2), direction=$1
   c.1 = f32[11,63]{1,0} convert(f1.1)
   ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, c.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
 }
 
 ENTRY e {
@@ -462,12 +432,10 @@ ENTRY e {
                                           "num_stages":"3","num_warps":"2"}}
 })";
   const std::string hlo_test = absl::Substitute(
-      hHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
       ComparisonDirectionToString(direction));
 
-  const std::string hHloRefTemplate = R"(
-HloModule m, is_scheduled=true
-
+  const std::string kHloRefTemplate = R"(
 fused_computation {
   p0 = $0[11,63]{1,0} parameter(0)
   p1 = $0[11,63]{1,0} parameter(1)
@@ -486,10 +454,10 @@ ENTRY e {
       {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
       "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
       "alpha_imag":0,"precision_config":
-      {"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}
 })";
   const std::string hlo_ref = absl::Substitute(
-      hHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
       ComparisonDirectionToString(direction));
 
   float tolerance;
@@ -527,7 +495,7 @@ INSTANTIATE_TEST_SUITE_P(
                                          cd::kLe, cd::kLt)),
     CompareTestParamsToString);
 
-class SelectTest : public NoTF32Test,
+class SelectTest : public TritonTest,
                    public ::testing::WithParamInterface<
                        std::tuple<PrimitiveType, PrimitiveType>> {};
 
@@ -542,9 +510,7 @@ TEST_P(SelectTest, SelectFusionExecutesCorrectly) {
     }
   }
 
-  const std::string hHloTestTemplate = R"(
-HloModule m, is_scheduled=true
-
+  const std::string kHloTestTemplate = R"(
 triton_gemm___computation {
   parameter_0 = $1[92,11]{1,0} parameter(0)
   parameter_1 = $0[11,63]{1,0} parameter(1)
@@ -553,7 +519,8 @@ triton_gemm___computation {
   f1.1 = $0[11,63]{1,0} select(parameter_3, parameter_1, parameter_2)
   c.1 = $1[11,63]{1,0} convert(f1.1)
   ROOT _.1 = $1[92,63]{1,0} dot(parameter_0, c.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
 }
 
 ENTRY e {
@@ -569,12 +536,10 @@ ENTRY e {
                                           "num_stages":"3","num_warps":"2"}}
 })";
   const std::string hlo_test = absl::Substitute(
-      hHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type1),
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type1),
       primitive_util::LowercasePrimitiveTypeName(data_type2));
 
-  const std::string hHloRefTemplate = R"(
-HloModule m, is_scheduled=true
-
+  const std::string kHloRefTemplate = R"(
 fused_computation {
   p0 = $0[11,63]{1,0} parameter(0)
   p1 = $0[11,63]{1,0} parameter(1)
@@ -596,10 +561,10 @@ ENTRY e {
       {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
       "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
       "alpha_imag":0,"precision_config":
-      {"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}
 })";
   const std::string hlo_ref = absl::Substitute(
-      hHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type1),
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type1),
       primitive_util::LowercasePrimitiveTypeName(data_type2));
 
   float tolerance;
@@ -652,7 +617,7 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::Values(F16, BF16, F32)),
     TwoPrimitiveTypesToString);
 
-class ConstantTest : public NoTF32Test,
+class ConstantTest : public TritonTest,
                      public ::testing::WithParamInterface<PrimitiveType> {};
 
 TEST_P(ConstantTest, ConstantFusionExecutesCorrectly) {
@@ -663,9 +628,7 @@ TEST_P(ConstantTest, ConstantFusionExecutesCorrectly) {
         primitive_util::LowercasePrimitiveTypeName(data_type));
   }
 
-  const std::string hHloTestTemplate = R"(
-HloModule m, is_scheduled=true
-
+  const std::string kHloTestTemplate = R"(
 triton_gemm___computation {
   parameter_0 = f32[92,11]{1,0} parameter(0)
   parameter_1 = f32[11,63]{1,0} parameter(1)
@@ -674,7 +637,8 @@ triton_gemm___computation {
   cv = f32[11,63] convert(b)
   m = f32[11,63] multiply(cv, parameter_1)
   ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, m),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
 }
 
 ENTRY e {
@@ -688,11 +652,9 @@ ENTRY e {
                                           "num_stages":"3","num_warps":"2"}}
 })";
   const std::string hlo_test = absl::Substitute(
-      hHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
-
-  const std::string hHloRefTemplate = R"(
-HloModule m, is_scheduled=true
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
 
+  const std::string kHloRefTemplate = R"(
 fused_computation {
   p0 = f32[11,63]{1,0} parameter(0)
   c = $0[] constant(123)
@@ -712,10 +674,10 @@ ENTRY e {
       {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
       "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
       "alpha_imag":0,"precision_config":
-      {"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}
 })";
   const std::string hlo_ref = absl::Substitute(
-      hHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
 
   float tolerance;
   switch (data_type) {
@@ -750,7 +712,7 @@ INSTANTIATE_TEST_SUITE_P(
       return primitive_util::LowercasePrimitiveTypeName(type.param);
     });
 
-class ConvertTest : public NoTF32Test,
+class ConvertTest : public TritonTest,
                     public ::testing::WithParamInterface<
                         std::tuple<PrimitiveType, PrimitiveType>> {};
 
@@ -773,10 +735,11 @@ t {
   p0cc = f32[2,2] convert(p0c)
   p1 = f32[2,2] parameter(1)
   ROOT r = f32[2,2] dot(p0cc, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
 }
 
-ENTRY e{
+ENTRY e {
   p0 = $0[2,2] parameter(0)
   p1 = f32[2,2] parameter(1)
   ROOT r = f32[2,2] fusion(p0, p1), kind=kCustom, calls=t,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 44aaa50afc72da..6c6daa7042d1a0 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -53,7 +53,6 @@ limitations under the License.
 #include "tsl/platform/status.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/platform/tensor_float_32_utils.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -77,20 +76,6 @@ class TritonGemmTest : public GpuCodegenTest {
   }
 };
 
-class TritonGemmNoTF32Test : public TritonGemmTest {
- public:
-  void SetUp() override {
-    tf32_state_ = tsl::tensor_float_32_execution_enabled();
-    tsl::enable_tensor_float_32_execution(false);
-  }
-  void TearDown() override {
-    tsl::enable_tensor_float_32_execution(tf32_state_);
-  }
-
- private:
-  bool tf32_state_;
-};
-
 class TritonFilecheckTest : public TritonGemmTest {
  public:
   StatusOr<bool> CreateTritonIrAndFileCheck(
@@ -220,16 +205,15 @@ CHECK:    }
               tsl::testing::IsOkAndHolds(true));
 }
 
-TEST_F(TritonGemmNoTF32Test, DoNotUseTensorCoresForF32) {
+TEST_F(TritonGemmTest, DoNotUseTensorCoresWithNonDefaultPrecision) {
   const std::string kHloText = R"(
-HloModule t, is_scheduled=true
-
 triton_gemm_r {
   parameter_0 = s8[80,15]{1,0} parameter(0)
   convert.3 = f32[80,15]{1,0} convert(parameter_0)
   parameter_1 = f32[16,15]{1,0} parameter(1)
   ROOT r.1 = f32[80,16]{1,0} dot(convert.3, parameter_1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    operand_precision={HIGH, HIGH}
 }
 
 ENTRY e {
@@ -237,9 +221,9 @@ ENTRY e {
   p0 = s8[80,15]{1,0} parameter(0)
   ROOT triton_gemm_r = f32[80,16]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm_r,
-    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":2}}
+    backend_config={kind: "__triton_gemm", triton_gemm_config:
+      {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":2}}
 })";
-  CHECK(!tsl::tensor_float_32_execution_enabled());
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
                           ParseAndReturnVerifiedModule(kHloText));
 
@@ -280,8 +264,6 @@ ENTRY e {
 
 TEST_F(TritonGemmTest, UseTensorCoresForF32OnAmpere) {
   const std::string kHloText = R"(
-HloModule t, is_scheduled=true
-
 triton_gemm_r {
   parameter_0 = s8[80,15]{1,0} parameter(0)
   convert.3 = f32[80,15]{1,0} convert(parameter_0)
@@ -295,9 +277,9 @@ ENTRY e {
   p0 = s8[80,15]{1,0} parameter(0)
   ROOT triton_gemm_r = f32[80,16]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm_r,
-    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":2}}
+    backend_config={kind: "__triton_gemm", triton_gemm_config:
+      {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":2}}
 })";
-  CHECK(tsl::tensor_float_32_execution_enabled());
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
                           ParseAndReturnVerifiedModule(kHloText));
 

From 86cba519e59c3ca7b5b6e5be4e08eb12e17c58f8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Sep 2023 06:11:28 -0700
Subject: [PATCH 418/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/88e2ee6c6aaed74fc4201ad62e0b6dad0c40d56f.

PiperOrigin-RevId: 569472682
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index ac11c18a145229..3fc4047aa7903e 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "703dbd9190461909be83200d6a1f1c495f1177d5"
-    TFRT_SHA256 = "7c7edf3357844ec377f5c23fc5242d888f691616cd5a5a8c2e8d95d17bb93e0b"
+    TFRT_COMMIT = "88e2ee6c6aaed74fc4201ad62e0b6dad0c40d56f"
+    TFRT_SHA256 = "c9e0cbaf6a39144f8bb1bc3a4789165d4af032ff5f2999724ad27c449de9b969"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index ac11c18a145229..3fc4047aa7903e 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "703dbd9190461909be83200d6a1f1c495f1177d5"
-    TFRT_SHA256 = "7c7edf3357844ec377f5c23fc5242d888f691616cd5a5a8c2e8d95d17bb93e0b"
+    TFRT_COMMIT = "88e2ee6c6aaed74fc4201ad62e0b6dad0c40d56f"
+    TFRT_SHA256 = "c9e0cbaf6a39144f8bb1bc3a4789165d4af032ff5f2999724ad27c449de9b969"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index ac11c18a145229..3fc4047aa7903e 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "703dbd9190461909be83200d6a1f1c495f1177d5"
-    TFRT_SHA256 = "7c7edf3357844ec377f5c23fc5242d888f691616cd5a5a8c2e8d95d17bb93e0b"
+    TFRT_COMMIT = "88e2ee6c6aaed74fc4201ad62e0b6dad0c40d56f"
+    TFRT_SHA256 = "c9e0cbaf6a39144f8bb1bc3a4789165d4af032ff5f2999724ad27c449de9b969"
 
     tf_http_archive(
         name = "tf_runtime",

From 3d1787658f2665a7b4f76c9cc7eb8ede55fa4535 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Fri, 29 Sep 2023 06:29:10 -0700
Subject: [PATCH 419/567] [XLA:GPU] Add VLOG for ShouldFuse.

PiperOrigin-RevId: 569475824
---
 .../xla/xla/service/gpu/priority_fusion.cc    | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/priority_fusion.cc b/third_party/xla/xla/service/gpu/priority_fusion.cc
index 1245f5a83458b2..e236d79906e5ba 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion.cc
@@ -65,15 +65,14 @@ bool ElementIsF32OrF16(const Shape& shape) {
 // nodes and their operands.
 class GpuPriorityFusionQueue : public FusionQueue {
   using Priority = int64_t;
-  using CanFuseCallback = std::function<bool(
+  using CanFuseCallback = std::function<FusionDecision(
       HloInstruction* /*producer*/, int64_t /*consumer operand_index*/)>;
 
  public:
   GpuPriorityFusionQueue(
       HloComputation* computation,
       const GpuHloCostAnalysis::Options& cost_analysis_options,
-      const GpuDeviceInfo* device_info,
-      const std::function<bool(HloInstruction*, int64_t)>& can_fuse)
+      const GpuDeviceInfo* device_info, const CanFuseCallback& can_fuse)
       : computation_(computation),
         cost_analysis_(cost_analysis_options, device_info),
         can_fuse_(can_fuse) {
@@ -232,7 +231,8 @@ class GpuPriorityFusionQueue : public FusionQueue {
   // users.
   Priority CalculateProducerPriority(HloInstruction* producer) {
     // Don't fuse if we can't fuse in all users.
-    if (!CanFuseWithAllUsers(producer)) {
+    if (auto fusion_decision = CanFuseWithAllUsers(producer);
+        !fusion_decision) {
       return std::numeric_limits<Priority>::min();
     }
 
@@ -243,10 +243,17 @@ class GpuPriorityFusionQueue : public FusionQueue {
                                     run_times.time_fused);
   }
 
-  bool CanFuseWithAllUsers(HloInstruction* producer) const {
-    return absl::c_all_of(producer->users(), [&](HloInstruction* user) {
-      return can_fuse_(user, user->operand_index(producer));
-    });
+  FusionDecision CanFuseWithAllUsers(HloInstruction* producer) const {
+    FusionDecision result;
+    for (const auto& user : producer->users()) {
+      if (auto fusion_decision = can_fuse_(user, user->operand_index(producer));
+          !fusion_decision) {
+        VLOG(10) << "Cannot fuse " << producer->name() << " with "
+                 << user->name() << ", because: " << fusion_decision.Explain();
+        return fusion_decision;
+      }
+    }
+    return {};
   }
 
   // Store computation for cost analysis.
@@ -391,7 +398,7 @@ std::unique_ptr<FusionQueue> GpuPriorityFusion::GetFusionQueue(
   return std::unique_ptr<FusionQueue>(new GpuPriorityFusionQueue(
       computation, cost_analysis_options_, &device_info_,
       [this](HloInstruction* consumer, int64_t operand_index) {
-        return ShouldFuse(consumer, operand_index).CanFuse();
+        return ShouldFuse(consumer, operand_index);
       }));
 }
 

From 39a8b772355f7ee57e8076dc1908dbcdf52a4c4b Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Fri, 29 Sep 2023 06:33:16 -0700
Subject: [PATCH 420/567] [XLA:GPU] Unravel the bool expression in
 IsUniversallyLoopFusible, IsLoopFusibleAsConsumer and IsLoopFusibleAsProducer

PiperOrigin-RevId: 569476546
---
 .../xla/xla/service/gpu/gpu_fusible.cc        | 93 ++++++++++++-------
 1 file changed, 59 insertions(+), 34 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index 14f9ea46dd008a..1c4cde6e0ba223 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -318,50 +318,75 @@ bool IsInputFusible(const HloInstruction& instr) {
           IsInputFusibleTranspose(instr));
 }
 
+// Returns true if `instr` can be fused as a producer or as a consumer into a
+// kLoop fusion.
 bool IsUniversallyLoopFusible(const HloInstruction& instr,
                               const HloInstruction& hero) {
-  // Don't fuse get-tuple-element on GPU: We can, but it's slower than not
-  // fusing.  We never generate kernels for unfused GTEs.  Instead, if an
-  // unfused GTE is an input to a kernel (including a fusion kernel), we
-  // compute the address of the GTE at the top of the kernel.  Often we know the
-  // address of the GTE result statically, so we can do this without chasing any
-  // pointers.
-  return ((instr.IsElementwise() && instr.operand_count() > 0 &&
-           instr.opcode() != HloOpcode::kCopy) ||
-          (instr.opcode() == HloOpcode::kCopy &&
-           !GetDescriptionForTiledTransposeEmitter(instr, hero).has_value()) ||
-          instr.opcode() == HloOpcode::kBitcast ||
-          instr.opcode() == HloOpcode::kBroadcast ||
-          instr.opcode() == HloOpcode::kConcatenate ||
-          instr.opcode() == HloOpcode::kDynamicSlice ||
-          instr.opcode() == HloOpcode::kDynamicUpdateSlice ||
-          (instr.opcode() == HloOpcode::kFusion &&
-           instr.fusion_kind() == HloInstruction::FusionKind::kLoop) ||
-          instr.opcode() == HloOpcode::kGather ||
-          instr.opcode() == HloOpcode::kPad ||
-          instr.opcode() == HloOpcode::kReduceWindow ||
-          instr.opcode() == HloOpcode::kReshape ||
-          instr.opcode() == HloOpcode::kReverse ||
-          instr.opcode() == HloOpcode::kSlice ||
-          instr.opcode() == HloOpcode::kTranspose);
+  // NOTE: this check is done before the switch below, because a fusion instr
+  // can also be elementwise, even if it's not a kLoop.
+  if (instr.IsElementwise() && instr.operand_count() > 0 &&
+      instr.opcode() != HloOpcode::kCopy) {
+    return true;
+  }
+
+  switch (instr.opcode()) {
+    case HloOpcode::kCopy:
+      return !GetDescriptionForTiledTransposeEmitter(instr, hero).has_value();
+
+    case HloOpcode::kFusion:
+      return instr.fusion_kind() == HloInstruction::FusionKind::kLoop;
+
+    case HloOpcode::kBitcast:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kGather:
+    case HloOpcode::kPad:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kReshape:
+    case HloOpcode::kReverse:
+    case HloOpcode::kSlice:
+    case HloOpcode::kTranspose:
+      return true;
+    default:
+      return false;
+  }
 }
 
+// Returns true if `instr` can be fused as a consumer into a kLoop fusion.
 bool IsLoopFusibleAsConsumer(const HloInstruction& instr,
                              const HloInstruction& hero) {
-  return instr.IsFusible() && instr.opcode() != HloOpcode::kBitcast &&
-         (IsUniversallyLoopFusible(instr, hero) ||
-          // Any reduction can be fused as a consumer.
-          instr.opcode() == HloOpcode::kReduce);
+  // Instr should be fusible.
+  if (!instr.IsFusible()) return false;
+
+  // An optimization for instruction fusion. Bitcast as a consumer means that it
+  // will be a root of a fusion. This just adds indexing overhead without any
+  // benefit.
+  if (instr.opcode() == HloOpcode::kBitcast) return false;
+
+  // Any reduction can be fused as a consumer.
+  if (instr.opcode() == HloOpcode::kReduce) return true;
+
+  return IsUniversallyLoopFusible(instr, hero);
 }
 
+// Returns true if `instr` can be fused as a producer into a kLoop fusion.
 bool IsLoopFusibleAsProducer(const HloInstruction& instr,
                              const HloInstruction& hero) {
-  return instr.IsFusible() &&
-         (IsUniversallyLoopFusible(instr, hero) ||
-          (instr.opcode() == HloOpcode::kIota ||
-           instr.opcode() == HloOpcode::kConstant ||
-           // Non-variadic reductions can be fused as producers.
-           (instr.opcode() == HloOpcode::kReduce && !instr.shape().IsTuple())));
+  // Instr should be fusible.
+  if (!instr.IsFusible()) return false;
+
+  switch (instr.opcode()) {
+    case HloOpcode::kIota:
+    case HloOpcode::kConstant:
+      return true;
+    case HloOpcode::kReduce:
+      // Non-variadic reductions can be fused as producers.
+      return !instr.shape().IsTuple();
+    default:
+      return IsUniversallyLoopFusible(instr, hero);
+  }
 }
 
 static bool AllSatisfy(const HloInstruction& instr,

From a1ccb74fbc61f4f926beae9548ea861d22106a0e Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Fri, 29 Sep 2023 07:25:38 -0700
Subject: [PATCH 421/567] Simplify implementation of ReduceWindow by removing
 template recursions.

PiperOrigin-RevId: 569486188
---
 tensorflow/lite/kernels/reduce_window.cc      | 204 ++++++------------
 tensorflow/lite/kernels/reduce_window_test.cc |   4 +-
 2 files changed, 73 insertions(+), 135 deletions(-)

diff --git a/tensorflow/lite/kernels/reduce_window.cc b/tensorflow/lite/kernels/reduce_window.cc
index 2c315cdd03693b..f21dbbc8051c4b 100644
--- a/tensorflow/lite/kernels/reduce_window.cc
+++ b/tensorflow/lite/kernels/reduce_window.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <type_traits>
 
 #include "tensorflow/lite/array.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -56,48 +57,26 @@ using IntCst = std::integral_constant<int, Val>;
 // └──┘     └──┘
 //  13 14 15 16
 //
-// This is a recursive implementation of the strided reduction. At the time of
-// writing, both GCC and Clang are able to optimise away the recursion when done
-// with templates. The same code without templates is optimised by GCC but not
-// by Clang.
-//
-// The template uses integral_constant for the rank and depth values because old
-// versions of GCC [do not support partial specializations on non-type template
-// parameters](https://stackoverflow.com/a/7776468).
-template <class Op, class Type, class RankValue, class DepthValue = IntCst<0>,
-          class = void>
-struct StridedReduceImpl;
-
-// Body case of the recursive implementation of the strided reduction.
-template <class Op, class Type, int Rank, int Depth>
-struct StridedReduceImpl<Op, Type, IntCst<Rank>, IntCst<Depth>,
-                         std::enable_if_t<Depth + 1 != Rank>> {
-  static void Run(const Type* input, const int64_t* const shape,
-                  const int64_t* const strides, Type& accu) {
-    const int64_t stride = strides[Depth];
-    const int64_t size = shape[Depth];
+// This is a recursive implementation of the strided reduction.
+template <class Op, class Type>
+void StridedReduce(const Type* input, const int64_t* const shape,
+                   const int64_t* const strides, Type& accu, const int rank,
+                   const int depth) {
+  const int64_t stride = strides[depth];
+  const int64_t size = shape[depth];
+  if (depth + 1 == rank) {
+    const Op op;
     for (int64_t i = 0; i < size; ++i) {
-      StridedReduceImpl<Op, Type, IntCst<Rank>, IntCst<Depth + 1>>::Run(
-          input, shape, strides, accu);
+      accu = op(accu, *input);
       input += stride;
     }
-  }
-};
-
-// Tail case of the recursive implementation of the strided reduction.
-template <class Op, class Type, int Rank>
-struct StridedReduceImpl<Op, Type, IntCst<Rank>, IntCst<Rank - 1>> {
-  static void Run(const Type* input, const int64_t* const shape,
-                  const int64_t* const strides, Type& accu) {
-    const int64_t stride = strides[Rank - 1];
-    const int64_t size = shape[Rank - 1];
-    const Op op;
+  } else {
     for (int64_t i = 0; i < size; ++i) {
-      accu = op(accu, *input);
+      StridedReduce<Op, Type>(input, shape, strides, accu, rank, depth + 1);
       input += stride;
     }
   }
-};
+}
 
 // Recursively computes strided reductions using a sliding window over the given
 // tensor.
@@ -117,52 +96,32 @@ struct StridedReduceImpl<Op, Type, IntCst<Rank>, IntCst<Rank - 1>> {
 // ┌─┐   ┌─┐
 // │X│X X│X│
 // └─┘   └─┘
-//
-// The template uses integral_constant for the rank and depth values because old
-// versions of GCC [do not support partial specializations on non-type template
-// parameters](https://stackoverflow.com/a/7776468).
-template <class Op, class Type, class RankValue, class DepthValue = IntCst<0>,
-          class = void>
-struct ReduceWindowImpl;
-
-// Body case of the recursive implementation of the window sliding.
-template <class Op, class Type, int Rank, int Depth>
-struct ReduceWindowImpl<Op, Type, IntCst<Rank>, IntCst<Depth>,
-                        std::enable_if_t<Depth + 1 != Rank>> {
-  static void Run(const Type* input, Type* output,
-                  const int64_t* const output_shape,
-                  const int64_t* const output_strides,
-                  const int64_t* const window_offset_strides,
-                  const int64_t* const window_shape,
-                  const int64_t* const window_reduce_strides, const Type init) {
-    for (int32_t dim = 0; dim < output_shape[Depth]; ++dim) {
-      ReduceWindowImpl<Op, Type, IntCst<Rank>, IntCst<Depth + 1>>::Run(
-          input, output, output_shape, output_strides, window_offset_strides,
-          window_shape, window_reduce_strides, init);
-      input += window_offset_strides[Depth];
-      output += output_strides[Depth];
-    }
-  }
-};
-
-// Tail case of the recursive implementation of the window sliding.
-template <class Op, class Type, int Rank>
-struct ReduceWindowImpl<Op, Type, IntCst<Rank>, IntCst<Rank - 1>> {
-  static void Run(const Type* input, Type* output,
-                  const int64_t* const output_shape,
-                  const int64_t* const output_strides,
-                  const int64_t* const window_offset_strides,
-                  const int64_t* const window_shape,
-                  const int64_t* const window_reduce_strides, const Type init) {
-    for (int32_t dim = 0; dim < output_shape[Rank - 1]; ++dim) {
+template <class Op, class Type>
+void ReduceWindowImpl(const Type* input, Type* output,
+                      const int64_t* const output_shape,
+                      const int64_t* const output_strides,
+                      const int64_t* const window_offset_strides,
+                      const int64_t* const window_shape,
+                      const int64_t* const window_reduce_strides,
+                      const Type init, const int rank, const int depth) {
+  if (depth + 1 == rank) {
+    for (int32_t dim = 0; dim < output_shape[depth]; ++dim) {
       *output = init;
-      StridedReduceImpl<Op, Type, IntCst<Rank>>::Run(
-          input, window_shape, window_reduce_strides, *output);
-      input += window_offset_strides[Rank - 1];
-      output += output_strides[Rank - 1];
+      StridedReduce<Op, Type>(input, window_shape, window_reduce_strides,
+                              *output, rank, 0);
+      input += window_offset_strides[depth];
+      output += output_strides[depth];
+    }
+  } else {
+    for (int32_t dim = 0; dim < output_shape[depth]; ++dim) {
+      ReduceWindowImpl<Op, Type>(input, output, output_shape, output_strides,
+                                 window_offset_strides, window_shape,
+                                 window_reduce_strides, init, rank, depth + 1);
+      input += window_offset_strides[depth];
+      output += output_strides[depth];
     }
   }
-};
+}
 
 std::array<int64_t, kMaxReduceWindowDims> ComputeStrides(
     const int64_t* const shape, const int64_t rank) {
@@ -205,26 +164,27 @@ std::array<int64_t, kMaxReduceWindowDims> ComputeOutputShape(
   return window_range;
 }
 
-template <class Op, class Type, int Rank>
+template <class Op, class Type>
 void ReduceWindow(const Type* const input, Type* output,
                   const int64_t* const shape, const int64_t* const window_shape,
                   const int64_t* const window_strides,
-                  const int64_t* const window_dilations, const Type init) {
+                  const int64_t* const window_dilations, const Type init,
+                  const int rank) {
   const std::array<int64_t, kMaxReduceWindowDims> strides =
-      ComputeStrides(shape, Rank);
+      ComputeStrides(shape, rank);
   const std::array<int64_t, kMaxReduceWindowDims> window_reduce_strides =
-      Multiply(strides.data(), window_dilations, Rank);
+      Multiply(strides.data(), window_dilations, rank);
   const std::array<int64_t, kMaxReduceWindowDims> window_offset_strides =
-      Multiply(strides.data(), window_strides, Rank);
+      Multiply(strides.data(), window_strides, rank);
   const std::array<int64_t, kMaxReduceWindowDims> output_shape =
       ComputeOutputShape(shape, window_shape, window_strides, window_dilations,
-                         Rank);
+                         rank);
   const std::array<int64_t, kMaxReduceWindowDims> output_strides =
-      ComputeStrides(output_shape.data(), Rank);
-  ReduceWindowImpl<Op, Type, IntCst<Rank>>::Run(
-      input, output, output_shape.data(), output_strides.data(),
-      window_offset_strides.data(), window_shape, window_reduce_strides.data(),
-      init);
+      ComputeStrides(output_shape.data(), rank);
+  ReduceWindowImpl<Op, Type>(input, output, output_shape.data(),
+                             output_strides.data(),
+                             window_offset_strides.data(), window_shape,
+                             window_reduce_strides.data(), init, rank, 0);
 }
 
 std::array<int64_t, kMaxReduceWindowDims> AsInt64(const int32_t* data,
@@ -280,49 +240,31 @@ TfLiteStatus SetupOutputTensor(const ReduceWindowContext& ctx) {
                                    output_shape.release());
 }
 
-template <class Op, class Type>
-TfLiteStatus DispatchReduceWindowRank(ReduceWindowContext& ctx) {
+template <class Op>
+TfLiteStatus DispatchReduceWindowType(ReduceWindowContext& ctx) {
   const int rank = ctx.input_tensor->dims->size;
   const std::array<int64_t, kMaxReduceWindowDims> input_shape =
       AsInt64(ctx.input_tensor->dims->data, rank);
-#define REDUCE_WINDOW_RANK_CASE(RANK)                               \
-  case RANK:                                                        \
-    ReduceWindow<Op, Type, RANK>(                                   \
-        reinterpret_cast<const Type*>(ctx.input_tensor->data.raw),  \
-        reinterpret_cast<Type*>(ctx.output_tensor->data.raw),       \
-        input_shape.data(), ctx.window_shape_tensor->data.i64,      \
-        ctx.window_strides_tensor->data.i64,                        \
-        ctx.window_dilations_tensor->data.i64,                      \
-        *reinterpret_cast<Type*>(ctx.init_value_tensor->data.raw)); \
+#define REDUCE_WINDOW_TYPE_CASE(CPP_TYPE, TENSOR_TYPE)                        \
+  case TENSOR_TYPE:                                                           \
+    ReduceWindow<Op, CPP_TYPE>(                                               \
+        reinterpret_cast<const CPP_TYPE*>(ctx.input_tensor->data.raw),        \
+        reinterpret_cast<CPP_TYPE*>(ctx.output_tensor->data.raw),             \
+        input_shape.data(), ctx.window_shape_tensor->data.i64,                \
+        ctx.window_strides_tensor->data.i64,                                  \
+        ctx.window_dilations_tensor->data.i64,                                \
+        *reinterpret_cast<CPP_TYPE*>(ctx.init_value_tensor->data.raw), rank); \
     break;
-  switch (ctx.input_tensor->dims->size) {
-    REDUCE_WINDOW_RANK_CASE(1);
-    REDUCE_WINDOW_RANK_CASE(2);
-    REDUCE_WINDOW_RANK_CASE(3);
-    REDUCE_WINDOW_RANK_CASE(4);
-    REDUCE_WINDOW_RANK_CASE(5);
-    REDUCE_WINDOW_RANK_CASE(6);
-    default:
-      return kTfLiteError;
-  }
-#undef REDUCE_WINDOW_RANK_CASE
-  return kTfLiteOk;
-}
-
-template <class Op>
-TfLiteStatus DispatchReduceWindowType(ReduceWindowContext& ctx) {
-#define REDUCE_WINDOW_TYPE_CASE(CPP_TYPE, TENSOR_TYPE) \
-  case TENSOR_TYPE:                                    \
-    return DispatchReduceWindowRank<Op, CPP_TYPE>(ctx);
   switch (ctx.input_tensor->type) {
+    REDUCE_WINDOW_TYPE_CASE(int8_t, kTfLiteBool);
     REDUCE_WINDOW_TYPE_CASE(int8_t, kTfLiteInt8);
     REDUCE_WINDOW_TYPE_CASE(int16_t, kTfLiteInt16);
     REDUCE_WINDOW_TYPE_CASE(int32_t, kTfLiteInt32);
     REDUCE_WINDOW_TYPE_CASE(int64_t, kTfLiteInt64);
     REDUCE_WINDOW_TYPE_CASE(uint8_t, kTfLiteUInt8);
-    REDUCE_WINDOW_TYPE_CASE(uint16_t, kTfLiteUInt16);
-    REDUCE_WINDOW_TYPE_CASE(uint32_t, kTfLiteUInt32);
-    REDUCE_WINDOW_TYPE_CASE(uint64_t, kTfLiteUInt64);
+    // REDUCE_WINDOW_TYPE_CASE(uint16_t, kTfLiteUInt16);
+    // REDUCE_WINDOW_TYPE_CASE(uint32_t, kTfLiteUInt32);
+    // REDUCE_WINDOW_TYPE_CASE(uint64_t, kTfLiteUInt64);
     REDUCE_WINDOW_TYPE_CASE(float, kTfLiteFloat32);
     static_assert(sizeof(float) == 4,
                   "float type is expected to be 32 bit long");
@@ -333,11 +275,7 @@ TfLiteStatus DispatchReduceWindowType(ReduceWindowContext& ctx) {
       return kTfLiteError;
   }
 #undef REDUCE_WINDOW_TYPE_CASE
-}
-
-template <class Op>
-TfLiteStatus DispatchReduceWindowOp(ReduceWindowContext& ctx) {
-  return DispatchReduceWindowType<Op>(ctx);
+  return kTfLiteOk;
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -377,17 +315,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case TfLiteReduceWindowFunctionUnsupported:
       return kTfLiteError;
     case TfLiteReduceWindowFunctionAdd:
-      return DispatchReduceWindowOp<std::plus<>>(ctx);
+      return DispatchReduceWindowType<std::plus<>>(ctx);
     case TfLiteReduceWindowFunctionMul:
-      return DispatchReduceWindowOp<std::multiplies<>>(ctx);
+      return DispatchReduceWindowType<std::multiplies<>>(ctx);
     case TfLiteReduceWindowFunctionAll:
-      return DispatchReduceWindowOp<std::logical_and<>>(ctx);
+      return DispatchReduceWindowType<std::logical_and<>>(ctx);
     case TfLiteReduceWindowFunctionAny:
-      return DispatchReduceWindowOp<std::logical_or<>>(ctx);
+      return DispatchReduceWindowType<std::logical_or<>>(ctx);
     case TfLiteReduceWindowFunctionMin:
-      return DispatchReduceWindowOp<Min>(ctx);
+      return DispatchReduceWindowType<Min>(ctx);
     case TfLiteReduceWindowFunctionMax:
-      return DispatchReduceWindowOp<Max>(ctx);
+      return DispatchReduceWindowType<Max>(ctx);
   }
 }
 
diff --git a/tensorflow/lite/kernels/reduce_window_test.cc b/tensorflow/lite/kernels/reduce_window_test.cc
index 37f8dd82878704..d1a8932fc1c422 100644
--- a/tensorflow/lite/kernels/reduce_window_test.cc
+++ b/tensorflow/lite/kernels/reduce_window_test.cc
@@ -166,8 +166,8 @@ class ReduceWindowTest : public testing::Test {
   DilateOpModel<StorageType> model_;
 };
 
-using TestList = testing::Types<int8_t, int16_t, int32_t, int64_t, uint8_t,
-                                uint16_t, uint32_t, uint64_t, float, double>;
+using TestList =
+    testing::Types<int8_t, int16_t, int32_t, int64_t, uint8_t, float, double>;
 
 TYPED_TEST_SUITE(ReduceWindowTest, TestList);
 

From 61b4313557b43bf6bd98d7c584e9d029e063fd2f Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Fri, 29 Sep 2023 08:18:51 -0700
Subject: [PATCH 422/567] Copy/Paste AddGraphExportLoweringPassesV2 from
 bridge.cc to the tf_Dialect_to_executor API.

PiperOrigin-RevId: 569496764
---
 tensorflow/compiler/mlir/tf2xla/api/v2/BUILD  |  24 ++++
 .../tf2xla/api/v2/testdata/empty_func.mlir    |   5 +
 .../api/v2/testdata/invalid_executor.mlir     |  17 +++
 .../tf2xla/api/v2/tf_dialect_to_executor.cc   | 114 +++++++++++++++++-
 .../tf2xla/api/v2/tf_dialect_to_executor.h    |   4 +-
 .../api/v2/tf_dialect_to_executor_test.cc     |  57 ++++++++-
 6 files changed, 215 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v2/testdata/empty_func.mlir
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v2/testdata/invalid_executor.mlir

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index 67a5fba6ccccf0..ce6b1dbd849bab 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -122,8 +122,22 @@ cc_library(
     srcs = ["tf_dialect_to_executor.cc"],
     hdrs = ["tf_dialect_to_executor.h"],
     deps = [
+        "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "//tensorflow/core:framework",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+        "@local_tsl//tsl/platform:error_logging",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -131,10 +145,20 @@ cc_library(
 tf_cc_test(
     name = "tf_dialect_to_executor_test",
     srcs = ["tf_dialect_to_executor_test.cc"],
+    data = [
+        "testdata/empty_func.mlir",
+        "testdata/invalid_executor.mlir",
+    ],
     deps = [
         ":tf_dialect_to_executor",
+        "//tensorflow/compiler/mlir:register_common_dialects",
+        "//tensorflow/core/platform:resource_loader",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/testdata/empty_func.mlir b/tensorflow/compiler/mlir/tf2xla/api/v2/testdata/empty_func.mlir
new file mode 100644
index 00000000000000..e0fd01c86d182f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/testdata/empty_func.mlir
@@ -0,0 +1,5 @@
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+  func.func @main() -> () {
+    func.return
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/testdata/invalid_executor.mlir b/tensorflow/compiler/mlir/tf2xla/api/v2/testdata/invalid_executor.mlir
new file mode 100644
index 00000000000000..fd493ecafb0436
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/testdata/invalid_executor.mlir
@@ -0,0 +1,17 @@
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+  func.func @main() {
+    tf_executor.graph {
+      %control = tf_executor.island {
+        tf_executor.yield
+      }
+      tf_executor.fetch %control : !tf_executor.control
+    }
+    tf_executor.graph {
+      %control = tf_executor.island {
+        tf_executor.yield
+      }
+      tf_executor.fetch %control : !tf_executor.control
+    }
+    return
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
index df04bfed17fd43..e2d34400aaaf41 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
@@ -15,18 +15,128 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
 
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/util/debug_data_dumper.h"
 #include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace tf2xla {
 namespace v2 {
 
+using mlir::LogicalResult;
 using mlir::ModuleOp;
+using mlir::OpPassManager;
+using mlir::PassManager;
+using mlir::func::FuncOp;
+
+namespace {
+
+// Add logger to bridge passmanager.
+// Enable timing statistics per pass for the bridge passmanager.
+void EnableDetailedLogging(PassManager *pm,
+                           llvm::StringRef module_name = llvm::StringRef()) {
+  // Print the whole module after each pass, which requires disabling
+  // multi-threading as well.
+  pm->getContext()->disableMultithreading();
+  pm->enableIRPrinting(std::make_unique<::tensorflow::DataDumperLoggerConfig>(
+      [module_name](const std::string &pass_tag_name, mlir::Operation *op) {
+        return DEBUG_DATA_DUMPER()->GetDumpFilename(
+            module_name.str(), kDebugGroupBridgePhase1, pass_tag_name);
+      },
+      "",
+      /*print_module_scope=*/true));
+  pm->enableTiming();
+}
+
+void AddGraphExportLoweringPasses(OpPassManager &pm) {
+  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
+
+  // First, we need to convert from functional, to executor dialect.
+  pm.addNestedPass<FuncOp>(
+      mlir::CreateFunctionalToExecutorDialectConversionPass());
+
+  // Do a single pass to split the graph's single island op into an island per
+  // op as expected by the following passes.
+  pm.addNestedPass<FuncOp>(mlir::TF::CreateSplitIntoIslandPerOpPass());
+
+  pm.addNestedPass<FuncOp>(mlir::TFDevice::CreateReplicateToIslandPass(
+      /*legacy_graph_export=*/false));
+  pm.addNestedPass<FuncOp>(
+      mlir::TFDevice::CreateReplicaIDToDeviceOrdinalPass());
+  pm.addNestedPass<FuncOp>(mlir::TFDevice::CreateParallelExecuteToIslandsPass(
+      /*legacy_graph_export=*/false));
+  pm.addNestedPass<FuncOp>(mlir::TFDevice::CreateLaunchToDeviceAttributePass(
+      /*legacy_graph_export=*/false));
+
+  // Do a single pass to encode necessary control deps in the IR according to
+  // the results of side effect analysis.
+  pm.addPass(
+      mlir::tf_executor::CreateTFExecutorUpdateControlDependenciesPass());
+
+  pm.addNestedPass<FuncOp>(mlir::TFTPU::CreateTPUDevicePropagationPass());
+  pm.addNestedPass<FuncOp>(mlir::TFTPU::CreateTPUColocateSplitsPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  if (tensorflow::GetMlirCommonFlags()
+          ->tf_mlir_enable_convert_control_to_data_outputs_pass) {
+    pm.addPass(
+        mlir::tf_executor::CreateTFExecutorConvertControlToDataOutputsPass());
+  }
+  pm.addPass(mlir::TF::CreateVerifySuitableForExportPass());
+}
+
+}  // namespace
+
+tensorflow::Status ExportFromTensorflowDialectToExecutor(
+    ModuleOp module, llvm::StringRef module_name) {
+  PassManager tf_to_executor(module.getContext());
+  ::tensorflow::applyTensorflowAndCLOptions(tf_to_executor);
+  AddGraphExportLoweringPasses(tf_to_executor);
+
+  if (VLOG_IS_ON(1) ||
+      DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(), kDebugGroupMain)) {
+    ::tensorflow::DumpMlirOpToFile(
+        DEBUG_DATA_DUMPER()->GetDumpFilename(
+            module_name.str(), kDebugGroupMain,
+            "tfxla_bridge_tfdialect_to_executor_before"),
+        module, llvm::StringRef(), &tf_to_executor);
+
+    if (VLOG_IS_ON(2) || DEBUG_DATA_DUMPER()->ShouldDump(
+                             module_name.str(), kDebugGroupBridgePhase1)) {
+      EnableDetailedLogging(&tf_to_executor, module_name);
+    }
+  }
+
+  LogicalResult result = tf_to_executor.run(module);
+
+  if (VLOG_IS_ON(1) ||
+      DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(), kDebugGroupMain)) {
+    ::tensorflow::DumpMlirOpToFile(
+        DEBUG_DATA_DUMPER()->GetDumpFilename(
+            module_name.str(), kDebugGroupMain,
+            "tfxla_bridge_tfdialect_to_executor_after"),
+        module, llvm::StringRef(), &tf_to_executor);
+  }
+
+  if (result.succeeded()) {
+    return tsl::OkStatus();
+  }
 
-tensorflow::Status ExportFromTensorflowDialectToExecutor(ModuleOp module) {
-  return tsl::OkStatus();
+  return absl::InternalError(
+      "Failed to export from TF Dialect to TF Executor Dialect.");
 }
 
 }  // namespace v2
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h
index 16b9ad252b7c2f..661d2a3e221e09 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TF_DIALECT_TO_EXECUTOR_H_
 #define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V2_TF_DIALECT_TO_EXECUTOR_H_
 
+#include "llvm/ADT/StringRef.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/core/platform/status.h"
 
@@ -33,7 +34,8 @@ namespace v2 {
 // Input: A MLIR Module in the Tensorflow Dialect with no
 // `tf_device.cluster_func` ops.
 // Output: A MLIR module in the Tensorflow Executor Dialect.
-tensorflow::Status ExportFromTensorflowDialectToExecutor(mlir::ModuleOp module);
+tensorflow::Status ExportFromTensorflowDialectToExecutor(
+    mlir::ModuleOp module, llvm::StringRef module_name = llvm::StringRef());
 
 }  // namespace v2
 }  // namespace tf2xla
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
index e43e82098066a8..20583847b3f5c4 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
@@ -15,20 +15,71 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
 
+#include <string>
+
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/register_common_dialects.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace tf2xla {
 namespace v2 {
 namespace {
 
+using mlir::DialectRegistry;
+using mlir::MLIRContext;
 using mlir::ModuleOp;
+using mlir::OwningOpRef;
+
+std::string TestDataPath() {
+  return tensorflow::GetDataDependencyFilepath(
+      "tensorflow/compiler/mlir/tf2xla/api/v2/testdata/");
+}
+
+class TensorflowDialectToExecutorTest : public ::testing::Test {
+ public:
+  TensorflowDialectToExecutorTest() {
+    mlir::RegisterCommonToolingDialects(registry_);
+    context_.appendDialectRegistry(registry_);
+    context_.loadAllAvailableDialects();
+  }
+
+  tsl::Status CreateMlirModule(std::string mlir_module_filename) {
+    std::string mlir_module_path = TestDataPath() + mlir_module_filename;
+    mlir_module_ =
+        mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context_);
+    if (!mlir_module_) {
+      return tsl::Status(
+          absl::StatusCode::kNotFound,
+          absl::StrCat("Could not find MLIR module at ", mlir_module_path));
+    }
+    return tsl::OkStatus();
+  }
+
+  DialectRegistry registry_;
+  MLIRContext context_;
+  OwningOpRef<mlir::ModuleOp> mlir_module_;
+};
+
+TEST_F(TensorflowDialectToExecutorTest, ConvertsToExecutor) {
+  TF_ASSERT_OK(CreateMlirModule("empty_func.mlir"));
+
+  TF_EXPECT_OK(ExportFromTensorflowDialectToExecutor(*mlir_module_));
+}
+
+TEST_F(TensorflowDialectToExecutorTest, ErrorsWhenCannotConvert) {
+  TF_ASSERT_OK(CreateMlirModule("invalid_executor.mlir"));
 
-TEST(TensorflowDialectToExecutor, ConvertsToExecutor) {
-  ModuleOp module;
-  TF_ASSERT_OK(ExportFromTensorflowDialectToExecutor(module));
+  EXPECT_FALSE(ExportFromTensorflowDialectToExecutor(*mlir_module_).ok());
 }
 
 }  // namespace

From ae244a6866bf5efc02ab5806e1037df9a52a5523 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Fri, 29 Sep 2023 08:57:58 -0700
Subject: [PATCH 423/567] [SE] [NFC] Remove unused APIs from dnn.h

PiperOrigin-RevId: 569504937
---
 third_party/xla/xla/stream_executor/dnn.h     | 308 -----------
 .../xla/xla/stream_executor/rocm/rocm_dnn.cc  | 485 ------------------
 .../xla/xla/stream_executor/rocm/rocm_dnn.h   | 115 -----
 third_party/xla/xla/stream_executor/stream.cc |  19 -
 third_party/xla/xla/stream_executor/stream.h  |  11 -
 5 files changed, 938 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index 1cf598e7f37141..52ccb08117604d 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -1965,71 +1965,6 @@ class DnnSupport {
       absl::Span<const DeviceMemory<float>* const> input_data,
       DeviceMemory<float>* output_data) = 0;
 
-  // Concatenates several layers into one, by concatenating each in the
-  // x-dimension or y-dimension, based on a user-specified flag.
-  // For x-concatenation, layers are aligned at matching y and depth
-  // coordinates, and for y-concatenation, they are aligned at matching x and
-  // depth coordinates. The inputs must all have the same depth and batch size.
-  // For x-concatenation, the inputs must have the same height (y-size), and the
-  // output will have the same depth and height as the inputs and its width (x-
-  // size) will be the sum of the input widths.  For y-concatenation, the inputs
-  // must have the same width, and the output will have the same depth and width
-  // as the inputs, and its height will be the sum of the input heights.
-  //
-  // Arguments:
-  //  stream: borrowed pointer to the stream that the 'space concatenate'
-  //    operation should be enqueued onto.
-  //  input_dimensions: the dimensions of each input.
-  //  input_data: un-owned device memory region which contains the input data
-  //    for each input layer.
-  //  output_data: un-owned device memory region in which to place the space
-  //    concatenate result.
-  //  concat_direction:  either dnn:SpaceConcatenateMode::XDirection or
-  //    dnn::SpaceConcatenateMode::YDirection.
-  virtual bool DoSpaceConcatenate(
-      Stream* stream, absl::Span<const dnn::BatchDescriptor> input_dimensions,
-      absl::Span<const DeviceMemory<float>* const> input_data,
-      DeviceMemory<float>* output_data,
-      dnn::SpaceConcatenateMode concat_direction) {
-    return false;
-  }
-
-  // Change the layout of the data by shrinking one dimension (or set of
-  // dimensions) and growing another dimension (or set of dimensions), while
-  // keeping the total number of data elements constant, and maintaining the
-  // current data ordering.
-  //
-  // Currently, the only supported operation is depth into space by a power of
-  // 2. E.g. (y, x, z) -> (y*2, x*2, z/4)
-  //
-  // Note that Reshape may not be a no-op, depending on the platform and which
-  // dimensions are being changed.
-  //
-  // Example: forgetting about batch for the moment, let's take a tensor that's
-  // 2x1x8 (y by x by z) and reshape to a tensor that's 4x2x2. The memory layout
-  // is row-major order: y,x,z. I.e. z changes the fastest, then x, then y. The
-  // elements of the tensor range from 0 to 15. The x,y,z indices are below each
-  // element.
-  //
-  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
-  // y0 y0 y0 y0 y0 y0 y0 y0 y1 y1 y1 y1 y1 y1 y1 y1
-  // x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0
-  // z0 z1 z2 z3 z4 z5 z6 z7 z0 z1 z2 z3 z4 z5 z6 z7
-  //
-  // reshape to 4x2x2
-  //
-  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
-  // y0 y0 y0 y0 y1 y1 y1 y1 y2 y2 y2 y2 y3 y3 y3 y3
-  // x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1
-  // z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1
-  virtual bool DoReshape(Stream* stream,
-                         const dnn::BatchDescriptor& input_dimensions,
-                         const DeviceMemory<float>& input_data,
-                         const dnn::BatchDescriptor& output_dimensions,
-                         DeviceMemory<float>* output_data) {
-    return false;
-  }
-
   // Depth to space takes an X by Y image with depth D*M^2 and changes it to an
   // MX x MY image with depth D. Each input location (x,y) with depth D*M^2 in
   // the input image is changed to an MxM contiguous area in the output image,
@@ -2059,35 +1994,6 @@ class DnnSupport {
     return false;
   }
 
-  // Space to depth is the inverse of depth to space. Space to depth takes each
-  // non-overlapping M by M patch (in the X and Y dimensions) with depth D of
-  // the input, and transforms it to a 1 by 1 patch with depth D*M^2. If the
-  // input has size (MX, MY, D), the output has size (X, Y, D*M^2). The number
-  // of data elements is not changed.
-  //
-  // Example.
-  // M=2, Din =2, Xin=4, Yin=4,  Dout=8
-  // DepthHeightWidth layout
-  // Values within a 'cell' are at different depths and same x & y.
-  // Input:
-  // ae bf im jn
-  // cg dh ko lp
-  // qu rv y2 z3
-  // sw tx 04 15
-  // Output:
-  // abcdefgh  ijklmnop
-  // qrstuvwx  yz012345
-  //
-  // sqrt_depth_increase: 'M' in the comment above
-  virtual bool DoSpaceToDepth(Stream* stream,
-                              const dnn::BatchDescriptor& input_dimensions,
-                              const DeviceMemory<float>& input_data,
-                              const DepthToSpaceLayout& space_to_depth_layout,
-                              const int& sqrt_depth_increase,
-                              DeviceMemory<float>* output_data) {
-    return false;
-  }
-
   // Computes the specified operation (e.g. addition or multiplication)
   // between corresponding elements in the inputs and stores the result in the
   // output element.
@@ -2644,220 +2550,6 @@ class DnnSupport {
     return false;
   }
 
-  // Enqueues a fused convolution+bias+activation operation onto the stream.
-  //
-  // Arguments (all borrowed):
-  //
-  //  stream: borrowed pointer to the stream that the 'fusion' operation should
-  //  be enqueued onto.
-  //
-  //  conv_input_descriptor: dimensions of the convolution input layer.
-  //  conv_input_data: device memory which contains the convolution input.
-  //
-  //  filter_descriptor: dimensions of the convolution filter.
-  //  filter_data: device memory which contains the convolution filter weights.
-  //
-  //  convolution_descriptor: stride of the convolution filter.
-  //
-  //  bias_descriptor: dimensions of the bias layer
-  //  biases: device memory region containing biases to add to the convolution
-  //  output
-  //
-  //  activation_mode: Type of activation to perform.
-  //
-  //  output_descriptor: dimensions of the output layer.
-  //  output_data: device memory region in which to place the fusion result.
-  //
-  //  output_profile_result: the output profile result for this call.
-  //         The profiling is only enabled when this is not nullptr.
-  //
-  virtual bool DoFusedConvolutionBiasActivation(
-      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-      const DeviceMemory<float>& conv_input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data,
-      dnn::ProfileResult* output_profile_result) {
-    return false;
-  }
-
-  // Enqueues a fused batchnorm+activation (inference) operation onto the
-  // stream.
-  //
-  // Arguments (all borrowed):
-  //
-  //  stream: borrowed pointer to the stream that the 'fusion' operation should
-  //  be enqueued onto.
-  //
-  //  x_descriptor: dimensions of the batchnorm input layer.
-  //  x_data: device memory which contains the batchnorm input.
-  //
-  //  scale_offset_mean_variance_descriptor:
-  //      dimensions of the scale/offset/mean/variance tensor.
-  //  scale_data: device memory which contains the scale input.
-  //  offset_data: device memory which contains the offset input.
-  //  mean_data: device memory which contains the mean input.
-  //  variance_data: device memory which contains the variance input.
-  //  epsilon : the epsilon value to use in batchnorm calculation
-  //
-  //  activation_mode: Type of activation to perform.
-  //
-  //  y_data: device memory region in which to place the fusion result.
-  //
-  //  output_profile_result: the output profile result for this call.
-  //         The profiling is only enabled when this is not nullptr.
-  //
-  virtual bool DoFusedBatchNormActivationInference(
-      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-      const DeviceMemory<float>& x_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data,
-      const DeviceMemory<float>& mean_data,
-      const DeviceMemory<float>& variance_data, double epsilon,
-      dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
-      dnn::ProfileResult* output_profile_result) {
-    return false;
-  }
-
-  virtual bool DoFusedBatchNormActivationInference(
-      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-      const DeviceMemory<Eigen::half>& x_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data,
-      const DeviceMemory<float>& mean_data,
-      const DeviceMemory<float>& variance_data, double epsilon,
-      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
-      dnn::ProfileResult* output_profile_result) {
-    return false;
-  }
-
-  // Enqueues a fused batchnorm+activation (training-fwd) operation onto the
-  // stream.
-  //
-  // Arguments (all borrowed):
-  //
-  //  stream: borrowed pointer to the stream that the 'fusion' operation should
-  //  be enqueued onto.
-  //
-  //  x_descriptor: dimensions of the batchnorm input layer.
-  //  x_data: device memory which contains the batchnorm input.
-  //
-  //  scale_offset_mean_variance_descriptor:
-  //      dimensions of the scale/offset/mean/variance tensor.
-  //  scale_data: device memory which contains the scale input.
-  //  offset_data: device memory which contains the offset input.
-  //  epsilon : the epsilon value to use in batchnorm calculation
-  //
-  //  activation_mode: Type of activation to perform.
-  //
-  //  y_data: device memory region in which to place the fusion result.
-  //  batch_mean_data: device memory in which to place the batch mean output.
-  //  batch_var_data: device memory in which to place the batch variance output.
-  //  saved_mean_data: device memory in which to save the mean for bwd pass.
-  //  saved_var_data: device memory in which to save the variance for bwd pass.
-  //
-  //  output_profile_result: the output profile result for this call.
-  //         The profiling is only enabled when this is not nullptr.
-  //
-  virtual bool DoFusedBatchNormActivationForward(
-      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-      const DeviceMemory<float>& x_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data, double epsilon,
-      dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
-      DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
-      DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
-      dnn::ProfileResult* output_profile_result) {
-    return false;
-  }
-
-  virtual bool DoFusedBatchNormActivationForward(
-      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-      const DeviceMemory<Eigen::half>& x_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data, double epsilon,
-      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
-      DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
-      DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
-      dnn::ProfileResult* output_profile_result) {
-    return false;
-  }
-
-  // Enqueues a fused batchnorm+activation (training-bwd) operation onto the
-  // stream.
-  //
-  // Arguments (all borrowed):
-  //
-  //  stream: borrowed pointer to the stream that the 'fusion' operation should
-  //  be enqueued onto.
-  //
-  //  y_act_backprop_descriptor: dimensions of the backprop input from the
-  //  previous layer. y_act_backprop_data: device memory which contains the
-  //  backprop input.
-  //
-  //  y_act_data: device memory which contains the actv-fwd output data.
-  //
-  //  activation_mode: actv-fwd type.
-  //
-  //  scale_offset_mean_variance_descriptor:
-  //      dimensions of the scale/offset/mean/variance tensor.
-  //  scale_data: device memory which contains the scale input.
-  //  offset_data: device memory which contains the offset input.
-  //  saved_mean_data: device memory which contains the saved mean from fwd
-  //  pass. saved_var_data: device memory which contains the saved variance from
-  //  fwd pass.
-  //
-  //  x_bn_backprop_data: device memory region in which to place the backprop
-  //  data from this layer scale_backprop_data: device memory in which to place
-  //  the scale backprop output. offset_backprop_data: device memory in which to
-  //  place the offset backprop output.
-  //
-  //  output_profile_result: the output profile result for this call.
-  //         The profiling is only enabled when this is not nullptr.
-  //
-  virtual bool DoFusedBatchNormActivationBackward(
-      Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
-      const DeviceMemory<float>& y_act_backprop_data,
-      const DeviceMemory<float>& y_act_data,
-      dnn::ActivationMode activation_mode, const DeviceMemory<float>& x_bn_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data,
-      const DeviceMemory<float>& saved_mean_data,
-      const DeviceMemory<float>& saved_var_data,
-      DeviceMemory<float>* x_bn_backprop_data,
-      DeviceMemory<float>* scale_backprop_data,
-      DeviceMemory<float>* offset_backprop_data,
-      dnn::ProfileResult* output_profile_result) {
-    return false;
-  }
-
-  virtual bool DoFusedBatchNormActivationBackward(
-      Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
-      const DeviceMemory<Eigen::half>& y_act_backprop_data,
-      const DeviceMemory<Eigen::half>& y_act_data,
-      dnn::ActivationMode activation_mode,
-      const DeviceMemory<Eigen::half>& x_bn_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data,
-      const DeviceMemory<float>& saved_mean_data,
-      const DeviceMemory<float>& saved_var_data,
-      DeviceMemory<Eigen::half>* x_bn_backprop_data,
-      DeviceMemory<float>* scale_backprop_data,
-      DeviceMemory<float>* offset_backprop_data,
-      dnn::ProfileResult* output_profile_result) {
-    return false;
-  }
-
   // Notifies that a stream is being destroyed and should be invalidated from
   // any internal caching.  This exists to allow the CUDA implementation to
   // avoid redundant cudnnSetStream calls without risking problems when a stream
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index d533b781f71f8d..77efe5126c257c 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -4793,491 +4793,6 @@ bool MIOpenSupport::DeriveOutputBatchDescriptor(
   return true;
 }
 
-template <typename T>
-bool MIOpenSupport::DoFusedConvolutionBiasActivationImpl(
-    Stream* stream,
-    int miopen_type,  // Actually miopenDataType_t.
-    const dnn::BatchDescriptor& conv_input_descriptor,
-    const DeviceMemory<T>& conv_input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<T>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& bias_descriptor,
-    const DeviceMemory<T>& bias_data, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
-    dnn::ProfileResult* output_profile_result) {
-  auto miopen = miopen_->GetHandle(parent_, stream);
-
-  ScopedTensorDescriptor conv_input_nd{
-      conv_input_descriptor, static_cast<miopenDataType_t>(miopen_type)};
-
-  ScopedTensorDescriptor bias_nd{bias_descriptor,
-                                 static_cast<miopenDataType_t>(miopen_type)};
-
-  ScopedTensorDescriptor output_nd{output_descriptor,
-                                   static_cast<miopenDataType_t>(miopen_type)};
-
-  ScopedConvolutionDescriptor conv{convolution_descriptor,
-                                   static_cast<miopenDataType_t>(miopen_type)};
-
-  ScopedFilterDescriptor filter{filter_descriptor,
-                                static_cast<miopenDataType_t>(miopen_type)};
-
-  ScopedActivationDescriptor activation_desc{activation_mode};
-
-  ScopedFusionPlanConvolutionBiasActivation fusion_plan{
-      miopen.handle(), conv_input_nd.handle(), filter.handle(),
-      conv.handle(),   bias_nd.handle(),       activation_desc};
-
-  bool retval = false;
-
-  if (fusion_plan.CompilationSucceeded()) {
-    std::optional<GpuTimer> timer;
-    const bool is_profiling = output_profile_result != nullptr;
-
-    if (is_profiling) {
-      auto timer_or_status = GpuTimer::Create(AsGpuStream(stream));
-      if (!timer_or_status.ok()) {
-        LOG(ERROR) << "Failed to create timer";
-        return false;
-      }
-      timer.emplace(std::move(*timer_or_status));
-    }
-
-    miopenStatus_t status = miopenStatusSuccess;
-
-    if (status == miopenStatusSuccess) {
-      fusion_plan.SetConvolutionArgs(filter_data.opaque());
-    }
-
-    if (status == miopenStatusSuccess) {
-      status = fusion_plan.SetBiasArgs(bias_data.opaque());
-    }
-
-    if (status == miopenStatusSuccess) {
-      status = fusion_plan.SetActivationForwardArgs(activation_desc);
-    }
-
-    if (status == miopenStatusSuccess) {
-      status =
-          fusion_plan.Execute(conv_input_nd.handle(), conv_input_data.opaque(),
-                              output_nd.handle(), output_data->opaque());
-    }
-
-    if (is_profiling) {
-      if (status == miopenStatusSuccess) {
-        tsl::StatusOr<absl::Duration> elapsed = timer->GetElapsedDuration();
-        if (!elapsed.ok()) {
-          LOG(ERROR) << "Failed to get elapsed duration";
-          return false;
-        }
-        output_profile_result->set_elapsed_time_in_ms(
-            absl::ToDoubleMilliseconds(*elapsed));
-      }
-    }
-
-    if (status != miopenStatusSuccess) {
-      // Silently return when we are profiling.
-      if (!is_profiling) {
-        LOG(FATAL) << "failed to enqueue fused-convolution on stream: "
-                   << ToString(status);
-      }
-    }
-
-    retval = true;
-  }
-
-  return retval;
-}
-
-bool MIOpenSupport::DoFusedConvolutionBiasActivation(
-    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-    const DeviceMemory<float>& conv_input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<float>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& bias_descriptor,
-    const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<float>* output_data,
-    dnn::ProfileResult* output_profile_result) {
-  return DoFusedConvolutionBiasActivationImpl<float>(
-      stream, miopenFloat, conv_input_descriptor, conv_input_data,
-      filter_descriptor, filter_data, convolution_descriptor, bias_descriptor,
-      bias_data, activation_mode, output_descriptor, output_data,
-      output_profile_result);
-}
-
-template <typename T, typename U>
-bool MIOpenSupport::DoFusedBatchNormActivationInferenceImpl(
-    Stream* stream,
-    int miopen_type,  // Actually miopenDataType_t.
-    const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data,
-    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-    const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
-    const DeviceMemory<U>& mean_data, const DeviceMemory<U>& variance_data,
-    double epsilon, dnn::ActivationMode activation_mode,
-    DeviceMemory<T>* y_data, dnn::ProfileResult* output_profile_result) {
-  auto miopen = miopen_->GetHandle(parent_, stream);
-
-  ScopedTensorDescriptor x_nd{x_descriptor,
-                              static_cast<miopenDataType_t>(miopen_type)};
-
-  ScopedTensorDescriptor scale_offset_mean_variance_nd{
-      scale_offset_mean_variance_descriptor,
-      static_cast<miopenDataType_t>(miopen_type)};
-
-  ScopedActivationDescriptor activation_desc{activation_mode};
-
-  ScopedFusionPlanBatchNormActivationInference fusion_plan{
-      miopen.handle(), x_nd.handle(), scale_offset_mean_variance_nd.handle(),
-      activation_desc};
-
-  bool retval = false;
-
-  if (fusion_plan.CompilationSucceeded()) {
-    std::optional<GpuTimer> timer;
-    const bool is_profiling = output_profile_result != nullptr;
-
-    if (is_profiling) {
-      auto timer_or_status = GpuTimer::Create(AsGpuStream(stream));
-      if (!timer_or_status.ok()) {
-        LOG(ERROR) << "Failed to create timer";
-        return false;
-      }
-      timer.emplace(std::move(*timer_or_status));
-    }
-
-    miopenStatus_t status = miopenStatusSuccess;
-
-    if (status == miopenStatusSuccess) {
-      fusion_plan.SetBatchNormInferenceArgs(
-          scale_data.opaque(), offset_data.opaque(), mean_data.opaque(),
-          variance_data.opaque(), epsilon);
-    }
-
-    if (status == miopenStatusSuccess) {
-      status = fusion_plan.SetActivationForwardArgs(activation_desc);
-    }
-
-    if (status == miopenStatusSuccess) {
-      status = fusion_plan.Execute(x_nd.handle(), x_data.opaque(),
-                                   x_nd.handle(), y_data->opaque());
-    }
-
-    if (is_profiling) {
-      if (status == miopenStatusSuccess) {
-        tsl::StatusOr<absl::Duration> elapsed = timer->GetElapsedDuration();
-        if (!elapsed.ok()) {
-          LOG(ERROR) << "Failed to get elapsed duration";
-          return false;
-        }
-        output_profile_result->set_elapsed_time_in_ms(
-            absl::ToDoubleMilliseconds(*elapsed));
-      }
-    }
-
-    if (status != miopenStatusSuccess) {
-      // Silently return when we are profiling.
-      if (!is_profiling) {
-        LOG(FATAL) << "failed to enqueue fused-convolution on stream: "
-                   << ToString(status);
-      }
-    }
-
-    retval = true;
-  }
-
-  return retval;
-}
-
-bool MIOpenSupport::DoFusedBatchNormActivationInference(
-    Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-    const DeviceMemory<float>& x_data,
-    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-    const DeviceMemory<float>& scale_data,
-    const DeviceMemory<float>& offset_data,
-    const DeviceMemory<float>& mean_data,
-    const DeviceMemory<float>& variance_data, double epsilon,
-    dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
-    dnn::ProfileResult* output_profile_result) {
-  return DoFusedBatchNormActivationInferenceImpl<float, float>(
-      stream, miopenFloat, x_descriptor, x_data,
-      scale_offset_mean_variance_descriptor, scale_data, offset_data, mean_data,
-      variance_data, epsilon, activation_mode, y_data, output_profile_result);
-}
-
-bool MIOpenSupport::DoFusedBatchNormActivationInference(
-    Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-    const DeviceMemory<Eigen::half>& x_data,
-    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-    const DeviceMemory<float>& scale_data,
-    const DeviceMemory<float>& offset_data,
-    const DeviceMemory<float>& mean_data,
-    const DeviceMemory<float>& variance_data, double epsilon,
-    dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
-    dnn::ProfileResult* output_profile_result) {
-  return DoFusedBatchNormActivationInferenceImpl<Eigen::half, float>(
-      stream, miopenHalf, x_descriptor, x_data,
-      scale_offset_mean_variance_descriptor, scale_data, offset_data, mean_data,
-      variance_data, epsilon, activation_mode, y_data, output_profile_result);
-}
-
-template <typename T, typename U>
-bool MIOpenSupport::DoFusedBatchNormActivationForwardImpl(
-    Stream* stream,
-    int miopen_type,  // Actually miopenDataType_t.
-    const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data,
-    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-    const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
-    double epsilon, dnn::ActivationMode activation_mode,
-    DeviceMemory<T>* y_data, DeviceMemory<U>* batch_mean_data,
-    DeviceMemory<U>* batch_var_data, DeviceMemory<U>* saved_mean_data,
-    DeviceMemory<U>* saved_var_data,
-    dnn::ProfileResult* output_profile_result) {
-  auto miopen = miopen_->GetHandle(parent_, stream);
-
-  ScopedTensorDescriptor x_nd{x_descriptor,
-                              static_cast<miopenDataType_t>(miopen_type)};
-
-  ScopedTensorDescriptor scale_offset_mean_variance_nd{
-      scale_offset_mean_variance_descriptor,
-      static_cast<miopenDataType_t>(miopen_type)};
-
-  ScopedActivationDescriptor activation_desc{activation_mode};
-
-  ScopedFusionPlanBatchNormActivationForward fusion_plan{
-      miopen.handle(), x_nd.handle(), scale_offset_mean_variance_nd.handle(),
-      activation_desc};
-
-  bool retval = false;
-
-  if (fusion_plan.CompilationSucceeded()) {
-    std::optional<GpuTimer> timer;
-    const bool is_profiling = output_profile_result != nullptr;
-
-    if (is_profiling) {
-      auto timer_or_status = GpuTimer::Create(AsGpuStream(stream));
-      if (!timer_or_status.ok()) {
-        LOG(ERROR) << "Failed to create timer";
-        return false;
-      }
-      timer.emplace(std::move(*timer_or_status));
-    }
-
-    miopenStatus_t status = miopenStatusSuccess;
-
-    if (status == miopenStatusSuccess) {
-      fusion_plan.SetBatchNormForwardArgs(
-          scale_data.opaque(), offset_data.opaque(), batch_mean_data->opaque(),
-          batch_var_data->opaque(), saved_mean_data->opaque(),
-          saved_var_data->opaque(), epsilon);
-    }
-
-    if (status == miopenStatusSuccess) {
-      status = fusion_plan.SetActivationForwardArgs(activation_desc);
-    }
-
-    if (status == miopenStatusSuccess) {
-      status = fusion_plan.Execute(x_nd.handle(), x_data.opaque(),
-                                   x_nd.handle(), y_data->opaque());
-    }
-
-    if (is_profiling) {
-      if (status == miopenStatusSuccess) {
-        tsl::StatusOr<absl::Duration> elapsed = timer->GetElapsedDuration();
-        if (!elapsed.ok()) {
-          LOG(ERROR) << "Failed to get elapsed duration";
-          return false;
-        }
-        output_profile_result->set_elapsed_time_in_ms(
-            absl::ToDoubleMilliseconds(*elapsed));
-      }
-    }
-
-    if (status != miopenStatusSuccess) {
-      // Silently return when we are profiling.
-      if (!is_profiling) {
-        LOG(FATAL) << "failed to enqueue fused-convolution on stream: "
-                   << ToString(status);
-      }
-    }
-
-    retval = true;
-  }
-
-  return retval;
-}
-
-bool MIOpenSupport::DoFusedBatchNormActivationForward(
-    Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-    const DeviceMemory<float>& x_data,
-    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-    const DeviceMemory<float>& scale_data,
-    const DeviceMemory<float>& offset_data, double epsilon,
-    dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
-    DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
-    DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
-    dnn::ProfileResult* output_profile_result) {
-  return DoFusedBatchNormActivationForwardImpl<float, float>(
-      stream, miopenFloat, x_descriptor, x_data,
-      scale_offset_mean_variance_descriptor, scale_data, offset_data, epsilon,
-      activation_mode, y_data, batch_mean_data, batch_var_data, saved_mean_data,
-      saved_var_data, output_profile_result);
-}
-
-bool MIOpenSupport::DoFusedBatchNormActivationForward(
-    Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-    const DeviceMemory<Eigen::half>& x_data,
-    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-    const DeviceMemory<float>& scale_data,
-    const DeviceMemory<float>& offset_data, double epsilon,
-    dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
-    DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
-    DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
-    dnn::ProfileResult* output_profile_result) {
-  return DoFusedBatchNormActivationForwardImpl<Eigen::half, float>(
-      stream, miopenHalf, x_descriptor, x_data,
-      scale_offset_mean_variance_descriptor, scale_data, offset_data, epsilon,
-      activation_mode, y_data, batch_mean_data, batch_var_data, saved_mean_data,
-      saved_var_data, output_profile_result);
-}
-
-template <typename T, typename U>
-bool MIOpenSupport::DoFusedBatchNormActivationBackwardImpl(
-    Stream* stream,
-    int miopen_type,  // Actually miopenDataType_t.
-    const dnn::BatchDescriptor& y_act_backprop_descriptor,
-    const DeviceMemory<T>& y_act_backprop_data,
-    const DeviceMemory<T>& y_act_data, dnn::ActivationMode activation_mode,
-    const DeviceMemory<T>& x_bn_data,
-    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-    const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
-    const DeviceMemory<U>& saved_mean_data,
-    const DeviceMemory<U>& saved_var_data, DeviceMemory<T>* x_bn_backprop_data,
-    DeviceMemory<U>* scale_backprop_data, DeviceMemory<U>* offset_backprop_data,
-    dnn::ProfileResult* output_profile_result) {
-  auto miopen = miopen_->GetHandle(parent_, stream);
-
-  ScopedTensorDescriptor y_act_backprop_nd{
-      y_act_backprop_descriptor, static_cast<miopenDataType_t>(miopen_type)};
-
-  ScopedTensorDescriptor scale_offset_mean_variance_nd{
-      scale_offset_mean_variance_descriptor,
-      static_cast<miopenDataType_t>(miopen_type)};
-
-  ScopedActivationDescriptor activation_desc{activation_mode};
-
-  ScopedFusionPlanBatchNormActivationBackward fusion_plan{
-      miopen.handle(), y_act_backprop_nd.handle(),
-      scale_offset_mean_variance_nd.handle(), activation_desc};
-
-  bool retval = false;
-
-  if (fusion_plan.CompilationSucceeded()) {
-    std::optional<GpuTimer> timer;
-    const bool is_profiling = output_profile_result != nullptr;
-
-    if (is_profiling) {
-      auto timer_or_status = GpuTimer::Create(AsGpuStream(stream));
-      if (!timer_or_status.ok()) {
-        LOG(ERROR) << "Failed to create timer";
-        return false;
-      }
-      timer.emplace(std::move(*timer_or_status));
-    }
-
-    miopenStatus_t status = miopenStatusSuccess;
-
-    if (status == miopenStatusSuccess) {
-      fusion_plan.SetBatchNormBackwardArgs(
-          x_bn_data.opaque(), scale_data.opaque(), offset_data.opaque(),
-          saved_mean_data.opaque(), saved_var_data.opaque(),
-          scale_backprop_data->opaque(), offset_backprop_data->opaque());
-    }
-
-    if (status == miopenStatusSuccess) {
-      status = fusion_plan.SetActivationBackwardArgs(activation_desc,
-                                                     y_act_data.opaque());
-    }
-
-    if (status == miopenStatusSuccess) {
-      status = fusion_plan.Execute(
-          y_act_backprop_nd.handle(), y_act_backprop_data.opaque(),
-          y_act_backprop_nd.handle(), x_bn_backprop_data->opaque());
-    }
-
-    if (is_profiling) {
-      if (status == miopenStatusSuccess) {
-        tsl::StatusOr<absl::Duration> elapsed = timer->GetElapsedDuration();
-        if (!elapsed.ok()) {
-          LOG(ERROR) << "Failed to get elapsed duration";
-          return false;
-        }
-        output_profile_result->set_elapsed_time_in_ms(
-            absl::ToDoubleMilliseconds(*elapsed));
-      }
-    }
-
-    if (status != miopenStatusSuccess) {
-      // Silently return when we are profiling.
-      if (!is_profiling) {
-        LOG(FATAL) << "failed to enqueue fused-convolution on stream: "
-                   << ToString(status);
-      }
-    }
-
-    retval = true;
-  }
-
-  return retval;
-}
-
-bool MIOpenSupport::DoFusedBatchNormActivationBackward(
-    Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
-    const DeviceMemory<float>& y_act_backprop_data,
-    const DeviceMemory<float>& y_act_data, dnn::ActivationMode activation_mode,
-    const DeviceMemory<float>& x_bn_data,
-    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-    const DeviceMemory<float>& scale_data,
-    const DeviceMemory<float>& offset_data,
-    const DeviceMemory<float>& saved_mean_data,
-    const DeviceMemory<float>& saved_var_data,
-    DeviceMemory<float>* x_bn_backprop_data,
-    DeviceMemory<float>* scale_backprop_data,
-    DeviceMemory<float>* offset_backprop_data,
-    dnn::ProfileResult* output_profile_result) {
-  return DoFusedBatchNormActivationBackwardImpl<float, float>(
-      stream, miopenFloat, y_act_backprop_descriptor, y_act_backprop_data,
-      y_act_data, activation_mode, x_bn_data,
-      scale_offset_mean_variance_descriptor, scale_data, offset_data,
-      saved_mean_data, saved_var_data, x_bn_backprop_data, scale_backprop_data,
-      offset_backprop_data, output_profile_result);
-}
-
-bool MIOpenSupport::DoFusedBatchNormActivationBackward(
-    Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
-    const DeviceMemory<Eigen::half>& y_act_backprop_data,
-    const DeviceMemory<Eigen::half>& y_act_data,
-    dnn::ActivationMode activation_mode,
-    const DeviceMemory<Eigen::half>& x_bn_data,
-    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-    const DeviceMemory<float>& scale_data,
-    const DeviceMemory<float>& offset_data,
-    const DeviceMemory<float>& saved_mean_data,
-    const DeviceMemory<float>& saved_var_data,
-    DeviceMemory<Eigen::half>* x_bn_backprop_data,
-    DeviceMemory<float>* scale_backprop_data,
-    DeviceMemory<float>* offset_backprop_data,
-    dnn::ProfileResult* output_profile_result) {
-  return DoFusedBatchNormActivationBackwardImpl<Eigen::half, float>(
-      stream, miopenHalf, y_act_backprop_descriptor, y_act_backprop_data,
-      y_act_data, activation_mode, x_bn_data,
-      scale_offset_mean_variance_descriptor, scale_data, offset_data,
-      saved_mean_data, saved_var_data, x_bn_backprop_data, scale_backprop_data,
-      offset_backprop_data, output_profile_result);
-}
-
 }  // namespace gpu
 
 void initialize_miopen() {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index 119ee2b67ce435..f20727e7227078 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -526,81 +526,6 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemory<float>* output_data,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedBatchNormActivationInference(
-      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-      const DeviceMemory<float>& x_data,
-      const dnn::BatchDescriptor& scale_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data,
-      const DeviceMemory<float>& mean_data,
-      const DeviceMemory<float>& variance_data, double epsilon,
-      dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoFusedBatchNormActivationInference(
-      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-      const DeviceMemory<Eigen::half>& x_data,
-      const dnn::BatchDescriptor& scale_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data,
-      const DeviceMemory<float>& mean_data,
-      const DeviceMemory<float>& variance_data, double epsilon,
-      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoFusedBatchNormActivationForward(
-      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-      const DeviceMemory<float>& x_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data, double epsilon,
-      dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
-      DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
-      DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoFusedBatchNormActivationForward(
-      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
-      const DeviceMemory<Eigen::half>& x_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data, double epsilon,
-      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
-      DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
-      DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoFusedBatchNormActivationBackward(
-      Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
-      const DeviceMemory<float>& y_act_backprop_data,
-      const DeviceMemory<float>& y_act_data,
-      dnn::ActivationMode activation_mode, const DeviceMemory<float>& x_bn_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data,
-      const DeviceMemory<float>& saved_mean_data,
-      const DeviceMemory<float>& saved_var_data,
-      DeviceMemory<float>* x_bn_backprop_data,
-      DeviceMemory<float>* scale_backprop_data,
-      DeviceMemory<float>* offset_backprop_data,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoFusedBatchNormActivationBackward(
-      Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
-      const DeviceMemory<Eigen::half>& y_act_backprop_data,
-      const DeviceMemory<Eigen::half>& y_act_data,
-      dnn::ActivationMode activation_mode,
-      const DeviceMemory<Eigen::half>& x_bn_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<float>& scale_data,
-      const DeviceMemory<float>& offset_data,
-      const DeviceMemory<float>& saved_mean_data,
-      const DeviceMemory<float>& saved_var_data,
-      DeviceMemory<Eigen::half>* x_bn_backprop_data,
-      DeviceMemory<float>* scale_backprop_data,
-      DeviceMemory<float>* offset_backprop_data,
-      dnn::ProfileResult* output_profile_result) override;
-
   GpuExecutor* GetParentExecutor() { return parent_; }
 
   tsl::Status DoCtcLoss(Stream* stream, dnn::DataType element_type,
@@ -713,46 +638,6 @@ class MIOpenSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       DeviceMemory<T>* output_data, dnn::ProfileResult* output_profile_result);
 
-  template <typename T, typename U>
-  bool DoFusedBatchNormActivationInferenceImpl(
-      Stream* stream,
-      int miopen_type,  // Actually miopenDataType_t.
-      const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
-      const DeviceMemory<U>& mean_data, const DeviceMemory<U>& variance_data,
-      double epsilon, dnn::ActivationMode activation_mode,
-      DeviceMemory<T>* y_data, dnn::ProfileResult* output_profile_result);
-
-  template <typename T, typename U>
-  bool DoFusedBatchNormActivationForwardImpl(
-      Stream* stream,
-      int miopen_type,  // Actually miopenDataType_t.
-      const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
-      double epsilon, dnn::ActivationMode activation_mode,
-      DeviceMemory<T>* y_data, DeviceMemory<U>* batch_mean_data,
-      DeviceMemory<U>* batch_var_data, DeviceMemory<U>* saved_mean_data,
-      DeviceMemory<U>* saved_var_data,
-      dnn::ProfileResult* output_profile_result);
-
-  template <typename T, typename U>
-  bool DoFusedBatchNormActivationBackwardImpl(
-      Stream* stream,
-      int miopen_type,  // Actually miopenDataType_t.
-      const dnn::BatchDescriptor& y_act_backprop_descriptor,
-      const DeviceMemory<T>& y_act_backprop_data,
-      const DeviceMemory<T>& y_act_data, dnn::ActivationMode activation_mode,
-      const DeviceMemory<T>& x_bn_data,
-      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
-      const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
-      const DeviceMemory<U>& saved_mean_data,
-      const DeviceMemory<U>& saved_var_data,
-      DeviceMemory<T>* x_bn_backprop_data, DeviceMemory<U>* scale_backprop_data,
-      DeviceMemory<U>* offset_backprop_data,
-      dnn::ProfileResult* output_profile_result);
-
   tsl::Status DoPrepareForConvolution(
       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
diff --git a/third_party/xla/xla/stream_executor/stream.cc b/third_party/xla/xla/stream_executor/stream.cc
index 04de3eaeb77643..c3846e4e495e90 100644
--- a/third_party/xla/xla/stream_executor/stream.cc
+++ b/third_party/xla/xla/stream_executor/stream.cc
@@ -747,25 +747,6 @@ Stream &Stream::ThenDepthToSpace(
   return *this;
 }
 
-Stream &Stream::ThenSpaceToDepth(
-    const dnn::BatchDescriptor &input_dimensions,
-    const DeviceMemory<float> &input_data,
-    const dnn::DepthToSpaceLayout &space_to_depth_layout,
-    const int sqrt_depth_increase, DeviceMemory<float> *output_data) {
-  VLOG_CALL(PARAM(input_dimensions), PARAM(input_data),
-            PARAM(space_to_depth_layout), PARAM(sqrt_depth_increase),
-            PARAM(output_data));
-
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoSpaceToDepth(this, input_dimensions, input_data,
-                                   space_to_depth_layout, sqrt_depth_increase,
-                                   output_data));
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
 Stream &Stream::ThenElementwiseOperate(
     dnn::ElementwiseOperation operation,
     absl::Span<const dnn::BatchDescriptor> input_dimensions,
diff --git a/third_party/xla/xla/stream_executor/stream.h b/third_party/xla/xla/stream_executor/stream.h
index 28cdbfa9dc5cbc..219e91d4e5b0e2 100644
--- a/third_party/xla/xla/stream_executor/stream.h
+++ b/third_party/xla/xla/stream_executor/stream.h
@@ -626,17 +626,6 @@ class Stream {
                            const int sqrt_depth_reduction,
                            DeviceMemory<float> *output_data);
 
-  // Space to depth is the inverse of depth to space. Space to depth takes each
-  // non-overlapping M by M patch (in the X and Y dimensions) with depth D of
-  // the input, and transforms it to a 1 by 1 patch with depth D*M². If the
-  // input has size (MX, MY, D), the output has size (X, Y, D*M²). The number of
-  // data elements is not changed.
-  Stream &ThenSpaceToDepth(const dnn::BatchDescriptor &input_dimensions,
-                           const DeviceMemory<float> &input_data,
-                           const dnn::DepthToSpaceLayout &space_to_depth_layout,
-                           const int sqrt_depth_increase,
-                           DeviceMemory<float> *output_data);
-
   Stream &ThenElementwiseOperate(
       dnn::ElementwiseOperation operation,
       absl::Span<const dnn::BatchDescriptor> input_dimensions,

From 8df5223f556b7a0cf7126348bed0d872bcd79883 Mon Sep 17 00:00:00 2001
From: Sandeep Dasgupta <sdasgup@google.com>
Date: Fri, 29 Sep 2023 09:08:26 -0700
Subject: [PATCH 424/567] Integrate StableHLO at openxla/stablehlo@2eec6db

PiperOrigin-RevId: 569507553
---
 third_party/stablehlo/temporary.patch         | 81 -------------------
 third_party/stablehlo/workspace.bzl           |  4 +-
 .../xla/third_party/stablehlo/temporary.patch | 81 -------------------
 .../xla/third_party/stablehlo/workspace.bzl   |  4 +-
 4 files changed, 4 insertions(+), 166 deletions(-)

diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 4c4228163a6f04..88a5bdaf0d603a 100644
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -980,87 +980,6 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo
 +}  // namespace mlir
 +
 +#endif  // STABLEHLO_DIALECT_EXPERIMENTAL_OPS_H
-diff --ruN a/stablehlo/stablehlo/reference/Process.cpp b/stablehlo/stablehlo/reference/Process.cpp
---- stablehlo/stablehlo/reference/Process.cpp
-+++ stablehlo/stablehlo/reference/Process.cpp
-@@ -44,7 +44,7 @@
- 
- void Process::outfeed(ArrayRef<Tensor> inputs) { grid_->outfeed(inputs); }
- 
--const std::shared_ptr<RendezvousResult> Process::rendezvous(
-+std::shared_ptr<RendezvousResult const> Process::rendezvous(
-     ProcessGroup processGroup, ChannelId channelId, const Tensor &operand) {
-   return grid_->rendezvous(processGroup, channelId, getId(), operand);
- }
-diff --ruN a/stablehlo/stablehlo/reference/Process.h b/stablehlo/stablehlo/reference/Process.h
---- stablehlo/stablehlo/reference/Process.h
-+++ stablehlo/stablehlo/reference/Process.h
-@@ -54,7 +54,7 @@
-   void outfeed(ArrayRef<Tensor> inputs);
- 
-   /// See `ProcessGrid::rendezvous`.
--  const std::shared_ptr<RendezvousResult> rendezvous(ProcessGroup processGroup,
-+  std::shared_ptr<RendezvousResult const> rendezvous(ProcessGroup processGroup,
-                                                      ChannelId channelId,
-                                                      const Tensor &operand);
- 
-diff --ruN a/stablehlo/stablehlo/reference/ProcessGrid.cpp b/stablehlo/stablehlo/reference/ProcessGrid.cpp
---- stablehlo/stablehlo/reference/ProcessGrid.cpp
-+++ stablehlo/stablehlo/reference/ProcessGrid.cpp
-@@ -66,13 +66,13 @@
-   result_[processId] = tensor;
- }
- 
--Tensor RendezvousResult::lookup(ProcessId processId) {
-+Tensor RendezvousResult::lookup(ProcessId processId) const {
-   auto it = result_.find(processId);
-   if (it != result_.end()) return it->second;
-   return {};
- }
- 
--SmallVector<Tensor> RendezvousResult::getSortedTensors() {
-+SmallVector<Tensor> RendezvousResult::getSortedTensors() const {
-   return llvm::to_vector(
-       llvm::map_range(result_, [](const auto &pair) { return pair.second; }));
- }
-@@ -162,7 +162,7 @@
- 
- void ProcessGrid::outfeed(ArrayRef<Tensor> inputs) { outfeed_.push(inputs); }
- 
--const std::shared_ptr<RendezvousResult> ProcessGrid::rendezvous(
-+std::shared_ptr<RendezvousResult const> ProcessGrid::rendezvous(
-     ProcessGroup processGroup, ChannelId channelId, ProcessId processId,
-     const Tensor &operand) {
-   std::pair<ProcessGroup, ChannelId> channelKey(processGroup, channelId);
-diff --ruN a/stablehlo/stablehlo/reference/ProcessGrid.h b/stablehlo/stablehlo/reference/ProcessGrid.h
---- stablehlo/stablehlo/reference/ProcessGrid.h
-+++ stablehlo/stablehlo/reference/ProcessGrid.h
-@@ -75,14 +75,14 @@
-   /// Iterates through the (ProcessId, Tensor) map entires and returns a vector
-   /// of Tensors sorted by ProcessId--(replicaId, partitionId) pair--in
-   /// lexicographical order.
--  SmallVector<Tensor> getSortedTensors();
-+  SmallVector<Tensor> getSortedTensors() const;
- 
-   /// Inserts `tensor` into the map using the key `processId`.
-   void insert(ProcessId processId, Tensor tensor);
- 
-   /// Iterates through the map and returns the value associated with the key
-   /// `processId`. If key is not found, return an empty `Tensor`.
--  Tensor lookup(ProcessId processId);
-+  Tensor lookup(ProcessId processId) const;
- 
-  private:
-   /// Internal map representation of the result of `ProcessGrid::rendezvous`.
-@@ -134,7 +134,7 @@
-   /// tensors are accumulated in `RendezvousResult` whose shard pointer is
-   /// returned to all callers once the barrier has been reached by all StableHLO
-   /// processes.
--  const std::shared_ptr<RendezvousResult> rendezvous(ProcessGroup processGroup,
-+  std::shared_ptr<RendezvousResult const> rendezvous(ProcessGroup processGroup,
-                                                      ChannelId channelId,
-                                                      ProcessId processId,
-                                                      const Tensor &operand);
 diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
 --- stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
 +++ stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index 31d099aa3e2bac..ebf1c7041e4100 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "887e0692ae727555c6b0e00402897742a409de54"
-    STABLEHLO_SHA256 = "ffa1d97f4ad96313c94289f2d4343bcec2268a08c9cad841de52e830cc35f9b0"
+    STABLEHLO_COMMIT = "2eec6dbd23dc5bd9d51f98c5768edf5b1a470ff7"
+    STABLEHLO_SHA256 = "a5e31ed87bb70005fa421abc355e692dc9d82c83c681a3d8d89ab15c607fc1f9"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 4c4228163a6f04..88a5bdaf0d603a 100644
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -980,87 +980,6 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo
 +}  // namespace mlir
 +
 +#endif  // STABLEHLO_DIALECT_EXPERIMENTAL_OPS_H
-diff --ruN a/stablehlo/stablehlo/reference/Process.cpp b/stablehlo/stablehlo/reference/Process.cpp
---- stablehlo/stablehlo/reference/Process.cpp
-+++ stablehlo/stablehlo/reference/Process.cpp
-@@ -44,7 +44,7 @@
- 
- void Process::outfeed(ArrayRef<Tensor> inputs) { grid_->outfeed(inputs); }
- 
--const std::shared_ptr<RendezvousResult> Process::rendezvous(
-+std::shared_ptr<RendezvousResult const> Process::rendezvous(
-     ProcessGroup processGroup, ChannelId channelId, const Tensor &operand) {
-   return grid_->rendezvous(processGroup, channelId, getId(), operand);
- }
-diff --ruN a/stablehlo/stablehlo/reference/Process.h b/stablehlo/stablehlo/reference/Process.h
---- stablehlo/stablehlo/reference/Process.h
-+++ stablehlo/stablehlo/reference/Process.h
-@@ -54,7 +54,7 @@
-   void outfeed(ArrayRef<Tensor> inputs);
- 
-   /// See `ProcessGrid::rendezvous`.
--  const std::shared_ptr<RendezvousResult> rendezvous(ProcessGroup processGroup,
-+  std::shared_ptr<RendezvousResult const> rendezvous(ProcessGroup processGroup,
-                                                      ChannelId channelId,
-                                                      const Tensor &operand);
- 
-diff --ruN a/stablehlo/stablehlo/reference/ProcessGrid.cpp b/stablehlo/stablehlo/reference/ProcessGrid.cpp
---- stablehlo/stablehlo/reference/ProcessGrid.cpp
-+++ stablehlo/stablehlo/reference/ProcessGrid.cpp
-@@ -66,13 +66,13 @@
-   result_[processId] = tensor;
- }
- 
--Tensor RendezvousResult::lookup(ProcessId processId) {
-+Tensor RendezvousResult::lookup(ProcessId processId) const {
-   auto it = result_.find(processId);
-   if (it != result_.end()) return it->second;
-   return {};
- }
- 
--SmallVector<Tensor> RendezvousResult::getSortedTensors() {
-+SmallVector<Tensor> RendezvousResult::getSortedTensors() const {
-   return llvm::to_vector(
-       llvm::map_range(result_, [](const auto &pair) { return pair.second; }));
- }
-@@ -162,7 +162,7 @@
- 
- void ProcessGrid::outfeed(ArrayRef<Tensor> inputs) { outfeed_.push(inputs); }
- 
--const std::shared_ptr<RendezvousResult> ProcessGrid::rendezvous(
-+std::shared_ptr<RendezvousResult const> ProcessGrid::rendezvous(
-     ProcessGroup processGroup, ChannelId channelId, ProcessId processId,
-     const Tensor &operand) {
-   std::pair<ProcessGroup, ChannelId> channelKey(processGroup, channelId);
-diff --ruN a/stablehlo/stablehlo/reference/ProcessGrid.h b/stablehlo/stablehlo/reference/ProcessGrid.h
---- stablehlo/stablehlo/reference/ProcessGrid.h
-+++ stablehlo/stablehlo/reference/ProcessGrid.h
-@@ -75,14 +75,14 @@
-   /// Iterates through the (ProcessId, Tensor) map entires and returns a vector
-   /// of Tensors sorted by ProcessId--(replicaId, partitionId) pair--in
-   /// lexicographical order.
--  SmallVector<Tensor> getSortedTensors();
-+  SmallVector<Tensor> getSortedTensors() const;
- 
-   /// Inserts `tensor` into the map using the key `processId`.
-   void insert(ProcessId processId, Tensor tensor);
- 
-   /// Iterates through the map and returns the value associated with the key
-   /// `processId`. If key is not found, return an empty `Tensor`.
--  Tensor lookup(ProcessId processId);
-+  Tensor lookup(ProcessId processId) const;
- 
-  private:
-   /// Internal map representation of the result of `ProcessGrid::rendezvous`.
-@@ -134,7 +134,7 @@
-   /// tensors are accumulated in `RendezvousResult` whose shard pointer is
-   /// returned to all callers once the barrier has been reached by all StableHLO
-   /// processes.
--  const std::shared_ptr<RendezvousResult> rendezvous(ProcessGroup processGroup,
-+  std::shared_ptr<RendezvousResult const> rendezvous(ProcessGroup processGroup,
-                                                      ChannelId channelId,
-                                                      ProcessId processId,
-                                                      const Tensor &operand);
 diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
 --- stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
 +++ stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 31d099aa3e2bac..ebf1c7041e4100 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "887e0692ae727555c6b0e00402897742a409de54"
-    STABLEHLO_SHA256 = "ffa1d97f4ad96313c94289f2d4343bcec2268a08c9cad841de52e830cc35f9b0"
+    STABLEHLO_COMMIT = "2eec6dbd23dc5bd9d51f98c5768edf5b1a470ff7"
+    STABLEHLO_SHA256 = "a5e31ed87bb70005fa421abc355e692dc9d82c83c681a3d8d89ab15c607fc1f9"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(

From 334d12a1a4d2cda5f3b4b3cdca1a9214051e238b Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 29 Sep 2023 09:34:37 -0700
Subject: [PATCH 425/567] [PJRT] Split the GpuId() platform constants into
 CudaId()/RocmId().

Similarly for the GpuName() constant.

While most of the time we treat CUDA and ROCm GPUs identically, we sometimes want to distinguish between CUDA and ROCm (e.g., for DLPack exports) and it's helpful if this is encoded in the platform ID.

PiperOrigin-RevId: 569513495
---
 .../core/common_runtime/gpu/gpu_device.cc     |  7 ++++-
 .../xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc     |  8 ++++-
 third_party/xla/xla/pjrt/gpu/BUILD            |  2 ++
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  8 ++++-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     |  3 +-
 .../xla/pjrt/gpu/se_gpu_pjrt_client_test.cc   |  3 +-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc  |  5 ++--
 .../xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc |  9 +++---
 third_party/xla/xla/pjrt/pjrt_compiler.h      | 20 +++++++++----
 third_party/xla/xla/python/BUILD              |  2 +-
 third_party/xla/xla/python/dlpack.cc          | 29 +++++--------------
 third_party/xla/xla/python/py_array.cc        | 10 ++++++-
 third_party/xla/xla/python/py_buffer.cc       |  4 +--
 third_party/xla/xla/python/py_client.cc       | 11 +++++++
 third_party/xla/xla/python/py_client.h        |  5 ++--
 .../xla/xla/python/py_host_callback.cc        |  4 ++-
 third_party/xla/xla/python/xla.cc             | 27 +++++++++++++++--
 third_party/xla/xla/python/xla_client_test.py | 21 +++++++++-----
 18 files changed, 122 insertions(+), 56 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index ea9f5cbc694e20..c97d1329cda024 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -1831,9 +1831,14 @@ Status BaseGPUDeviceFactory::CreateDevices(
     // Creates PJRT GPU client and places it into a TF global resource manager.
     auto gpu_run_options =
         std::make_unique<xla::gpu::GpuExecutableRunOptions>();
+#if TENSORFLOW_USE_ROCM
+    auto platform_name = xla::RocmName();
+#else   // TENSORFLOW_USE_ROCM
+    auto platform_name = xla::CudaName();
+#endif  // TENSORFLOW_USE_ROCM
     std::unique_ptr<xla::PjRtClient> pjrt_client =
         std::make_unique<xla::StreamExecutorGpuClient>(
-            xla::GpuName(), xla_client, std::move(pjrt_devices),
+            platform_name, xla_client, std::move(pjrt_devices),
             /*process_index=*/numa_node,
             /*allocator=*/std::move(allocator_adapter),
             /*host_memory_allocator=*/std::move(pjrt_gpu_host_allocator),
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 319ee8cec112db..6fd6115ca79905 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -50,9 +50,15 @@ limitations under the License.
 namespace pjrt {
 namespace {
 
+#ifdef TENSORFLOW_USE_ROCM
 const bool kUnused = (RegisterPjRtCApiTestFactory([]() { return GetPjrtApi(); },
-                                                  /*platform_name=*/"gpu"),
+                                                  /*platform_name=*/"rocm"),
                       true);
+#else   // TENSORFLOW_USE_ROCM
+const bool kUnused = (RegisterPjRtCApiTestFactory([]() { return GetPjrtApi(); },
+                                                  /*platform_name=*/"cuda"),
+                      true);
+#endif  // TENSORFLOW_USE_ROCM
 
 class PjrtCApiGpuTest : public PjrtCApiTestBase {
  public:
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index eb1ddc2fad0b38..389554049783f9 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -84,6 +84,7 @@ cc_library(
         "@local_tsl//tsl/lib/strings:proto_serialization",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
         "@local_tsl//tsl/util:env_var",
         "@tf_runtime//:hostcontext",
@@ -264,6 +265,7 @@ xla_cc_test(
         "//xla/client:xla_computation",
         "//xla/mlir_hlo",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_compiler",
         "//xla/service:gpu_plugin",
         "//xla/service:hlo_parser",
         "//xla/stream_executor/cuda:cublas_plugin",
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index b1916e471c8b6a..9e314cf2eefd06 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -1052,8 +1052,14 @@ StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
     devices = BuildLocalDevices(std::move(local_device_states), node_id);
   }
 
+#if TENSORFLOW_USE_ROCM
+  auto pjrt_platform_name = xla::RocmName();
+#else   // TENSORFLOW_USE_ROCM
+  auto pjrt_platform_name = xla::CudaName();
+#endif  // TENSORFLOW_USE_ROCM
+
   return std::unique_ptr<PjRtClient>(std::make_unique<StreamExecutorGpuClient>(
-      GpuName(), xla_client, std::move(devices),
+      pjrt_platform_name, xla_client, std::move(devices),
       /*node_id=*/node_id, std::move(allocator),
       std::move(host_memory_allocator), should_stage_host_to_device_transfers,
       /*gpu_run_options=*/std::move(gpu_run_options)));
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index be6b4cb75a9777..c989b341547072 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/statusor.h"
+#include "tsl/platform/fingerprint.h"
 
 namespace stream_executor {
 
@@ -169,7 +170,7 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
             std::move(allocator), std::move(host_memory_allocator),
             should_stage_host_to_device_transfers, std::move(gpu_run_options)),
         topology_(xla::StreamExecutorGpuTopologyDescription::Create(
-            xla::GpuId(), std::move(platform_name),
+            tsl::Fingerprint64(platform_name), platform_name,
             devices_.back()->device_kind(), devices_)) {}
 
   xla::StatusOr<xla::DeviceAssignment> GetDefaultDeviceAssignment(
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index da9c50ddc4b70f..93efaaccf3fc44 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -591,7 +591,8 @@ TEST(StreamExecutorGpuClientTest, DistributeInit) {
               /*node_id=*/i, num_nodes, /*allowed_devices=*/std::nullopt,
               /*platform_name=*/std::nullopt,
               /*should_stage_host_to_device_transfers=*/true, kv_get, kv_put));
-      EXPECT_EQ(client->platform_name(), "gpu");
+      EXPECT_TRUE(client->platform_name() == "cuda" ||
+                  client->platform_name() == "rocm");
       EXPECT_EQ(client->addressable_device_count(), 2);
       EXPECT_EQ(client->device_count(), 4);
     });
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 2535b5d5308ada..6a4dbadf3405d5 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "xla/client/xla_computation.h"
@@ -51,7 +52,7 @@ namespace xla {
 namespace {
 
 bool IsGpuClient(const PjRtClient& client) {
-  return client.platform_id() == GpuId();
+  return client.platform_id() == CudaId() || client.platform_id() == RocmId();
 }
 
 bool IsSameTopology(const PjRtTopologyDescription& topology1,
@@ -191,6 +192,6 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
 REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
   std::unique_ptr<PjRtCompiler> compiler =
       std::make_unique<StreamExecutorGpuCompiler>();
-  PjRtRegisterCompiler(GpuName(), std::move(compiler));
+  PjRtRegisterCompiler(CudaName(), std::move(compiler));
 });
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
index d4bc1beb3f74e4..94c197cdeda9cb 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/tests/literal_test_util.h"
 #include "tsl/platform/status_matchers.h"
@@ -58,7 +59,7 @@ absl::StatusOr<xla::XlaComputation> GetXlaComputation(
 
 TEST(StreamExecutorGpuCompilerTest, NoClientXla) {
   StreamExecutorGpuCompiler compiler;
-  StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),
+  StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
                                                 "Fake_device", {0, 1});
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
@@ -69,7 +70,7 @@ TEST(StreamExecutorGpuCompilerTest, NoClientXla) {
 
 TEST(StreamExecutorGpuCompilerTest, TopologyNotSameXla) {
   StreamExecutorGpuCompiler compiler;
-  StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),
+  StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
                                                 "Fake_device", {0, 1});
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -117,7 +118,7 @@ TEST(StreamExecutorGpuCompilerTest, NoClientMlir) {
   auto mlir_module =
       mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
 
-  StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),
+  StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
                                                 "Fake_device", {0, 1});
 
   EXPECT_THAT(
@@ -135,7 +136,7 @@ TEST(StreamExecutorGpuCompilerTest, TopologyNotSameMlir) {
   auto mlir_module =
       mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
 
-  StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),
+  StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
                                                 "Fake_device", {0, 1});
 
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.h b/third_party/xla/xla/pjrt/pjrt_compiler.h
index 886c687cdd9956..f310bf96d2ae38 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.h
@@ -36,9 +36,13 @@ inline const char* CpuName() {
   static constexpr char kCpuName[] = "cpu";
   return kCpuName;
 }
-inline const char* GpuName() {
-  static constexpr char kGpuName[] = "gpu";
-  return kGpuName;
+inline const char* CudaName() {
+  static constexpr char kCudaName[] = "cuda";
+  return kCudaName;
+}
+inline const char* RocmName() {
+  static constexpr char kRocmName[] = "rocm";
+  return kRocmName;
 }
 inline const char* TpuName() {
   static constexpr char kTpuName[] = "tpu";
@@ -48,9 +52,13 @@ inline PjRtPlatformId CpuId() {
   static const PjRtPlatformId kCpuId = tsl::Fingerprint64(CpuName());
   return kCpuId;
 }
-inline PjRtPlatformId GpuId() {
-  static const PjRtPlatformId kGpuId = tsl::Fingerprint64(GpuName());
-  return kGpuId;
+inline PjRtPlatformId CudaId() {
+  static const PjRtPlatformId kCudaId = tsl::Fingerprint64(CudaName());
+  return kCudaId;
+}
+inline PjRtPlatformId RocmId() {
+  static const PjRtPlatformId kRocmId = tsl::Fingerprint64(RocmName());
+  return kRocmId;
 }
 inline PjRtPlatformId TpuId() {
   static const PjRtPlatformId kTpuId = tsl::Fingerprint64(TpuName());
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 7d9b296bd27a9e..8b714b77af91ba 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -488,7 +488,7 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla/pjrt:pjrt_client",
-        "//xla/pjrt/gpu:se_gpu_pjrt_client",
+        "//xla/pjrt:pjrt_compiler",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/python/dlpack.cc b/third_party/xla/xla/python/dlpack.cc
index fa40030130a4f8..097f72411db33b 100644
--- a/third_party/xla/xla/python/dlpack.cc
+++ b/third_party/xla/xla/python/dlpack.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "include/dlpack/dlpack.h"  // from @dlpack
 #include "pybind11/pytypes.h"  // from @pybind11
-#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/python/py_array.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/traceback.h"
@@ -213,15 +213,10 @@ StatusOr<std::vector<int64_t>> StridesToLayout(
 StatusOr<DLDeviceType> DLDeviceTypeForDevice(const PjRtDevice& device) {
   if (device.client()->platform_id() == CpuId()) {
     return kDLCPU;
-  } else if (device.client()->platform_id() == GpuId()) {
-    const StreamExecutorGpuDevice& gdevice =
-        dynamic_cast<const StreamExecutorGpuDevice&>(device);
-
-    if (absl::StrContains(gdevice.device_vendor(), "Advanced Micro Devices")) {
-      return kDLROCM;
-    } else {
-      return kDLCUDA;
-    }
+  } else if (device.client()->platform_id() == CudaId()) {
+    return kDLCUDA;
+  } else if (device.client()->platform_id() == RocmId()) {
+    return kDLROCM;
   }
   return InvalidArgument("Device %s cannot be used as a DLPack device.",
                          device.DebugString());
@@ -250,14 +245,14 @@ StatusOr<PjRtDevice*> DeviceForDLDevice(const PjRtClient* cpu_client,
         return InvalidArgument(
             "DLPack tensor is on GPU, but no GPU backend was provided.");
       }
-      TF_RET_CHECK(gpu_client->platform_id() == GpuId());
+      TF_RET_CHECK(gpu_client->platform_id() == CudaId());
       return gpu_client->LookupAddressableDevice(context.device_id);
     case kDLROCM:
       if (gpu_client == nullptr) {
         return InvalidArgument(
             "DLPack tensor is on GPU, but no GPU backend was provided.");
       }
-      TF_RET_CHECK(gpu_client->platform_id() == GpuId());
+      TF_RET_CHECK(gpu_client->platform_id() == RocmId());
       return gpu_client->LookupAddressableDevice(context.device_id);
     default:
       return InvalidArgument("Unknown/unsupported DLPack device type %d",
@@ -358,18 +353,8 @@ StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
   // TODO(hyeontaek): This is a potential target for an IFRT client to multiplex
   // multiple PjRt clients. Devices from these PjRt clients could be expressed
   // as a unified set of IFRT devices.
-  // Backward compatibility: if only one client is passed, it may be from any
-  // platform. Drop this support after dropping support for jax <= 0.2.14.
-  if (cpu_client && cpu_client->pjrt_client()->platform_id() == GpuId()) {
-    gpu_client = std::move(cpu_client);
-    cpu_client = nullptr;
-  }
   auto* cpu_pjrt_client = cpu_client ? cpu_client->pjrt_client() : nullptr;
   auto* gpu_pjrt_client = gpu_client ? gpu_client->pjrt_client() : nullptr;
-  if (cpu_client && cpu_pjrt_client->platform_id() != CpuId()) {
-    return InvalidArgument("DLPack does not support platform %s",
-                           cpu_pjrt_client->platform_name());
-  }
 
   if (absl::string_view(tensor.name()) != kDlTensorCapsuleName) {
     return InvalidArgument(
diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index c2470723434903..f4b4afa99f42db 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "llvm/Support/Casting.h"
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
@@ -1110,7 +1111,14 @@ Status PyArray::RegisterTypes(py::module& m) {
       },
       py::is_method(type));
   type.attr("platform") = py::cpp_function(
-      [](PyArray self) { return self.ifrt_array()->client()->platform_name(); },
+      [](PyArray self) {
+        if (self.ifrt_array()->client()->platform_name() == "cuda" ||
+            self.ifrt_array()->client()->platform_name() == "rocm") {
+          return absl::string_view("gpu");
+        } else {
+          return self.ifrt_array()->client()->platform_name();
+        }
+      },
       py::is_method(type));
   type.attr("is_ready") = py::cpp_function(
       [](PyArray self) { return xla::ValueOrThrow(self.IsReady()); },
diff --git a/third_party/xla/xla/python/py_buffer.cc b/third_party/xla/xla/python/py_buffer.cc
index 3ac01dee3ed19f..481337e76daaf8 100644
--- a/third_party/xla/xla/python/py_buffer.cc
+++ b/third_party/xla/xla/python/py_buffer.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
@@ -260,8 +261,7 @@ pybind11::dtype IfrtHelpers::python_dtype(ifrt::Array* ifrt_array) {
 StatusOr<pybind11::dict> IfrtHelpers::CudaArrayInterface(
     ifrt::Array* ifrt_array, std::optional<Shape>& scratch) {
   auto* pjrt_buffer = IfrtHelpers::pjrt_buffer(ifrt_array);
-  // TODO(zhangqiaorjc): Differentiate between NVidia and other GPUs.
-  if (pjrt_buffer->client()->platform_id() != GpuId()) {
+  if (pjrt_buffer->client()->platform_id() != CudaId()) {
     return InvalidArgument(
         "__cuda_array_interface__ is only defined for NVidia GPU buffers.");
   }
diff --git a/third_party/xla/xla/python/py_client.cc b/third_party/xla/xla/python/py_client.cc
index ce6e4dcba1c2e7..0fa1b7071e5c01 100644
--- a/third_party/xla/xla/python/py_client.cc
+++ b/third_party/xla/xla/python/py_client.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/python/callback.h"
 #include "xla/python/exceptions.h"
@@ -58,6 +59,16 @@ namespace py = pybind11;
 PyClient::PyClient(std::shared_ptr<ifrt::Client> ifrt_client)
     : ifrt_client_(std::move(ifrt_client)) {
   CHECK(ifrt_client_);
+  // TODO(phawkins): this is a temporary backwards compatibility shim. We
+  // changed the name PJRT reports for GPU platforms to "cuda" or "rocm", but
+  // we haven't yet updated JAX clients that expect "gpu". Migrate users and
+  // remove this code.
+  if (ifrt_client_->platform_name() == "cuda" ||
+      ifrt_client_->platform_name() == "rocm") {
+    platform_name_ = "gpu";
+  } else {
+    platform_name_ = ifrt_client_->platform_name();
+  }
 }
 
 PyClient::~PyClient() {
diff --git a/third_party/xla/xla/python/py_client.h b/third_party/xla/xla/python/py_client.h
index d3805ffe45da1f..31397f89a7b60f 100644
--- a/third_party/xla/xla/python/py_client.h
+++ b/third_party/xla/xla/python/py_client.h
@@ -136,9 +136,7 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
     return shared_ptr_pjrt_client();
   }
 
-  absl::string_view platform_name() const {
-    return ifrt_client_->platform_name();
-  }
+  absl::string_view platform_name() const { return platform_name_; }
   absl::string_view platform_version() const {
     return ifrt_client_->platform_version();
   }
@@ -248,6 +246,7 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   friend struct PyArray_Storage;
 
   std::shared_ptr<ifrt::Client> ifrt_client_;
+  std::string platform_name_;
 
   // Pointers to intrusive doubly-linked lists of arrays and executables, used
   // to iterate over all known objects when heap profiling. The list structure
diff --git a/third_party/xla/xla/python/py_host_callback.cc b/third_party/xla/xla/python/py_host_callback.cc
index e149742659aa11..625bcbabf5d2ea 100644
--- a/third_party/xla/xla/python/py_host_callback.cc
+++ b/third_party/xla/xla/python/py_host_callback.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/layout_util.h"
 #include "xla/pjrt/host_callback.h"
+#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/python/callback.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/host_callback.h"
@@ -112,7 +113,8 @@ PyCpuLoadedHostCallback::Create(ifrt::Client* ifrt_client,
                                 absl::Span<const Shape> operand_shapes,
                                 absl::Span<const Shape> result_shapes) {
   ifrt::PlatformId platform_id = ifrt_client->platform_id();
-  if (platform_id != GpuId() && platform_id != CpuId()) {
+  if (platform_id != CpuId() && platform_id != CudaId() &&
+      platform_id != RocmId()) {
     return Unimplemented("CpuCallback supports CPU and GPU only");
   }
 
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 46053a1009a0ae..c1bef710507c0b 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -40,6 +40,7 @@ limitations under the License.
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "pybind11/attr.h"  // from @pybind11
 #include "pybind11/cast.h"  // from @pybind11
@@ -202,7 +203,18 @@ static void Init(py::module_& m) {
                              "Deprecated; please use process_index")
       .def_property_readonly("platform",
                              [](const ClientAndPtr<PjRtDevice>& device) {
-                               return device.client()->platform_name();
+                               // TODO(phawkins): this is a temporary backwards
+                               // compatibility shim. We changed the name PJRT
+                               // reports for GPU platforms to "cuda" or "rocm",
+                               // but we haven't yet updated JAX clients that
+                               // expect "gpu". Migrate users and remove this
+                               // code.
+                               if (device.client()->platform_name() == "cuda" ||
+                                   device.client()->platform_name() == "rocm") {
+                                 return absl::string_view("gpu");
+                               } else {
+                                 return device.client()->platform_name();
+                               }
                              })
       .def_property_readonly("device_kind", &PjRtDevice::device_kind)
       .def_property_readonly("client",
@@ -371,7 +383,18 @@ static void Init(py::module_& m) {
       .def_property_readonly(
           "platform",
           [](const ClientAndPtr<PjRtMemorySpace>& memory_space) {
-            return memory_space.client()->platform_name();
+            // TODO(phawkins): this is a temporary backwards
+            // compatibility shim. We changed the name PJRT
+            // reports for GPU platforms to "cuda" or "rocm",
+            // but we haven't yet updated JAX clients that
+            // expect "gpu". Migrate users and remove this
+            // code.
+            if (memory_space.client()->platform_name() == "cuda" ||
+                memory_space.client()->platform_name() == "rocm") {
+              return absl::string_view("gpu");
+            } else {
+              return memory_space.client()->platform_name();
+            }
           })
       .def_property_readonly("kind", &PjRtMemorySpace::memory_space_kind)
       .def("__str__", &PjRtMemorySpace::DebugString)
diff --git a/third_party/xla/xla/python/xla_client_test.py b/third_party/xla/xla/python/xla_client_test.py
index 92018d7f18c3fc..22d6ac43580ff2 100644
--- a/third_party/xla/xla/python/xla_client_test.py
+++ b/third_party/xla/xla/python/xla_client_test.py
@@ -2155,7 +2155,7 @@ def testMemoryStats(self):
         stats = device.memory_stats()
         if (
             self.backend.platform != "tpu" or not tfrt_tpu
-        ) and self.backend.platform != "gpu":
+        ) and self.backend.platform not in ("gpu", "cuda", "rocm"):
           self.assertIsNone(stats)
         else:
           self.assertIsNotNone(stats)
@@ -2285,13 +2285,16 @@ class DLPackTest(parameterized.TestCase):
     def setUp(self):
       super(DLPackTest, self).setUp()
       self.backend = xla_backend()
-      if self.backend.platform not in ("cpu", "gpu"):
+      if self.backend.platform not in ("cpu", "gpu", "cuda", "rocm"):
         self.skipTest("DLPack requires CPU or GPU")
       self.cpu_backend = (
           self.backend
           if self.backend.platform == "cpu" else xla_client.make_cpu_client())
       self.gpu_backend = (
-          self.backend if self.backend.platform == "gpu" else None)
+          self.backend
+          if self.backend.platform in ("gpu", "cuda", "rocm")
+          else None
+      )
 
     def tearDown(self):
       super().tearDown()
@@ -2337,7 +2340,9 @@ def testTensorsCanBeConsumedOnceOnly(self):
           buffer, take_ownership=True)
 
       def ConsumeDLPackTensor():
-        _ = xla_client._xla.dlpack_managed_tensor_to_buffer(dlt, self.backend)
+        _ = xla_client._xla.dlpack_managed_tensor_to_buffer(
+            dlt, self.cpu_backend, self.gpu_backend
+        )
 
       ConsumeDLPackTensor()
       self.assertRaisesRegex(
@@ -2364,8 +2369,10 @@ def testNonOwnedDlpackCanBeViewedTwice(self):
       d2 = xla_client._xla.buffer_to_dlpack_managed_tensor(
           buffer, take_ownership=False)
 
-      y = xla_client._xla.dlpack_managed_tensor_to_buffer(d1, self.backend)
-      z = xla_client._xla.dlpack_managed_tensor_to_buffer(d2, self.backend)
+      y = xla_client._xla.dlpack_managed_tensor_to_buffer(
+          d1, self.cpu_backend, self.gpu_backend)
+      z = xla_client._xla.dlpack_managed_tensor_to_buffer(
+          d2, self.cpu_backend, self.gpu_backend)
       del d1, d2
       np.testing.assert_array_equal(x, np.asarray(buffer))
       np.testing.assert_array_equal(x, np.asarray(y))
@@ -2514,7 +2521,7 @@ def testPlatformVersion(self):
       logging.info("platform_version:\n%s", version)
       if self.backend.platform == "cpu":
         self.assertEqual(version, "<unknown>")
-      elif self.backend.platform == "gpu":
+      elif self.backend.platform in ("gpu", "cuda", "rocm"):
         # Following is false if not built with --config=cuda
         if version != "<unknown>":
           self.assertTrue(

From b0a9536d587e341b088b9a52d007c8c0633037b2 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Fri, 29 Sep 2023 10:12:14 -0700
Subject: [PATCH 426/567] [XLA:GPU] [NFC] Remove gpu_device_info as a redundant
 indirection layer

PiperOrigin-RevId: 569523071
---
 tensorflow/core/BUILD                         |   1 +
 third_party/xla/xla/service/gpu/BUILD         |  87 +++-------
 .../xla/xla/service/gpu/buffer_comparator.cc  |  18 +-
 .../service/gpu/compile_module_to_llvm_ir.cc  |  12 +-
 .../service/gpu/compile_module_to_llvm_ir.h   |   5 +-
 .../xla/xla/service/gpu/fusion_merger.cc      |   4 +-
 .../xla/xla/service/gpu/fusion_merger.h       |   6 +-
 .../xla/xla/service/gpu/fusion_pipeline.cc    |   7 +-
 .../xla/xla/service/gpu/fusion_pipeline.h     |   7 +-
 .../xla/xla/service/gpu/gemm_rewriter.cc      |   1 +
 .../service/gpu/gpu_aot_compilation_test.cc   |   5 +-
 .../xla/xla/service/gpu/gpu_compiler.cc       |  29 ++--
 .../xla/xla/service/gpu/gpu_compiler.h        |   8 +-
 .../gpu/gpu_cost_model_stats_collection.h     |   6 +-
 .../xla/xla/service/gpu/gpu_device_info.cc    |  59 -------
 .../xla/xla/service/gpu/gpu_device_info.h     | 111 -------------
 .../service/gpu/gpu_device_info_for_tests.cc  |  79 ++++-----
 .../service/gpu/gpu_device_info_for_tests.h   |   8 +-
 .../xla/service/gpu/gpu_device_info_test.cc   | 157 ------------------
 .../xla/xla/service/gpu/gpu_fusible.cc        |   6 +-
 third_party/xla/xla/service/gpu/gpu_fusible.h |   4 +-
 .../xla/service/gpu/gpu_hlo_cost_analysis.cc  |   7 +-
 .../xla/service/gpu/gpu_hlo_cost_analysis.h   |   9 +-
 .../xla/xla/service/gpu/gpu_hlo_schedule.cc   |   4 +-
 .../xla/xla/service/gpu/gpu_hlo_schedule.h    |   4 +-
 .../xla/service/gpu/gpu_hlo_schedule_test.cc  |   8 +-
 .../xla/service/gpu/gpu_performance_model.cc  |  28 ++--
 .../service/gpu/gpu_performance_model_test.cc |   4 +-
 .../xla/xla/service/gpu/gpu_target_config.cc  |  10 +-
 .../xla/xla/service/gpu/gpu_target_config.h   |   5 +-
 .../xla/service/gpu/hlo_fusion_analysis.cc    |  20 +--
 .../xla/xla/service/gpu/hlo_fusion_analysis.h |  12 +-
 .../xla/xla/service/gpu/hlo_op_profiler.cc    |   6 +-
 .../xla/xla/service/gpu/hlo_op_profiler.h     |   4 +-
 .../xla/service/gpu/hlo_op_profiler_run.cc    |  16 +-
 .../service/gpu/horizontal_input_fusion.cc    |   4 +-
 .../xla/service/gpu/horizontal_input_fusion.h |   7 +-
 .../gpu/horizontal_loop_fusion_test.cc        |   3 +-
 .../xla/xla/service/gpu/instruction_fusion.h  |   7 +-
 .../xla/xla/service/gpu/ir_emitter_context.h  |  14 +-
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |  11 +-
 .../xla/xla/service/gpu/ir_emitter_triton.h   |   7 +-
 .../xla/service/gpu/ir_emitter_triton_test.cc |  18 +-
 .../xla/service/gpu/ir_emitter_unnested.cc    |  13 +-
 .../xla/xla/service/gpu/launch_dimensions.cc  |  43 ++---
 .../xla/xla/service/gpu/launch_dimensions.h   |  10 +-
 .../xla/service/gpu/multi_output_fusion.cc    |   4 +-
 .../xla/xla/service/gpu/multi_output_fusion.h |   6 +-
 .../xla/xla/service/gpu/nvptx_compiler.cc     |   2 +-
 .../xla/xla/service/gpu/priority_fusion.cc    |   4 +-
 .../xla/xla/service/gpu/priority_fusion.h     |   4 +-
 third_party/xla/xla/service/gpu/tests/BUILD   |   1 -
 .../gpu/tests/gpu_fusion_pipeline_test.cc     |   3 +-
 .../xla/service/gpu/tests/hlo_to_llvm_ir.cc   |   9 +-
 .../xla/xla/service/gpu/triton_autotuner.cc   |  16 +-
 .../xla/stream_executor/device_description.cc |  57 ++++++-
 .../xla/stream_executor/device_description.h  |  91 +++++-----
 third_party/xla/xla/xla.bzl                   |   2 +
 58 files changed, 391 insertions(+), 702 deletions(-)
 delete mode 100644 third_party/xla/xla/service/gpu/gpu_device_info.cc
 delete mode 100644 third_party/xla/xla/service/gpu/gpu_device_info.h
 delete mode 100644 third_party/xla/xla/service/gpu/gpu_device_info_test.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 09bc9919bb9b4e..1a18c391a3b3e1 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1473,6 +1473,7 @@ cc_library(
         "@local_xla//xla/service/memory_space_assignment:memory_space_assignment_proto_cc_impl",
         "@local_xla//xla/service/gpu:backend_configs_cc_impl",
         "@local_xla//xla/service/gpu:hlo_op_profile_proto_cc_impl",
+        "@local_xla//xla/stream_executor:device_description_proto_cc_impl",
     ] + tf_protos_grappler_impl() + tf_monitoring_framework_deps(),
     # Alwayslink causes a cc_binary to "always link" in the
     # srcs for a given cc_library, even if they are unreferenced, see:
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index f95c89f7a0e363..56f6c2d937bd70 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -109,10 +109,10 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_device_info",
         "//xla:shape_util",
         "//xla:statusor",
         "//xla:util",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
     ],
@@ -237,20 +237,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "gpu_device_info",
-    srcs = ["gpu_device_info.cc"],
-    hdrs = ["gpu_device_info.h"],
-    compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/stream_executor",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor:device_description_proto_cc",
-        "//xla/stream_executor/gpu:gpu_types_header",
-    ],
-)
-
 cc_library(
     name = "gpu_device_info_for_tests",
     testonly = 1,
@@ -259,34 +245,10 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_device_info",
         "//xla/stream_executor:device_description",
     ],
 )
 
-xla_cc_test(
-    name = "gpu_device_info_test",
-    srcs = ["gpu_device_info_test.cc"],
-    tags = tf_cuda_tests_tags(),
-    deps = [
-        ":gpu_device_info",
-        ":gpu_device_info_for_tests",
-        "//xla/service:pattern_matcher_gmock",
-        "//xla/stream_executor",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor/gpu:gpu_executor_header",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:test",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-        "//xla/stream_executor/cuda:cuda_executor_header",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-        "//xla/stream_executor/rocm:rocm_gpu_executor_header",
-    ]),
-)
-
 cc_library(
     name = "ir_emitter_context",
     srcs = ["ir_emitter_context.cc"],
@@ -294,11 +256,11 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":gpu_constants",
-        ":gpu_device_info",
         ":gpu_executable",
         "//xla/service:buffer_assignment",
         "//xla/service:name_uniquer",
         "//xla/stream_executor",
+        "//xla/stream_executor:device_description",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
     ],
@@ -320,7 +282,6 @@ cc_library(
         ":gpu_asm_opts_util",
         ":gpu_constants",
         ":gpu_conv_runner",
-        ":gpu_device_info",
         ":gpu_executable",
         ":gpu_fused_mha_runner",
         ":gpu_fusible",
@@ -364,6 +325,7 @@ cc_library(
         "//xla/service/llvm_ir:kernel_support_library",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:sort_util",
+        "//xla/stream_executor:device_description",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "//xla/translate/mhlo_to_hlo:attribute_exporter",
         "//xla/translate/mhlo_to_hlo:location_exporter",
@@ -462,7 +424,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":gemm_rewriter_triton",
-        ":gpu_device_info",
         ":hlo_traversal",
         ":ir_emission_utils",
         ":launch_dimensions",
@@ -544,7 +505,6 @@ xla_test(
     deps = [
         ":backend_configs_cc",
         ":gemm_rewriter_triton",
-        ":gpu_device_info",
         ":gpu_device_info_for_tests",
         ":ir_emission_utils",
         ":ir_emitter_triton",
@@ -645,7 +605,6 @@ cc_library(
         ":buffer_comparator",
         ":gemm_rewriter_triton",
         ":gemm_rewriter",
-        ":gpu_device_info",
         ":gpu_float_support",
         ":gpu_fusible",
         ":instruction_fusion",
@@ -1902,7 +1861,6 @@ cc_library(
     hdrs = ["instruction_fusion.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_device_info",
         ":gpu_fusible",
         "//xla:shape_util",
         "//xla:statusor",
@@ -1912,6 +1870,7 @@ cc_library(
         "//xla/service:fusion_queue",
         "//xla/service:hlo_pass",
         "//xla/service:instruction_fusion",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/meta:type_traits",
@@ -1945,7 +1904,6 @@ cc_library(
     hdrs = ["priority_fusion.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_device_info",
         ":gpu_fusible",
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
@@ -1958,6 +1916,7 @@ cc_library(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_pass",
         "//xla/service:instruction_fusion",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1996,7 +1955,6 @@ cc_library(
     hdrs = ["multi_output_fusion.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_device_info",
         ":gpu_fusible",
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
@@ -2008,6 +1966,7 @@ cc_library(
         "//xla/service:hlo_graph_dumper",
         "//xla/service:hlo_pass",
         "//xla/service:instruction_fusion",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -2091,7 +2050,6 @@ cc_library(
     hdrs = ["fusion_merger.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_device_info",
         ":gpu_fusible",
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
@@ -2100,6 +2058,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_graph_dumper",
         "//xla/service:hlo_pass",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
     ],
@@ -2462,7 +2421,7 @@ cc_library(
         ":buffer_sharing",
         ":executable_proto_cc",
         ":gpu_constants",
-        ":gpu_device_info",
+        ":gpu_convert_async_collectives_to_sync",
         ":gpu_executable",
         ":ir_emitter_context",
         ":ir_emitter_unnested",
@@ -2541,7 +2500,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":fusion_merger",
-        ":gpu_device_info",
         ":gpu_hlo_cost_analysis",
         ":gpu_shape_verifier",
         ":horizontal_input_fusion",
@@ -2592,8 +2550,9 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_device_info",
-        "//xla/stream_executor:device_description_proto_cc_impl",
+        "//xla/stream_executor",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:dnn",
     ],
 )
@@ -2630,7 +2589,6 @@ cc_library(
         ":gpu_conv_rewriter",
         ":gpu_convert_async_collectives_to_sync",
         ":gpu_cost_model_stats_collection",
-        ":gpu_device_info",
         ":gpu_executable",
         ":gpu_float_support",
         ":gpu_hlo_cost_analysis",
@@ -2772,12 +2730,12 @@ cc_library(
         "//xla/service:while_loop_simplifier",
         "//xla/service:while_loop_trip_count_annotator",
         "//xla/service:zero_sized_hlo_elimination",
-        "//xla/stream_executor:device_description",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/spmd:collective_permute_motion",
         "//xla/service/spmd:stateful_rng_spmd_partitioner",
         "//xla/stream_executor",
-        "//xla/stream_executor:device_description_proto_cc_impl",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "//xla/translate/mhlo_to_hlo:location_exporter",
@@ -3182,7 +3140,6 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
-        ":gpu_device_info",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_memory_scheduler",
@@ -3190,6 +3147,7 @@ cc_library(
         "//xla/service:latency_hiding_scheduler",
         "//xla/service:p2p_schedule_preparation",
         "//xla/service:profile_guided_latency_estimator",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -3206,11 +3164,11 @@ xla_cc_test(
     ],
     tags = tf_cuda_tests_tags(),
     deps = [
-        ":gpu_device_info",
         ":gpu_hlo_schedule",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:gpu_plugin",
+        "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
@@ -3292,7 +3250,6 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
-        ":gpu_device_info",
         ":hlo_op_profile_proto_cc",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -3325,7 +3282,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
-        ":gpu_device_info",
         ":gpu_fusible",
         ":hlo_traversal",
         ":ir_emission_utils",
@@ -3368,13 +3324,13 @@ xla_cc_test(
     srcs = ["gpu_performance_model_test.cc"],
     deps = [
         ":backend_configs_cc",
-        ":gpu_device_info",
         ":gpu_device_info_for_tests",
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
+        "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
@@ -3401,7 +3357,6 @@ cc_library(
     local_defines = if_cuda(["GOOGLE_CUDA"]),
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_device_info",
         ":hlo_op_profile_proto_cc",
         "//xla:debug_options_flags",
         "//xla:literal",
@@ -3417,6 +3372,7 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_runner",
         "//xla/service:interpreter_plugin",
+        "//xla/stream_executor:device_description",
         "//xla/tests:test_utils",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:errors",
@@ -3439,7 +3395,6 @@ xla_cc_test(
         "requires-gpu-nvidia",
     ],
     deps = [
-        ":gpu_device_info",
         ":hlo_op_profile_proto_cc",
         ":hlo_op_profiler_lib",
         "//xla:debug_options_flags",
@@ -3447,7 +3402,7 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_runner",
         "//xla/service:platform_util",
-        "//xla/stream_executor",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -3530,13 +3485,13 @@ cc_library(
     hdrs = ["gpu_fusible.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_device_info",
         ":ir_emission_utils",
         ":reduction_utils",
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:instruction_fusion",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -3919,11 +3874,11 @@ cc_library(
     hdrs = ["horizontal_input_fusion.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_device_info",
         ":gpu_fusible",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_pass",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/types:span",
     ],
@@ -4605,7 +4560,6 @@ cc_library(
     hdrs = ["gpu_cost_model_stats_collection.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_device_info",
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
         "//xla:statusor",
@@ -4613,6 +4567,7 @@ cc_library(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_graph_dumper",
         "//xla/service:hlo_pass",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator.cc b/third_party/xla/xla/service/gpu/buffer_comparator.cc
index 4cb9cee9ec19c5..40180e249b2ea9 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator.cc
+++ b/third_party/xla/xla/service/gpu/buffer_comparator.cc
@@ -1067,22 +1067,8 @@ static StatusOr<bool> DeviceCompare(se::Stream* stream,
                                    se::DeviceMemory<uint64_t>>(
           kernel_name, buffer_compare_ptx, compiled_ptx)));
 
-  GpuDeviceInfo gpu_device_info;
-  gpu_device_info.threads_per_block_limit =
-      executor->GetDeviceDescription().threads_per_block_limit();
-  gpu_device_info.threads_per_warp =
-      executor->GetDeviceDescription().threads_per_warp();
-  gpu_device_info.shared_memory_per_block =
-      executor->GetDeviceDescription().shared_memory_per_block();
-  gpu_device_info.threads_per_core_limit =
-      executor->GetDeviceDescription().threads_per_core_limit();
-  gpu_device_info.core_count = executor->GetDeviceDescription().core_count();
-  gpu_device_info.block_dim_limit_x =
-      executor->GetDeviceDescription().block_dim_limit().x;
-  gpu_device_info.block_dim_limit_y =
-      executor->GetDeviceDescription().block_dim_limit().y;
-  gpu_device_info.block_dim_limit_z =
-      executor->GetDeviceDescription().block_dim_limit().z;
+  const se::DeviceDescription& gpu_device_info =
+      executor->GetDeviceDescription();
 
   TF_ASSIGN_OR_RETURN(LaunchDimensions dim,
                       CalculateLaunchDimensions(buffer_shape, gpu_device_info));
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 66ef3a2cd95470..188cfe52af6879 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -55,7 +55,6 @@ limitations under the License.
 #include "xla/service/gpu/conditional_thunk.h"
 #include "xla/service/gpu/for_thunk.h"
 #include "xla/service/gpu/gpu_constants.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_unnested.h"
@@ -247,7 +246,7 @@ StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
     const std::string& platform_name, const se::Platform::Id platform_id,
-    GpuDeviceInfo gpu_device_info,
+    const se::DeviceDescription& gpu_device_info,
     const BufferValue::SizeFunction& buffer_size_bytes_function) {
   CompileModuleResults results;
   TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
@@ -338,7 +337,7 @@ Status CompileModuleToLlvmIrImpl(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
     const std::string& platform_name, se::Platform::Id platform_id,
-    GpuDeviceInfo gpu_device_info,
+    const se::DeviceDescription& gpu_device_info,
     const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
     const BufferValue::SizeFunction& buffer_size_bytes_function,
     CompileModuleResults* results, se::StreamExecutor* stream_exec) {
@@ -370,8 +369,9 @@ Status CompileModuleToLlvmIrImpl(
   };
   DumpHloModuleIfEnabled(
       *hlo_module, *results->buffer_assignment,
-      absl::StrCat(std::visit(GetCcStr(), gpu_device_info.compute_capability),
-                   "_gpu_", kAfterOptimizationsDumpName));
+      absl::StrCat(
+          std::visit(GetCcStr(), gpu_device_info.gpu_compute_capability()),
+          "_gpu_", kAfterOptimizationsDumpName));
 
   VLOG(1) << "After optimization module fingerprint for " << hlo_module->name()
           << ": " << hlo_module->GetFingerprint128();
@@ -457,7 +457,7 @@ Status CompileModuleToLlvmIrImpl(
         LowerToJitRt(*mlir_module, entry_function.getName(), buffer_sizes,
                      hlo_module->config(), ir_emitter->ConsumeThunkSequence(),
                      /*hlo_module_for_dump=*/hlo_module,
-                     gpu_device_info.compute_capability));
+                     gpu_device_info.gpu_compute_capability()));
     return OkStatus();
   }
 
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
index 8090af9d9aae67..64d73b81700f96 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/gpu/executable.pb.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_dataflow_analysis.h"
@@ -64,14 +63,14 @@ StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
     const std::string& platform_name, se::Platform::Id platform_id,
-    GpuDeviceInfo gpu_device_info,
+    const se::DeviceDescription& gpu_device_info,
     const BufferValue::SizeFunction& buffer_size_bytes_function);
 
 Status CompileModuleToLlvmIrImpl(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
     const std::string& platform_name, se::Platform::Id platform_id,
-    GpuDeviceInfo gpu_device_info,
+    const se::DeviceDescription& gpu_device_info,
     const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
     const BufferValue::SizeFunction& buffer_size_bytes_function,
     CompileModuleResults* results, se::StreamExecutor* stream_exec = nullptr);
diff --git a/third_party/xla/xla/service/gpu/fusion_merger.cc b/third_party/xla/xla/service/gpu/fusion_merger.cc
index 055bfaec70ec94..0a1123b2bc5286 100644
--- a/third_party/xla/xla/service/gpu/fusion_merger.cc
+++ b/third_party/xla/xla/service/gpu/fusion_merger.cc
@@ -40,7 +40,7 @@ namespace gpu {
 class FusionInstructionMerger {
  public:
   explicit FusionInstructionMerger(
-      HloComputation* computation, const GpuDeviceInfo& gpu_device_info,
+      HloComputation* computation, const se::DeviceDescription& gpu_device_info,
       HloCostAnalysis::ShapeSizeFunction shape_size_function)
       : computation_(computation),
         shape_size_function_(shape_size_function),
@@ -64,7 +64,7 @@ class FusionInstructionMerger {
   // HLO cost analysis of the computation so that it may be not needed at all.
   std::optional<GpuHloCostAnalysis> cost_analysis_;
   FusionInfoCache fusion_info_cache_;
-  const GpuDeviceInfo& gpu_device_info_;
+  const se::DeviceDescription& gpu_device_info_;
   bool changed_ = false;
   bool dump_fusion_visualization_ = false;
 
diff --git a/third_party/xla/xla/service/gpu/fusion_merger.h b/third_party/xla/xla/service/gpu/fusion_merger.h
index 689a67fc2e3484..cefb945ba8a8f9 100644
--- a/third_party/xla/xla/service/gpu/fusion_merger.h
+++ b/third_party/xla/xla/service/gpu/fusion_merger.h
@@ -17,9 +17,9 @@ limitations under the License.
 #define XLA_SERVICE_GPU_FUSION_MERGER_H_
 
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -61,7 +61,7 @@ namespace gpu {
 
 class FusionMerger : public HloModulePass {
  public:
-  explicit FusionMerger(const GpuDeviceInfo& d,
+  explicit FusionMerger(const se::DeviceDescription& d,
                         HloCostAnalysis::ShapeSizeFunction f)
       : gpu_device_info_(d), shape_size_function_(f) {}
   absl::string_view name() const override { return "fusion_merger"; }
@@ -72,7 +72,7 @@ class FusionMerger : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  const GpuDeviceInfo gpu_device_info_;
+  se::DeviceDescription gpu_device_info_;
   HloCostAnalysis::ShapeSizeFunction shape_size_function_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
index b5f4bc91880ddf..584d6776895011 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <utility>
 
 #include "xla/service/gpu/fusion_merger.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/gpu_shape_verifier.h"
 #include "xla/service/gpu/horizontal_input_fusion.h"
@@ -35,6 +34,7 @@ limitations under the License.
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/service/layout_assignment.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/xla.pb.h"
 
 namespace xla {
@@ -43,7 +43,7 @@ namespace gpu {
 HloPassPipeline FusionPipeline(
     const DebugOptions& debug_options,
     HloCostAnalysis::ShapeSizeFunction shape_size_bytes_function,
-    const GpuDeviceInfo& gpu_device_info) {
+    const se::DeviceDescription& gpu_device_info) {
   HloPassFix<HloPassPipeline> fusion("fusion");
   // We try to split variadic ops with many parameters into several such ops
   // to avoid exceeding the parameter space.
@@ -81,7 +81,8 @@ HloPassPipeline FusionPipeline(
   return std::move(fusion);
 }
 
-HloPassPipeline HorizontalFusionPipeline(const GpuDeviceInfo& gpu_device_info) {
+HloPassPipeline HorizontalFusionPipeline(
+    const se::DeviceDescription& gpu_device_info) {
   HloPassFix<HloPassPipeline> horizontal_fusion("horizontal fusion");
   horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
   horizontal_fusion.AddPass<GpuHorizontalInputFusion>(gpu_device_info);
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.h b/third_party/xla/xla/service/gpu/fusion_pipeline.h
index 5359e499d4784f..cecb1a98091270 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.h
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSION_PIPELINE_H_
 #define XLA_SERVICE_GPU_FUSION_PIPELINE_H_
 
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_pass_pipeline.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/xla.pb.h"
 
 namespace xla {
@@ -28,10 +28,11 @@ namespace gpu {
 HloPassPipeline FusionPipeline(
     const DebugOptions& debug_options,
     HloCostAnalysis::ShapeSizeFunction shape_size_bytes_function,
-    const GpuDeviceInfo& gpu_device_info);
+    const se::DeviceDescription& gpu_device_info);
 
 // Function wrapper around the horizontal XLA GPU fusion pipeline.
-HloPassPipeline HorizontalFusionPipeline(const GpuDeviceInfo& gpu_device_info);
+HloPassPipeline HorizontalFusionPipeline(
+    const se::DeviceDescription& gpu_device_info);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index cc3dc4ed1fc721..b604ff8b827c0e 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
index c5ae7fb0df0c18..e7bf0a6eb39874 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
@@ -83,10 +83,7 @@ ENTRY main {
   auto module_group = std::make_unique<HloModuleGroup>(std::move(module));
 
   // Stream executor is not passed as an option.
-  GpuTargetConfig gpu_target_config;
-  gpu_target_config.gpu_device_info = GetGpuDeviceInfo(stream_exec);
-  gpu_target_config.platform_name = stream_exec->platform()->Name();
-
+  GpuTargetConfig gpu_target_config(stream_exec);
   AotCompilationOptions aot_options(compiler.PlatformId());
   aot_options.set_target_config(gpu_target_config);
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 21374a6e7f29bf..8214b862c7a470 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -105,7 +105,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_conv_rewriter.h"
 #include "xla/service/gpu/gpu_convert_async_collectives_to_sync.h"
 #include "xla/service/gpu/gpu_cost_model_stats_collection.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_float_support.h"
 #include "xla/service/gpu/gpu_hlo_cost_analysis.h"
@@ -414,7 +413,7 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
       } else {
         // Use a simple mesh shape if not specified.
         option.device_mesh_shape = {
-            gpu_target_config.gpu_device_info.core_count, 1};
+            gpu_target_config.gpu_device_info.core_count(), 1};
       }
       if (!hlo_module->config().auto_spmd_partitioning_mesh_ids().empty()) {
         option.device_mesh_ids =
@@ -457,7 +456,7 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
 
     HloPredicate upcaster_filter = [&](const HloInstruction* instr) {
       const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
-          &gpu_target_config.gpu_device_info.compute_capability);
+          &gpu_target_config.gpu_device_info.gpu_compute_capability());
       if (cuda_cc != nullptr &&
           !cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) {
         return true;
@@ -696,7 +695,7 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
   // Run target-specific HLO optimization passes for convolution
   // canonicalization.
   se::GpuComputeCapability gpu_version =
-      gpu_target_config.gpu_device_info.compute_capability;
+      gpu_target_config.gpu_device_info.gpu_compute_capability();
   se::dnn::VersionInfo dnn_version = gpu_target_config.dnn_version_info;
   if (stream_exec != nullptr) {
     gpu_version = GetGpuVersion(stream_exec);
@@ -735,7 +734,8 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
       hlo_module, stream_exec, options, gpu_target_config, autotune_results,
       thread_pool));
 
-  const GpuDeviceInfo& gpu_device_info = gpu_target_config.gpu_device_info;
+  const se::DeviceDescription& gpu_device_info =
+      gpu_target_config.gpu_device_info;
 
   TF_RETURN_IF_ERROR(
       FusionPipeline(debug_options, ShapeSizeBytesFunction(), gpu_device_info)
@@ -899,7 +899,7 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // Constants:
   const DebugOptions& debug_options = hlo_module->config().debug_options();
   const se::GpuComputeCapability gpu_version =
-      gpu_target_config.gpu_device_info.compute_capability;
+      gpu_target_config.gpu_device_info.gpu_compute_capability();
   const AlgebraicSimplifierOptions simplifier_options = [&] {
     AlgebraicSimplifierOptions opts;
     opts.set_supports_non_canonical_dots(false);
@@ -957,7 +957,7 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
 
     // Rewrite GEMMs into custom calls.
     se::GpuComputeCapability gpu_version =
-        gpu_target_config.gpu_device_info.compute_capability;
+        gpu_target_config.gpu_device_info.gpu_compute_capability();
     const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version);
     if (debug_options.xla_gpu_enable_triton_gemm() && cuda_cc != nullptr &&
         cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) {
@@ -1155,7 +1155,8 @@ Status RunPostSchedulingCopyInsertion(
 
 StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
     HloModule* hlo_module, se::StreamExecutor* stream_exec) {
-  const GpuDeviceInfo gpu_device_info = GetGpuDeviceInfo(stream_exec);
+  const se::DeviceDescription& gpu_device_info =
+      stream_exec->GetDeviceDescription();
   const int64_t scheduler_mem_limit =
       GetSchedulerMemoryLimit(hlo_module, gpu_device_info, pointer_size_);
   TF_RETURN_IF_ERROR(
@@ -1466,7 +1467,8 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   llvm::LLVMContext llvm_context;
 
-  const GpuDeviceInfo gpu_device_info = GetGpuDeviceInfo(stream_exec);
+  const se::DeviceDescription& gpu_device_info =
+      stream_exec->GetDeviceDescription();
 
   if (module->config().hlo_profiling_enabled() || VLOG_IS_ON(1)) {
     HloCostAnalysis::Options cost_analysis_options{ShapeSizeBytesFunction()};
@@ -1610,8 +1612,9 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
     const int64_t scheduler_mem_limit = GetSchedulerMemoryLimit(
         module.get(),
-        gpu_target_config != nullptr ? gpu_target_config->gpu_device_info
-                                     : GetGpuDeviceInfo(options.executor()),
+        gpu_target_config != nullptr
+            ? gpu_target_config->gpu_device_info
+            : options.executor()->GetDeviceDescription(),
         pointer_size_);
     TF_RETURN_IF_ERROR(
         ScheduleGpuModule(module.get(), pointer_size_, scheduler_mem_limit));
@@ -1633,7 +1636,7 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
       TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
           module.get(), &llvm_context, target_triple_, data_layout_,
           stream_exec->platform()->Name(), options.PlatformId(),
-          GetGpuDeviceInfo(stream_exec), GetCanShareBuffer(),
+          stream_exec->GetDeviceDescription(), GetCanShareBuffer(),
           BufferSizeBytesFunction(), &compile_module_results));
     }
     if (user_pre_optimization_hook_) {
@@ -1647,7 +1650,7 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
           backend_result,
           CompileToTargetBinary(
               module->config(), std::move(compile_module_results.llvm_module),
-              gpu_target_config->gpu_device_info.compute_capability,
+              gpu_target_config->gpu_device_info.gpu_compute_capability(),
               options.executor(), {options.device_allocator()}, module.get()));
     } else {
       TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 8deac1552da5ae..756ab4a95a135e 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/buffer_sharing.h"
 #include "xla/service/gpu/executable.pb.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_target_config.h"
 #include "xla/service/hlo.pb.h"
@@ -37,6 +36,7 @@ limitations under the License.
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/service/llvm_compiler.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
@@ -125,11 +125,7 @@ class GpuCompiler : public LLVMCompiler {
   se::GpuComputeCapability GetGpuVersion(se::StreamExecutor* stream_exec);
 
   GpuTargetConfig GetGpuTargetConfig(se::StreamExecutor* stream_exec) {
-    GpuTargetConfig gpu_target_config;
-    gpu_target_config.gpu_device_info = GetGpuDeviceInfo(stream_exec);
-    gpu_target_config.platform_name = stream_exec->platform()->Name();
-
-    return gpu_target_config;
+    return GpuTargetConfig(stream_exec);
   }
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
diff --git a/third_party/xla/xla/service/gpu/gpu_cost_model_stats_collection.h b/third_party/xla/xla/service/gpu/gpu_cost_model_stats_collection.h
index 51ff0cf116c47a..678327c899cae1 100644
--- a/third_party/xla/xla/service/gpu/gpu_cost_model_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/gpu_cost_model_stats_collection.h
@@ -21,11 +21,11 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -35,7 +35,7 @@ namespace gpu {
 class GpuCostModelStatsCollection : public HloModulePass {
  public:
   explicit GpuCostModelStatsCollection(
-      const GpuDeviceInfo& d,
+      const se::DeviceDescription& d,
       const GpuHloCostAnalysis::Options& cost_analysis_options)
       : device_info_(d), cost_analysis_(cost_analysis_options, &device_info_) {}
 
@@ -49,7 +49,7 @@ class GpuCostModelStatsCollection : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  const GpuDeviceInfo device_info_;
+  se::DeviceDescription device_info_;
   GpuHloCostAnalysis cost_analysis_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info.cc b/third_party/xla/xla/service/gpu/gpu_device_info.cc
deleted file mode 100644
index 160d86b7bb43ce..00000000000000
--- a/third_party/xla/xla/service/gpu/gpu_device_info.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/gpu_device_info.h"
-
-#include "xla/stream_executor/device_description.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-GpuDeviceInfo GetGpuDeviceInfo(
-    const stream_executor::DeviceDescription& device) {
-  GpuDeviceInfo device_info;
-  device_info.compute_capability = device.gpu_compute_capability();
-  device_info.threads_per_block_limit = device.threads_per_block_limit();
-  device_info.threads_per_warp = device.threads_per_warp();
-  device_info.shared_memory_per_block = device.shared_memory_per_block();
-  device_info.shared_memory_per_block_optin =
-      device.shared_memory_per_block_optin();
-  device_info.shared_memory_per_core = device.shared_memory_per_core();
-  device_info.threads_per_core_limit = device.threads_per_core_limit();
-  device_info.core_count = device.core_count();
-  device_info.fpus_per_core = device.fpus_per_core();
-  device_info.block_dim_limit_x = device.block_dim_limit().x;
-  device_info.block_dim_limit_y = device.block_dim_limit().y;
-  device_info.block_dim_limit_z = device.block_dim_limit().z;
-  device_info.memory_bandwidth = device.memory_bandwidth();
-  device_info.l2_cache_size = device.l2_cache_size();
-  device_info.clock_rate_ghz = device.clock_rate_ghz();
-  device_info.device_memory_size = device.device_memory_size();
-  return device_info;
-}
-
-}  // namespace
-
-GpuDeviceInfo GetGpuDeviceInfo(
-    const stream_executor::StreamExecutor* stream_exec) {
-  return GetGpuDeviceInfo(stream_exec->GetDeviceDescription());
-}
-
-GpuDeviceInfo GetGpuDeviceInfo(const stream_executor::Platform* platform) {
-  auto device = platform->DescriptionForDevice(0);
-  return GetGpuDeviceInfo(**device);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info.h b/third_party/xla/xla/service/gpu/gpu_device_info.h
deleted file mode 100644
index 7f6abc73845b85..00000000000000
--- a/third_party/xla/xla/service/gpu/gpu_device_info.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_GPU_DEVICE_INFO_H_
-#define XLA_SERVICE_GPU_GPU_DEVICE_INFO_H_
-
-#include <string>
-#include <variant>
-
-#include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/stream_executor.h"
-
-namespace xla {
-namespace gpu {
-
-// The information contained in these structures is also contained in
-// se::DeviceDescription, but separating these out lets us write code that does
-// not depend on stream executor.
-struct GpuDeviceInfo {
-  stream_executor::GpuComputeCapability compute_capability;
-  int threads_per_block_limit;
-  int threads_per_warp;
-  int shared_memory_per_block;
-  int shared_memory_per_block_optin;
-  int shared_memory_per_core;
-  int threads_per_core_limit;
-  int core_count;
-  int64_t fpus_per_core;
-  int block_dim_limit_x;
-  int block_dim_limit_y;
-  int block_dim_limit_z;
-  int64_t memory_bandwidth;
-  int64_t l2_cache_size;
-  float clock_rate_ghz;
-  int64_t device_memory_size;
-
-  stream_executor::GpuDeviceInfoProto ToProto() const {
-    stream_executor::GpuDeviceInfoProto proto;
-    if (auto* ptr = std::get_if<stream_executor::CudaComputeCapability>(
-            &compute_capability))
-      *proto.mutable_cuda_compute_capability() = ptr->ToProto();
-    if (auto* ptr = std::get_if<stream_executor::RocmComputeCapability>(
-            &compute_capability))
-      *proto.mutable_rocm_compute_capability() = ptr->ToProto();
-
-    proto.set_threads_per_block_limit(threads_per_block_limit);
-    proto.set_threads_per_warp(threads_per_warp);
-    proto.set_shared_memory_per_block(shared_memory_per_block);
-    proto.set_shared_memory_per_block_optin(shared_memory_per_block_optin);
-    proto.set_shared_memory_per_core(shared_memory_per_core);
-    proto.set_threads_per_core_limit(threads_per_core_limit);
-    proto.set_core_count(core_count);
-    proto.set_fpus_per_core(fpus_per_core);
-    proto.set_block_dim_limit_x(block_dim_limit_x);
-    proto.set_block_dim_limit_y(block_dim_limit_y);
-    proto.set_block_dim_limit_z(block_dim_limit_z);
-    proto.set_memory_bandwidth(memory_bandwidth);
-    proto.set_l2_cache_size(l2_cache_size);
-    proto.set_clock_rate_ghz(clock_rate_ghz);
-    proto.set_device_memory_size(device_memory_size);
-    return proto;
-  }
-
-  GpuDeviceInfo() = default;
-  explicit GpuDeviceInfo(const stream_executor::GpuDeviceInfoProto& proto) {
-    if (proto.has_cuda_compute_capability()) {
-      compute_capability = stream_executor::CudaComputeCapability(
-          proto.cuda_compute_capability());
-    } else {
-      compute_capability = stream_executor::RocmComputeCapability(
-          proto.rocm_compute_capability());
-    }
-    threads_per_block_limit = proto.threads_per_block_limit();
-    threads_per_warp = proto.threads_per_warp();
-    shared_memory_per_block = proto.shared_memory_per_block();
-    shared_memory_per_block_optin = proto.shared_memory_per_block_optin();
-    shared_memory_per_core = proto.shared_memory_per_core();
-    threads_per_core_limit = proto.threads_per_core_limit();
-    core_count = proto.core_count();
-    fpus_per_core = proto.fpus_per_core();
-    block_dim_limit_x = proto.block_dim_limit_x();
-    block_dim_limit_y = proto.block_dim_limit_y();
-    block_dim_limit_z = proto.block_dim_limit_z();
-    memory_bandwidth = proto.memory_bandwidth();
-    l2_cache_size = proto.l2_cache_size();
-    clock_rate_ghz = proto.clock_rate_ghz();
-    device_memory_size = proto.device_memory_size();
-  }
-};
-
-GpuDeviceInfo GetGpuDeviceInfo(
-    const stream_executor::StreamExecutor* stream_exec);
-GpuDeviceInfo GetGpuDeviceInfo(const stream_executor::Platform* platform);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_GPU_DEVICE_INFO_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
index 565f2dfd8a917c..5f88847b3352a4 100644
--- a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
+++ b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
@@ -15,52 +15,53 @@ limitations under the License.
 
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
 
-GpuDeviceInfo TestGpuDeviceInfo::RTXA6000DeviceInfo() {
-  GpuDeviceInfo info;
-  info.compute_capability = stream_executor::CudaComputeCapability(8, 9);
-  info.threads_per_block_limit = 1024;
-  info.threads_per_warp = 32;
-  info.shared_memory_per_block = 48 * 1024;
-  info.shared_memory_per_block_optin = 99 * 1024;
-  info.shared_memory_per_core = 100 * 1024;
-  info.threads_per_core_limit = 1536;
-  info.core_count = 84;
-  info.fpus_per_core = 128;
-  info.block_dim_limit_x = 2'147'483'647;
-  info.block_dim_limit_y = 65535;
-  info.block_dim_limit_z = 65535;
-  info.memory_bandwidth = 768'096'000'000;
-  info.l2_cache_size = 6 * 1024 * 1024;
-  info.clock_rate_ghz = 1.410;
-  info.device_memory_size = 51'050'250'240;
-  return info;
+stream_executor::DeviceDescription TestGpuDeviceInfo::RTXA6000DeviceInfo(
+    stream_executor::GpuComputeCapability cc) {
+  stream_executor::internal::DeviceDescriptionBuilder b;
+  b.set_gpu_compute_capability(cc);
+  b.set_threads_per_block_limit(1024);
+  b.set_threads_per_warp(32);
+  b.set_shared_memory_per_block(48 * 1024);
+  b.set_shared_memory_per_block_optin(99 * 1024);
+  b.set_shared_memory_per_core(100 * 1024);
+  b.set_threads_per_core_limit(1536);
+  b.set_core_count(84);
+  b.set_fpus_per_core(128);
+  b.set_block_dim_limit_x(2'147'483'647);
+  b.set_block_dim_limit_y(65535);
+  b.set_block_dim_limit_z(65535);
+  b.set_memory_bandwidth(768'096'000'000);
+  b.set_l2_cache_size(6 * 1024 * 1024);
+  b.set_clock_rate_ghz(1.410);
+  b.set_device_memory_size(51'050'250'240);
+  return b.BuildObject();
 }
 
-GpuDeviceInfo TestGpuDeviceInfo::AMDMI210DeviceInfo() {
-  GpuDeviceInfo info;
-  info.compute_capability = stream_executor::RocmComputeCapability("gfx90a");
-  info.threads_per_block_limit = 1024;
-  info.threads_per_warp = 64;
-  info.shared_memory_per_block = 64 * 1024;
-  info.shared_memory_per_block_optin = 0;
-  info.shared_memory_per_core = 64 * 1024;
-  info.threads_per_core_limit = 2048;
-  info.core_count = 104;
-  info.fpus_per_core = 0;
-  info.block_dim_limit_x = 2'147'483'647;
-  info.block_dim_limit_y = 2'147'483'647;
-  info.block_dim_limit_z = 2'147'483'647;
-  info.memory_bandwidth = 1'638'400'000'000;
-  info.l2_cache_size = 8 * 1024 * 1024;
-  info.clock_rate_ghz = 1.7;
-  info.device_memory_size = 67'628'957'696;
-  return info;
+stream_executor::DeviceDescription TestGpuDeviceInfo::AMDMI210DeviceInfo() {
+  stream_executor::internal::DeviceDescriptionBuilder b;
+  b.set_gpu_compute_capability(
+      stream_executor::RocmComputeCapability("gfx90a"));
+  b.set_threads_per_block_limit(1024);
+  b.set_threads_per_warp(64);
+  b.set_shared_memory_per_block(64 * 1024);
+  b.set_shared_memory_per_block_optin(0);
+  b.set_shared_memory_per_core(64 * 1024);
+  b.set_threads_per_core_limit(2048);
+  b.set_core_count(104);
+  b.set_fpus_per_core(0);
+  b.set_block_dim_limit_x(2'147'483'647);
+  b.set_block_dim_limit_y(2'147'483'647);
+  b.set_block_dim_limit_z(2'147'483'647);
+  b.set_memory_bandwidth(1'638'400'000'000);
+  b.set_l2_cache_size(8 * 1024 * 1024);
+  b.set_clock_rate_ghz(1.7);
+  b.set_device_memory_size(67'628'957'696);
+  return b.BuildObject();
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h
index 79f93a4617ccc3..2dab6d5a90c07c 100644
--- a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h
+++ b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h
@@ -16,15 +16,17 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_GPU_DEVICE_INFO_FOR_TESTS_H_
 #define XLA_SERVICE_GPU_GPU_DEVICE_INFO_FOR_TESTS_H_
 
-#include "xla/service/gpu/gpu_device_info.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
 
 class TestGpuDeviceInfo {
  public:
-  static GpuDeviceInfo RTXA6000DeviceInfo();
-  static GpuDeviceInfo AMDMI210DeviceInfo();
+  static stream_executor::DeviceDescription RTXA6000DeviceInfo(
+      stream_executor::GpuComputeCapability cc =
+          stream_executor::CudaComputeCapability(8, 9));
+  static stream_executor::DeviceDescription AMDMI210DeviceInfo();
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_test.cc b/third_party/xla/xla/service/gpu/gpu_device_info_test.cc
deleted file mode 100644
index 8d1ec3e65ef239..00000000000000
--- a/third_party/xla/xla/service/gpu/gpu_device_info_test.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/gpu_device_info.h"
-
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/gpu_executor.h"
-#include "tsl/platform/test.h"
-
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#endif
-
-namespace xla {
-namespace gpu {
-namespace {
-
-namespace se = stream_executor;
-
-TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
-  std::string test_platform = "cuda";
-#if TENSORFLOW_USE_ROCM
-  test_platform = "rocm";
-#endif
-
-  se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName(test_platform).value();
-  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-  const GpuDeviceInfo dev_info = GetGpuDeviceInfo(executor);
-  absl::string_view name(executor->GetDeviceDescription().name());
-  if (name == "NVIDIA RTX A6000") {
-    GpuDeviceInfo test_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-    EXPECT_THAT(
-        dev_info,
-        ::testing::FieldsAre(
-            test_info.compute_capability, test_info.threads_per_block_limit,
-            test_info.threads_per_warp, test_info.shared_memory_per_block,
-            test_info.shared_memory_per_block_optin,
-            test_info.shared_memory_per_core, test_info.threads_per_core_limit,
-            test_info.core_count, test_info.fpus_per_core,
-            test_info.block_dim_limit_x, test_info.block_dim_limit_y,
-            test_info.block_dim_limit_z, test_info.memory_bandwidth,
-            test_info.l2_cache_size,
-            // Clock rate can vary between base and boost values.
-            ::testing::Ge(test_info.clock_rate_ghz),
-            dev_info.device_memory_size));
-  } else if (name == "Quadro P1000") {
-    EXPECT_THAT(
-        dev_info,
-        ::testing::FieldsAre(
-            se::GpuComputeCapability(se::CudaComputeCapability(6, 1)),
-            /*threads_per_block_limit=*/1024,
-            /*threads_per_warp=*/32, /*shared_memory_per_block=*/48 * 1024,
-            /*shared_memory_per_block_optin=*/48 * 1024,
-            /*shared_memory_per_core=*/96 * 1024,
-            /*threads_per_core_limit=*/2048, /*core_count=*/5,
-            /*fpus_per_core=*/128,
-            /*block_dim_limit_x=*/2'147'483'647,
-            /*block_dim_limit_y=*/65535,
-            /*block_dim_limit_z=*/65535,
-            /*memory_bandwidth=*/80'160'000'000, /*l2_cache_size=*/1024 * 1024,
-            /*clock_rate_ghz=*/::testing::Ge(1.4),
-            /*device_memory_size=*/4'234'346'496));
-  } else if (name == "Tesla P100-SXM2-16GB") {
-    EXPECT_THAT(dev_info,
-                ::testing::FieldsAre(
-                    se::GpuComputeCapability(se::CudaComputeCapability(6, 0)),
-                    /*threads_per_block_limit=*/1024,
-                    /*threads_per_warp=*/32,
-                    /*shared_memory_per_block=*/48 * 1024,
-                    /*shared_memory_per_block_optin=*/48 * 1024,
-                    /*shared_memory_per_core=*/64 * 1024,
-                    /*threads_per_core_limit=*/2048, /*core_count=*/56,
-                    /*fpus_per_core=*/64,
-                    /*block_dim_limit_x=*/2'147'483'647,
-                    /*block_dim_limit_y=*/65535,
-                    /*block_dim_limit_z=*/65535,
-                    /*memory_bandwidth=*/732'160'000'000,
-                    /*l2_cache_size=*/4 * 1024 * 1024,
-                    /*clock_rate_ghz=*/::testing::Ge(1.4),
-                    /*device_memory_size=*/17'066'622'976));
-  }
-#if TF_ROCM_VERSION >= 50500
-  else if (name == "AMD Instinct MI210") {  // NOLINT
-    GpuDeviceInfo test_info = TestGpuDeviceInfo::AMDMI210DeviceInfo();
-    EXPECT_THAT(
-        dev_info,
-        ::testing::FieldsAre(
-            test_info.compute_capability, test_info.threads_per_block_limit,
-            test_info.threads_per_warp, test_info.shared_memory_per_block,
-            test_info.shared_memory_per_block_optin,
-            test_info.shared_memory_per_core, test_info.threads_per_core_limit,
-            test_info.core_count, test_info.fpus_per_core,
-            test_info.block_dim_limit_x, test_info.block_dim_limit_y,
-            test_info.block_dim_limit_z, test_info.memory_bandwidth,
-            test_info.l2_cache_size, ::testing::Ge(test_info.clock_rate_ghz),
-            dev_info.device_memory_size));
-  } else if (name == "AMD Instinct MI100") {
-    EXPECT_THAT(
-        dev_info,
-        ::testing::FieldsAre(
-            se::GpuComputeCapability(se::RocmComputeCapability("gfx908")),
-            /*threads_per_block_limit=*/1024,
-            /*threads_per_warp=*/64, /*shared_memory_per_block=*/64 * 1024,
-            /*shared_memory_per_block_optin=*/0,
-            /*shared_memory_per_core=*/64 * 1024,
-            /*threads_per_core_limit=*/2560, /*core_count=*/120,
-            /*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647,
-            /*block_dim_limit_y=*/2'147'483'647,
-            /*block_dim_limit_z=*/2'147'483'647,
-            /*memory_bandwidth=*/1228800000000,
-            /*l2_cache_size=*/8 * 1024 * 1024,
-            /*clock_rate_ghz=*/::testing::Ge(1.5),
-            /*device_memory_size=*/33'806'090'240));
-  } else if (name == "AMD Instinct MI50/MI60") {
-    EXPECT_THAT(
-        dev_info,
-        ::testing::FieldsAre(
-            se::GpuComputeCapability(se::RocmComputeCapability("gfx906")),
-            /*threads_per_block_limit=*/1024,
-            /*threads_per_warp=*/64, /*shared_memory_per_block=*/64 * 1024,
-            /*shared_memory_per_block_optin=*/0,
-            /*shared_memory_per_core=*/64 * 1024,
-            /*threads_per_core_limit=*/2560, /*core_count=*/60,
-            /*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647,
-            /*block_dim_limit_y=*/2'147'483'647,
-            /*block_dim_limit_z=*/2'147'483'647,
-            /*memory_bandwidth=*/256000000000,
-            /*l2_cache_size=*/8 * 1024 * 1024,
-            /*clock_rate_ghz=*/::testing::Ge(1.7),
-            /*device_memory_size=*/17'163'091'968));
-  }
-#endif    // TF_ROCM_VERSION >= 50500
-  else {  // NOLINT
-    VLOG(1) << "Not tested for " << name;
-  }
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index 1c4cde6e0ba223..51daedc4681692 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -645,14 +645,14 @@ static int64_t NumUnnestedReductions(const HloInstruction& instr,
 // to true to enable more fusion.
 FusionDecision FusionFitsInBudget(const HloInstruction& instr1,
                                   const HloInstruction& instr2,
-                                  const GpuDeviceInfo& device_info,
+                                  const se::DeviceDescription& device_info,
                                   bool is_consumer_producer_fusion,
                                   FusionInfoCache* cache /*=nullptr*/) {
   if (SharedMemoryUsage(instr1, cache) + SharedMemoryUsage(instr2, cache) >
-      device_info.shared_memory_per_block) {
+      device_info.shared_memory_per_block()) {
     return FusionDecision{}
            << "shared memory usage would be over the budget of "
-           << device_info.shared_memory_per_block << "B";
+           << device_info.shared_memory_per_block() << "B";
   }
 
   if (NumUnnestedReductions(instr1, cache) +
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.h b/third_party/xla/xla/service/gpu/gpu_fusible.h
index c29faa748f277d..ec02a6c2d04ad4 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.h
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.h
@@ -20,9 +20,9 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/instruction_fusion.h"
+#include "xla/stream_executor/device_description.h"
 
 // TODO(b/112957171): Extract logic to determine fusibility of HLO ops from
 // GpuInstructionFusion, FusionMerger, and GpuMultiOutputFusion.
@@ -102,7 +102,7 @@ bool IsInputFusibleScatter(const HloInstruction& instr);
 // the producer, set consumer_producer_fusion to true to enable more fusion.
 FusionDecision FusionFitsInBudget(const HloInstruction& instr1,
                                   const HloInstruction& instr2,
-                                  const GpuDeviceInfo& device_info,
+                                  const se::DeviceDescription& device_info,
                                   bool is_consumer_producer_fusion = false,
                                   FusionInfoCache* cache = nullptr);
 
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_cost_analysis.cc b/third_party/xla/xla/service/gpu/gpu_hlo_cost_analysis.cc
index cdc8cf6e9a4ff6..cf979c35cab7bf 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_cost_analysis.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/hlo_op_profile.pb.h"
 #include "xla/service/gpu/hlo_op_profiles.h"
 #include "xla/stream_executor/device_description.h"
@@ -300,15 +299,15 @@ const ProfilesNestedMap* LoadOpProfiles() {
   return ret;
 }
 
-int64_t FlopsPerElement(const GpuDeviceInfo* device_info,
+int64_t FlopsPerElement(const se::DeviceDescription* device_info,
                         const PrimitiveType type, const HloOpcode opcode) {
   std::string compute_capability = "<unknown>";
   if (device_info != nullptr) {
     if (auto* ptr = std::get_if<stream_executor::CudaComputeCapability>(
-            &device_info->compute_capability))
+            &device_info->gpu_compute_capability()))
       compute_capability = absl::StrCat("sm_", ptr->major, ptr->minor);
     if (auto* ptr = std::get_if<stream_executor::RocmComputeCapability>(
-            &device_info->compute_capability))
+            &device_info->gpu_compute_capability()))
       compute_capability = ptr->gfx_version();
   }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_cost_analysis.h b/third_party/xla/xla/service/gpu/gpu_hlo_cost_analysis.h
index 3ef1e3e2fb27d1..c8d1ab86aff9ac 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_cost_analysis.h
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_cost_analysis.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <memory>
 
 #include "absl/strings/string_view.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/hlo_cost_analysis.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -33,8 +33,9 @@ class GpuHloCostAnalysis : public HloCostAnalysis {
   static constexpr int64_t kMaxIRSize = 10000;
 
  public:
-  explicit GpuHloCostAnalysis(const Options& options,
-                              const GpuDeviceInfo* device_info = nullptr)
+  explicit GpuHloCostAnalysis(
+      const Options& options,
+      const se::DeviceDescription* device_info = nullptr)
       : HloCostAnalysis(options), device_info_(device_info) {}
 
   Status Preprocess(const HloInstruction* hlo) override;
@@ -65,7 +66,7 @@ class GpuHloCostAnalysis : public HloCostAnalysis {
   float CommonElementwiseUtilization(const HloInstruction* a,
                                      const HloInstruction* b) const;
 
-  const GpuDeviceInfo* device_info_;
+  const se::DeviceDescription* device_info_;
 
  protected:
   std::unique_ptr<HloCostAnalysis> CreateNestedCostAnalysis() override;
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 43392ad384ccdf..c3fc3b39b94340 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -678,7 +678,7 @@ HloInstructionSequence PostProcessSchedule(
 // Compute the device memory limit to be used by passes like scheduler and
 // HLO rematerialization.
 int64_t GetSchedulerMemoryLimit(const HloModule* module,
-                                const GpuDeviceInfo& gpu_device_info,
+                                const se::DeviceDescription& gpu_device_info,
                                 int pointer_size) {
   // There is a "base" value which is either specified in HloModuleConfig (this
   // value should take into account the fact that we need to leave some memory
@@ -691,7 +691,7 @@ int64_t GetSchedulerMemoryLimit(const HloModule* module,
   const int64_t base_limit =
       module->config().device_memory_size() != 0
           ? module->config().device_memory_size()
-          : gpu_device_info.device_memory_size * 80 / 100;
+          : gpu_device_info.device_memory_size() * 80 / 100;
 
   // Find the total size of inputs and outputs.
   int64_t total_io_size = 0;
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
index df2237d266f275..ef6e86b434d5eb 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_GPU_GPU_HLO_SCHEDULE_H_
 
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/gpu_device_info.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -30,7 +30,7 @@ Status ScheduleGpuModule(HloModule* module, int64_t pointer_size,
 HloInstructionSequence PostProcessSchedule(const HloInstructionSequence& input);
 
 int64_t GetSchedulerMemoryLimit(const HloModule* module,
-                                const GpuDeviceInfo& gpu_device_info,
+                                const se::DeviceDescription& gpu_device_info,
                                 int pointer_size);
 
 constexpr absl::string_view kFingerprintBeforeLHS = "fingerprint_before_lhs";
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index 9700b43d685975..985095286df5d2 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/utils/hlo_query.h"
-#include "xla/service/gpu/gpu_device_info.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_utils.h"
 #include "tsl/profiler/protobuf/profiled_instructions.pb.h"
@@ -44,11 +44,11 @@ class GpuHloScheduleTest : public HloTestBase {
 
   SequentialHloOrdering BuildHloOrdering(HloModule* module) {
     Backend& test_backend = backend();
-    const GpuDeviceInfo gpu_device_info =
-        GetGpuDeviceInfo(test_backend.default_stream_executor());
+    const se::DeviceDescription& gpu_device_info =
+        test_backend.default_stream_executor()->GetDeviceDescription();
     TF_CHECK_OK(ScheduleGpuModule(
         module, /*pointer_size=*/8,
-        /*memory_size=*/gpu_device_info.device_memory_size * 8 / 10));
+        /*memory_size=*/gpu_device_info.device_memory_size() * 8 / 10));
     return SequentialHloOrdering{module->schedule()};
   }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/gpu_performance_model.cc
index 431c09c0812450..6f81fee249e40f 100644
--- a/third_party/xla/xla/service/gpu/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/gpu_performance_model.cc
@@ -58,12 +58,12 @@ bool FusionUsesParameterElementwiseFromRoot(
 // Estimate read time of n_bytes_total bytes from global memory on a
 // given GPU. Account for L1 / L2 cache speedup if the input's nominal size
 // n_bytes_net is small.
-absl::Duration ReadTime(const GpuDeviceInfo& gpu_device_info,
+absl::Duration ReadTime(const se::DeviceDescription& gpu_device_info,
                         int64_t n_bytes_net, int64_t n_bytes_total) {
-  float bw = gpu_device_info.memory_bandwidth;
-  if (n_bytes_net < gpu_device_info.l2_cache_size) {
+  float bw = gpu_device_info.memory_bandwidth();
+  if (n_bytes_net < gpu_device_info.l2_cache_size()) {
     bw *= kL2CacheSpeedup;
-    if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count) {
+    if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count()) {
       bw *= kL1CacheSpeedup;
     }
   }
@@ -75,7 +75,8 @@ absl::Duration ReadTime(const GpuDeviceInfo& gpu_device_info,
 // inputs as if it is fused into the consumer.
 absl::Duration ProducerInputAccessTime(
     const GpuHloCostAnalysis* cost_analysis,
-    const GpuDeviceInfo& gpu_device_info, const HloInstruction* producer,
+    const se::DeviceDescription& gpu_device_info,
+    const HloInstruction* producer,
     const HloInstruction* fused_consumer = nullptr) {
   absl::Duration ret = absl::ZeroDuration();
   float producer_output_utilization = 1.f;
@@ -142,7 +143,7 @@ absl::Duration ProducerInputAccessTime(
 // Use HloFusionAnalysis for computing the actual number of threads that the
 // IR emitter will use. Return std::nullopt if this data is not available.
 std::optional<int64_t> EstimateThreadCount(
-    const HloInstruction* instr, const GpuDeviceInfo& gpu_device_info) {
+    const HloInstruction* instr, const se::DeviceDescription& gpu_device_info) {
   auto fusion = DynCast<const HloFusionInstruction>(instr);
   if (fusion == nullptr) return std::nullopt;
   auto analysis = HloFusionAnalysis::Create(fusion, &gpu_device_info);
@@ -152,11 +153,12 @@ std::optional<int64_t> EstimateThreadCount(
   return launch_dimensions->launch_bound();
 }
 
-absl::Duration ComputeTime(const GpuDeviceInfo& gpu_device_info,
+absl::Duration ComputeTime(const se::DeviceDescription& gpu_device_info,
                            int64_t n_flops, int64_t n_threads) {
-  int fpu_count = gpu_device_info.core_count * gpu_device_info.fpus_per_core;
+  int fpu_count =
+      gpu_device_info.core_count() * gpu_device_info.fpus_per_core();
   float n_threads_active = fmin(n_threads, fpu_count);
-  float flop_per_second_per_fpu = 2 * 1e9 * gpu_device_info.clock_rate_ghz;
+  float flop_per_second_per_fpu = 2 * 1e9 * gpu_device_info.clock_rate_ghz();
   float flop_per_second_effective = flop_per_second_per_fpu * n_threads_active;
   return absl::Seconds(n_flops / flop_per_second_effective);
 }
@@ -171,7 +173,7 @@ struct EstimateRunTimeData {
 
 EstimateRunTimeData EstimateRunTimeImpl(
     const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis) {
-  const GpuDeviceInfo* device_info = cost_analysis->device_info_;
+  const se::DeviceDescription* device_info = cost_analysis->device_info_;
 
   int64_t flops = cost_analysis->flop_count(*instr);
   float bytes_written = cost_analysis->output_bytes_accessed(*instr);
@@ -183,7 +185,7 @@ EstimateRunTimeData EstimateRunTimeImpl(
   const absl::Duration read_time =
       ProducerInputAccessTime(cost_analysis, *device_info, /*producer=*/instr);
   const absl::Duration write_time =
-      absl::Seconds(bytes_written / device_info->memory_bandwidth);
+      absl::Seconds(bytes_written / device_info->memory_bandwidth());
   const absl::Duration exec_time =
       std::max(compute_time, read_time + write_time);
 
@@ -210,7 +212,7 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     VLOG(10) << producer->fused_instructions_computation()->ToString();
   }
 
-  const GpuDeviceInfo* device_info = cost_analysis->device_info_;
+  const se::DeviceDescription* device_info = cost_analysis->device_info_;
 
   EstimateRunTimeData producer_data =
       EstimateRunTimeImpl(producer, cost_analysis);
@@ -290,7 +292,7 @@ void GpuPerformanceModel::RecordEstimatedRunTime(
 
   EstimateRunTimeData data = EstimateRunTimeImpl(instruction, cost_analysis);
   double cycles = absl::ToDoubleNanoseconds(data.exec_time) *
-                  cost_analysis->device_info_->clock_rate_ghz;
+                  cost_analysis->device_info_->clock_rate_ghz();
 
   auto backend_config = instruction->backend_config<FusionBackendConfig>();
   TF_CHECK_OK(backend_config.status()) << instruction->ToString();
diff --git a/third_party/xla/xla/service/gpu/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/gpu_performance_model_test.cc
index b302284ee1d217..5bfa8a976d5fbb 100644
--- a/third_party/xla/xla/service/gpu/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_performance_model_test.cc
@@ -26,12 +26,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
@@ -46,7 +46,7 @@ class GpuPerformanceModelTest : public HloTestBase {
     };
   }
 
-  GpuDeviceInfo dev_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo()};
+  se::DeviceDescription dev_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo()};
 
  public:
   GpuHloCostAnalysis::Options options_{ShapeSizeBytesFunction(),
diff --git a/third_party/xla/xla/service/gpu/gpu_target_config.cc b/third_party/xla/xla/service/gpu/gpu_target_config.cc
index 44a5ceff8534f8..0573c47e1df24e 100644
--- a/third_party/xla/xla/service/gpu/gpu_target_config.cc
+++ b/third_party/xla/xla/service/gpu/gpu_target_config.cc
@@ -15,19 +15,25 @@ limitations under the License.
 
 #include "xla/service/gpu/gpu_target_config.h"
 
+#include "xla/stream_executor/stream_executor.h"
+
 namespace xla {
 namespace gpu {
 
+GpuTargetConfig::GpuTargetConfig(stream_executor::StreamExecutor* s)
+    : gpu_device_info(s->GetDeviceDescription().ToGpuProto()),
+      platform_name(s->platform()->Name()) {}
+
 GpuTargetConfig::GpuTargetConfig(
     const stream_executor::GpuTargetConfigProto& proto)
-    : gpu_device_info(proto.gpu_device_info()),
+    : gpu_device_info({proto.gpu_device_info()}),
       platform_name(proto.platform_name()),
       dnn_version_info(proto.dnn_version_info()),
       device_description_str(proto.device_description_str()) {}
 
 stream_executor::GpuTargetConfigProto GpuTargetConfig::ToProto() const {
   stream_executor::GpuTargetConfigProto proto;
-  *proto.mutable_gpu_device_info() = gpu_device_info.ToProto();
+  *proto.mutable_gpu_device_info() = gpu_device_info.ToGpuProto();
   proto.set_platform_name(platform_name);
   *proto.mutable_dnn_version_info() = dnn_version_info.ToProto();
   proto.set_device_description_str(device_description_str);
diff --git a/third_party/xla/xla/service/gpu/gpu_target_config.h b/third_party/xla/xla/service/gpu/gpu_target_config.h
index 4c5e40f58fda83..1845bc755116cf 100644
--- a/third_party/xla/xla/service/gpu/gpu_target_config.h
+++ b/third_party/xla/xla/service/gpu/gpu_target_config.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_GPU_TARGET_CONFIG_H_
 #define XLA_SERVICE_GPU_GPU_TARGET_CONFIG_H_
 
-#include "xla/service/gpu/gpu_device_info.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/dnn.h"
 
@@ -26,10 +26,11 @@ namespace gpu {
 struct GpuTargetConfig {
   GpuTargetConfig() = default;
   explicit GpuTargetConfig(const stream_executor::GpuTargetConfigProto& proto);
+  explicit GpuTargetConfig(stream_executor::StreamExecutor* s);
 
   stream_executor::GpuTargetConfigProto ToProto() const;
 
-  GpuDeviceInfo gpu_device_info;
+  stream_executor::DeviceDescription gpu_device_info;
   std::string platform_name;
   stream_executor::dnn::VersionInfo dnn_version_info;
   std::string device_description_str;
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index 830961ad9894af..36006855dd580d 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -259,7 +258,7 @@ std::optional<TransposeDescription> FindConsistentTransposeHero(
 StatusOr<HloFusionAnalysis> HloFusionAnalysis::Create(
     FusionBackendConfig backend_config,
     std::vector<const HloInstruction*> hlo_roots, FusionBoundaryFn boundary_fn,
-    const GpuDeviceInfo* device_info) {
+    const se::DeviceDescription* device_info) {
   std::vector<const HloInstruction*> heroes;
   heroes.reserve(hlo_roots.size());
   for (auto* root : hlo_roots) {
@@ -283,7 +282,8 @@ StatusOr<HloFusionAnalysis> HloFusionAnalysis::Create(
 
 // static
 StatusOr<HloFusionAnalysis> HloFusionAnalysis::Create(
-    const HloFusionInstruction* fusion, const GpuDeviceInfo* device_info) {
+    const HloFusionInstruction* fusion,
+    const se::DeviceDescription* device_info) {
   CHECK(device_info != nullptr);
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       fusion->backend_config<FusionBackendConfig>());
@@ -455,7 +455,7 @@ const LaunchDimensionsConfig* HloFusionAnalysis::GetLoopFusionConfig() {
   // Call 'small' fusions that use less threads than the GPU has.
   int64_t num_elements = ShapeUtil::ElementsIn(GetElementShape());
   int64_t n_threads_max =
-      device_info_->threads_per_core_limit * device_info_->core_count;
+      device_info_->threads_per_core_limit() * device_info_->core_count();
   if (num_elements >= n_threads_max &&
       !MayPreventVectorization(fusion_roots_, fusion_boundary_fn_)) {
     unroll_factor = ComputeMaxUnrollFactor(num_elements);
@@ -543,8 +543,8 @@ int64_t HloFusionAnalysis::MaxBeneficialColumnReductionUnrollBasedOnBlockSize()
   int64_t num_threads = num_blocks * 1024;
   // Number of SMs we can saturate with this work.
   int num_cores =
-      CeilOfRatio<int64_t>(num_threads, device_info_->threads_per_core_limit);
-  return static_cast<int>(CeilOfRatio(num_cores, device_info_->core_count));
+      CeilOfRatio<int64_t>(num_threads, device_info_->threads_per_core_limit());
+  return static_cast<int>(CeilOfRatio(num_cores, device_info_->core_count()));
 }
 
 // Divides `num_reduces` reduces into groups. Different groups will be executed
@@ -743,8 +743,8 @@ bool HloFusionAnalysis::CanVectorizeReduction(
     return false;
   }
 
-  const auto* cuda_cc =
-      std::get_if<se::CudaComputeCapability>(&device_info_->compute_capability);
+  const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
+      &device_info_->gpu_compute_capability());
   if (cuda_cc == nullptr) return false;
   if (cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) return true;
   if (cuda_cc->IsAtLeast(se::CudaComputeCapability::PASCAL_)) {
@@ -762,7 +762,7 @@ int HloFusionAnalysis::CalculateVirtualThreadScalingFactorForReduction(
   if (reduction_dimensions.is_row_reduction && dimx <= 128) {
     int rows_per_warp = RowReductionGetRowsPerWarp(dimx);
     const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
-        &device_info_->compute_capability);
+        &device_info_->gpu_compute_capability());
     if (cuda_cc != nullptr &&
         cuda_cc->IsAtLeast(se::CudaComputeCapability::AMPERE)) {
       return rows_per_warp * 3;
@@ -811,7 +811,7 @@ ReductionCodegenInfo HloFusionAnalysis::ComputeReductionCodegenInfo(
   auto instr_index_groups = GroupDisjointReductions();
   int64_t shmem_usage = ReductionProjectedShmemUsageBytes(reduction_dimensions,
                                                           instr_index_groups);
-  const int64_t shmem_budget = device_info_->shared_memory_per_block;
+  const int64_t shmem_budget = device_info_->shared_memory_per_block();
   bool reduction_is_race_free = ReductionIsRaceFree(
       hero_reduction->GetModule()->config(), reduction_dimensions);
   bool vectorize =
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
index 600eae6c801e6b..131d9c09042470 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernel_mapping_scheme.h"
@@ -51,9 +50,10 @@ class HloFusionAnalysis {
   static StatusOr<HloFusionAnalysis> Create(
       FusionBackendConfig backend_config,
       std::vector<const HloInstruction*> hlo_roots,
-      FusionBoundaryFn boundary_fn, const GpuDeviceInfo* device_info);
-  static StatusOr<HloFusionAnalysis> Create(const HloFusionInstruction* fusion,
-                                            const GpuDeviceInfo* device_info);
+      FusionBoundaryFn boundary_fn, const se::DeviceDescription* device_info);
+  static StatusOr<HloFusionAnalysis> Create(
+      const HloFusionInstruction* fusion,
+      const se::DeviceDescription* device_info);
 
   const std::vector<const HloInstruction*>& fusion_roots() const {
     return fusion_roots_;
@@ -90,7 +90,7 @@ class HloFusionAnalysis {
                     FusionBoundaryFn fusion_boundary_fn,
                     std::vector<const HloInstruction*> fusion_arguments,
                     std::vector<const HloInstruction*> fusion_heroes,
-                    const GpuDeviceInfo* device_info,
+                    const se::DeviceDescription* device_info,
                     std::optional<TransposeDescription> tiled_transpose)
       : fusion_backend_config_(std::move(fusion_backend_config)),
         fusion_roots_(std::move(fusion_roots)),
@@ -125,7 +125,7 @@ class HloFusionAnalysis {
   // are /outside/ the fusion.
   std::vector<const HloInstruction*> fusion_arguments_;
   std::vector<const HloInstruction*> fusion_heroes_;
-  const GpuDeviceInfo* device_info_;
+  const se::DeviceDescription* device_info_;
   std::optional<TransposeDescription> tiled_transpose_;
 
   std::optional<ReductionCodegenInfo> reduction_codegen_info_;
diff --git a/third_party/xla/xla/service/gpu/hlo_op_profiler.cc b/third_party/xla/xla/service/gpu/hlo_op_profiler.cc
index b88186f347eaf1..51dfdcc20e5865 100644
--- a/third_party/xla/xla/service/gpu/hlo_op_profiler.cc
+++ b/third_party/xla/xla/service/gpu/hlo_op_profiler.cc
@@ -32,13 +32,13 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/primitive_util.h"
 #include "xla/service/executable.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/hlo_op_profile.pb.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_runner.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tests/test_utils.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -192,7 +192,7 @@ StatusOr<absl::Duration> HloOpProfiler::MeasureOpChainDuration(
 
 HloOpProfiler::HloOpProfiler(HloRunner& runner)
     : runner_(runner),
-      dev_info_(GetGpuDeviceInfo(runner.backend().stream_executors()[0])),
+      dev_info_(runner.backend().stream_executors()[0]->GetDeviceDescription()),
       // Twice the runtime of a copy (chain_length = 0) kernel.
       min_duration_(2 * MeasureOpChainDuration(HloOpcode::kNegate, F32, 0)
                             .value_or(absl::ZeroDuration())) {
@@ -234,7 +234,7 @@ StatusOr<HloInstructionProfile> HloOpProfiler::MeasureClockCyclesPerOp(
       (double_duration - duration) * 2.0 / chain_length;
 
   const float clocks_per_nanosecond =
-      dev_info_.clock_rate_ghz * 2;  // 2 for FMA
+      dev_info_.clock_rate_ghz() * 2;  // 2 for FMA
   const int64_t n_clocks =
       absl::ToInt64Nanoseconds(time_per_op) * clocks_per_nanosecond;
   VLOG(3) << time_per_op << " = " << n_clocks << " clock cycles";
diff --git a/third_party/xla/xla/service/gpu/hlo_op_profiler.h b/third_party/xla/xla/service/gpu/hlo_op_profiler.h
index f95b56cf87e175..0b558acf515a81 100644
--- a/third_party/xla/xla/service/gpu/hlo_op_profiler.h
+++ b/third_party/xla/xla/service/gpu/hlo_op_profiler.h
@@ -22,9 +22,9 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/hlo_op_profile.pb.h"
 #include "xla/service/hlo_runner.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -45,7 +45,7 @@ class HloOpProfiler {
 
  private:
   HloRunner& runner_;
-  const GpuDeviceInfo dev_info_;
+  const se::DeviceDescription& dev_info_;
   absl::Duration min_duration_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/hlo_op_profiler_run.cc b/third_party/xla/xla/service/gpu/hlo_op_profiler_run.cc
index 00cb80a133c1b4..a1f1d50f87bb1d 100644
--- a/third_party/xla/xla/service/gpu/hlo_op_profiler_run.cc
+++ b/third_party/xla/xla/service/gpu/hlo_op_profiler_run.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/hlo_op_profile.pb.h"
 #include "xla/service/gpu/hlo_op_profiler.h"
 #include "xla/service/hlo_runner.h"
@@ -78,15 +77,9 @@ int RunProfiler(int argc, char** argv) {
 
   HloRunner runner(PlatformUtil::GetPlatform("cuda").value());
   HloOpProfiler profiler(runner);
-  const auto device_info =
-      gpu::GetGpuDeviceInfo(runner.backend().stream_executors()[0]);
-  const auto compute_capability =
-      std::get<stream_executor::CudaComputeCapability>(
-          device_info.compute_capability);
-  std::string compute_capability_str =
-      absl::StrCat("sm_", compute_capability.major, compute_capability.minor);
-  VLOG(0) << compute_capability_str << " @ " << device_info.clock_rate_ghz
-          << " GHz";
+  const se::DeviceDescription& dev_info =
+      runner.backend().stream_executors()[0]->GetDeviceDescription();
+  VLOG(0) << dev_info.name() << " @ " << dev_info.clock_rate_ghz() << " GHz";
 
   const std::vector<PrimitiveType> dtypes = {
       S8, S16, S32, S64, U8, U16, U32, U64, F16, F32, F64, C64, C128,
@@ -129,8 +122,7 @@ int RunProfiler(int argc, char** argv) {
   VLOG(1) << "\n" << instr_profiles.DebugString();
 
   DeviceHloInstructionProfiles device_profiles;
-  device_profiles.mutable_entries()->insert(
-      {compute_capability_str, instr_profiles});
+  device_profiles.mutable_entries()->insert({dev_info.name(), instr_profiles});
   if (!output_file.empty()) {
     WriteOutput(device_profiles, output_file);
   }
diff --git a/third_party/xla/xla/service/gpu/horizontal_input_fusion.cc b/third_party/xla/xla/service/gpu/horizontal_input_fusion.cc
index 3cf913806b453d..58170caf37e9a7 100644
--- a/third_party/xla/xla/service/gpu/horizontal_input_fusion.cc
+++ b/third_party/xla/xla/service/gpu/horizontal_input_fusion.cc
@@ -43,7 +43,7 @@ Shape GetInputShapeForMultiOutputFusion(const HloInstruction& instr) {
 class HorizontalInputFusionImpl {
  public:
   explicit HorizontalInputFusionImpl(HloComputation* computation,
-                                     const GpuDeviceInfo& d)
+                                     const se::DeviceDescription& d)
       : computation_(computation), device_info_(d) {}
 
   ~HorizontalInputFusionImpl() {}
@@ -52,7 +52,7 @@ class HorizontalInputFusionImpl {
 
  private:
   HloComputation* computation_;
-  const GpuDeviceInfo device_info_;
+  const se::DeviceDescription& device_info_;
 };  // HorizontalInputFusionImpl
 
 // Compares one-by-one the dimensions of `shape_a` and `shape_b` from left to
diff --git a/third_party/xla/xla/service/gpu/horizontal_input_fusion.h b/third_party/xla/xla/service/gpu/horizontal_input_fusion.h
index 5b991f358e75e1..0dc45c90adb5f7 100644
--- a/third_party/xla/xla/service/gpu/horizontal_input_fusion.h
+++ b/third_party/xla/xla/service/gpu/horizontal_input_fusion.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -38,7 +38,8 @@ namespace gpu {
 // ROOT tuple of the entry computation.
 class GpuHorizontalInputFusion : public HloModulePass {
  public:
-  explicit GpuHorizontalInputFusion(const GpuDeviceInfo& d) : device_info_(d) {}
+  explicit GpuHorizontalInputFusion(const se::DeviceDescription& d)
+      : device_info_(d) {}
 
   absl::string_view name() const override {
     return "gpu_horizontal_input_fusion";
@@ -52,7 +53,7 @@ class GpuHorizontalInputFusion : public HloModulePass {
  private:
   StatusOr<bool> RunOnComputation(HloComputation*);
 
-  const GpuDeviceInfo device_info_;
+  const se::DeviceDescription& device_info_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/horizontal_loop_fusion_test.cc b/third_party/xla/xla/service/gpu/horizontal_loop_fusion_test.cc
index cf46cd36ccb0e6..30879583969493 100644
--- a/third_party/xla/xla/service/gpu/horizontal_loop_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/horizontal_loop_fusion_test.cc
@@ -293,7 +293,8 @@ TEST_F(HorizontalLoopFusionTest, HorizontalLoopFusionAfterVerticalFusion) {
                     .value();
 
   HloPassPipeline fusion("fusion");
-  const GpuDeviceInfo device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  const se::DeviceDescription device_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
   fusion.AddPass<xla::gpu::GpuInstructionFusion>(/*may_duplicate=*/false,
                                                  device_info);
   fusion.AddPass<xla::gpu::GpuInstructionFusion>(/*may_duplicate=*/true,
diff --git a/third_party/xla/xla/service/gpu/instruction_fusion.h b/third_party/xla/xla/service/gpu/instruction_fusion.h
index 353a29fe317f5f..bf036cbdd45017 100644
--- a/third_party/xla/xla/service/gpu/instruction_fusion.h
+++ b/third_party/xla/xla/service/gpu/instruction_fusion.h
@@ -29,17 +29,18 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/fusion_node_indexing_evaluation.h"
 #include "xla/service/fusion_queue.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
 
 class GpuInstructionFusion : public InstructionFusion {
  public:
-  explicit GpuInstructionFusion(bool may_duplicate, const GpuDeviceInfo& d)
+  explicit GpuInstructionFusion(bool may_duplicate,
+                                const se::DeviceDescription& d)
       : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate),
         device_info_(d) {}
 
@@ -76,7 +77,7 @@ class GpuInstructionFusion : public InstructionFusion {
   absl::flat_hash_map<const HloInstruction*, FusionNodeIndexingEvaluation>
       fusion_node_evaluations_;
 
-  const GpuDeviceInfo device_info_;
+  se::DeviceDescription device_info_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index 9bffa2f3290d39..a981f0d2a2244b 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/name_uniquer.h"
 #include "xla/stream_executor/device_description.h"
@@ -40,7 +39,8 @@ class IrEmitterContext {
  public:
   IrEmitterContext(const HloModule* hlo_module,
                    const BufferAssignment* buffer_assignment,
-                   std::string platform_name, GpuDeviceInfo gpu_device_info,
+                   std::string platform_name,
+                   const se::DeviceDescription& gpu_device_info,
                    mlir::MLIRContext* mlir_context, llvm::Module* llvm_module)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
@@ -58,15 +58,17 @@ class IrEmitterContext {
     return *buffer_assignment_;
   }
   absl::string_view platform_name() const { return platform_name_; }
-  GpuDeviceInfo gpu_device_info() const { return gpu_device_info_; }
+  const se::DeviceDescription& gpu_device_info() const {
+    return gpu_device_info_;
+  }
   se::CudaComputeCapability cuda_compute_capability() const {
     auto* cc = std::get_if<se::CudaComputeCapability>(
-        &gpu_device_info_.compute_capability);
+        &gpu_device_info_.gpu_compute_capability());
     return cc != nullptr ? *cc : se::CudaComputeCapability();
   }
   se::RocmComputeCapability rocm_compute_capability() const {
     auto* cc = std::get_if<se::RocmComputeCapability>(
-        &gpu_device_info_.compute_capability);
+        &gpu_device_info_.gpu_compute_capability());
     return cc != nullptr ? *cc : se::RocmComputeCapability();
   }
   mlir::MLIRContext* mlir_context() { return mlir_context_; }
@@ -102,7 +104,7 @@ class IrEmitterContext {
   const BufferAssignment* buffer_assignment_;
   absl::Span<const BufferAllocation> allocations_;
   std::string platform_name_;
-  GpuDeviceInfo gpu_device_info_;
+  const se::DeviceDescription& gpu_device_info_;
   mlir::MLIRContext* mlir_context_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index e3d2a62d3e2144..6ce0adc2366d4d 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -91,7 +91,6 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/dump.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -1622,7 +1621,8 @@ std::string GetLibdevicePath(const HloComputation* hlo_computation) {
 
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
-    const HloComputation* hlo_computation, const GpuDeviceInfo& device_info,
+    const HloComputation* hlo_computation,
+    const se::DeviceDescription& device_info,
     const AutotuneResult::TritonGemmKey& config, TritonIrEmitter ir_emitter,
     mlir::MLIRContext& mlir_context) {
   mlir_context.loadDialect<mt::TritonDialect>();
@@ -1657,7 +1657,7 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
 
   TF_RETURN_IF_ERROR(ir_emitter(b, GetLibdevicePath(hlo_computation), analysis,
                                 hlo_computation, fn, config,
-                                device_info.shared_memory_per_block_optin));
+                                device_info.shared_memory_per_block_optin()));
 
   b.create<mt::ReturnOp>(loc);
 
@@ -1674,7 +1674,8 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
 StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation, absl::string_view fusion_kind,
-    const se::CudaComputeCapability& cc, const GpuDeviceInfo& device_info,
+    const se::CudaComputeCapability& cc,
+    const se::DeviceDescription& device_info,
     const AutotuneResult::TritonGemmKey& config, llvm::Module* llvm_module,
     TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context) {
   if (fusion_kind == kTritonGemmFusionKind) {
@@ -1783,7 +1784,7 @@ StatusOr<TritonWrapperResult> TritonWrapper(
           ->getAttrOfType<mlir::IntegerAttr>("triton_gpu.shared")
           .getInt();
   VLOG(2) << "Shared memory usage: " << shared_mem_bytes << " B";
-  if (shared_mem_bytes > device_info.shared_memory_per_block_optin) {
+  if (shared_mem_bytes > device_info.shared_memory_per_block_optin()) {
     return ResourceExhausted("Shared memory size limit exceeded.");
   }
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index 06970d350b24f2..9edf27853e22fd 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/statusor.h"
@@ -78,7 +77,8 @@ using TritonIrEmitter = std::function<Status(
 StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation, absl::string_view fusion_kind,
-    const se::CudaComputeCapability& cc, const GpuDeviceInfo& device_info,
+    const se::CudaComputeCapability& cc,
+    const se::DeviceDescription& device_info,
     const AutotuneResult::TritonGemmKey& config, llvm::Module* llvm_module,
     TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context);
 
@@ -86,7 +86,8 @@ StatusOr<TritonWrapperResult> TritonWrapper(
 // use TritonWrapper instead.
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
-    const HloComputation* hlo_computation, const GpuDeviceInfo& device_info,
+    const HloComputation* hlo_computation,
+    const se::DeviceDescription& device_info,
     const AutotuneResult::TritonGemmKey& config, TritonIrEmitter ir_emitter,
     mlir::MLIRContext& mlir_context);
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 6c6daa7042d1a0..e8c5c0f16663a8 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
@@ -320,7 +319,8 @@ ENTRY entry {
       hlo_module->entry_computation()
           ->root_instruction()
           ->fused_instructions_computation();
-  const GpuDeviceInfo dev_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
   llvm::LLVMContext llvm_ctx;
   llvm::Module llvm_module("module", llvm_ctx);
   mlir::MLIRContext mlir_context;
@@ -353,7 +353,7 @@ ENTRY entry {
                                               /*minor=*/0},
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context));
   // Use optin shared memory which is > shared_memory_per_block.
-  EXPECT_GT(result.shmem_bytes, dev_info.shared_memory_per_block);
+  EXPECT_GT(result.shmem_bytes, dev_info.shared_memory_per_block());
 }
 
 TEST_F(TritonGemmTest, WorksWhenKIsDivisibleByBlockKButNotByBlockKTimesSplitK) {
@@ -776,7 +776,8 @@ ENTRY entry {
       hlo_module->entry_computation()
           ->root_instruction()
           ->fused_instructions_computation();
-  const GpuDeviceInfo dev_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
   llvm::LLVMContext llvm_ctx;
   llvm::Module llvm_module("module", llvm_ctx);
   mlir::MLIRContext mlir_context;
@@ -1671,10 +1672,11 @@ TEST_F(CompareTest, UsingOptinSharedMemoryOnAmpereProducesSameResult) {
           se::CudaComputeCapability::AMPERE)) {
     GTEST_SKIP() << "This test is for Ampere+ GPUs.";
   }
-  const GpuDeviceInfo dev_info =
-      GetGpuDeviceInfo(backend().default_stream_executor());
+  const se::DeviceDescription dev_info =
+      backend().default_stream_executor()->GetDeviceDescription();
   constexpr int kBytesOfSharedMemoryTested = 64 * 1024;
-  EXPECT_GE(dev_info.shared_memory_per_block_optin, kBytesOfSharedMemoryTested);
+  EXPECT_GE(dev_info.shared_memory_per_block_optin(),
+            kBytesOfSharedMemoryTested);
 
   const std::string kHloTextOptinShmem = R"(
 HloModule t
@@ -1720,7 +1722,7 @@ ENTRY e {
   // has the optin one should be able to execute the test.
   EXPECT_EQ(result.shmem_bytes, kBytesOfSharedMemoryTested);
   // Make sure the written config indeed has to use optin shared memory.
-  EXPECT_GT(result.shmem_bytes, dev_info.shared_memory_per_block);
+  EXPECT_GT(result.shmem_bytes, dev_info.shared_memory_per_block());
 
   const std::string kHloTextLowShmem = R"(
 HloModule t
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 02c9eaa5ef5599..ff9c31d0ab45ce 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -97,7 +97,6 @@ limitations under the License.
 #include "xla/service/gpu/gemm_thunk.h"
 #include "xla/service/gpu/gpu_asm_opts_util.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_fused_mha_runner.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -131,6 +130,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/translate/hlo_to_mhlo/hlo_utils.h"
 #include "xla/translate/mhlo_to_hlo/attribute_exporter.h"
 #include "xla/translate/mhlo_to_hlo/location_exporter.h"
@@ -1777,7 +1777,8 @@ Status IrEmitterUnnested::EmitFusion(
   auto* fused_computation = fusion->fused_instructions_computation();
 
   // Create HloFusionAnalysis instance.
-  GpuDeviceInfo device_info = ir_emitter_context_->gpu_device_info();
+  const se::DeviceDescription& device_info =
+      ir_emitter_context_->gpu_device_info();
   TF_ASSIGN_OR_RETURN(auto fusion_analysis,
                       HloFusionAnalysis::Create(fusion, &device_info));
 
@@ -2497,17 +2498,17 @@ Status IrEmitterUnnested::EmitSort(
   }
   bool no_tiling =
       kThreadsPerBlock >
-          ir_emitter_context_->gpu_device_info().threads_per_block_limit ||
+          ir_emitter_context_->gpu_device_info().threads_per_block_limit() ||
       total_shared_memory_needed >
-          ir_emitter_context_->gpu_device_info().shared_memory_per_block;
+          ir_emitter_context_->gpu_device_info().shared_memory_per_block();
   VLOG(2) << absl::StreamFormat(
       "%s %s use tiling. No tiling if any of the following is true: "
       "kThreadsPerBlock=%d > threads_per_block_limit=%d, "
       "total_shared_memory_needed=%d > shared_memory_per_block=%d",
       op_name, (no_tiling ? "won't" : "will"), kThreadsPerBlock,
-      ir_emitter_context_->gpu_device_info().threads_per_block_limit,
+      ir_emitter_context_->gpu_device_info().threads_per_block_limit(),
       total_shared_memory_needed,
-      ir_emitter_context_->gpu_device_info().shared_memory_per_block);
+      ir_emitter_context_->gpu_device_info().shared_memory_per_block());
 
   uint64_t num_blocks = CeilOfRatio(num_iterations, kThreadsPerBlock);
   LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock);
diff --git a/third_party/xla/xla/service/gpu/launch_dimensions.cc b/third_party/xla/xla/service/gpu/launch_dimensions.cc
index df27155fbe1002..47762ba19e1791 100644
--- a/third_party/xla/xla/service/gpu/launch_dimensions.cc
+++ b/third_party/xla/xla/service/gpu/launch_dimensions.cc
@@ -21,10 +21,10 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -40,8 +40,9 @@ std::ostream& operator<<(std::ostream& out,
   return out;
 }
 
-static int64_t ThreadsPerBlockLimit(const GpuDeviceInfo& gpu_device_info) {
-  int64_t threads_per_block = gpu_device_info.threads_per_block_limit;
+static int64_t ThreadsPerBlockLimit(
+    const se::DeviceDescription& gpu_device_info) {
+  int64_t threads_per_block = gpu_device_info.threads_per_block_limit();
   if (threads_per_block <= 0) {
     static std::atomic<int64_t> log_count{0};
     if (log_count.fetch_add(1) < 8) {
@@ -50,7 +51,7 @@ static int64_t ThreadsPerBlockLimit(const GpuDeviceInfo& gpu_device_info) {
                       "StreamExecutor's PopulateDeviceDescription should be "
                       "updated for this device.";
     }
-    threads_per_block = gpu_device_info.threads_per_warp;
+    threads_per_block = gpu_device_info.threads_per_warp();
     if (threads_per_block == 0) {
       // Fall back to *something* if we can't even get num threads per warp.
       threads_per_block = 32;
@@ -59,9 +60,9 @@ static int64_t ThreadsPerBlockLimit(const GpuDeviceInfo& gpu_device_info) {
   return threads_per_block;
 }
 
-int64_t ThreadsPerBlockRowVectorized(const Shape& shape,
-                                     const GpuDeviceInfo& gpu_device_info,
-                                     LaunchDimensionsConfig dim_config) {
+int64_t ThreadsPerBlockRowVectorized(
+    const Shape& shape, const se::DeviceDescription& gpu_device_info,
+    LaunchDimensionsConfig dim_config) {
   if (shape.dimensions().empty()) {
     return -1;
   }
@@ -75,7 +76,7 @@ int64_t ThreadsPerBlockRowVectorized(const Shape& shape,
       (shape.dimensions().back() % 256) != 0 &&
       // We do not support row that do not fit in one block.
       threads_per_block_row_vectorized <=
-          gpu_device_info.threads_per_block_limit) {
+          gpu_device_info.threads_per_block_limit()) {
     return threads_per_block_row_vectorized;
   }
   return -1;
@@ -90,13 +91,13 @@ struct BlockSizes {
 };
 
 BlockSizes GetBlockSizes(LaunchDimensionsConfig dim_config,
-                         const GpuDeviceInfo& gpu_device_info,
+                         const se::DeviceDescription& gpu_device_info,
                          const Shape& shape, int64_t num_elements) {
   if (!dim_config.row_vectorized && !dim_config.few_waves) {
     BlockSizes result;
     const int kWarpSchedulers = 4;
     result.threads_per_block_x = std::min<int64_t>(
-        gpu_device_info.threads_per_warp * kWarpSchedulers, num_elements);
+        gpu_device_info.threads_per_warp() * kWarpSchedulers, num_elements);
     result.threads_per_block_y = 1;
     result.block_count = CeilOfRatio(
         num_elements, result.threads_per_block_x * result.threads_per_block_y);
@@ -138,8 +139,8 @@ BlockSizes GetBlockSizes(LaunchDimensionsConfig dim_config,
       // looking at the arithmetic intensity of the kernels can specialize the
       // multiple per kernel.
       int64_t max_block_count =
-          32 * gpu_device_info.core_count *
-          (gpu_device_info.threads_per_core_limit /
+          32 * gpu_device_info.core_count() *
+          (gpu_device_info.threads_per_core_limit() /
            (result.threads_per_block_x * result.threads_per_block_y));
       int64_t capped_block_count = result.block_count;
       while (capped_block_count > max_block_count) {
@@ -154,8 +155,8 @@ BlockSizes GetBlockSizes(LaunchDimensionsConfig dim_config,
       int64_t capped_threads_per_block_x =
           std::min<int64_t>(result.threads_per_block_x, 128);
       int64_t capped_block_count =
-          gpu_device_info.core_count *
-          (gpu_device_info.threads_per_core_limit /
+          gpu_device_info.core_count() *
+          (gpu_device_info.threads_per_core_limit() /
            (capped_threads_per_block_x * result.threads_per_block_y));
       if (capped_block_count < result.block_count) {
         result.threads_per_block_x = capped_threads_per_block_x;
@@ -173,7 +174,7 @@ BlockSizes GetBlockSizes(LaunchDimensionsConfig dim_config,
 }  // namespace
 
 StatusOr<LaunchDimensions> CalculateLaunchDimensions(
-    const Shape& shape, const GpuDeviceInfo& gpu_device_info,
+    const Shape& shape, const se::DeviceDescription& gpu_device_info,
     LaunchDimensionsConfig dim_config) {
   int64_t num_elements = ShapeUtil::ElementsIn(shape);
   if (num_elements <= 1) {
@@ -183,12 +184,12 @@ StatusOr<LaunchDimensions> CalculateLaunchDimensions(
   num_elements = num_elements / dim_config.unroll_factor;
   BlockSizes sizes =
       GetBlockSizes(dim_config, gpu_device_info, shape, num_elements);
-  if (gpu_device_info.block_dim_limit_x > 0 &&
-      sizes.block_count >= gpu_device_info.block_dim_limit_x) {
-    return tsl::errors::Unimplemented("Kernel launch needs more blocks (",
-                                      sizes.block_count,
-                                      ") than allowed by hardware (",
-                                      gpu_device_info.block_dim_limit_x, ").");
+  if (gpu_device_info.block_dim_limit().x > 0 &&
+      sizes.block_count >= gpu_device_info.block_dim_limit().x) {
+    return absl::UnimplementedError(
+        absl::StrCat("Kernel launch needs more blocks (", sizes.block_count,
+                     ") than allowed by hardware (",
+                     gpu_device_info.block_dim_limit().x, ")."));
   }
 
   return LaunchDimensions(
diff --git a/third_party/xla/xla/service/gpu/launch_dimensions.h b/third_party/xla/xla/service/gpu/launch_dimensions.h
index 92e05346ec41f3..5bde8059e3bb5e 100644
--- a/third_party/xla/xla/service/gpu/launch_dimensions.h
+++ b/third_party/xla/xla/service/gpu/launch_dimensions.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -119,13 +119,13 @@ struct LaunchDimensionsConfig {
 
 // Returns -1 if the shape doesn't allow the row vectorization code path.
 // If supported, return the number of threads to use in that case.
-int64_t ThreadsPerBlockRowVectorized(const Shape& shape,
-                                     const GpuDeviceInfo& gpu_device_info,
-                                     LaunchDimensionsConfig dim_config);
+int64_t ThreadsPerBlockRowVectorized(
+    const Shape& shape, const se::DeviceDescription& gpu_device_info,
+    LaunchDimensionsConfig dim_config);
 
 // Calculates the launch dimensions used to invoke `hlo`.
 StatusOr<LaunchDimensions> CalculateLaunchDimensions(
-    const Shape& shape, const GpuDeviceInfo& gpu_device_info,
+    const Shape& shape, const se::DeviceDescription& gpu_device_info,
     LaunchDimensionsConfig dim_config = {});
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/multi_output_fusion.cc b/third_party/xla/xla/service/gpu/multi_output_fusion.cc
index 384ad39c39e82d..59218d50f38511 100644
--- a/third_party/xla/xla/service/gpu/multi_output_fusion.cc
+++ b/third_party/xla/xla/service/gpu/multi_output_fusion.cc
@@ -30,13 +30,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_reachability.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/gpu_performance_model.h"
 #include "xla/service/hlo_graph_dumper.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -103,7 +103,7 @@ FusionDecision ParameterSlicesAreNonOverlapping(const HloInstruction& instr1,
 
 FusionDecision LegalToFuse(const HloInstruction& instr1,
                            const HloInstruction& instr2,
-                           const GpuDeviceInfo& device_info,
+                           const se::DeviceDescription& device_info,
                            FusionInfoCache* fusion_info_cache) {
   CHECK(instr1.opcode() == HloOpcode::kFusion);
 
diff --git a/third_party/xla/xla/service/gpu/multi_output_fusion.h b/third_party/xla/xla/service/gpu/multi_output_fusion.h
index 37fb5a28654e34..103694857407ac 100644
--- a/third_party/xla/xla/service/gpu/multi_output_fusion.h
+++ b/third_party/xla/xla/service/gpu/multi_output_fusion.h
@@ -24,11 +24,11 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_reachability.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -94,7 +94,7 @@ namespace gpu {
 class GpuMultiOutputFusion : public HloModulePass {
  public:
   explicit GpuMultiOutputFusion(
-      const GpuDeviceInfo& device_info,
+      const se::DeviceDescription& device_info,
       HloCostAnalysis::ShapeSizeFunction shape_size_function)
       : device_info_(device_info), shape_size_function_(shape_size_function) {}
 
@@ -123,7 +123,7 @@ class GpuMultiOutputFusion : public HloModulePass {
   // The reachability map of current computation.
   std::unique_ptr<HloReachabilityMap> reachability_;
 
-  const GpuDeviceInfo device_info_;
+  se::DeviceDescription device_info_;
   HloCostAnalysis::ShapeSizeFunction shape_size_function_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 5ea8134ff5dc20..7ebefc88acd332 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -204,7 +204,7 @@ Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
   // This needs to run before GemmRewriter, which is part of
   // OptimizeHloPostLayoutAssignment().
   auto cuda_compute_capability = std::get<se::CudaComputeCapability>(
-      gpu_target_config.gpu_device_info.compute_capability);
+      gpu_target_config.gpu_device_info.gpu_compute_capability());
 
   if (hlo_module->config().debug_options().xla_gpu_enable_cudnn_fmha()) {
     HloPassPipeline mha_fusion_pipeline(
diff --git a/third_party/xla/xla/service/gpu/priority_fusion.cc b/third_party/xla/xla/service/gpu/priority_fusion.cc
index e236d79906e5ba..77af51e447fe2f 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion.cc
@@ -37,12 +37,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/fusion_node_indexing_evaluation.h"
 #include "xla/service/fusion_queue.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/gpu_performance_model.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
@@ -72,7 +72,7 @@ class GpuPriorityFusionQueue : public FusionQueue {
   GpuPriorityFusionQueue(
       HloComputation* computation,
       const GpuHloCostAnalysis::Options& cost_analysis_options,
-      const GpuDeviceInfo* device_info, const CanFuseCallback& can_fuse)
+      const se::DeviceDescription* device_info, const CanFuseCallback& can_fuse)
       : computation_(computation),
         cost_analysis_(cost_analysis_options, device_info),
         can_fuse_(can_fuse) {
diff --git a/third_party/xla/xla/service/gpu/priority_fusion.h b/third_party/xla/xla/service/gpu/priority_fusion.h
index 6d3ce74133f9a8..d8f07acf8f89f0 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.h
+++ b/third_party/xla/xla/service/gpu/priority_fusion.h
@@ -39,7 +39,7 @@ namespace gpu {
 class GpuPriorityFusion : public InstructionFusion {
  public:
   explicit GpuPriorityFusion(
-      const GpuDeviceInfo& d,
+      const se::DeviceDescription& d,
       const GpuHloCostAnalysis::Options& cost_analysis_options)
       : InstructionFusion(GpuPriorityFusion::IsExpensive),
         device_info_(d),
@@ -65,7 +65,7 @@ class GpuPriorityFusion : public InstructionFusion {
   HloInstruction* FuseInstruction(HloInstruction* fusion_instruction,
                                   HloInstruction* producer) override;
 
-  const GpuDeviceInfo device_info_;
+  se::DeviceDescription device_info_;
 
   // Cost model options that defines priorities in the queue.
   GpuHloCostAnalysis::Options cost_analysis_options_;
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index cd8e689d5b2428..4691c4a7270c12 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -746,7 +746,6 @@ xla_cc_binary(
         "//xla/service/gpu/llvm_gpu_backend",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:device_description_proto_cc_impl",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor/host:host_platform",
         "//xla/tests:test_utils",
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fusion_pipeline_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fusion_pipeline_test.cc
index 263e55a5969ba1..2c96ac7e316783 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fusion_pipeline_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fusion_pipeline_test.cc
@@ -41,7 +41,8 @@ class GpuFusionPipelineTest : public GpuCodegenTest {
   void CheckGpuFusionPipeline(absl::string_view hlo,
                               std::optional<absl::string_view> expected) {
     HloPassPipeline pipeline("gpu-fusion");
-    const GpuDeviceInfo device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+    const se::DeviceDescription device_info =
+        TestGpuDeviceInfo::RTXA6000DeviceInfo();
     pipeline.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false,
                                            device_info);
     pipeline.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true, device_info);
diff --git a/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
index e1624902375d42..7aaefd9cc020f7 100644
--- a/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
@@ -64,21 +64,20 @@ xla::Status CompileAndPrintLlvmIr(const std::string& hlo_text,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   llvm::LLVMContext llvm_context;
 
-  tensorflow::se::CudaComputeCapability cuda_compute_capability;
+  stream_executor::CudaComputeCapability cuda_compute_capability;
   cuda_compute_capability.major = sm / 10;
   cuda_compute_capability.minor = sm % 10;
 
 #if GOOGLE_CUDA
-  xla::gpu::GpuDeviceInfo gpu_device_info =
-      xla::gpu::TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  gpu_device_info.compute_capability = cuda_compute_capability;
+  stream_executor::DeviceDescription gpu_device_info =
+      xla::gpu::TestGpuDeviceInfo::RTXA6000DeviceInfo(cuda_compute_capability);
   std::string target_triple = xla::gpu::nvptx::TargetTriple();
   std::string data_layout = xla::gpu::nvptx::DataLayout();
   std::string platform_name = "CUDA";
   stream_executor::Platform::Id platform_id =
       stream_executor::cuda::kCudaPlatformId;
 #elif TENSORFLOW_USE_ROCM
-  xla::gpu::GpuDeviceInfo gpu_device_info =
+  se::DeviceDescription gpu_device_info =
       xla::gpu::TestGpuDeviceInfo::AMDMI210DeviceInfo();
   std::string target_triple = xla::gpu::amdgpu::TargetTriple();
   std::string data_layout = xla::gpu::amdgpu::DataLayout();
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner.cc b/third_party/xla/xla/service/gpu/triton_autotuner.cc
index 11f03d7136c001..1679a53b8347e8 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner.cc
@@ -56,7 +56,6 @@ limitations under the License.
 #include "xla/service/gpu/buffer_comparator.h"
 #include "xla/service/gpu/gemm_rewriter.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
-#include "xla/service/gpu/gpu_device_info.h"
 #include "xla/service/gpu/gpu_float_support.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/instruction_fusion.h"
@@ -339,8 +338,8 @@ int GetLogEveryN() { return VLOG_IS_ON(3) ? 100 : 1000; }
 
 StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
     const AutotuneResult::TritonGemmKey& key,
-    const GpuDeviceInfo& gpu_device_info, const HloFusionInstruction* fusion,
-    DebugOptions debug_opts) {
+    const se::DeviceDescription& gpu_device_info,
+    const HloFusionInstruction* fusion, DebugOptions debug_opts) {
   std::unique_ptr<HloModule> new_module =
       AutotunerUtil::ExtractInstructionIntoNewModule(*fusion);
   // Reduce memory usage during compilation by disabling GPU runtime.
@@ -390,8 +389,8 @@ StatusOr<std::unique_ptr<HloModule>> CublasGemmAutotuneExtractor(
   new_module->config().set_debug_options(debug_opts);
 
   GemmRewriter rewriter(config.GetCudaComputeCapability());
-  GpuInstructionFusion fusion_pass(/*may_duplicate=*/false,
-                                   GetGpuDeviceInfo(config.GetExecutor()));
+  GpuInstructionFusion fusion_pass(
+      /*may_duplicate=*/false, config.GetExecutor()->GetDeviceDescription());
   TF_RETURN_IF_ERROR(rewriter.Run(new_module.get()).status());
   TF_RETURN_IF_ERROR(fusion_pass.Run(new_module.get()).status());
   // TODO(tdanyluk): Consider running GemmAlgorithmPicker here for better cuBLAS
@@ -416,7 +415,8 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
     return executable_sets;
   }
 
-  GpuDeviceInfo gpu_device_info = GetGpuDeviceInfo(config.GetExecutor());
+  const se::DeviceDescription& gpu_device_info =
+      config.GetExecutor()->GetDeviceDescription();
 
   const int log_every_n = GetLogEveryN();
   int64_t config_count = 0;
@@ -726,8 +726,8 @@ Status DumpAutotunedFusions(const AutotuneConfig& config,
                       util.ExtractModule([&](const DebugOptions& debug_opts) {
                         return TritonGemmAutotuneExtractor(
                             result.triton(),
-                            GetGpuDeviceInfo(config.GetExecutor()), fusion,
-                            debug_opts);
+                            config.GetExecutor()->GetDeviceDescription(),
+                            fusion, debug_opts);
                       }));
   module->set_name(std::string(fusion->name()));
   // Using the original module for its debug info and name in the first
diff --git a/third_party/xla/xla/stream_executor/device_description.cc b/third_party/xla/xla/stream_executor/device_description.cc
index 366c91d57d6cff..08a1d9adbf2e85 100644
--- a/third_party/xla/xla/stream_executor/device_description.cc
+++ b/third_party/xla/xla/stream_executor/device_description.cc
@@ -53,14 +53,59 @@ DeviceDescription::DeviceDescription()
       core_count_(-1),
       ecc_enabled_(false) {}
 
-namespace internal {
-
-DeviceDescriptionBuilder::DeviceDescriptionBuilder()
-    : device_description_(new DeviceDescription) {}
+DeviceDescription::DeviceDescription(const GpuDeviceInfoProto &proto) {
+  if (proto.has_cuda_compute_capability()) {
+    gpu_compute_capability_ =
+        stream_executor::CudaComputeCapability(proto.cuda_compute_capability());
+  } else {
+    gpu_compute_capability_ =
+        stream_executor::RocmComputeCapability(proto.rocm_compute_capability());
+  }
+  threads_per_block_limit_ = proto.threads_per_block_limit();
+  threads_per_warp_ = proto.threads_per_warp();
+  shared_memory_per_block_ = proto.shared_memory_per_block();
+  shared_memory_per_block_optin_ = proto.shared_memory_per_block_optin();
+  shared_memory_per_core_ = proto.shared_memory_per_core();
+  threads_per_core_limit_ = proto.threads_per_core_limit();
+  core_count_ = proto.core_count();
+  fpus_per_core_ = proto.fpus_per_core();
+  block_dim_limit_ =
+      BlockDim(proto.block_dim_limit_x(), proto.block_dim_limit_y(),
+               proto.block_dim_limit_z());
+  memory_bandwidth_ = proto.memory_bandwidth();
+  l2_cache_size_ = proto.l2_cache_size();
+  clock_rate_ghz_ = proto.clock_rate_ghz();
+  device_memory_size_ = proto.device_memory_size();
+}
 
-}  // namespace internal
+GpuDeviceInfoProto DeviceDescription::ToGpuProto() const {
+  stream_executor::GpuDeviceInfoProto proto;
+  if (auto *ptr = std::get_if<stream_executor::CudaComputeCapability>(
+          &gpu_compute_capability_))
+    *proto.mutable_cuda_compute_capability() = ptr->ToProto();
+  if (auto *ptr = std::get_if<stream_executor::RocmComputeCapability>(
+          &gpu_compute_capability_))
+    *proto.mutable_rocm_compute_capability() = ptr->ToProto();
+
+  proto.set_threads_per_block_limit(threads_per_block_limit_);
+  proto.set_threads_per_warp(threads_per_warp_);
+  proto.set_shared_memory_per_block(shared_memory_per_block_);
+  proto.set_shared_memory_per_block_optin(shared_memory_per_block_optin_);
+  proto.set_shared_memory_per_core(shared_memory_per_core_);
+  proto.set_threads_per_core_limit(threads_per_core_limit_);
+  proto.set_core_count(core_count_);
+  proto.set_fpus_per_core(fpus_per_core_);
+  proto.set_block_dim_limit_x(block_dim_limit().x);
+  proto.set_block_dim_limit_y(block_dim_limit().y);
+  proto.set_block_dim_limit_z(block_dim_limit().z);
+  proto.set_memory_bandwidth(memory_bandwidth_);
+  proto.set_l2_cache_size(l2_cache_size_);
+  proto.set_clock_rate_ghz(clock_rate_ghz_);
+  proto.set_device_memory_size(device_memory_size_);
+  return proto;
+}
 
-GpuComputeCapability DeviceDescription::gpu_compute_capability() const {
+const GpuComputeCapability &DeviceDescription::gpu_compute_capability() const {
   return gpu_compute_capability_;
 }
 
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index b3d2af5b123b66..0be3f4f0d8330e 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -323,7 +323,7 @@ class DeviceDescription {
   // be "gfx000" (which is an invalid gfx arch).
   RocmComputeCapability rocm_compute_capability() const;
 
-  GpuComputeCapability gpu_compute_capability() const;
+  const GpuComputeCapability &gpu_compute_capability() const;
 
   // Returns the maximum amount of shared memory present on a single core
   // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
@@ -341,6 +341,9 @@ class DeviceDescription {
     return shared_memory_per_block_optin_;
   }
 
+  GpuDeviceInfoProto ToGpuProto() const;
+  explicit DeviceDescription(const GpuDeviceInfoProto &proto);
+
   // For string values that are not available via the underlying platform, this
   // value will be provided.
   static const char *kUndefinedString;
@@ -390,8 +393,6 @@ class DeviceDescription {
   int core_count_;
   int fpus_per_core_;
   bool ecc_enabled_;
-
-  SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription);
 };
 
 namespace internal {
@@ -400,102 +401,114 @@ namespace internal {
 // number of fields that would be easily confused in constructor form.
 class DeviceDescriptionBuilder {
  public:
-  DeviceDescriptionBuilder();
+  DeviceDescriptionBuilder() = default;
 
   // For descriptions of the following fields, see comments on the corresponding
   // DeviceDescription::* accessors above.
 
+  void set_gpu_compute_capability(GpuComputeCapability c) {
+    device_description_.gpu_compute_capability_ = c;
+  }
+
+  void set_block_dim_limit_x(int64_t limit) {
+    device_description_.block_dim_limit_.x = limit;
+  }
+
+  void set_block_dim_limit_y(int64_t limit) {
+    device_description_.block_dim_limit_.y = limit;
+  }
+
+  void set_block_dim_limit_z(int64_t limit) {
+    device_description_.block_dim_limit_.z = limit;
+  }
+
   void set_device_vendor(const std::string &value) {
-    device_description_->device_vendor_ = value;
+    device_description_.device_vendor_ = value;
   }
   void set_platform_version(const std::string &value) {
-    device_description_->platform_version_ = value;
+    device_description_.platform_version_ = value;
   }
   void set_driver_version(const std::string &value) {
-    device_description_->driver_version_ = value;
+    device_description_.driver_version_ = value;
   }
   void set_runtime_version(const std::string &value) {
-    device_description_->runtime_version_ = value;
+    device_description_.runtime_version_ = value;
   }
   void set_pci_bus_id(const std::string &value) {
-    device_description_->pci_bus_id_ = value;
-  }
-  void set_name(const std::string &value) {
-    device_description_->name_ = value;
+    device_description_.pci_bus_id_ = value;
   }
+  void set_name(const std::string &value) { device_description_.name_ = value; }
   void set_model_str(const std::string &value) {
-    device_description_->model_str_ = value;
+    device_description_.model_str_ = value;
   }
 
   void set_thread_dim_limit(const ThreadDim &value) {
-    device_description_->thread_dim_limit_ = value;
+    device_description_.thread_dim_limit_ = value;
   }
   void set_block_dim_limit(const BlockDim &value) {
-    device_description_->block_dim_limit_ = value;
+    device_description_.block_dim_limit_ = value;
   }
 
   void set_threads_per_core_limit(int64_t value) {
-    device_description_->threads_per_core_limit_ = value;
+    device_description_.threads_per_core_limit_ = value;
   }
   void set_threads_per_block_limit(int64_t value) {
-    device_description_->threads_per_block_limit_ = value;
+    device_description_.threads_per_block_limit_ = value;
   }
   void set_threads_per_warp(int64_t value) {
-    device_description_->threads_per_warp_ = value;
+    device_description_.threads_per_warp_ = value;
   }
 
   void set_registers_per_core_limit(int64_t value) {
-    device_description_->registers_per_core_limit_ = value;
+    device_description_.registers_per_core_limit_ = value;
   }
   void set_registers_per_block_limit(int64_t value) {
-    device_description_->registers_per_block_limit_ = value;
+    device_description_.registers_per_block_limit_ = value;
   }
 
   void set_device_address_bits(int64_t value) {
-    device_description_->device_address_bits_ = value;
+    device_description_.device_address_bits_ = value;
   }
   void set_device_memory_size(int64_t value) {
-    device_description_->device_memory_size_ = value;
+    device_description_.device_memory_size_ = value;
   }
   void set_l2_cache_size(int64_t value) {
-    device_description_->l2_cache_size_ = value;
+    device_description_.l2_cache_size_ = value;
   }
   void set_memory_bandwidth(int64_t value) {
-    device_description_->memory_bandwidth_ = value;
+    device_description_.memory_bandwidth_ = value;
   }
 
   void set_shared_memory_per_core(int64_t value) {
-    device_description_->shared_memory_per_core_ = value;
+    device_description_.shared_memory_per_core_ = value;
   }
   void set_shared_memory_per_block(int64_t value) {
-    device_description_->shared_memory_per_block_ = value;
+    device_description_.shared_memory_per_block_ = value;
   }
   void set_shared_memory_per_block_optin(int64_t value) {
-    device_description_->shared_memory_per_block_optin_ = value;
+    device_description_.shared_memory_per_block_optin_ = value;
   }
 
   void set_clock_rate_ghz(float value) {
-    device_description_->clock_rate_ghz_ = value;
+    device_description_.clock_rate_ghz_ = value;
   }
 
   void set_cuda_compute_capability(int major, int minor) {
-    device_description_->gpu_compute_capability_ =
+    device_description_.gpu_compute_capability_ =
         CudaComputeCapability{major, minor};
   }
 
   void set_rocm_compute_capability(std::string gcn_arch_name) {
-    device_description_->gpu_compute_capability_ =
+    device_description_.gpu_compute_capability_ =
         RocmComputeCapability(gcn_arch_name);
   }
 
-  void set_numa_node(int value) { device_description_->numa_node_ = value; }
-  void set_core_count(int value) { device_description_->core_count_ = value; }
+  void set_numa_node(int value) { device_description_.numa_node_ = value; }
+  void set_core_count(int value) { device_description_.core_count_ = value; }
   void set_fpus_per_core(int value) {
-    device_description_->fpus_per_core_ = value;
-  }
-  void set_ecc_enabled(bool value) {
-    device_description_->ecc_enabled_ = value;
+    device_description_.fpus_per_core_ = value;
   }
+  void set_ecc_enabled(bool value) { device_description_.ecc_enabled_ = value; }
 
   // Returns a built DeviceDescription with ownership transferred to the
   // caller. There are currently no restrictions on which fields must be set in
@@ -503,11 +516,13 @@ class DeviceDescriptionBuilder {
   //
   // Once the description is built, this builder object should be discarded.
   std::unique_ptr<DeviceDescription> Build() {
-    return std::move(device_description_);
+    return std::make_unique<DeviceDescription>(device_description_);
   }
 
+  DeviceDescription BuildObject() { return device_description_; }
+
  private:
-  std::unique_ptr<DeviceDescription> device_description_;
+  DeviceDescription device_description_;
 
   SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder);
 };
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index ee021b9b78c951..deac05f9e5077b 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -62,6 +62,7 @@ def xla_cc_binary(deps = None, copts = tsl_copts(), **kwargs):
         "//xla/service/memory_space_assignment:memory_space_assignment_proto_cc_impl",
         "//xla/service/gpu:backend_configs_cc_impl",
         "//xla/service/gpu:hlo_op_profile_proto_cc_impl",
+        "//xla/stream_executor:device_description_proto_cc_impl",
         "//xla/stream_executor:dnn_proto_cc_impl",
         "//xla/stream_executor:stream_executor_impl",
         "@local_tsl//tsl/platform:env_impl",
@@ -95,6 +96,7 @@ def xla_cc_test(
                        clean_dep("//xla/service/memory_space_assignment:memory_space_assignment_proto_cc_impl"),
                        clean_dep("//xla/service/gpu:backend_configs_cc_impl"),
                        clean_dep("//xla/service/gpu:hlo_op_profile_proto_cc_impl"),
+                       clean_dep("//xla/stream_executor:device_description_proto_cc_impl"),
                        clean_dep("//xla/stream_executor:dnn_proto_cc_impl"),
                        clean_dep("//xla/stream_executor:device_id_utils"),
                        clean_dep("//xla/stream_executor:stream_executor_impl"),

From e2a703188ce58e0448bc5ddae2c4f3fd325dbba1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Sep 2023 10:35:19 -0700
Subject: [PATCH 427/567] Refactors the strategy & cost creation within
 EnumerateAll2DPartition[Reshape] into their own dedicated functions.

PiperOrigin-RevId: 569529051
---
 .../auto_sharding/auto_sharding.cc            | 189 +++++++++++-------
 1 file changed, 116 insertions(+), 73 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index c19641d58e4019..8ffb8d452b6dd4 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -751,6 +751,14 @@ void EnumerateAll1DPartition(const HloInstruction* ins, const Shape& shape,
   }
 }
 
+void BuildStrategyAndCostForOp(const HloInstruction* ins, const Shape& shape,
+                               const Array<int64_t>& device_mesh,
+                               const ClusterEnvironment& cluster_env,
+                               const StrategyMap& strategy_map,
+                               std::unique_ptr<StrategyVector>& strategies,
+                               const CallGraph& call_graph,
+                               absl::Span<const int64_t> tensor_dims);
+
 // Enumerate 2D partition
 void EnumerateAll2DPartition(const HloInstruction* ins, const Shape& shape,
                              const Array<int64_t>& device_mesh,
@@ -782,59 +790,74 @@ void EnumerateAll2DPartition(const HloInstruction* ins, const Shape& shape,
         continue;
       }
 
-      std::string name = absl::StrFormat("S{%d,%d} @ {0,1}", i, j);
-      HloSharding output_spec = Tile(shape, {i, j}, {0, 1}, device_mesh);
-      double compute_cost = 0, communication_cost = 0;
-      double memory_cost = GetBytes(shape) / output_spec.NumTiles();
-      std::vector<std::optional<HloSharding>> input_shardings;
-      std::vector<std::vector<double>> resharding_costs;
-      if (ins->opcode() == HloOpcode::kConditional) {
-        // TODO(pratikf): Compute input_shardings for kConditional ops
-        resharding_costs =
-            CreateZeroReshardingCostsForAllOperands(ins, strategy_map);
-      } else if (ins->operand_count() > 0 &&
-                 ins->operand(0)->shape().IsTuple()) {
-        CHECK_EQ(ins->operand_count(), 1)
-            << "Do not support instructions with more than one tuple "
-               "operand. If this CHECK fails, we will need to fix "
-               "b/233412625.";
-        std::tie(resharding_costs, input_shardings) =
-            ReshardingCostsForTupleOperand(
-                ins->operand(0), strategy_map.at(ins->operand(0)).get());
-        LOG(INFO) << absl::StrJoin(resharding_costs.back(), ",");
-      } else {
-        std::tie(resharding_costs, input_shardings) =
-            GenerateReshardingCostsAndShardingsForAllOperands(
-                ins, output_spec, strategy_map, cluster_env, call_graph);
-      }
-      // TODO(pratikf) Communication costs for sort HLO ops. This is currently a
-      // placeholder approximation and should be improved.
-      if (ins->opcode() == HloOpcode::kSort) {
-        auto sort_ins = xla::DynCast<HloSortInstruction>(ins);
-        CHECK(sort_ins);
+      BuildStrategyAndCostForOp(ins, shape, device_mesh, cluster_env,
+                                strategy_map, strategies, call_graph, {i, j});
+    }
+  }
+}
 
-        if (sort_ins->sort_dimension() == i) {
-          communication_cost = ComputeSortCommunicationCost(
-              sort_ins->sort_dimension(), i, 0, shape, cluster_env);
-        } else if (sort_ins->sort_dimension() == j) {
-          communication_cost = ComputeSortCommunicationCost(
-              sort_ins->sort_dimension(), j, 1, shape, cluster_env);
-        }
-      } else if (IsTopKCustomCall(ins)) {
-        auto topk_dim = ins->operand(0)->shape().rank() - 1;
-        if (topk_dim == i) {
-          communication_cost =
-              ComputeSortCommunicationCost(topk_dim, i, 0, shape, cluster_env);
-        } else if (topk_dim == j) {
-          communication_cost =
-              ComputeSortCommunicationCost(topk_dim, j, 1, shape, cluster_env);
-        }
+// Builds the strategy + cost for the given tensor_dims & mesh_dims.
+void BuildStrategyAndCostForOp(const HloInstruction* ins, const Shape& shape,
+                               const Array<int64_t>& device_mesh,
+                               const ClusterEnvironment& cluster_env,
+                               const StrategyMap& strategy_map,
+                               std::unique_ptr<StrategyVector>& strategies,
+                               const CallGraph& call_graph,
+                               absl::Span<const int64_t> tensor_dims) {
+  std::vector<int64_t> mesh_dims(tensor_dims.size());
+  std::iota(mesh_dims.begin(), mesh_dims.end(), 0);
+  std::string name =
+      absl::StrFormat("S{%s} @ {%s}", absl::StrJoin(tensor_dims, ","),
+                      absl::StrJoin(mesh_dims, ","));
+  HloSharding output_spec = Tile(shape, tensor_dims, mesh_dims, device_mesh);
+  double compute_cost = 0, communication_cost = 0;
+  double memory_cost = GetBytes(shape) / output_spec.NumTiles();
+  std::vector<std::optional<HloSharding>> input_shardings;
+  std::vector<std::vector<double>> resharding_costs;
+  if (ins->opcode() == HloOpcode::kConditional) {
+    // TODO(pratikf): Compute input_shardings for kConditional ops
+    resharding_costs =
+        CreateZeroReshardingCostsForAllOperands(ins, strategy_map);
+  } else if (ins->operand_count() > 0 && ins->operand(0)->shape().IsTuple()) {
+    CHECK_EQ(ins->operand_count(), 1)
+        << "Do not support instructions with more than one tuple "
+           "operand. If this CHECK fails, we will need to fix "
+           "b/233412625.";
+    std::tie(resharding_costs, input_shardings) =
+        ReshardingCostsForTupleOperand(ins->operand(0),
+                                       strategy_map.at(ins->operand(0)).get());
+    LOG(INFO) << absl::StrJoin(resharding_costs.back(), ",");
+  } else {
+    std::tie(resharding_costs, input_shardings) =
+        GenerateReshardingCostsAndShardingsForAllOperands(
+            ins, output_spec, strategy_map, cluster_env, call_graph);
+  }
+  // TODO(pratikf) Communication costs for sort HLO ops. This is currently a
+  // placeholder approximation and should be improved.
+  if (ins->opcode() == HloOpcode::kSort) {
+    auto sort_ins = xla::DynCast<HloSortInstruction>(ins);
+    CHECK(sort_ins);
+    for (int64_t dim = 0; dim < tensor_dims.size(); ++dim) {
+      if (sort_ins->sort_dimension() == tensor_dims[dim]) {
+        communication_cost = ComputeSortCommunicationCost(
+            sort_ins->sort_dimension(), tensor_dims[dim], dim, shape,
+            cluster_env);
+        break;
+      }
+    }
+  } else if (IsTopKCustomCall(ins)) {
+    auto topk_dim = ins->operand(0)->shape().rank() - 1;
+    for (int64_t dim = 0; dim < tensor_dims.size(); ++dim) {
+      if (topk_dim == tensor_dims[dim]) {
+        communication_cost = ComputeSortCommunicationCost(
+            topk_dim, tensor_dims[dim], dim, shape, cluster_env);
+        break;
       }
-      strategies->leaf_vector.push_back(ShardingStrategy(
-          {name, output_spec, compute_cost, communication_cost, memory_cost,
-           std::move(resharding_costs), input_shardings}));
     }
   }
+  strategies->leaf_vector.push_back(ShardingStrategy(
+      {name, output_spec, compute_cost, communication_cost, memory_cost,
+       std::move(resharding_costs), input_shardings}));
 }
 
 // Enumerate all 1d partition strategies for reshape.
@@ -888,6 +911,13 @@ void EnumerateAll1DPartitionReshape(const HloInstruction* ins,
   }
 }
 
+void BuildStrategyAndCostForReshape(const HloInstruction* ins,
+                                    const Array<int64_t>& device_mesh,
+                                    const ClusterEnvironment& cluster_env,
+                                    const StrategyMap& strategy_map,
+                                    std::unique_ptr<StrategyVector>& strategies,
+                                    absl::Span<const int64_t> tensor_dims);
+
 // Enumerate 2D partition for reshape. Batch dim is always partitioned.
 void Enumerate2DPartitionReshape(const HloInstruction* ins,
                                  const Array<int64_t>& device_mesh,
@@ -902,7 +932,6 @@ void Enumerate2DPartitionReshape(const HloInstruction* ins,
     batch_dim = iter->second;
   }
 
-  const HloInstruction* operand = ins->operand(0);
 
   // Split batch dim + another dim
   for (int64_t i = 0; i < ins->shape().rank(); ++i) {
@@ -920,33 +949,47 @@ void Enumerate2DPartitionReshape(const HloInstruction* ins,
         continue;
       }
 
-      HloSharding output_spec = Tile(ins->shape(), {i, j}, {0, 1}, device_mesh);
-      std::optional<HloSharding> input_spec =
-          hlo_sharding_util::ReshapeSharding(ins->shape(), operand->shape(),
-                                             output_spec);
-      if (!input_spec.has_value()) {  // invalid reshape
-        continue;
-      }
-
-      std::string name = absl::StrFormat("S%d%d @ {%d,%d}", i, j, 0, 1);
-      double compute_cost = 0, communication_cost = 0;
-      double memory_cost = GetBytes(ins->shape()) / output_spec.NumTiles();
-
-      std::vector<std::vector<double>> resharding_costs{
-          ReshardingCostVector(strategy_map.at(operand).get(), operand->shape(),
-                               *input_spec, cluster_env)};
-      strategies->leaf_vector.push_back(
-          ShardingStrategy({name,
-                            output_spec,
-                            compute_cost,
-                            communication_cost,
-                            memory_cost,
-                            std::move(resharding_costs),
-                            {*input_spec}}));
+      BuildStrategyAndCostForReshape(ins, device_mesh, cluster_env,
+                                     strategy_map, strategies, {i, j});
     }
   }
 }
 
+void BuildStrategyAndCostForReshape(const HloInstruction* ins,
+                                    const Array<int64_t>& device_mesh,
+                                    const ClusterEnvironment& cluster_env,
+                                    const StrategyMap& strategy_map,
+                                    std::unique_ptr<StrategyVector>& strategies,
+                                    absl::Span<const int64_t> tensor_dims) {
+  const HloInstruction* operand = ins->operand(0);
+  std::vector<int64_t> mesh_dims(tensor_dims.size());
+  std::iota(mesh_dims.begin(), mesh_dims.end(), 0);
+  HloSharding output_spec =
+      Tile(ins->shape(), tensor_dims, mesh_dims, device_mesh);
+  std::optional<HloSharding> input_spec = hlo_sharding_util::ReshapeSharding(
+      ins->shape(), operand->shape(), output_spec);
+  if (!input_spec.has_value()) {  // invalid reshape
+    return;
+  }
+  std::string name =
+      absl::StrFormat("S%s @ {%s}", absl::StrJoin(tensor_dims, ""),
+                      absl::StrJoin(mesh_dims, ","));
+  double compute_cost = 0, communication_cost = 0;
+  double memory_cost = GetBytes(ins->shape()) / output_spec.NumTiles();
+
+  std::vector<std::vector<double>> resharding_costs{
+      ReshardingCostVector(strategy_map.at(operand).get(), operand->shape(),
+                           *input_spec, cluster_env)};
+  strategies->leaf_vector.push_back(
+      ShardingStrategy({name,
+                        output_spec,
+                        compute_cost,
+                        communication_cost,
+                        memory_cost,
+                        std::move(resharding_costs),
+                        {*input_spec}}));
+}
+
 // Return the maximum number of tiles among all strategies of an instruction.
 int64_t MaxNumTiles(const StrategyMap& strategy_map,
                     const HloInstruction* ins) {

From cee7495bd2d351a4a1c217380ab6cd8f71a8e43a Mon Sep 17 00:00:00 2001
From: Tongfei Guo <tongfei@google.com>
Date: Fri, 29 Sep 2023 10:46:25 -0700
Subject: [PATCH 428/567] Add gather test coverage for some bug.

PiperOrigin-RevId: 569532078
---
 third_party/xla/xla/tests/BUILD               | 13 ++---
 .../xla/xla/tests/gather_operation_test.cc    | 50 +++++++++++++++++++
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 3a67d99fb39e3f..fb4fabaa7e5f4d 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -2,18 +2,18 @@
 #   Base testing infrastructure for XLA.
 
 load("//xla/tests:build_defs.bzl", "generate_backend_suites", "generate_backend_test_macros", "xla_test", "xla_test_library")
-load(
-    "@local_tsl//tsl/platform:build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
 )
 load("@local_tsl//tsl:tsl.bzl", "tsl_copts")
+load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
+load(
+    "@local_tsl//tsl/platform:build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -954,6 +954,7 @@ xla_test(
         ":hlo_test_base",
         ":test_macros_header",
         ":xla_internal_test_main",
+        "//xla:array",
         "//xla:execution_options_util",
         "//xla:literal_util",
         "//xla:status_macros",
diff --git a/third_party/xla/xla/tests/gather_operation_test.cc b/third_party/xla/xla/tests/gather_operation_test.cc
index fc8be385be4673..44d43141b8fe92 100644
--- a/third_party/xla/xla/tests/gather_operation_test.cc
+++ b/third_party/xla/xla/tests/gather_operation_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "xla/array.h"
 #include "xla/client/xla_builder.h"
 #include "xla/execution_options_util.h"
 #include "xla/literal_util.h"
@@ -790,5 +791,54 @@ XLA_TEST_F(GatherClientLibraryTest,
   LiteralTestUtil::ExpectR2Equal<int32_t>({{1, 2, 3}, {7, 8, 9}},
                                           result_literal);
 }
+
+XLA_TEST_F(GatherOperationTest, b_301618442_case1) {
+  const std::string hlo_text = R"(
+HloModule b_301618442
+
+ENTRY main {
+  operand = s32[4,256] parameter(0)
+  indices = s32[64] parameter(1)
+  ROOT gather = s32[64,256] gather(operand, indices),
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
+      index_vector_dim=1,
+      slice_sizes={1,256}
+}
+)";
+  Array<int32_t> operand({4, 256});
+  operand.FillIota(0);
+  Literal operand_literal = LiteralUtil::CreateFromArray(operand);
+  Array<int32_t> indices({64});
+  operand.FillRandomUniform(0, 3);
+  Literal indices_literal = LiteralUtil::CreateFromArray(indices);
+  RunTest(hlo_text, &operand_literal, &indices_literal);
+}
+
+XLA_TEST_F(GatherOperationTest, b_301618442_case2) {
+  const std::string hlo_text = R"(
+HloModule b_301618442
+
+ENTRY main {
+  operand = s32[4,4096] parameter(0)
+  indices = s32[64] parameter(1)
+  ROOT gather = s32[64,4096] gather(operand, indices),
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
+      index_vector_dim=1,
+      slice_sizes={1,4096}
+}
+)";
+  Array<int32_t> operand({4, 4096});
+  operand.FillIota(0);
+  Literal operand_literal = LiteralUtil::CreateFromArray(operand);
+  Array<int32_t> indices({64});
+  operand.FillRandomUniform(0, 3);
+  Literal indices_literal = LiteralUtil::CreateFromArray(indices);
+  RunTest(hlo_text, &operand_literal, &indices_literal);
+}
+
 }  // namespace
 }  // namespace xla

From fed78494b256794b95f03cf459fbb5d8f7428eb1 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Fri, 29 Sep 2023 11:09:53 -0700
Subject: [PATCH 429/567] [xla:gpu] Deallocate and disable gpu graphs that
 cause graph update error

PiperOrigin-RevId: 569538929
---
 third_party/xla/xla/runtime/state.h           | 15 +++++++
 third_party/xla/xla/service/gpu/runtime/BUILD |  2 +
 .../xla/xla/service/gpu/runtime/executable.cc |  5 ++-
 .../xla/xla/service/gpu/runtime/executable.h  |  1 +
 .../xla/service/gpu/runtime/graph_launch.cc   | 43 +++++++++++++++----
 .../xla/service/gpu/runtime/graph_launch.h    |  6 +++
 6 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/runtime/state.h b/third_party/xla/xla/runtime/state.h
index ee17422ee0d22c..c505dd96ffea72 100644
--- a/third_party/xla/xla/runtime/state.h
+++ b/third_party/xla/xla/runtime/state.h
@@ -74,6 +74,8 @@ class StateVector {
 
     absl::StatusOr<T*> Get(size_t id);
 
+    absl::Status Erase(size_t id);
+
     // Returns a state constructed from this snapshot for a given id.
     State<T> state(size_t id) { return State<T>(id, this); }
 
@@ -197,6 +199,19 @@ absl::StatusOr<T*> StateVector<T>::Snapshot::Get(size_t id) {
   return absl::InternalError("Value not found in state vector");
 }
 
+template <typename T>
+absl::Status StateVector<T>::Snapshot::Erase(size_t id) {
+  absl::MutexLock lock(&owning_state_.mu_);
+
+  std::vector<std::unique_ptr<T>>& state = owning_state_.vector_;
+  if (id < state.size() && state[id].get()) {
+    state[id].reset(nullptr);
+    return absl::OkStatus();
+  }
+
+  return absl::InternalError("Value not found in state vector");
+}
+
 }  // namespace runtime
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index b087ce7f33d285..4cdd2dcbbd8464 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -418,6 +418,7 @@ cc_library(
         ":gemm",
         ":kernel_launch",
         ":support",
+        "//xla:statusor",
         "//xla/runtime:custom_call",
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
@@ -427,6 +428,7 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_graph",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/profiler/lib:profiler_lock",
diff --git a/third_party/xla/xla/service/gpu/runtime/executable.cc b/third_party/xla/xla/service/gpu/runtime/executable.cc
index 7eb80a386b0dab..8fe28a284f275f 100644
--- a/third_party/xla/xla/service/gpu/runtime/executable.cc
+++ b/third_party/xla/xla/service/gpu/runtime/executable.cc
@@ -420,6 +420,8 @@ Status GpuRuntimeExecutable::Execute(
       executor_graphs->snapshot();
   CapturedFunctionExecutionCount::Snapshot execution_count =
       captured_function_counts_(executor)->snapshot();
+  OrdinalToFallback::Snapshot ordinal_to_fallback =
+      ordinal_to_fallback_.snapshot();
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
   // Kernels in concurrent regions should be launched on borrowed stream, so
@@ -459,7 +461,7 @@ Status GpuRuntimeExecutable::Execute(
       // only.
       &fused_attention_runners, &fused_attention_backward_runners,
 #endif  // GOOGLE_CUDA
-      &graph_instances, &execution_count,
+      &graph_instances, &execution_count, &ordinal_to_fallback,
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       &concurrent_region_status,
       // Null pointer will be interpreted as an absence of async collectives
@@ -496,6 +498,7 @@ Status GpuRuntimeExecutable::Execute(
 
     if (auto instantiated = graph_instances_.InstantiateAllGraphs(
             run_options, executable, user_data, device_ptr,
+            &ordinal_to_fallback,
             debug_options_.xla_gpu_graph_eviction_timeout_seconds());
         !instantiated.ok()) {
       return InternalError("Failed to instantiate GPU graphs: %s",
diff --git a/third_party/xla/xla/service/gpu/runtime/executable.h b/third_party/xla/xla/service/gpu/runtime/executable.h
index 73f357596d16e9..05967e17afa3c6 100644
--- a/third_party/xla/xla/service/gpu/runtime/executable.h
+++ b/third_party/xla/xla/service/gpu/runtime/executable.h
@@ -177,6 +177,7 @@ class GpuRuntimeExecutable {
   // Keep captured and instantiated GPU graphs instances.
   GraphInstances graph_instances_;
   CapturedFunctionExecutionCounts captured_function_counts_;
+  OrdinalToFallback ordinal_to_fallback_;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
   // Keep an executable state for all registered runtime modules.
diff --git a/third_party/xla/xla/service/gpu/runtime/graph_launch.cc b/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
index 58eb78cb82b370..fa42e1330050c4 100644
--- a/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
+++ b/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
@@ -24,8 +24,10 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
@@ -38,6 +40,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/kernel_launch.h"
 #include "xla/service/gpu/runtime/support.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/statusor.h"
 #include "tsl/profiler/lib/profiler_lock.h"
 #include "tsl/profiler/lib/traceme.h"
 #include "tsl/profiler/lib/traceme_encode.h"
@@ -267,7 +270,8 @@ bool GraphInstances::InstantiatedAllGraphs(
 Status GraphInstances::InstantiateAllGraphs(
     const ServiceExecutableRunOptions* run_options,
     const Executable& executable, const CustomCall::UserData& user_data,
-    void* ptr, std::optional<uint64_t> eviction_timeout_seconds) {
+    void* ptr, OrdinalToFallback::Snapshot* ordinal_to_fallback,
+    std::optional<uint64_t> eviction_timeout_seconds) {
   // We have only "main" function in the executable.
   if (executable.num_functions() == 1) return OkStatus();
 
@@ -299,6 +303,9 @@ Status GraphInstances::InstantiateAllGraphs(
                           "xla.gpu.graph.capture"))
       continue;
 
+    StatusOr<std::monostate*> fallback = ordinal_to_fallback->Get(ordinal);
+    if (fallback.ok()) continue;
+
     VLOG(3) << "Instantiate Gpu graph defined by capture function @"
             << executable.function_name(ordinal) << " (ordinal = " << ordinal
             << ")";
@@ -550,6 +557,7 @@ static absl::Status LaunchGraph(
     StreamExecutorConvRunners::Snapshot* convs,
     StreamExecutorGraphInstances::Snapshot* instances,
     CapturedFunctionExecutionCount::Snapshot* counts,
+    OrdinalToFallback::Snapshot* ordinal_to_fallback,
     GemmConfigs::Snapshot* gemm_config, runtime::Executable* executable,
     NonAtomicallyUpgradeableRWLock* gpu_lock,
     ConcurrentRegionStatus* region_status, CustomCall::RemainingArgs fwd_args,
@@ -583,7 +591,10 @@ static absl::Status LaunchGraph(
   // work around disable graph execution and run everything in op-by-op mode.
   bool is_profiling = tsl::profiler::ProfilerLock::HasActiveSession();
 
-  if (count < num_runs_to_instantiate || is_profiling) {
+  StatusOr<std::monostate*> fallback =
+      ordinal_to_fallback->Get(capture.ordinal);
+
+  if (count < num_runs_to_instantiate || is_profiling || fallback.ok()) {
     VLOG(3) << "Run gpu graph in op-by-op mode: ordinal = " << capture.ordinal;
     return RunGraphOpByOp(run_options, function_ref, fwd_args, user_data());
   }
@@ -645,21 +656,34 @@ static absl::Status LaunchGraph(
   TF_ASSIGN_OR_RETURN(
       auto g, CaptureGraph(run_options, function_ref, args, user_data()));
 
-  // At this point we have to grab a writer lock, because we might potentially
-  // have concurrent execution of the cached graph instance.
-  absl::WriterMutexLock lock(instance->mutex.get());
+  se::gpu::OwnedGpuGraphExec::UpdateResult update_result;
+  {
+    // At this point we have to grab a writer lock, because we might potentially
+    // have concurrent execution of the cached graph instance.
+    absl::WriterMutexLock lock(instance->mutex.get());
 
-  // Update captured graph executable.
-  TF_ASSIGN_OR_RETURN(se::gpu::OwnedGpuGraphExec::UpdateResult update_result,
-                      instance->exec.Update(std::move(g)));
+    // Update captured graph executable.
+    TF_ASSIGN_OR_RETURN(update_result, instance->exec.Update(std::move(g)));
+  }
 
   switch (update_result) {
-    case se::gpu::OwnedGpuGraphExec::UpdateResult::kFallback:
+    case se::gpu::OwnedGpuGraphExec::UpdateResult::kFallback: {
       LOG(WARNING) << "Fallback to op-by-op mode because memset node breaks "
                       "graph update";
+      // Deallocate instance.
+      TF_RETURN_IF_ERROR(instances->Erase(capture.ordinal));
+      // Set ordinal_to_fallback to prevent future instantiation of this graph.
+      TF_ASSIGN_OR_RETURN(
+          std::monostate * fallback,
+          ordinal_to_fallback->GetOrCreate(
+              capture.ordinal,
+              []() -> StatusOr<std::monostate> { return std::monostate{}; }));
+      DCHECK(fallback);
       return RunGraphOpByOp(run_options, function_ref, fwd_args, user_data());
+    }
     case se::gpu::OwnedGpuGraphExec::UpdateResult::kSuccess:
       // Update captured pointer hash.
+      absl::WriterMutexLock lock(instance->mutex.get());
       instance->ptr_hash = ptrs_hash;
 
       TraceMe trace([&] {
@@ -691,6 +715,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .UserData<StreamExecutorConvRunners::Snapshot*>()
         .UserData<StreamExecutorGraphInstances::Snapshot*>()
         .UserData<CapturedFunctionExecutionCount::Snapshot*>()
+        .UserData<OrdinalToFallback::Snapshot*>()
         .UserData<GemmConfigs::Snapshot*>()
         .UserData<Executable*>()
         .UserData<NonAtomicallyUpgradeableRWLock*>()
diff --git a/third_party/xla/xla/service/gpu/runtime/graph_launch.h b/third_party/xla/xla/service/gpu/runtime/graph_launch.h
index 2c14a0238595df..e4841541bc781e 100644
--- a/third_party/xla/xla/service/gpu/runtime/graph_launch.h
+++ b/third_party/xla/xla/service/gpu/runtime/graph_launch.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 
 #include "absl/container/node_hash_map.h"
 #include "xla/runtime/custom_call_registry.h"
@@ -47,6 +48,10 @@ class StreamExecutorGraphInstances;  // Forward declare
 class CapturedFunctionExecutionCount
     : public runtime::StateVector<std::unique_ptr<std::atomic<uint64_t>>> {};
 
+// Create the i-th value if the capture function with ordinal i causes graph
+// update failure.
+class OrdinalToFallback : public runtime::StateVector<std::monostate> {};
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // A state vector that owns all instantiated GPU graphs. Graph capture function
@@ -106,6 +111,7 @@ class GraphInstances {
       const ServiceExecutableRunOptions* run_options,
       const runtime::Executable& executable,
       const runtime::CustomCall::UserData& user_data, void* ptr,
+      OrdinalToFallback::Snapshot* ordinal_to_fallback,
       std::optional<uint64_t> eviction_timeout_seconds = std::nullopt);
 
   // Returns true if all Gpu graphs were already instantiated.

From dab293abf1b258358660173965bc955c3884e5c0 Mon Sep 17 00:00:00 2001
From: Shixin Li <shixinli@google.com>
Date: Fri, 29 Sep 2023 11:18:52 -0700
Subject: [PATCH 430/567] Enable cross compilation for GPU PJRT
 compiler/client.

PiperOrigin-RevId: 569541694
---
 third_party/xla/xla/pjrt/gpu/BUILD            |  24 ++--
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  32 ++---
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     |  17 +--
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc  |  64 ++++-----
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h   |   4 +-
 .../pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc | 117 ++++++++--------
 .../xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc |  11 --
 third_party/xla/xla/service/BUILD             |  26 ++++
 third_party/xla/xla/service/local_service.cc  |  88 +-----------
 third_party/xla/xla/service/local_service.h   |   7 -
 .../xla/xla/service/local_service_utils.cc    | 125 ++++++++++++++++++
 .../xla/xla/service/local_service_utils.h     |  38 ++++++
 third_party/xla/xla/service/service.cc        |   2 +-
 third_party/xla/xla/service/service.h         |  12 +-
 14 files changed, 319 insertions(+), 248 deletions(-)
 create mode 100644 third_party/xla/xla/service/local_service_utils.cc
 create mode 100644 third_party/xla/xla/service/local_service_utils.h

diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 389554049783f9..67d4ae617ccd65 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -4,7 +4,6 @@ load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_tsl//tsl:tsl.bzl", "if_nccl")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
-load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
@@ -234,10 +233,12 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_proto_util",
         "//xla/service:local_service",
+        "//xla/service:local_service_utils",
         "//xla/service/gpu:executable_proto_cc",
         "//xla/service/gpu:gpu_target_config",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
     ] + if_cuda([
         ":nccl_id_store_cuda",
@@ -258,7 +259,12 @@ cc_library(
 xla_cc_test(
     name = "se_gpu_pjrt_compiler_test",
     srcs = if_gpu_is_configured(["se_gpu_pjrt_compiler_test.cc"]),
-    tags = tf_cuda_tests_tags(),
+    tags = [
+        "config-cuda-only",
+        "gpu",
+        "no_oss",
+        "requires-gpu-nvidia",
+    ],
     deps = [
         ":se_gpu_pjrt_client",
         ":se_gpu_pjrt_compiler",
@@ -282,7 +288,6 @@ xla_cc_test(
 xla_cc_test(
     name = "se_gpu_pjrt_compiler_aot_test",
     srcs = if_gpu_is_configured(["se_gpu_pjrt_compiler_aot_test.cc"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     tags = [
         "config-cuda-only",
         "gpu",
@@ -304,6 +309,7 @@ xla_cc_test(
         "//xla/service:gpu_plugin",
         "//xla/service:hlo_parser",
         "//xla/service/gpu:gpu_target_config",
+        "//xla/service/gpu:nvptx_compiler_impl",
         "//xla/stream_executor/cuda:cublas_plugin",
         "//xla/tests:literal_test_util",
         "@com_google_absl//absl/memory",
@@ -314,18 +320,8 @@ xla_cc_test(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
-        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
-    ] + if_cuda([
-        ":nccl_id_store_cuda",
-        "@local_config_cuda//cuda:cuda_headers",
-        "//xla/stream_executor/cuda:cuda_activation_header",
-        "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
-    ]) + if_rocm([
-        ":nccl_id_store_rocm",
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
+    ],
 )
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 9e314cf2eefd06..9352b830152df5 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -556,9 +556,9 @@ StatusOr<std::unique_ptr<StreamExecutorUnloadedExecutable>> FromProto(
 }  // namespace
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
-                                        std::optional<CompileOptions> options,
-                                        const LoadOptions& load_options) {
+StreamExecutorGpuClient::LoadSerializedExecutable(
+    absl::string_view serialized, std::optional<CompileOptions> options,
+    const LoadOptions& load_options) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   StreamExecutorUnloadedExecutableProto proto;
   if (serialized.size() > std::numeric_limits<int>::max()) {
@@ -572,8 +572,7 @@ StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
         "failed");
   }
   TF_ASSIGN_OR_RETURN(auto se_executable, FromProto(proto));
-  // TODO(b/296466237): Unify the `Load` method.
-  return Load(std::move(se_executable));
+  return Load(std::move(se_executable), LoadOptions());
 #endif
   return absl::InternalError("LoadSerialized only works with cuda or rocm.");
 }
@@ -594,38 +593,35 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
 }
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>> StreamExecutorGpuClient::Load(
-    std::unique_ptr<PjRtExecutable> executable) {
+    std::unique_ptr<PjRtExecutable> executable,
+    const LoadOptions& load_options) {
   auto se_executable =
       absl::WrapUnique(tensorflow::down_cast<StreamExecutorUnloadedExecutable*>(
           executable.release()));
 
   CompileOptions compile_options = se_executable->compile_options();
+  CompileOptions input_options = compile_options;
   TF_RETURN_IF_ERROR(compile_options.ApplyAllOptionOverrides());
   TF_ASSIGN_OR_RETURN(ExecutableExtras extras,
                       GetExecutableExtras(&compile_options));
 
-  TF_ASSIGN_OR_RETURN(
-      auto se_executor,
-      client()->backend().stream_executor(
-          compile_options.executable_build_options.device_ordinal()));
-
   // Load Executable from AOT compilation result.
   std::vector<std::unique_ptr<LocalExecutable>> local_executables;
   local_executables.reserve(se_executable->aot_executables().size());
   for (std::unique_ptr<xla::AotCompilationResult>& aot_executable :
        se_executable->aot_executables()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                        aot_executable->LoadExecutable(
-                            client()->backend().compiler(), se_executor));
-    local_executables.push_back(std::make_unique<LocalExecutable>(
-        std::move(executable), client()->local_service()->mutable_backend(),
-        compile_options.executable_build_options));
+    TF_ASSIGN_OR_RETURN(std::string serialized,
+                        aot_executable->SerializeAsString());
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<LocalExecutable> local_executable,
+        client()->Load(serialized, compile_options.executable_build_options));
+    local_executables.push_back(std::move(local_executable));
   }
   bool parameter_is_tupled_arguments =
       compile_options.parameter_is_tupled_arguments;
   auto ret = std::make_unique<PjRtStreamExecutorExecutable>(
       std::move(local_executables), parameter_is_tupled_arguments,
-      std::move(extras.device_assignment), std::move(compile_options),
+      std::move(extras.device_assignment), std::move(input_options),
       std::move(extras.addressable_device_logical_ids),
       std::move(extras.addressable_devices), this);
   TF_RETURN_IF_ERROR(ret->SetUpDonation(parameter_is_tupled_arguments));
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index c989b341547072..d2fdeea8c11078 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -191,24 +191,13 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
     return &topology_;
   }
 
-  // TODO(b/285385306): Enable loading a non-loaded PjRtExecutable.
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
       std::unique_ptr<PjRtExecutable> executable,
-      const LoadOptions& load_options) override {
-    return absl::WrapUnique<PjRtLoadedExecutable>(
-        tensorflow::down_cast<PjRtLoadedExecutable*>(executable.release()));
-  }
-
-  // TODO(b/296466237): Unify `Load` method after (de)serialization and tests on
-  // existing use cases are done.
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
-      std::unique_ptr<PjRtExecutable> executable);
+      const LoadOptions& load_options) override;
 
-  // TODO(b/296466237): Unify `LoadSerializedExecutable` after fixing existing
-  // tests.
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerialized(
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerializedExecutable(
       absl::string_view serialized, std::optional<CompileOptions> options,
-      const LoadOptions& load_options);
+      const LoadOptions& load_options) override;
 
  private:
   xla::StreamExecutorGpuTopologyDescription topology_;
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 6a4dbadf3405d5..e3457f350c8b7f 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/status_macros.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/errors.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -39,6 +40,7 @@ limitations under the License.
 #include "xla/service/hlo_module_util.h"
 #include "xla/service/hlo_proto_util.h"
 #include "xla/service/local_service.h"
+#include "xla/service/local_service_utils.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #endif
 
@@ -99,17 +101,14 @@ absl::StatusOr<std::unique_ptr<PjRtExecutable>> AotCompile(
       options.argument_layouts, &options.executable_build_options,
       &argument_layout_pointers));
 
-  // TODO(b/300657649): Call `UpdateBuildOptions` like in LocalClient::Compile.
-  // TODO(b/300657649): Get HloModuleConfig from `GetHloModuleConfig` like in
-  // LocalService::CompileExecutables.
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> hlo_config,
+                      GetHloModuleConfig(computation, argument_layout_pointers,
+                                         options.executable_build_options));
   HloModuleProto hlo_module_proto = computation.proto();
-  TF_ASSIGN_OR_RETURN(ProgramShape shape, computation.GetProgramShape());
-  DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
-  HloModuleConfig config(shape);
-  config.set_debug_options(debug_options);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProto(hlo_module_proto, *hlo_config));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                      HloModule::CreateFromProto(hlo_module_proto, config));
 #if GOOGLE_CUDA
   auto gpu_compiler = gpu::NVPTXCompiler();
 #elif TENSORFLOW_USE_ROCM
@@ -147,23 +146,31 @@ absl::StatusOr<std::unique_ptr<PjRtExecutable>> AotCompile(
 #endif
 }  // namespace
 
-// TODO(b/285385306): Enable compilation on provided `topology`.
 absl::StatusOr<std::unique_ptr<PjRtExecutable>>
 StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    const XlaComputation& computation,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
-  if (client == nullptr && gpu_target_config_ != std::nullopt) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  if (client == nullptr && gpu_target_config_ != std::nullopt) {
     return AotCompile(options, computation, *gpu_target_config_);
-#endif
-    return absl::InternalError(
-        "GPU AOT compilation requires the target to be built with CUDA or "
-        "ROCm.");
   }
-  // TODO(b/296466237): Remove client dependency.
   TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
-  return client->Compile(computation, options);
+
+  PjRtStreamExecutorClient* se_client =
+      tensorflow::down_cast<PjRtStreamExecutorClient*>(client);
+#if GOOGLE_CUDA
+  auto gpu_compiler = gpu::NVPTXCompiler();
+#elif TENSORFLOW_USE_ROCM
+  auto gpu_compiler = gpu::AMDGPUCompiler();
+#endif
+  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
+      se_client->client()->backend().default_stream_executor());
+  return AotCompile(options, computation, gpu_target_config);
+#endif
+  return absl::InternalError(
+      "GPU AOT compilation requires the target to be built with CUDA or "
+      "ROCm.");
 }
 
 absl::StatusOr<std::unique_ptr<PjRtExecutable>>
@@ -171,22 +178,17 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    mlir::ModuleOp module,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
-  if (client == nullptr && gpu_target_config_ != std::nullopt) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    XlaComputation xla_computation;
-    TF_RETURN_IF_ERROR(MlirToXlaComputation(
-        module, xla_computation,
-        /*use_tuple_args=*/options.parameter_is_tupled_arguments,
-        /*return_tuple=*/false));
-    return AotCompile(options, xla_computation, *gpu_target_config_);
+  XlaComputation xla_computation;
+  TF_RETURN_IF_ERROR(MlirToXlaComputation(
+      module, xla_computation,
+      /*use_tuple_args=*/options.parameter_is_tupled_arguments,
+      /*return_tuple=*/false));
+  return Compile(options, xla_computation, topology, client);
 #endif
-    return absl::InternalError(
-        "GPU AOT compilation requires the target to be built with CUDA or "
-        "ROCm.");
-  }
-  // TODO(b/296466237): Remove client dependency.
-  TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
-  return client->Compile(module, options);
+  return absl::InternalError(
+      "GPU AOT compilation requires the target to be built with CUDA or "
+      "ROCm.");
 }
 
 REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
index fe3a67eb5849ef..981748429fa48e 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
@@ -32,8 +32,8 @@ namespace xla {
 class StreamExecutorGpuCompiler : public PjRtCompiler {
  public:
   // If `gpu_target_config` is nullopt, the compiler has to compile with device,
-  // i.e. calling of `Compile` should depend on the passed-in client's
-  // compilation functionality.
+  // i.e. calling of `Compile` should depend on the passed-in client's runtime
+  // device information.
   explicit StreamExecutorGpuCompiler(const std::optional<gpu::GpuTargetConfig>
                                          gpu_target_config = std::nullopt)
       : gpu_target_config_(gpu_target_config) {}
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index 42f28daf377447..07960a19dc4917 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -37,8 +37,10 @@ limitations under the License.
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_compiler.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/service/gpu/gpu_target_config.h"
+#include "xla/service/gpu/nvptx_compiler.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/tests/literal_test_util.h"
 #include "tsl/platform/casts.h"
@@ -61,31 +63,6 @@ constexpr absl::string_view mlir_str = R"mlir(
     }
   })mlir";
 
-constexpr absl::string_view kGpuTargetConfig =
-    R"pb(
-  gpu_device_info {
-    threads_per_block_limit: 1024
-    threads_per_warp: 32
-    shared_memory_per_block: 49152
-    shared_memory_per_core: 65536
-    threads_per_core_limit: 2048
-    core_count: 56
-    fpus_per_core: 64
-    block_dim_limit_x: 2147483647
-    block_dim_limit_y: 65535
-    block_dim_limit_z: 65535
-    memory_bandwidth: 732160000000
-    l2_cache_size: 4194304
-    clock_rate_ghz: 1.4805
-    device_memory_size: 17066622976
-    shared_memory_per_block_optin: 49152
-    cuda_compute_capability { major: 6 }
-  }
-  platform_name: "CUDA"
-  dnn_version_info {}
-  device_description_str: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
-    )pb";
-
 absl::StatusOr<xla::XlaComputation> GetXlaComputation(
     absl::string_view program) {
   TF_ASSIGN_OR_RETURN(auto hlo_module,
@@ -105,36 +82,29 @@ void ValidateResult(
       LiteralTestUtil::Equal(LiteralUtil::CreateR0(2), *result_literal));
 }
 
-absl::StatusOr<gpu::GpuTargetConfig> GetGpuTargetConfig() {
-  stream_executor::GpuTargetConfigProto gpu_target_config_proto;
-  if (!proto2::TextFormat::ParseFromString(kGpuTargetConfig,
-                                           &gpu_target_config_proto)) {
-    return absl::InvalidArgumentError("Failed to parse GpuTargetConfigProto");
-  }
-  return gpu::GpuTargetConfig(gpu_target_config_proto);
-}
-
 TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
-  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
-                       GetGpuTargetConfig());
-  CompileOptions options = xla::CompileOptions();
-  StreamExecutorGpuCompiler compiler(gpu_config);
-
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
                                               /*node_id=*/0));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+  auto gpu_compiler = gpu::NVPTXCompiler();
+  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
+      se_client->client()->backend().default_stream_executor());
+  StreamExecutorGpuCompiler compiler(gpu_target_config);
+
   mlir::MLIRContext context;
   context.loadDialect<mlir::mhlo::MhloDialect, mlir::func::FuncDialect>();
   auto mlir_module =
       mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
   TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
   TF_ASSERT_OK_AND_ASSIGN(
-      auto executable, compiler.Compile(xla::CompileOptions(),
-                                        mlir_module.get(), *topology, nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
-                          se_client->Load(std::move(executable)));
+      auto executable,
+      compiler.Compile(xla::CompileOptions(), mlir_module.get(), *topology,
+                       /*client=*/nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto loaded_executable,
+      se_client->Load(std::move(executable), LoadOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
@@ -142,59 +112,82 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
 }
 
 TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
-  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
-                       GetGpuTargetConfig());
-  CompileOptions options = xla::CompileOptions();
-  StreamExecutorGpuCompiler compiler(gpu_config);
-
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
                                               /*node_id=*/0));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+  auto gpu_compiler = gpu::NVPTXCompiler();
+  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
+      se_client->client()->backend().default_stream_executor());
+  StreamExecutorGpuCompiler compiler(gpu_target_config);
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
   TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          compiler.Compile(xla::CompileOptions(), computation,
+                                           *topology, /*client=*/nullptr));
   TF_ASSERT_OK_AND_ASSIGN(
-      auto executable,
-      compiler.Compile(xla::CompileOptions(), computation, *topology, nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
-                          se_client->Load(std::move(executable)));
+      auto loaded_executable,
+      se_client->Load(std::move(executable), LoadOptions()));
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
   ValidateResult(result);
 }
 
 TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
-  ASSERT_OK_AND_ASSIGN(const gpu::GpuTargetConfig gpu_config,
-                       GetGpuTargetConfig());
-  CompileOptions options = xla::CompileOptions();
-  StreamExecutorGpuCompiler compiler(gpu_config);
-
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
                                               /*node_id=*/0));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+  auto gpu_compiler = gpu::NVPTXCompiler();
+  gpu::GpuTargetConfig gpu_target_config = gpu_compiler.GetGpuTargetConfig(
+      se_client->client()->backend().default_stream_executor());
+  StreamExecutorGpuCompiler compiler(gpu_target_config);
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
   TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto executable,
-      compiler.Compile(xla::CompileOptions(), computation, *topology, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          compiler.Compile(xla::CompileOptions(), computation,
+                                           *topology, /*client=*/nullptr));
 
   // Serialize the executable and load it.
   TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable,
                           executable->SerializeExecutable());
   TF_ASSERT_OK_AND_ASSIGN(
       auto loaded_executable,
-      se_client->LoadSerialized(serialized_executable, std::nullopt,
-                                LoadOptions()));
+      se_client->LoadSerializedExecutable(serialized_executable, std::nullopt,
+                                          LoadOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
   ValidateResult(result);
 }
 
+TEST(StreamExecutorGpuCompilerTest, SuccessCrossCompileAndLoad) {
+  CompileOptions options = xla::CompileOptions();
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
+                          GetXlaComputation(kProgram));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto topology, client->GetTopologyDescription());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      xla::PjRtCompile(options, computation, *topology, client.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized,
+                          executable->SerializeExecutable());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
+                          client->LoadSerializedExecutable(
+                              serialized, std::nullopt, LoadOptions()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
+  ValidateResult(result);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
index 94c197cdeda9cb..e09b2a5014dd6a 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -57,17 +57,6 @@ absl::StatusOr<xla::XlaComputation> GetXlaComputation(
   return XlaComputation(hlo_module->ToProto());
 }
 
-TEST(StreamExecutorGpuCompilerTest, NoClientXla) {
-  StreamExecutorGpuCompiler compiler;
-  StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
-                                                "Fake_device", {0, 1});
-
-  TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
-  EXPECT_THAT(compiler.Compile(xla::CompileOptions(), computation, topology,
-                               /*client=*/nullptr),
-              StatusIs(absl::StatusCode::kUnimplemented));
-}
-
 TEST(StreamExecutorGpuCompilerTest, TopologyNotSameXla) {
   StreamExecutorGpuCompiler compiler;
   StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 4aa33f467ba7a0..856737f086c7b4 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1167,6 +1167,7 @@ cc_library(
         ":hlo_execution_profile",
         ":hlo_module_config",
         ":hlo_module_util",
+        ":local_service_utils",
         ":platform_util",
         ":service",
         ":shaped_buffer",
@@ -1187,6 +1188,31 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "local_service_utils",
+    srcs = ["local_service_utils.cc"],
+    hdrs = ["local_service_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":backend",
+        ":hlo_module_config",
+        ":hlo_module_util",
+        ":service",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/client:executable_build_options",
+        "//xla/client:xla_computation",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
diff --git a/third_party/xla/xla/service/local_service.cc b/third_party/xla/xla/service/local_service.cc
index 65abe2e6a5e857..cbd81a3915b7cd 100644
--- a/third_party/xla/xla/service/local_service.cc
+++ b/third_party/xla/xla/service/local_service.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_module_util.h"
+#include "xla/service/local_service_utils.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
@@ -41,6 +42,7 @@ limitations under the License.
 #include "xla/types.h"
 #include "xla/util.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -68,86 +70,6 @@ LocalService::LocalService(const ServiceOptions& options,
                            std::unique_ptr<Backend> execute_backend)
     : Service(options, std::move(execute_backend)) {}
 
-namespace {
-
-// Retrieves the parameter metadata for the given computation and parameter
-// number.
-//
-// If the parameter number is invalid for this computation, nullopt is
-// returned. When the return value has_value(), nullptr will never be
-// the held value.
-std::optional<const OpMetadata*> ParameterMetadata(
-    const XlaComputation& computation, int parameter_number) {
-  for (const HloComputationProto& comp : computation.proto().computations()) {
-    if (comp.id() == computation.proto().entry_computation_id()) {
-      for (const HloInstructionProto& instr : comp.instructions()) {
-        if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter) &&
-            instr.parameter_number() == parameter_number) {
-          if (!instr.has_metadata()) {
-            return std::nullopt;
-          }
-          return &instr.metadata();
-        }
-      }
-    }
-  }
-  return std::nullopt;
-}
-
-}  // namespace
-
-StatusOr<std::unique_ptr<HloModuleConfig>> LocalService::GetHloModuleConfig(
-    const XlaComputation& computation,
-    const absl::Span<const Shape* const> argument_layouts,
-    const ExecutableBuildOptions& build_options) {
-  const HloModuleProto& proto = computation.proto();
-  TF_RET_CHECK(proto.has_host_program_shape());
-  ProgramShape program_shape(proto.host_program_shape());
-
-  // Validate incoming layouts.
-  if (argument_layouts.size() != program_shape.parameters_size()) {
-    return InvalidArgument(
-        "Invalid number of arguments for computation: expected %d, got %u.",
-        program_shape.parameters_size(), argument_layouts.size());
-  }
-
-  for (int i = 0; i < argument_layouts.size(); ++i) {
-    const Shape& argument_shape = *argument_layouts[i];
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ValidateShapeWithOptionalLayout(argument_shape));
-    if (!ShapeUtil::Compatible(argument_shape, program_shape.parameters(i))) {
-      std::optional<const OpMetadata*> metadata =
-          ParameterMetadata(computation, /*parameter_number=*/i);
-      auto metadata_string = [&metadata]() -> std::string {
-        if (!metadata.has_value()) {
-          return "";
-        }
-        CHECK(metadata.value() != nullptr);
-        const OpMetadata& m = *metadata.value();
-        if (!m.source_file().empty()) {
-          return absl::StrFormat(" (%s:%d)", m.source_file(), m.source_line());
-        }
-        return "";
-      };
-      return InvalidArgument(
-          "Invalid argument shape for argument %d%s, expected %s, got %s.", i,
-          metadata_string(),
-          ShapeUtil::HumanString(program_shape.parameters(i)),
-          ShapeUtil::HumanString(argument_shape));
-    }
-  }
-  if (build_options.result_layout() != nullptr) {
-    TF_RETURN_IF_ERROR(ValidateResultShape(*build_options.result_layout(),
-                                           program_shape.result()));
-  }
-
-  ExecutionOptions execution_options =
-      CreateExecutionOptions(build_options, &program_shape);
-
-  return CreateModuleConfig(program_shape, argument_layouts,
-                            &execution_options);
-}
-
 StatusOr<std::vector<std::unique_ptr<Executable>>>
 LocalService::CompileExecutables(
     const XlaComputation& computation,
@@ -155,7 +77,8 @@ LocalService::CompileExecutables(
     const ExecutableBuildOptions& build_options) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      GetHloModuleConfig(computation, argument_layouts, build_options));
+      GetHloModuleConfig(computation, argument_layouts, build_options,
+                         &options_, execute_backend_.get()));
 
   VLOG(3) << "Computation Layout: "
           << module_config->entry_computation_layout().ToString();
@@ -205,7 +128,8 @@ LocalService::CompileAotResults(
     const ExecutableBuildOptions& build_options) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      GetHloModuleConfig(computation, argument_layouts, build_options));
+      GetHloModuleConfig(computation, argument_layouts, build_options,
+                         &options_, execute_backend_.get()));
 
   TF_ASSIGN_OR_RETURN(
       se::StreamExecutor * executor,
diff --git a/third_party/xla/xla/service/local_service.h b/third_party/xla/xla/service/local_service.h
index b549d05fee0bdf..4bea89ef1f5876 100644
--- a/third_party/xla/xla/service/local_service.h
+++ b/third_party/xla/xla/service/local_service.h
@@ -83,13 +83,6 @@ class LocalService : public Service {
                         std::unique_ptr<Backend> backend);
   LocalService(const LocalService&) = delete;
   void operator=(const LocalService&) = delete;
-
-  // Validates the computation argument layouts, and returns the corresponding
-  // HloModuleConfig.
-  StatusOr<std::unique_ptr<HloModuleConfig>> GetHloModuleConfig(
-      const XlaComputation& computation,
-      const absl::Span<const Shape* const> argument_layouts,
-      const ExecutableBuildOptions& build_options);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/local_service_utils.cc b/third_party/xla/xla/service/local_service_utils.cc
new file mode 100644
index 00000000000000..e0b2d81da3ace5
--- /dev/null
+++ b/third_party/xla/xla/service/local_service_utils.cc
@@ -0,0 +1,125 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/local_service_utils.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/client/executable_build_options.h"
+#include "xla/client/xla_computation.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/backend.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_module_util.h"
+#include "xla/service/service.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+
+namespace xla {
+namespace {
+// Retrieves the parameter metadata for the given computation and parameter
+// number.
+//
+// If the parameter number is invalid for this computation, nullopt is
+// returned. When the return value has_value(), nullptr will never be
+// the held value.
+std::optional<const OpMetadata*> ParameterMetadata(
+    const XlaComputation& computation, int parameter_number) {
+  for (const HloComputationProto& comp : computation.proto().computations()) {
+    if (comp.id() == computation.proto().entry_computation_id()) {
+      for (const HloInstructionProto& instr : comp.instructions()) {
+        if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter) &&
+            instr.parameter_number() == parameter_number) {
+          if (!instr.has_metadata()) {
+            return std::nullopt;
+          }
+          return &instr.metadata();
+        }
+      }
+    }
+  }
+  return std::nullopt;
+}
+}  // namespace
+
+StatusOr<std::unique_ptr<HloModuleConfig>> GetHloModuleConfig(
+    const XlaComputation& computation,
+    absl::Span<const Shape* const> argument_layouts,
+    const ExecutableBuildOptions& build_options, ServiceOptions* options,
+    Backend* backend) {
+  const HloModuleProto& proto = computation.proto();
+  TF_RET_CHECK(proto.has_host_program_shape());
+  ProgramShape program_shape(proto.host_program_shape());
+
+  // Validate incoming layouts.
+  if (argument_layouts.size() != program_shape.parameters_size()) {
+    return InvalidArgument(
+        "Invalid number of arguments for computation: expected %d, got %u.",
+        program_shape.parameters_size(), argument_layouts.size());
+  }
+
+  for (int i = 0; i < argument_layouts.size(); ++i) {
+    const Shape& argument_shape = *argument_layouts[i];
+    TF_RETURN_IF_ERROR(
+        ShapeUtil::ValidateShapeWithOptionalLayout(argument_shape));
+    if (!ShapeUtil::Compatible(argument_shape, program_shape.parameters(i))) {
+      std::optional<const OpMetadata*> metadata =
+          ParameterMetadata(computation, /*parameter_number=*/i);
+      auto metadata_string = [&metadata]() -> std::string {
+        if (!metadata.has_value()) {
+          return "";
+        }
+        CHECK(metadata.value() != nullptr);
+        const OpMetadata& m = *metadata.value();
+        if (!m.source_file().empty()) {
+          return absl::StrFormat(" (%s:%d)", m.source_file(), m.source_line());
+        }
+        return "";
+      };
+      return InvalidArgument(
+          "Invalid argument shape for argument %d%s, expected %s, got %s.", i,
+          metadata_string(),
+          ShapeUtil::HumanString(program_shape.parameters(i)),
+          ShapeUtil::HumanString(argument_shape));
+    }
+  }
+  if (build_options.result_layout() != nullptr) {
+    TF_RETURN_IF_ERROR(Service::ValidateResultShape(
+        *build_options.result_layout(), program_shape.result()));
+  }
+
+  ExecutionOptions execution_options =
+      CreateExecutionOptions(build_options, &program_shape);
+
+  int default_num_replicas =
+      options == nullptr ? 1 : options->number_of_replicas();
+  std::optional<int> num_threads;
+  if (backend != nullptr && backend->eigen_intra_op_thread_pool() != nullptr) {
+    num_threads = backend->eigen_intra_op_thread_pool()->NumThreads();
+  }
+  return xla::CreateModuleConfig(program_shape, argument_layouts,
+                                 &execution_options, default_num_replicas,
+                                 num_threads);
+}
+}  // namespace xla
diff --git a/third_party/xla/xla/service/local_service_utils.h b/third_party/xla/xla/service/local_service_utils.h
new file mode 100644
index 00000000000000..e71f4605aec2e8
--- /dev/null
+++ b/third_party/xla/xla/service/local_service_utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_LOCAL_SERVICE_UTILS_H_
+#define XLA_SERVICE_LOCAL_SERVICE_UTILS_H_
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "xla/client/executable_build_options.h"
+#include "xla/client/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/backend.h"
+#include "xla/service/service.h"
+
+namespace xla {
+// Validates the computation argument layouts, and returns the corresponding
+// HloModuleConfig.
+StatusOr<std::unique_ptr<HloModuleConfig>> GetHloModuleConfig(
+    const XlaComputation& computation,
+    absl::Span<const Shape* const> argument_layouts,
+    const ExecutableBuildOptions& build_options,
+    ServiceOptions* options = nullptr, Backend* backend = nullptr);
+}  // namespace xla
+
+#endif  // XLA_SERVICE_LOCAL_SERVICE_UTILS_H_
diff --git a/third_party/xla/xla/service/service.cc b/third_party/xla/xla/service/service.cc
index 5358270fa35992..8a25b21fa9de13 100644
--- a/third_party/xla/xla/service/service.cc
+++ b/third_party/xla/xla/service/service.cc
@@ -215,7 +215,7 @@ Status Service::DeconstructTuple(const DeconstructTupleRequest* arg,
 }
 
 Status Service::ValidateResultShape(const Shape& client_shape,
-                                    const Shape& result_shape) const {
+                                    const Shape& result_shape) {
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(client_shape));
   if (!ShapeUtil::Compatible(client_shape, result_shape)) {
     return InvalidArgument(
diff --git a/third_party/xla/xla/service/service.h b/third_party/xla/xla/service/service.h
index 75aca964f14d33..81abbba520b566 100644
--- a/third_party/xla/xla/service/service.h
+++ b/third_party/xla/xla/service/service.h
@@ -190,6 +190,12 @@ class Service : public ServiceInterface {
       const ExecutionOptions* execution_options,
       const AotCompilationOptions* aot_options = nullptr);
 
+  // Convenience function which checks whether the given client_shape
+  // (presumably passed by the client to set the result layout) is valid for the
+  // given computation result shape.
+  static Status ValidateResultShape(const Shape& client_shape,
+                                    const Shape& result_shape);
+
  private:
   // A private overload for Service itself, used by other methods within this
   // class.
@@ -273,12 +279,6 @@ class Service : public ServiceInterface {
       Backend* backend, absl::Span<const DeviceHandle> device_handles,
       absl::Span<const std::string> result_tags, ExecutionProfile* profile);
 
-  // Convenience function which checks whether the given client_shape
-  // (presumably passed by the client to set the result layout) is valid for the
-  // given computation result shape.
-  Status ValidateResultShape(const Shape& client_shape,
-                             const Shape& result_shape) const;
-
   // Returns the stream executors assigned to the replicas represented by the
   // given device handle. Each device_handle is a virtual replicated device that
   // represents a set of physical devices for the replicas.

From cedad1c00b75d563a8c6a0792309287ad26c07ce Mon Sep 17 00:00:00 2001
From: Fiona Lang <flang@google.com>
Date: Fri, 29 Sep 2023 11:40:00 -0700
Subject: [PATCH 431/567] Move tpu import from python/__init__.py to
 python/modules_with_exports.py.

PiperOrigin-RevId: 569547232
---
 tensorflow/compiler/jit/ops/BUILD                         | 4 ++++
 tensorflow/python/BUILD                                   | 1 +
 tensorflow/python/__init__.py                             | 1 -
 tensorflow/python/modules_with_exports.py                 | 3 +++
 tensorflow/python/ops/BUILD                               | 8 ++++++++
 tensorflow/python/ops/gradients_impl.py                   | 7 +++++++
 tensorflow/python/ops/state_ops.py                        | 1 +
 .../saved_model/registration/tf_registration_test.py      | 3 +++
 tensorflow/python/tools/api/generator/doc_srcs.py         | 8 ++++++--
 tensorflow/python/tpu/ops/tpu_ops.py                      | 3 +++
 tensorflow/python/tpu/tpu.py                              | 2 --
 11 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index ab93b126fa3371..08798aa9f307a8 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -36,5 +36,9 @@ py_strict_library(
     name = "xla_ops_grad",
     srcs = ["xla_ops_grad.py"],
     srcs_version = "PY3",
+    visibility = [
+        "//tensorflow/compiler/tf2xla:internal",
+        "//tensorflow/python/ops:__pkg__",
+    ],
     deps = ["//tensorflow/python/framework:ops"],
 )
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 7ad991d1d5b87b..69f6e5aac1d250 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -436,6 +436,7 @@ py_strict_library(
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/summary:summary_py",
+        "//tensorflow/python/tpu:tpu_noestimator",
         "//tensorflow/python/training",
         "//tensorflow/python/training:quantize_training",
         "//tensorflow/python/user_ops:ops",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 082cf4e7b175e4..88caa3ee581420 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -29,7 +29,6 @@
 
 # from tensorflow.python import keras
 # from tensorflow.python.layers import layers
-from tensorflow.python.tpu import api
 
 # Special dunders that we choose to export:
 _exported_dunders = set([
diff --git a/tensorflow/python/modules_with_exports.py b/tensorflow/python/modules_with_exports.py
index 77bb4573e761c0..4b89ea887e80e0 100644
--- a/tensorflow/python/modules_with_exports.py
+++ b/tensorflow/python/modules_with_exports.py
@@ -172,6 +172,9 @@
 # Summary
 from tensorflow.python.summary import summary
 
+# TPU
+from tensorflow.python.tpu import api
+
 # Training
 from tensorflow.python.training import training as train
 from tensorflow.python.training import quantize_training as _quantize_training
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index dc013d882788ad..5de6f151ebd042 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -1622,6 +1622,7 @@ py_strict_library(
         ":array_ops",
         ":check_ops",
         ":control_flow_grad",
+        ":cudnn_rnn_grad",
         ":gradients_util",
         ":image_grad",
         ":linalg_grad",
@@ -1630,13 +1631,19 @@ py_strict_library(
         ":manip_grad",
         ":math_grad",
         ":math_ops",
+        ":nccl_ops",
         ":optional_grad",
+        ":proto_ops",
         ":random_grad",
         ":rnn_grad",
         ":sdca_ops",
+        ":sets",
+        ":sparse_grad",
+        ":tensor_array_grad",
         ":tensor_array_ops",
         ":unconnected_gradients",
         ":while_loop",
+        "//tensorflow/compiler/jit/ops:xla_ops_grad",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_grad",
@@ -2921,6 +2928,7 @@ py_strict_library(
         ":array_ops",
         ":math_ops_gen",
         ":resource_variable_ops_gen",
+        ":state_grad",
         ":state_ops_gen",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 5a69feae861089..b7d2e774235474 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -14,12 +14,14 @@
 # ==============================================================================
 """Implements the graph generation for computation of gradients."""
 
+from tensorflow.compiler.jit.ops import xla_ops_grad  # pylint: disable=unused-import
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import cudnn_rnn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import image_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
@@ -28,10 +30,15 @@
 from tensorflow.python.ops import manip_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nccl_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import optional_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import proto_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import random_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import rnn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import sdca_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import sets  # pylint: disable=unused-import
+from tensorflow.python.ops import sparse_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import while_loop
 from tensorflow.python.ops.linalg.sparse import sparse_csr_matrix_grad  # pylint: disable=unused-import
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index d4cfb23145ca86..b80487cd78c169 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -24,6 +24,7 @@
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gen_state_ops
+from tensorflow.python.ops import state_grad  # pylint: disable=unused-import
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_state_ops import *
diff --git a/tensorflow/python/saved_model/registration/tf_registration_test.py b/tensorflow/python/saved_model/registration/tf_registration_test.py
index 68619f089ed9d8..7a162e8b3f26e5 100644
--- a/tensorflow/python/saved_model/registration/tf_registration_test.py
+++ b/tensorflow/python/saved_model/registration/tf_registration_test.py
@@ -31,6 +31,9 @@
 CHECKPOINT_SAVER_ALLOWLIST = os.path.join(resource_loader.get_data_files_path(),
                                           "tf_checkpoint_saver_allowlist.txt")
 
+# call TPUEmbedding to load the file and run its registration
+_ = tf.tpu.experimental.embedding.TPUEmbedding
+
 
 @registration.register_tf_serializable()
 class NotInAllowlistExample(tf.Variable):
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index acbd65afc0ca15..56ab8a1b221fdd 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -61,8 +61,12 @@ def __init__(self, docstring=None, docstring_module_name=None):
         DocSource(docstring_module_name='ops.linalg_ops'),
     'logging':
         DocSource(docstring_module_name='ops.logging_ops'),
-    'losses':
-        DocSource(docstring_module_name='ops.losses.losses'),
+    'losses': DocSource(
+        docstring=(
+            'Loss operations for use in neural networks. Note: All the losses'
+            ' are added to the `GraphKeys.LOSSES` collection by default.'
+        )
+    ),
     'manip':
         DocSource(docstring_module_name='ops.manip_ops'),
     'math':
diff --git a/tensorflow/python/tpu/ops/tpu_ops.py b/tensorflow/python/tpu/ops/tpu_ops.py
index 629f5269e1a927..d54cb5a68afbb8 100644
--- a/tensorflow/python/tpu/ops/tpu_ops.py
+++ b/tensorflow/python/tpu/ops/tpu_ops.py
@@ -26,6 +26,9 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
+ops.NotDifferentiable("TPUReplicatedInput")
+
+
 def _create_default_group_assignment():
   num_shards = tpu_function.get_tpu_context().number_of_shards
   if num_shards is None:
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index 2daee6c2a96cbc..cc830a68dc427d 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -59,8 +59,6 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
-ops.NotDifferentiable("TPUReplicatedInput")
-
 # Ops which can be safely pruned from XLA compile if they have no consumers.
 #  These ops should also have no inputs.
 _UNCONNECTED_OPS_TO_PRUNE = set(["Placeholder", "VarHandleOp"])

From ad63ca43a2dcc28ee937a2eb8d20da6ddd77087b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Sep 2023 11:46:32 -0700
Subject: [PATCH 432/567] Rename the parameters of the IsDivisible function in
 auto_sharding_util to be more intuitive.

PiperOrigin-RevId: 569548788
---
 .../xla/hlo/experimental/auto_sharding/auto_sharding_util.cc  | 4 ++--
 .../xla/hlo/experimental/auto_sharding/auto_sharding_util.h   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 100113745a49c1..05cc51657dcc96 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -1625,8 +1625,8 @@ std::vector<int64_t> GetDimensionMapping(
   return mapping;
 }
 
-bool IsDivisible(int64_t denominator, int64_t numerator) {
-  return (denominator % numerator == 0);
+bool IsDivisible(int64_t numerator, int64_t denominator) {
+  return (numerator % denominator == 0);
 }
 
 std::vector<std::vector<int64_t>> GetReplicaGroupsAlongOneDimension(
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index 73380abf3df2c8..38926d9fa35afa 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -525,8 +525,8 @@ inline std::vector<const HloInstruction*> GetGradientComputationInstructions(
 std::vector<int64_t> GetDimensionMapping(
     absl::Span<const int64_t> reduced_dimensions, int64_t op_count);
 
-// Checks whether denominator is divisible by numerator.
-bool IsDivisible(int64_t denominator, int64_t numerator);
+// Checks whether numerator is divisible by denominator.
+bool IsDivisible(int64_t numerator, int64_t denominator);
 
 // Generate all replica groups along one device_mesh dimension. Device_mesh can
 // be any number of dimensions. |communication_dim| has to be one of

From 8f12131a42a2573cd4824e41a4058f31e18b13b8 Mon Sep 17 00:00:00 2001
From: Wilsin Gosti <wilsin@google.com>
Date: Fri, 29 Sep 2023 11:53:09 -0700
Subject: [PATCH 433/567] #tf-data Reduce the starting
 `max_outstanding_requests` to 16 before any element is produced to avoid OOM
 when jobs start with many tasks.

PiperOrigin-RevId: 569550401
---
 .../experimental/data_service_dataset_op.cc   | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 15072ce1c60114..c67835391ea65b 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <atomic>
+#include <cstdint>
 #include <functional>
 #include <limits>
 #include <memory>
@@ -102,6 +103,10 @@ constexpr const char kDistributedEpoch[] = "distributed_epoch";
 
 // Default interval between task list refreshes.
 constexpr absl::Duration kDefaultTaskRefreshInterval = absl::Seconds(1);
+
+// Default starting `max_outstanding_requests` when it is autotuned.
+constexpr int64_t kStartingMaxOutstandingRequests = 16;
+
 }  // namespace
 
 // Dataset for reading data from the tf.data service.
@@ -437,11 +442,16 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
         if (element_size_cache_ == 0.0) {
           element_size_cache_ = node_->AverageBufferedElementSize();
           if (element_size_cache_ == 0) {
-            VLOG(3) << "The average element size of `DataService` is 0. "
-                       "Request to change `max_outstanding_requests` from "
-                    << max_outstanding_requests << " to "
-                    << requested_outstanding_requests << " is granted.";
-            return requested_outstanding_requests;
+            int64_t new_outstanding_requests = std::max(
+                max_outstanding_requests, kStartingMaxOutstandingRequests);
+            VLOG(3) << "The average element size of `DataService` is 0. The "
+                       "`max_outstanding_requests` value "
+                    << max_outstanding_requests
+                    << (max_outstanding_requests == new_outstanding_requests
+                            ? " is kept at "
+                            : " is changed to the default value of ")
+                    << new_outstanding_requests << ".";
+            return new_outstanding_requests;
           }
         }
         const int64_t delta_outstanding_requests =

From fa4873704576a8522f46e3bde4a79296a9afa00c Mon Sep 17 00:00:00 2001
From: Matt Callanan <mpcallanan@google.com>
Date: Fri, 29 Sep 2023 12:52:20 -0700
Subject: [PATCH 434/567] #tf-data-service Optimize filepath for snapshot
 writes.

PiperOrigin-RevId: 569564835
---
 tensorflow/core/data/service/snapshot/BUILD                  | 1 +
 .../core/data/service/snapshot/snapshot_stream_writer.cc     | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/data/service/snapshot/BUILD b/tensorflow/core/data/service/snapshot/BUILD
index e8466a0df768ae..f8344967fbf7ff 100644
--- a/tensorflow/core/data/service/snapshot/BUILD
+++ b/tensorflow/core/data/service/snapshot/BUILD
@@ -245,6 +245,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/data:snapshot_utils",
+        "//tensorflow/core/data:utils",
         "//tensorflow/core/data/service:common",
         "//tensorflow/core/data/service:common_proto_cc",
         "//tensorflow/core/data/service:task_runner",
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
index f26f3b39969bf6..673cf2667d6f44 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/data/service/snapshot/utils.h"
 #include "tensorflow/core/data/service/worker.pb.h"
 #include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/data/utils.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -147,8 +148,8 @@ Status SnapshotStreamWriter::WriteChunk() {
   std::string uncommitted_chunk_file_path =
       tsl::io::JoinPath(params_.UncommittedChunksDirectory(),
                         absl::StrCat("chunk_", chunk_index_));
-  snapshot_util::TFRecordWriter writer(uncommitted_chunk_file_path,
-                                       params_.compression);
+  snapshot_util::TFRecordWriter writer(
+      TranslateFileName(uncommitted_chunk_file_path), params_.compression);
   TF_RETURN_IF_ERROR(writer.Initialize(params_.env));
   while (ShouldWriteRecord()) {
     TF_RETURN_IF_ERROR(WriteRecord(writer));

From 15e8ebf798d02f3ea489f467bb94b3ec3befedcd Mon Sep 17 00:00:00 2001
From: Fiona Lang <flang@google.com>
Date: Fri, 29 Sep 2023 13:01:23 -0700
Subject: [PATCH 435/567] Add a presubmit check to ensure no imports are added
 to python/__init__.py.

PiperOrigin-RevId: 569567547
---
 tensorflow/python/__init__.py             | 20 +++++---------------
 tensorflow/python/tools/BUILD             |  6 ++++++
 tensorflow/python/tools/tf_import_time.py | 17 +++++++++++++++++
 3 files changed, 28 insertions(+), 15 deletions(-)
 create mode 100644 tensorflow/python/tools/tf_import_time.py

diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 88caa3ee581420..9c52b712be1598 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -12,24 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Import core names of TensorFlow.
+"""TensorFlow Python init file."""
 
-Programs that want to build TensorFlow Ops and Graphs without having to import
-the constructors and utilities individually can import this file:
+# Do not add code to //third_party/tensorflow/python/__init__.py.
+# This file is imported whenever TensorFlow is imported.
+# Additional imports in this file could cause the internal
+# import time of TensorFlow to increase by multiple seconds.
 
 
-import tensorflow as tf
-"""
-
-# We aim to keep this file minimal and ideally remove completely.
-# If you are adding a new file with @tf_export decorators,
-# import it in modules_with_exports.py instead.
-
-# pylint: disable=g-bad-import-order,g-import-not-at-top
-
-# from tensorflow.python import keras
-# from tensorflow.python.layers import layers
-
 # Special dunders that we choose to export:
 _exported_dunders = set([
     '__version__',
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index a867a19fbed0e6..e4a02f9a93a3aa 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -121,6 +121,12 @@ py_strict_library(
     ],
 )
 
+py_strict_binary(
+    name = "tf_import_time",
+    srcs = ["tf_import_time.py"],
+    deps = ["//tensorflow:tensorflow_py"],
+)
+
 py_strict_test(
     name = "freeze_graph_test",
     size = "small",
diff --git a/tensorflow/python/tools/tf_import_time.py b/tensorflow/python/tools/tf_import_time.py
new file mode 100644
index 00000000000000..51299f9ae6db77
--- /dev/null
+++ b/tensorflow/python/tools/tf_import_time.py
@@ -0,0 +1,17 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to measure the internal tensorflow import time."""
+
+import tensorflow as tf  # pylint: disable=unused-import

From 78da7d4ba33fdfb709bc5c6f2ae7e2d0d2b70fac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Sep 2023 13:10:32 -0700
Subject: [PATCH 436/567] Rollback of PR #58255 PR #58255: Fix ReLUN1To1 fused
 activation for OpenCL and OpenGL

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/58255

Fix GPU (OpenCL and OpenGL) compile errors on Android.
Fix ReLUN1To1 fused activation for OpenCL and OpenGL.

Copybara import of the project:

--
aa2a1ced77c5e47fb7e4cb2f80b768bfc7c63e6b by Mike Corrigan <corrigan@motorola.com>:

Fix GPU compile errors (OpenCL and OpenGL)

Fix number of parameters in call to CLArguments::Init().
Add missing li...

PiperOrigin-RevId: 569570381
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |   1 -
 .../delegates/gpu/cl/cl_arguments_test.cc     |   5 +-
 .../delegates/gpu/cl/kernels/relu_test.cc     |  20 ---
 .../lite/delegates/gpu/cl/testing/BUILD       |   7 -
 .../delegates/gpu/common/model_builder.cc     |  23 ++-
 .../gpu/common/model_builder_helper.cc        |   9 +-
 .../lite/delegates/gpu/common/operations.h    |  21 +--
 .../lite/delegates/gpu/common/tasks/relu.cc   |  15 +-
 .../gpu/common/tasks/relu_test_util.cc        | 136 +-----------------
 .../gpu/common/tasks/relu_test_util.h         |   4 -
 tensorflow/lite/delegates/gpu/gl/BUILD        |   4 -
 .../lite/delegates/gpu/gl/kernels/BUILD       |  17 +--
 .../lite/delegates/gpu/gl/kernels/relu.cc     |  11 +-
 .../delegates/gpu/gl/kernels/relu_test.cc     |  64 +--------
 14 files changed, 44 insertions(+), 293 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index e9423c5b5dafcb..d710059af90886 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -126,7 +126,6 @@ cc_test(
     deps = [
         ":buffer",
         ":cl_arguments",
-        ":cl_test",
         ":gpu_object",
         "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc b/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
index c9da770940968f..2a46c202286dd5 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/match.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_test.h"
 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 
@@ -46,7 +45,7 @@ __kernel void main_function($0) {
 
   CLArguments cl_args;
   GpuInfo gpu_info;
-  ASSERT_OK(cl_args.Init(gpu_info, nullptr, &args, &sample_code));
+  ASSERT_OK(cl_args.Init(gpu_info, {}, nullptr, &args, &sample_code));
   EXPECT_TRUE(absl::StrContains(sample_code, "value = weights_buffer[id];"));
   EXPECT_TRUE(
       absl::StrContains(sample_code, "__global float4* weights_buffer"));
@@ -68,7 +67,7 @@ TEST(CLArgumentsTest, TestNoSelector) {
 )";
   CLArguments cl_args;
   GpuInfo gpu_info;
-  EXPECT_FALSE(cl_args.Init(gpu_info, nullptr, &args, &sample_code).ok());
+  EXPECT_FALSE(cl_args.Init(gpu_info, {}, nullptr, &args, &sample_code).ok());
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
index fd76f2cbc48bf8..10a5c565c2437f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
@@ -46,26 +46,6 @@ TEST_F(OpenCLOperationTest, ReLUAlphaClip) {
   ASSERT_TRUE(status.ok()) << status.message();
 }
 
-TEST_F(OpenCLOperationTest, ReLULN1NoClipNoAlpha) {
-  auto status = ReLUN1NoClipNoAlphaTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status;
-}
-
-TEST_F(OpenCLOperationTest, ReLUN1Clip) {
-  auto status = ReLUN1ClipTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status;
-}
-
-TEST_F(OpenCLOperationTest, ReLULN1Alpha) {
-  auto status = ReLUN1AlphaTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status;
-}
-
-TEST_F(OpenCLOperationTest, ReLUN1AlphaClip) {
-  auto status = ReLUN1AlphaClipTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status;
-}
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index 75e36c0c9ca877..f5c26496875348 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -42,13 +42,6 @@ cc_test(
 cc_binary(
     name = "internal_api_samples",
     srcs = ["internal_api_samples.cc"],
-    linkopts = select({
-        "//tensorflow:android": [
-            "-lEGL",
-            "-lGLESv3",
-        ],
-        "//conditions:default": [],
-    }),
     tags = [
         "nobuilder",
         "notap",
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 548cbcba1afc80..a8aec7c4a505dc 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -433,8 +433,8 @@ class ClampOperationsParser : public TFLiteOperationParser {
     // We replace clamp(...) with sequence of elementwise ops:
     // substaction -> usual relu with alpha = 0.0 -> addition.
     // node_sub = v0 = v - a // add op (add -a)
-    // node_relu = v1 = clamp(v0, 0.0, activation_max); // relu op alpha = 0.0,
-    // activation_max = b - a;
+    // node_relu = v1 = clamp(v0, 0.0, clip); // relu op alpha = 0.0,
+    // clip = b - a;
     // node_add = v2 = v1 + a // add op (add a)
     Node* node_sub = graph->NewNode();
     Node* node_relu = graph->NewNode();
@@ -447,7 +447,7 @@ class ClampOperationsParser : public TFLiteOperationParser {
 
     ReLUAttributes relu_attr;
     relu_attr.alpha = 0.0f;
-    relu_attr.activation_max = clamp_b_ - clamp_a_;
+    relu_attr.clip = clamp_b_ - clamp_a_;
     node_relu->operation.type = ToString(OperationType::RELU);
     node_relu->operation.attributes = relu_attr;
 
@@ -1943,8 +1943,7 @@ class QuantizeOperationParser : public TFLiteOperationParser {
 
 class ReLUOperationParser : public TFLiteOperationParser {
  public:
-  explicit ReLUOperationParser(int activation_min, int activation_max)
-      : activation_min_(activation_min), activation_max_(activation_max) {}
+  explicit ReLUOperationParser(int clip) : clip_(clip) {}
 
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
@@ -1964,15 +1963,13 @@ class ReLUOperationParser : public TFLiteOperationParser {
     const TfLiteLeakyReluParams* tf_options;
     auto status = RetrieveBuiltinData(tflite_node, &tf_options);
     attr.alpha = status.ok() ? tf_options->alpha : 0;
-    attr.activation_min = activation_min_;
-    attr.activation_max = activation_max_;
+    attr.clip = clip_;
     node->operation.attributes = attr;
     return reader->AddOutputs(node);
   }
 
  private:
-  const int activation_min_;
-  const int activation_max_;
+  const int clip_;
 };
 
 class ResamplerOperationParser : public TFLiteOperationParser {
@@ -3210,13 +3207,13 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       }
       break;
     case kTfLiteBuiltinRelu:
-      return std::make_unique<ReLUOperationParser>(0, 0);
+      return std::make_unique<ReLUOperationParser>(0);
     case kTfLiteBuiltinRelu6:
-      return std::make_unique<ReLUOperationParser>(0, 6);
+      return std::make_unique<ReLUOperationParser>(6);
     case kTfLiteBuiltinReluN1To1:
-      return std::make_unique<ReLUOperationParser>(-1.0, 1.0);
+      return std::make_unique<ClampOperationsParser>(-1.0, 1.0);
     case kTfLiteBuiltinLeakyRelu:
-      return std::make_unique<ReLUOperationParser>(0, 0);
+      return std::make_unique<ReLUOperationParser>(0);
     case kTfLiteBuiltinPrelu:
       return std::make_unique<PReLUOperationParser>();
     case kTfLiteBuiltinReshape:
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index b498916bfed447..c825d2be691d5f 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -397,12 +397,9 @@ absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
     case kTfLiteActReluN1To1:
     case kTfLiteActRelu6: {
       ReLUAttributes attr;
-      attr.activation_max =
-          fused_activation == kTfLiteActRelu
-              ? 0.0f
-              : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
-      attr.activation_min =
-          fused_activation == kTfLiteActReluN1To1 ? -1.0f : 0.0f;
+      attr.clip = fused_activation == kTfLiteActRelu
+                      ? 0.0f
+                      : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
       Node* activation_node;
       RETURN_IF_ERROR(
           NewPassthroughNode(graph, node, outputs[0], &activation_node));
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index ac384bbe801ea2..37d8dfa0fcdc6f 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -391,25 +391,18 @@ Padding3D CalculateSamePadding(const BHWDC& input,
                                const DepthwiseConvolution3DAttributes& attr);
 
 // f(x):= {
-//   if alpha != 0: x -> min(activation_max, x)
-//   else
-//     if x < activation_min : x -> min(activation_min, alpha * x)
-//     if x >= activation_min : x -> min(activation_max, x)
+//   if x < 0  : x -> alpha * x
+//   if x >= 0 : x -> min(clip, x)
 // }
 //
 // Examples:
-//   - ReLU: activation_min = 0, activation_max = 0, alpha = 0
-//   - ReLU6: activation_min = 0, activation_max = 6, alpha = 0
-//   - Leaky ReLU: activation_min = 0, activation_max = 0, alpha = a
-//   - ReLUN1To1: activation_min = -1, activation_max = 1, alpha = 0
+//   - ReLU: clip = 0, alpha = 0
+//   - ReLU6: clip = 6, alpha = 0
+//   - Leaky ReLU: clip = 0, alpha = a
 struct ReLUAttributes {
-  // activation_min must be < activation_max
-  float activation_min = 0;
+  // clip <= 0 mean it is not set.
+  float clip = 0;
 
-  // activation_max <= 0 mean it is not set.
-  float activation_max = 0;
-
-  // alpha must be <= 1
   float alpha = 0;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/relu.cc b/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
index 861ba496cc2b72..9afc2c7508eabb 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
@@ -35,21 +35,16 @@ ElementwiseDescriptor CreateReLU(const ReLUAttributes& attr,
       result.args.AddHalf("alpha", half(attr.alpha));
     }
   } else {
-    min_func = "INIT_FLT4(args.activation_min)";
-    if (precision == CalculationsPrecision::F32) {
-      result.args.AddFloat("activation_min", attr.activation_min);
-    } else {
-      result.args.AddHalf("activation_min", half(attr.activation_min));
-    }
+    min_func = "INIT_FLT4(0.0f)";
   }
-  if (attr.activation_max != 0.0f) {
+  if (attr.clip != 0.0f) {
     if (precision == CalculationsPrecision::F32) {
-      result.args.AddFloat("activation_max", attr.activation_max);
+      result.args.AddFloat("clip", attr.clip);
     } else {
-      result.args.AddHalf("activation_max", half(attr.activation_max));
+      result.args.AddHalf("clip", half(attr.clip));
     }
     result.code = absl::StrCat("out_value = clamp(in_value, " + min_func +
-                               ", INIT_FLT4(args.activation_max));");
+                               ", INIT_FLT4(args.clip));");
   } else {
     result.code = absl::StrCat("out_value = max(in_value, ", min_func, ");");
   }
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
index 09a8c2110c2935..7c87bbb7823c0b 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
@@ -33,7 +33,7 @@ absl::Status ReLUNoClipNoAlphaTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.0f;
-  attr.activation_max = 0.0f;
+  attr.clip = 0.0f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -62,7 +62,7 @@ absl::Status ReLUClipTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.0f;
-  attr.activation_max = 0.9f;
+  attr.clip = 0.9f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -91,7 +91,7 @@ absl::Status ReLUAlphaTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.5f;
-  attr.activation_max = 0.0f;
+  attr.clip = 0.0f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -120,7 +120,7 @@ absl::Status ReLUAlphaClipTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.5f;
-  attr.activation_max = 0.5f;
+  attr.clip = 0.5f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -142,133 +142,5 @@ absl::Status ReLUAlphaClipTest(TestExecutionEnvironment* env) {
   return absl::OkStatus();
 }
 
-absl::Status ReLUN1NoClipNoAlphaTest(TestExecutionEnvironment* env) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 4);
-  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
-
-  ReLUAttributes attr;
-  attr.alpha = 0.0f;
-  attr.activation_min = -1.0f;
-  attr.activation_max = 0.0f;
-
-  for (auto precision : env->GetSupportedPrecisions()) {
-    auto data_type = DeduceDataTypeFromPrecision(precision);
-    for (auto storage : env->GetSupportedStorages(data_type)) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(op_def, attr);
-      RETURN_IF_ERROR(env->ExecuteGPUOperation(
-          src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
-          BHWC(1, 2, 1, 4), &dst_tensor));
-      RETURN_IF_ERROR(
-          PointWiseNear({-1.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f},
-                        dst_tensor.data, eps));
-    }
-  }
-  return absl::OkStatus();
-}
-
-absl::Status ReLUN1ClipTest(TestExecutionEnvironment* env) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 4);
-  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
-
-  ReLUAttributes attr;
-  attr.alpha = 0.0f;
-  attr.activation_min = -1.0f;
-  attr.activation_max = 1.0f;
-
-  for (auto precision : env->GetSupportedPrecisions()) {
-    auto data_type = DeduceDataTypeFromPrecision(precision);
-    for (auto storage : env->GetSupportedStorages(data_type)) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(op_def, attr);
-      RETURN_IF_ERROR(env->ExecuteGPUOperation(
-          src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
-          BHWC(1, 2, 1, 4), &dst_tensor));
-      RETURN_IF_ERROR(
-          PointWiseNear({-1.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 1.0f},
-                        dst_tensor.data, eps));
-    }
-  }
-  return absl::OkStatus();
-}
-
-absl::Status ReLUN1AlphaTest(TestExecutionEnvironment* env) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 4);
-  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
-
-  ReLUAttributes attr;
-  attr.alpha = 1.0f;
-  attr.activation_min = -1.0f;  // activation_min ignored if alpha != 0
-  attr.activation_max = 0.0f;
-
-  for (auto precision : env->GetSupportedPrecisions()) {
-    auto data_type = DeduceDataTypeFromPrecision(precision);
-    for (auto storage : env->GetSupportedStorages(data_type)) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(op_def, attr);
-      RETURN_IF_ERROR(env->ExecuteGPUOperation(
-          src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
-          BHWC(1, 2, 1, 4), &dst_tensor));
-      RETURN_IF_ERROR(
-          PointWiseNear({-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f},
-                        dst_tensor.data, eps));
-    }
-  }
-  return absl::OkStatus();
-}
-
-absl::Status ReLUN1AlphaClipTest(TestExecutionEnvironment* env) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 4);
-  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
-
-  ReLUAttributes attr;
-  attr.alpha = 1.0f;
-  attr.activation_min = -1.0f;  // activation_min ignored if alpha != 0
-  attr.activation_max = 3.0f;
-
-  for (auto precision : env->GetSupportedPrecisions()) {
-    auto data_type = DeduceDataTypeFromPrecision(precision);
-    for (auto storage : env->GetSupportedStorages(data_type)) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(op_def, attr);
-      RETURN_IF_ERROR(env->ExecuteGPUOperation(
-          src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
-          BHWC(1, 2, 1, 4), &dst_tensor));
-      RETURN_IF_ERROR(
-          PointWiseNear({-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.0f},
-                        dst_tensor.data, eps));
-    }
-  }
-  return absl::OkStatus();
-}
-
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h
index 8c0a7023048962..92ed2eea5cbcd8 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h
@@ -26,10 +26,6 @@ absl::Status ReLUNoClipNoAlphaTest(TestExecutionEnvironment* env);
 absl::Status ReLUClipTest(TestExecutionEnvironment* env);
 absl::Status ReLUAlphaTest(TestExecutionEnvironment* env);
 absl::Status ReLUAlphaClipTest(TestExecutionEnvironment* env);
-absl::Status ReLUN1NoClipNoAlphaTest(TestExecutionEnvironment* env);
-absl::Status ReLUN1ClipTest(TestExecutionEnvironment* env);
-absl::Status ReLUN1AlphaTest(TestExecutionEnvironment* env);
-absl::Status ReLUN1AlphaClipTest(TestExecutionEnvironment* env);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/BUILD b/tensorflow/lite/delegates/gpu/gl/BUILD
index b3cf20e666ccc6..4768126863d078 100644
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@@ -242,7 +242,6 @@ cc_test(
     linkopts = [
         "-lEGL",
         "-lGLESv2",
-        "-lm",
     ],
     tags = tf_gpu_tests_tags() + [
         "local",
@@ -476,9 +475,6 @@ cc_library(
 cc_test(
     name = "serialization_test",
     srcs = ["serialization_test.cc"],
-    linkopts = [
-        "-lm",
-    ],
     tags = [
         "local",
         "nobuilder",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 003c2b21360d9c..1c920b92b8dab7 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -45,7 +45,6 @@ cc_test(
     ],
     deps = [
         ":converter",
-        ":test_util",
         "//tensorflow/lite/delegates/gpu/common:convert",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -722,18 +721,10 @@ cc_library(
     testonly = 1,
     srcs = ["test_util.cc"],
     hdrs = ["test_util.h"],
-    linkopts = select({
-        "//tensorflow:android": [
-            "-lEGL",
-            "-lGLESv3",
-            "-ldl",
-            "-lm",
-        ],
-        "//conditions:default": [
-            "-lEGL",
-            "-lGLESv3",
-        ],
-    }),
+    linkopts = [
+        "-lEGL",
+        "-lGLESv3",
+    ],
     deps = [
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
index 5bfd0517df68eb..6d05ea894d4cfe 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
@@ -39,22 +39,21 @@ class ReLU : public NodeShader {
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
     const auto& attr = std::any_cast<const ReLUAttributes&>(ctx.op_attr);
-    // clamp(value, min(0, alpha * value), activation_max)
+    // clamp(value, min(0, alpha * value), clip)
     std::vector<Variable> params;
     std::string min;
     if (attr.alpha == 0) {
-      min = "vec4($activation_min$)";
-      params.push_back({"activation_min", attr.activation_min});
+      min = "vec4(0.0)";
     } else {
       min = "min($alpha$ * value_0, 0.0)";
       params.push_back({"alpha", attr.alpha});
     }
     std::string code;
-    if (attr.activation_max == 0) {
+    if (attr.clip == 0) {
       code = "value_0 = max(value_0, " + min + ");";
     } else {
-      code = "value_0 = clamp(value_0, " + min + ", vec4($activation_max$));";
-      params.push_back({"activation_max", attr.activation_max});
+      code = "value_0 = clamp(value_0, " + min + ", vec4($clip$));";
+      params.push_back({"clip", attr.clip});
     }
     *generated_code = {
         /*parameters=*/std::move(params),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
index 035555656b2a3a..0c2d2536fd654e 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
@@ -45,7 +45,7 @@ class ReluTest : public ::testing::Test {
 TEST_F(ReluTest, Smoke) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.activation_max = 0;
+  attr.clip = 0;
   attr.alpha = 0;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});
@@ -58,7 +58,7 @@ TEST_F(ReluTest, Smoke) {
 TEST_F(ReluTest, ClipOnly) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.activation_max = 6;
+  attr.clip = 6;
   attr.alpha = 0;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});
@@ -71,7 +71,7 @@ TEST_F(ReluTest, ClipOnly) {
 TEST_F(ReluTest, AlphaOnly) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.activation_max = 0;
+  attr.clip = 0;
   attr.alpha = 0.5;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});
@@ -84,63 +84,7 @@ TEST_F(ReluTest, AlphaOnly) {
 TEST_F(ReluTest, ClipAndAlpha) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.activation_max = 6;
-  attr.alpha = 0.5;
-  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
-                      {GetTensorRef(1)});
-  ASSERT_TRUE(model.PopulateTensor(0, {-6.0, 0.0, 2.0, 8.0}));
-  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
-  EXPECT_THAT(model.GetOutput(0),
-              Pointwise(FloatNear(1e-6), {-3.0, 0.0, 2.0, 6.0}));
-}
-
-TEST_F(ReluTest, ReLUN1Smoke) {
-  OperationType op_type = OperationType::RELU;
-  ReLUAttributes attr;
-  attr.activation_min = -1;
-  attr.activation_max = 0;
-  attr.alpha = 0;
-  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
-                      {GetTensorRef(1)});
-  ASSERT_TRUE(model.PopulateTensor(0, {-12.0f, -0.5f, 0.8f, 3.2f}));
-  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
-  EXPECT_THAT(model.GetOutput(0),
-              Pointwise(FloatNear(1e-6), {-1.0f, -0.5f, 0.8f, 3.2f}));
-}
-
-TEST_F(ReluTest, ReLUN1ClipOnly) {
-  OperationType op_type = OperationType::RELU;
-  ReLUAttributes attr;
-  attr.activation_min = -1;
-  attr.activation_max = 1;
-  attr.alpha = 0;
-  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
-                      {GetTensorRef(1)});
-  ASSERT_TRUE(model.PopulateTensor(0, {-12.0f, -0.5f, 0.8f, 3.2f}));
-  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
-  EXPECT_THAT(model.GetOutput(0),
-              Pointwise(FloatNear(1e-6), {-1.0f, -0.5f, 0.8f, 1.0f}));
-}
-
-TEST_F(ReluTest, ReLUN1AlphaOnly) {
-  OperationType op_type = OperationType::RELU;
-  ReLUAttributes attr;
-  attr.activation_min = -1;  // activation_min ignored if alpha != 0
-  attr.activation_max = 0;
-  attr.alpha = 0.5;
-  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
-                      {GetTensorRef(1)});
-  ASSERT_TRUE(model.PopulateTensor(0, {-6.0, 0.0, 2.0, 8.0}));
-  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
-  EXPECT_THAT(model.GetOutput(0),
-              Pointwise(FloatNear(1e-6), {-3.0, 0.0, 2.0, 8.0}));
-}
-
-TEST_F(ReluTest, ReLUN1ClipAndAlpha) {
-  OperationType op_type = OperationType::RELU;
-  ReLUAttributes attr;
-  attr.activation_min = -1;  // activation_min ignored if alpha != 0
-  attr.activation_max = 6;
+  attr.clip = 6;
   attr.alpha = 0.5;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});

From c0ec7cea40eba9b381c12cbe502ddfc539fabb8d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Sep 2023 13:11:28 -0700
Subject: [PATCH 437/567] Rewrites EnumerateAll2DPartition[Reshape] using a
 recursive implementation that supports an arbitrary number of device mesh
 shapes.

PiperOrigin-RevId: 569570620
---
 .../auto_sharding/auto_sharding.cc            | 144 ++++++++++--------
 1 file changed, 80 insertions(+), 64 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 8ffb8d452b6dd4..d2b4dfcbb3f615 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -759,40 +759,47 @@ void BuildStrategyAndCostForOp(const HloInstruction* ins, const Shape& shape,
                                const CallGraph& call_graph,
                                absl::Span<const int64_t> tensor_dims);
 
-// Enumerate 2D partition
-void EnumerateAll2DPartition(const HloInstruction* ins, const Shape& shape,
-                             const Array<int64_t>& device_mesh,
-                             const ClusterEnvironment& cluster_env,
-                             const StrategyMap& strategy_map,
-                             std::unique_ptr<StrategyVector>& strategies,
-                             const InstructionBatchDimMap& batch_dim_map,
-                             bool only_allow_divisible,
-                             const CallGraph& call_graph) {
+// Enumerate all partitions recursively
+void EnumerateAllPartition(const HloInstruction* ins, const Shape& shape,
+                           const Array<int64_t>& device_mesh,
+                           const ClusterEnvironment& cluster_env,
+                           const StrategyMap& strategy_map,
+                           std::unique_ptr<StrategyVector>& strategies,
+                           const InstructionBatchDimMap& batch_dim_map,
+                           bool only_allow_divisible,
+                           const CallGraph& call_graph,
+                           int64_t partition_dimensions,
+                           const std::vector<int64_t>& tensor_dims = {}) {
+  const auto tensor_dims_size = tensor_dims.size();
+  if (tensor_dims_size == partition_dimensions) {
+    BuildStrategyAndCostForOp(ins, shape, device_mesh, cluster_env,
+                              strategy_map, strategies, call_graph,
+                              tensor_dims);
+    return;
+  }
   auto iter = batch_dim_map.find(GetBatchDimMapKey(ins));
   int64_t batch_dim = -1;
   if (iter != batch_dim_map.end()) {
     batch_dim = iter->second;
   }
-  // Fully tile the buffer to 2-d mesh
+  // Fully tile the buffer to the mesh
   for (int64_t i = 0; i < shape.rank(); ++i) {
-    for (int64_t j = 0; j < shape.rank(); ++j) {
-      if ((batch_dim != -1 && !(batch_dim == i || batch_dim == j)) || i == j) {
-        continue;
-      }
-      if (shape.dimensions(i) < device_mesh.dim(0) ||
-          shape.dimensions(j) < device_mesh.dim(1)) {
-        continue;
-      }
-
-      if (only_allow_divisible &&
-          (!IsDivisible(shape.dimensions(i), device_mesh.dim(0)) ||
-           !IsDivisible(shape.dimensions(j), device_mesh.dim(1)))) {
-        continue;
-      }
-
-      BuildStrategyAndCostForOp(ins, shape, device_mesh, cluster_env,
-                                strategy_map, strategies, call_graph, {i, j});
+    auto tensor_it = std::find(tensor_dims.begin(), tensor_dims.end(), i);
+    if ((batch_dim != -1 && batch_dim != i) || tensor_it != tensor_dims.end()) {
+      continue;
     }
+    if (shape.dimensions(i) < device_mesh.dim(tensor_dims_size)) {
+      continue;
+    }
+    if (only_allow_divisible &&
+        !IsDivisible(shape.dimensions(i), device_mesh.dim(tensor_dims_size))) {
+      continue;
+    }
+    std::vector<int64_t> next_tensor_dims = tensor_dims;
+    next_tensor_dims.push_back(i);
+    EnumerateAllPartition(ins, shape, device_mesh, cluster_env, strategy_map,
+                          strategies, batch_dim_map, only_allow_divisible,
+                          call_graph, partition_dimensions, next_tensor_dims);
   }
 }
 
@@ -918,14 +925,22 @@ void BuildStrategyAndCostForReshape(const HloInstruction* ins,
                                     std::unique_ptr<StrategyVector>& strategies,
                                     absl::Span<const int64_t> tensor_dims);
 
-// Enumerate 2D partition for reshape. Batch dim is always partitioned.
-void Enumerate2DPartitionReshape(const HloInstruction* ins,
-                                 const Array<int64_t>& device_mesh,
-                                 const ClusterEnvironment& cluster_env,
-                                 const StrategyMap& strategy_map,
-                                 const InstructionBatchDimMap& batch_dim_map,
-                                 std::unique_ptr<StrategyVector>& strategies,
-                                 bool only_allow_divisible) {
+// Enumerate all partitions for reshape. Batch dim is always partitioned.
+void EnumeratePartitionReshape(const HloInstruction* ins,
+                               const Array<int64_t>& device_mesh,
+                               const ClusterEnvironment& cluster_env,
+                               const StrategyMap& strategy_map,
+                               const InstructionBatchDimMap& batch_dim_map,
+                               std::unique_ptr<StrategyVector>& strategies,
+                               bool only_allow_divisible,
+                               int64_t partition_dimensions,
+                               const std::vector<int64_t>& tensor_dims = {}) {
+  const auto tensor_dims_size = tensor_dims.size();
+  if (tensor_dims_size == partition_dimensions) {
+    BuildStrategyAndCostForReshape(ins, device_mesh, cluster_env, strategy_map,
+                                   strategies, tensor_dims);
+    return;
+  }
   auto iter = batch_dim_map.find(GetBatchDimMapKey(ins));
   int64_t batch_dim = -1;
   if (iter != batch_dim_map.end()) {
@@ -935,23 +950,24 @@ void Enumerate2DPartitionReshape(const HloInstruction* ins,
 
   // Split batch dim + another dim
   for (int64_t i = 0; i < ins->shape().rank(); ++i) {
-    for (int64_t j = 0; j < ins->shape().rank(); ++j) {
-      if ((batch_dim != -1 && !(batch_dim == i || batch_dim == j)) || i == j) {
-        continue;
-      }
-      if (ins->shape().dimensions(i) < device_mesh.dim(0) ||
-          ins->shape().dimensions(j) < device_mesh.dim(1)) {
-        continue;
-      }
-      if (only_allow_divisible &&
-          (!IsDivisible(ins->shape().dimensions(i), device_mesh.dim(0)) ||
-           !IsDivisible(ins->shape().dimensions(j), device_mesh.dim(1)))) {
-        continue;
-      }
-
-      BuildStrategyAndCostForReshape(ins, device_mesh, cluster_env,
-                                     strategy_map, strategies, {i, j});
+    auto tensor_it = std::find(tensor_dims.begin(), tensor_dims.end(), i);
+    if ((batch_dim != -1 && batch_dim != i) || tensor_it != tensor_dims.end()) {
+      continue;
+    }
+    if (ins->shape().dimensions(i) < device_mesh.dim(tensor_dims_size)) {
+      continue;
+    }
+    if (only_allow_divisible &&
+        !IsDivisible(ins->shape().dimensions(i),
+                     device_mesh.dim(tensor_dims_size))) {
+      continue;
     }
+
+    std::vector<int64_t> next_tensor_dims = tensor_dims;
+    next_tensor_dims.push_back(i);
+    EnumeratePartitionReshape(ins, device_mesh, cluster_env, strategy_map,
+                              batch_dim_map, strategies, only_allow_divisible,
+                              partition_dimensions, next_tensor_dims);
   }
 }
 
@@ -1140,9 +1156,9 @@ StatusOr<std::unique_ptr<StrategyVector>> CreateAllStrategiesVector(
       //                for operators with batch dimension. We didn't include
       //                this logic here since this pass might be used for
       //                more general cases.
-      EnumerateAll2DPartition(ins, shape, cluster_env.device_mesh_, cluster_env,
-                              strategy_map, strategies, batch_dim_map,
-                              only_allow_divisible, call_graph);
+      EnumerateAllPartition(ins, shape, cluster_env.device_mesh_, cluster_env,
+                            strategy_map, strategies, batch_dim_map,
+                            only_allow_divisible, call_graph, /*partitions*/ 2);
     }
 
     if (solver_option.allow_mixed_mesh_shape && cluster_env.IsDeviceMesh2D()) {
@@ -1669,10 +1685,10 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                                   cluster_env, strategy_map, strategies,
                                   only_allow_divisible, "", call_graph);
         } else {
-          EnumerateAll2DPartition(ins, ins->shape(), cluster_env.device_mesh_,
-                                  cluster_env, strategy_map, strategies,
-                                  batch_dim_map, only_allow_divisible,
-                                  call_graph);
+          EnumerateAllPartition(ins, ins->shape(), cluster_env.device_mesh_,
+                                cluster_env, strategy_map, strategies,
+                                batch_dim_map, only_allow_divisible, call_graph,
+                                /*partitions*/ 2);
           if (solver_option.allow_mixed_mesh_shape) {
             EnumerateAll1DPartition(ins, ins->shape(),
                                     cluster_env.device_mesh_1d_, cluster_env,
@@ -1751,9 +1767,9 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
           }
           if (cluster_env.IsDeviceMesh2D()) {
             // Split 2 dim, one is always the batch dim
-            Enumerate2DPartitionReshape(ins, device_mesh, cluster_env,
-                                        strategy_map, batch_dim_map, strategies,
-                                        only_allow_divisible);
+            EnumeratePartitionReshape(ins, device_mesh, cluster_env,
+                                      strategy_map, batch_dim_map, strategies,
+                                      only_allow_divisible, /*partitions*/ 2);
           }
 
           // Replicate
@@ -2078,9 +2094,9 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
         }
         if (cluster_env.IsDeviceMesh2D()) {
           // Split 2 dims
-          EnumerateAll2DPartition(ins, ins->shape(), device_mesh, cluster_env,
-                                  strategy_map, strategies, batch_dim_map,
-                                  only_allow_divisible, call_graph);
+          EnumerateAllPartition(ins, ins->shape(), device_mesh, cluster_env,
+                                strategy_map, strategies, batch_dim_map,
+                                only_allow_divisible, call_graph, /*parts*/ 2);
         }
         if (cluster_env.IsDeviceMesh2D() &&
             solver_option.allow_mixed_mesh_shape) {

From dd4ee66b5c075ae1c2f9b53b568aeed8c6f2c290 Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Fri, 29 Sep 2023 13:12:50 -0700
Subject: [PATCH 438/567] Remove redundant sponge link and non-functional
 fusion link from generate_index_html.sh

PiperOrigin-RevId: 569570997
---
 ci/official/utilities/generate_index_html.sh                   | 2 --
 third_party/xla/.kokoro/generate_index_html.sh                 | 2 --
 third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh | 2 --
 3 files changed, 6 deletions(-)

diff --git a/ci/official/utilities/generate_index_html.sh b/ci/official/utilities/generate_index_html.sh
index 16fbe56e65ed50..03e7ec6f861167 100755
--- a/ci/official/utilities/generate_index_html.sh
+++ b/ci/official/utilities/generate_index_html.sh
@@ -37,9 +37,7 @@ cat > "$1" <<EOF
 </ul>
 <h2>Googlers-Only Links</h2>
 <ul>
-<li><a href="http://sponge/$KOKORO_BUILD_ID">Sponge</a></li>
 <li><a href="http://sponge2/$KOKORO_BUILD_ID">Sponge2</a></li>
-<li><a href="http://fusion/$KOKORO_BUILD_ID">Test Fusion</a></li>
 <li><a href="http://sponge/target:$KOKORO_JOB_NAME">Sponge - recent jobs</a></li>
 <li><a href="http://fusion2/ci/kokoro/prod:$(echo "$KOKORO_JOB_NAME" | sed 's!/!%2F!g')">Test Fusion - recent jobs</a></li>
 <li><a href="http://cs/f:devtools/kokoro/config/prod/$KOKORO_JOB_NAME">Codesearch - job definition</a></li>
diff --git a/third_party/xla/.kokoro/generate_index_html.sh b/third_party/xla/.kokoro/generate_index_html.sh
index 7cc206cb61b073..3e58ab2825058e 100755
--- a/third_party/xla/.kokoro/generate_index_html.sh
+++ b/third_party/xla/.kokoro/generate_index_html.sh
@@ -36,9 +36,7 @@ tee "$1" <<EOF
 </ul>
 <h2>Googlers-Only Links</h2>
 <ul>
-<li><a href="http://sponge/$KOKORO_BUILD_ID">Sponge</a></li>
 <li><a href="http://sponge2/$KOKORO_BUILD_ID">Sponge2</a></li>
-<li><a href="http://fusion/$KOKORO_BUILD_ID">Test Fusion</a></li>
 <li><a href="http://sponge/target:$KOKORO_JOB_NAME">Sponge - recent jobs</a></li>
 </ul>
 <h2>Non-Googler Links</h2>
diff --git a/third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh b/third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh
index 203930e35c99a3..8870bb2818e52d 100755
--- a/third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh
+++ b/third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh
@@ -36,9 +36,7 @@ tee "$1" <<EOF
 </ul>
 <h2>Googlers-Only Links</h2>
 <ul>
-<li><a href="http://sponge/$KOKORO_BUILD_ID">Sponge</a></li>
 <li><a href="http://sponge2/$KOKORO_BUILD_ID">Sponge2</a></li>
-<li><a href="http://fusion/$KOKORO_BUILD_ID">Test Fusion</a></li>
 <li><a href="http://sponge/target:$KOKORO_JOB_NAME">Sponge - recent jobs</a></li>
 </ul>
 <h2>Non-Googler Links</h2>

From 3b0a7418fad00d63857cc563ad56f67aa57747b7 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Fri, 29 Sep 2023 13:30:36 -0700
Subject: [PATCH 439/567] Fix conversion of ops that use options declared in
 BuiltinOption2.

PiperOrigin-RevId: 569575379
---
 .../compiler/mlir/lite/converter_gen.cc       | 74 ++++++++++++++++---
 .../compiler/mlir/lite/flatbuffer_operator.cc |  2 +-
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  2 +
 3 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/converter_gen.cc b/tensorflow/compiler/mlir/lite/converter_gen.cc
index f2a667371700ec..09e2e6d7f1cff5 100644
--- a/tensorflow/compiler/mlir/lite/converter_gen.cc
+++ b/tensorflow/compiler/mlir/lite/converter_gen.cc
@@ -87,6 +87,16 @@ static inline bool IsLstmOp(const StringRef op_name) {
   return op_name.take_back(6) == "LSTMOp";
 }
 
+static int HasOptions(const Record &def) {
+  if (def.getValueAsBit("hasOptions")) {
+    return 1;
+  }
+  if (def.getValueAsBit("hasOptions2")) {
+    return 2;
+  }
+  return 0;
+}
+
 static void EmitOptionBuilders(const RecordKeeper &record_keeper,
                                const std::vector<Record *> &defs,
                                raw_ostream *ostream) {
@@ -94,8 +104,11 @@ static void EmitOptionBuilders(const RecordKeeper &record_keeper,
 
   const auto attr_type = record_keeper.getClass("Attr");
   for (const auto *def : defs) {
+    const int has_options = HasOptions(*def);
     // TFLite ops without options are skipped over.
-    if (!def->getValueAsBit("hasOptions")) continue;
+    if (!has_options) {
+      continue;
+    }
 
     StringRef op_name = def->getName().drop_front(4);  // Strip 'TFL_' prefix
     std::string option_name = GetOperatorOptionName(*def);
@@ -200,7 +213,8 @@ static void EmitOperatorBuilders(const std::vector<Record *> &defs,
     // Build the FlatBuffer operator
     os << "  return tflite::CreateOperator(\n"
           "      *fbb, opcode_index, inputs, outputs,\n";
-    if (def->getValueAsBit("hasOptions")) {
+    const int has_options = HasOptions(*def);
+    if (has_options == 1) {
       auto option_name = GetOperatorOptionName(*def);
       std::string tflite_option_name =
           option_name == "BasicLSTMOptions" ? "LSTMOptions" : option_name;
@@ -213,8 +227,19 @@ static void EmitOperatorBuilders(const std::vector<Record *> &defs,
     // used by custom or flex ops and those ops are handled manually.
     os << "      /*custom_options=*/0, "
        << "tflite::CustomOptionsFormat_FLEXBUFFERS,\n"
-       << "      /*mutating_variable_inputs=*/0"
-       << (has_intermediates ? ", intermediates" : "") << ");\n}\n\n";
+       << "      /*mutating_variable_inputs=*/0,"
+       << (has_intermediates ? "intermediates" : "/*intermediates=*/0");
+
+    if (has_options == 2) {
+      os << ",\n"
+         << "      /*large_custom_options_offset=*/0,\n"
+         << "      /*large_custom_options_size=*/0";
+      os << ",\n";
+      const std::string option_name = GetOperatorOptionName(*def);
+      os << "      tflite::BuiltinOptions2_" << option_name << ", "
+         << "Create" << option_name << "(tflOp, fbb).Union()";
+    }
+    os << ");\n}\n\n";
   }
 }
 
@@ -342,24 +367,44 @@ static void EmitBuildOperator(const std::vector<Record *> &defs,
 
 // Emit a function that converts a BuiltinOptionsUnion to a vector of attributes
 // Signature:
-// void mlir::BuiltinOptionsToAttributes(
-//     tflite::BuiltinOptionsUnion op_union,
+// void mlir::BuiltinOptions{id}ToAttributes(
+//     tflite::BuiltinOptions{id}Union op_union,
 //     mlir::Builder builder,
 //     llvm::SmallVectorImpl<mlir::NamedAttribute> &attributes);
+//
+// where id is an empty string if builtin_options_id is 1, or builtin_options_id
+// otherwise.
 static void EmitBuiltinOptionsToAttributes(const RecordKeeper &record_keeper,
                                            const std::vector<Record *> &defs,
-                                           raw_ostream *ostream) {
+                                           raw_ostream *ostream,
+                                           const int builtin_options_id) {
   raw_ostream &os = *ostream;
 
+  const std::string builtin_options_suffix = [&] {
+    switch (builtin_options_id) {
+      case 1:
+        return "";
+      case 2:
+        return "2";
+    }
+    return "UnknownId";
+  }();
+
   // Signature
-  os << "void mlir::BuiltinOptionsToAttributes("
-        "tflite::BuiltinOptionsUnion op_union, "
+  os << "void mlir::BuiltinOptions" << builtin_options_suffix
+     << "ToAttributes("
+        "tflite::BuiltinOptions"
+     << builtin_options_suffix
+     << "Union op_union, "
         "mlir::Builder builder, "
         "llvm::SmallVectorImpl<mlir::NamedAttribute> &attributes) {\n";
 
   const auto attr_type = record_keeper.getClass("Attr");
   for (const auto *def : defs) {
-    if (!def->getValueAsBit("hasOptions")) continue;
+    const int has_options = HasOptions(*def);
+    if (has_options != builtin_options_id) {
+      continue;
+    }
     auto option_name = GetOperatorOptionName(*def);
     // Basic LSTM and LSTM ops share the same option to attribute converter.
     if (option_name == "BasicLSTMOptions") {
@@ -392,9 +437,14 @@ static void EmitBuiltinOptionsToAttributes(const RecordKeeper &record_keeper,
     os << "    return;\n";
     os << "  }\n";
   }
+  if (builtin_options_id == 2) {
+    os << "  BuiltinOptions2ToAttributesManual(op_union, builder, "
+          "attributes);\n";
+  }
   // Fallthrough case is no attributes
   os << "}";
 }
+
 // The function below has a non-constant reference as that is required by LLVM's
 // TableGenMain.
 // NOLINTNEXTLINE
@@ -427,7 +477,9 @@ static bool OperatorWritersMain(raw_ostream &os, RecordKeeper &records) {
   os << "\n\n";
   EmitBuildOperator(defs, &os);
   os << "\n\n";
-  EmitBuiltinOptionsToAttributes(records, defs, &os);
+  EmitBuiltinOptionsToAttributes(records, defs, &os, /*builtin_options_id=*/1);
+  os << "\n\n";
+  EmitBuiltinOptionsToAttributes(records, defs, &os, /*builtin_options_id=*/2);
   os << "\n\n";
   EmitOperandNumbers(records, defs, &os);
 
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index 290b1075f28dba..8f1d7243252812 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -386,7 +386,7 @@ Status mlir::CustomOptionsToAttributes(
 
 // TODO(zichuanwei@): Populate Builtin_options_2 manual for now, should automate
 // these in the future
-void mlir::BuiltinOptions2ToAttributes(
+void BuiltinOptions2ToAttributesManual(
     tflite::BuiltinOptions2Union op_union, mlir::Builder builder,
     llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes) {
   if (const auto* op = op_union.AsStablehloConcatenateOptions()) {
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 06d8167e1063a4..b5e8fb257fc4a1 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -461,6 +461,8 @@ class TFL_Op<string mnemonic, list<Trait> traits = []> :
 
   // Whether the TFLite operator has options in the schema representation.
   bit hasOptions = 0b0;
+  // Whether the TFLite operator has options2 in the schema representation.
+  bit hasOptions2 = 0b0;
 
   // Use to specify a custom options type for TFLite operators where
   // the option's name does not match the TFLite operator's name.

From 9fe8a78bfe981a2308b7e5fc81f5c74041c7dae7 Mon Sep 17 00:00:00 2001
From: Kanvi Khanna <kanvi.khanna@intel.com>
Date: Fri, 29 Sep 2023 13:41:39 -0700
Subject: [PATCH 440/567] fix test

---
 tensorflow/core/grappler/optimizers/remapper_test.cc | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index c0ac573cf58423..9a01d9825cc6f4 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -1702,12 +1702,10 @@ TEST_F(RemapperFuseMatMulWithBiasTest, F16) {
 TEST_F(RemapperFuseMatMulWithBiasTest, F32) { RunTest<DT_FLOAT>(); }
 
 TEST_F(RemapperFuseMatMulWithBiasTest, Bf16) {
-#if !defined(ENABLE_MKL)
-  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
         << "Intel oneDNN with bfloat16 support is not enabled, skipping "
            "FuseMatMulWithBias with bfloat16.";
-#endif
   RunTest<DT_BFLOAT16>();  // NOLINT
 }
 
@@ -1914,12 +1912,10 @@ TEST_F(RemapperFuseMatMulWithBiasAndActivationTest, F32) {
 }
 
 TEST_F(RemapperFuseMatMulWithBiasAndActivationTest, Bf16) {
-#if !defined(ENABLE_MKL)
-  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
         << "Intel oneDNN with bfloat16 support is not enabled, skipping "
            "FuseMatMulWithBiasAndActivation with bfloat16.";
-#endif
   RunTest<DT_BFLOAT16>();  // NOLINT
 }
 

From af3421baf53759a776c375228e5e8e72a5103eeb Mon Sep 17 00:00:00 2001
From: Michael Hudgins <michaelhudgins@google.com>
Date: Fri, 29 Sep 2023 14:21:38 -0700
Subject: [PATCH 441/567] Reenable flaky test flag for 2.15 branch cut and
 release.  This will be reverted at HEAD post branch cut.

PiperOrigin-RevId: 569587811
---
 .bazelrc                                           | 2 ++
 tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc | 3 +++
 third_party/xla/.bazelrc                           | 2 ++
 third_party/xla/third_party/tsl/.bazelrc           | 2 ++
 4 files changed, 9 insertions(+)

diff --git a/.bazelrc b/.bazelrc
index 9691d264a24a8f..7cddd444e396fc 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -561,6 +561,8 @@ try-import %workspace%/.bazelrc.user
 # Here are bazelrc configs for release builds
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
+# TODO(b/294367488) disable after 2.15 brancut
+test:release_base --flaky_test_attempts=3
 
 # Target the AVX instruction set
 build:release_cpu_linux --config=avx_linux
diff --git a/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc b/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc
index c388f5322abf07..89dd80803ba9b3 100644
--- a/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc
+++ b/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc
@@ -23,6 +23,9 @@ build --action_env MACOSX_DEPLOYMENT_TARGET=12.0
 # Test-related settings below this point.
 test --verbose_failures=true --local_test_jobs=HOST_CPUS --test_output=errors
 
+# TODO(b/294367488) disable after 2.15 brancut
+test --flaky_test_attempts=3
+
 # Increase the test timeout as tests often take longer on mac.
 test --test_timeout=300,450,1200,3600
 test --test_size_filters=small,medium
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 9691d264a24a8f..7cddd444e396fc 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -561,6 +561,8 @@ try-import %workspace%/.bazelrc.user
 # Here are bazelrc configs for release builds
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
+# TODO(b/294367488) disable after 2.15 brancut
+test:release_base --flaky_test_attempts=3
 
 # Target the AVX instruction set
 build:release_cpu_linux --config=avx_linux
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 9691d264a24a8f..7cddd444e396fc 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -561,6 +561,8 @@ try-import %workspace%/.bazelrc.user
 # Here are bazelrc configs for release builds
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
+# TODO(b/294367488) disable after 2.15 brancut
+test:release_base --flaky_test_attempts=3
 
 # Target the AVX instruction set
 build:release_cpu_linux --config=avx_linux

From bb181e5019d03eef43741e68e63045dc450ed70d Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Fri, 29 Sep 2023 14:43:00 -0700
Subject: [PATCH 442/567] #tf-data-service Increase default client timeout to 5
 minutes.

Otherwise, if the dispatcher or client restarts, this can lead
to false client cleanup.

PiperOrigin-RevId: 569592616
---
 tensorflow/core/data/service/dispatcher_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index 89fcb140272c79..f1db0aaa06a54a 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -95,7 +95,7 @@ constexpr int kDefaultWorkerMaxConcurrentSnapshots = 2;
 
 constexpr absl::Duration kDefaultIterationGcCheckInterval = absl::Minutes(10);
 constexpr absl::Duration kDefaultIterationGcTimeout = absl::Minutes(5);
-constexpr absl::Duration kDefaultClientTimeout = absl::Minutes(2);
+constexpr absl::Duration kDefaultClientTimeout = absl::Minutes(5);
 constexpr absl::Duration kDefaultWorkerTimeout = absl::Minutes(10);
 
 constexpr std::array<const char*, 8> kNodeNameSharingOps = {

From c47518f0239a74dbed900a20d8cd6b89f4b2ef70 Mon Sep 17 00:00:00 2001
From: Vladislav <vkozlov@nvidia.com>
Date: Fri, 29 Sep 2023 15:01:05 -0700
Subject: [PATCH 443/567] =?UTF-8?q?PR=20#5855:=20[NVIDIA:GPU]=20Add=20XLA?=
 =?UTF-8?q?=20compilations=20flags=20per=20this=20discussion=20in=20this?=
 =?UTF-8?q?=20thread=20https:/=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/5855

…/mail.google.com/chat/\\#chat/space/AAAAQuUms_4/6XkLzS2sEUA:

As of now there are tons of works in XLA sharding. To have the tests maintainable, we were thinking to disable all async collectives. To do so we need to be able to control the following flags:

xla_gpu_enable_async_all_reduce
xla_gpu_enable_async_all_gather
xla_gpu_enable_async_collective_permute
xla_gpu_enable_async_all_to_all
xla_gpu_enable_async_reduce_scatter

Setting them to true/false we can on or off related collectives.
Copybara import of the project:

--
a01eb833e9b2185094dff86a1869c7dc82905425 by Vladislav Kozlov <vkozlov@nvidia.com>:

Add XLA compilations flags per this discussion in this thread https://mail.google.com/chat/\\#chat/space/AAAAQuUms_4/6XkLzS2sEUA:
As of now there are tons of works in XLA sharding. To have the tests maintainable, we were thinking to disable all async collectives. To do so we need to be able to control the following flags:

xla_gpu_enable_async_all_reduce
xla_gpu_enable_async_all_gather
xla_gpu_enable_async_collective_permute
xla_gpu_enable_async_all_to_all
xla_gpu_enable_async_reduce_scatter

Setting them to true/false we can on or off related collectives.

Merging this change closes #5855

PiperOrigin-RevId: 569597114
---
 third_party/xla/xla/python/xla_compiler.cc      | 17 ++++++++++++++++-
 .../xla/xla/python/xla_extension/__init__.pyi   |  5 +++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/xla_compiler.cc b/third_party/xla/xla/python/xla_compiler.cc
index 4c18c7ba72b4a5..4b752c02698344 100644
--- a/third_party/xla/xla/python/xla_compiler.cc
+++ b/third_party/xla/xla/python/xla_compiler.cc
@@ -924,7 +924,22 @@ void BuildXlaCompilerSubmodule(py::module& m) {
                     &DebugOptions::xla_dump_hlo_pipeline_re,
                     [](DebugOptions* self, std::string value) {
                       self->set_xla_dump_hlo_pipeline_re(value);
-                    });
+                    })
+      .def_property("xla_gpu_enable_async_all_reduce",
+                    &DebugOptions::xla_gpu_enable_async_all_reduce,
+                    &DebugOptions::set_xla_gpu_enable_async_all_reduce)
+      .def_property("xla_gpu_enable_async_all_gather",
+                    &DebugOptions::xla_gpu_enable_async_all_gather,
+                    &DebugOptions::set_xla_gpu_enable_async_all_gather)
+      .def_property("xla_gpu_enable_async_collective_permute",
+                    &DebugOptions::xla_gpu_enable_async_collective_permute,
+                    &DebugOptions::set_xla_gpu_enable_async_collective_permute)
+      .def_property("xla_gpu_enable_async_all_to_all",
+                    &DebugOptions::xla_gpu_enable_async_all_to_all,
+                    &DebugOptions::set_xla_gpu_enable_async_all_to_all)
+      .def_property("xla_gpu_enable_async_reduce_scatter",
+                    &DebugOptions::xla_gpu_enable_async_reduce_scatter,
+                    &DebugOptions::set_xla_gpu_enable_async_reduce_scatter);
 
   py::class_<ExecutableBuildOptions>(m, "ExecutableBuildOptions")
       .def(py::init<>())
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
index d30dc0cb857a29..31615b9af2cac5 100644
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ b/third_party/xla/xla/python/xla_extension/__init__.pyi
@@ -272,6 +272,11 @@ class DebugOptions:
   xla_dump_hlo_as_long_text: bool
   xla_dump_disable_metadata: bool
   xla_dump_hlo_pipeline_re: str
+  xla_gpu_enable_async_all_reduce: bool
+  xla_gpu_enable_async_all_gather: bool
+  xla_gpu_enable_async_collective_permute: bool
+  xla_gpu_enable_async_all_to_all: bool
+  xla_gpu_enable_async_reduce_scatter: bool
 
 class CompiledMemoryStats:
   generated_code_size_in_bytes: int

From 76123afecc091e201cf53bc2606a3f5a5280c1d8 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <dmitrig@google.com>
Date: Fri, 29 Sep 2023 15:26:47 -0700
Subject: [PATCH 444/567] Remove the long-deprecated type gtl::optional

PiperOrigin-RevId: 569602891
---
 tensorflow/core/BUILD                         |  1 -
 tensorflow/core/lib/gtl/BUILD                 |  9 -----
 tensorflow/core/lib/gtl/optional.h            | 33 -------------------
 tensorflow/lite/special_rules.bzl             |  1 -
 .../tsl/tsl/platform/default/build_config.bzl |  1 -
 5 files changed, 45 deletions(-)
 delete mode 100644 tensorflow/core/lib/gtl/optional.h

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1a18c391a3b3e1..14f712fa288b51 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1338,7 +1338,6 @@ cc_library(
         "//tensorflow/core/lib/gtl:iterator_range",
         "//tensorflow/core/lib/gtl:manual_constructor",
         "//tensorflow/core/lib/gtl:map_util",
-        "//tensorflow/core/lib/gtl:optional",
         "//tensorflow/core/lib/gtl:priority_queue_util",
         "//tensorflow/core/lib/gtl:top_n",
         "//tensorflow/core/lib/hash",
diff --git a/tensorflow/core/lib/gtl/BUILD b/tensorflow/core/lib/gtl/BUILD
index e7bf7ecd4f7672..d03c47b594f637 100644
--- a/tensorflow/core/lib/gtl/BUILD
+++ b/tensorflow/core/lib/gtl/BUILD
@@ -143,12 +143,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "optional",
-    hdrs = ["optional.h"],
-    deps = ["@com_google_absl//absl/types:optional"],
-)
-
 cc_library(
     name = "priority_queue_util",
     hdrs = ["priority_queue_util.h"],
@@ -171,7 +165,6 @@ filegroup(
         "flatset.h",
         "inlined_vector.h",
         "iterator_range.h",
-        "optional.h",
         "priority_queue_util.h",
         "@local_tsl//tsl/lib/gtl:legacy_lib_gtl_headers",
     ],
@@ -236,7 +229,6 @@ filegroup(
         "iterator_range.h",
         "manual_constructor.h",
         "map_util.h",
-        "optional.h",
         "priority_queue_util.h",
         "//tensorflow/core/lib/gtl/subtle:map_traits",
         "@local_tsl//tsl/lib/gtl:mobile_srcs_only_runtime",
@@ -259,7 +251,6 @@ filegroup(
         "iterator_range.h",
         "manual_constructor.h",
         "map_util.h",
-        "optional.h",
         "priority_queue_util.h",
         "top_n.h",
         "//tensorflow/core/lib/gtl/subtle:map_traits",
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
deleted file mode 100644
index 238aa18e1e8540..00000000000000
--- a/tensorflow/core/lib/gtl/optional.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_LIB_GTL_OPTIONAL_H_
-#define TENSORFLOW_CORE_LIB_GTL_OPTIONAL_H_
-
-#include "absl/types/optional.h"
-
-namespace tensorflow {
-namespace gtl {
-
-// Deprecated: please use absl::optional directly.
-using absl::make_optional;
-using absl::nullopt;
-template <typename T>
-using optional = absl::optional<T>;
-
-}  // namespace gtl
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_LIB_GTL_OPTIONAL_H_
diff --git a/tensorflow/lite/special_rules.bzl b/tensorflow/lite/special_rules.bzl
index 3d3864a8166b14..83d1a65bfedc3d 100644
--- a/tensorflow/lite/special_rules.bzl
+++ b/tensorflow/lite/special_rules.bzl
@@ -142,7 +142,6 @@ def flex_portable_tensorflow_deps():
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:optional",
         "@gemmlowp",
         "@icu//:common",
         "//third_party/icu/data:conversion_data",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl b/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
index f582b07c112a88..3fe7b2069a1bdf 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
@@ -715,7 +715,6 @@ def tf_additional_lib_deps():
         "@com_google_absl//absl/base:base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:optional",
     ] + if_static(
         [clean_dep("@nsync//:nsync_cpp")],
         [clean_dep("@nsync//:nsync_headers")],

From 41c80fa0144db695c39a3e255f2263ee4eea0d04 Mon Sep 17 00:00:00 2001
From: Haibo Huang <hhb@google.com>
Date: Fri, 29 Sep 2023 16:21:36 -0700
Subject: [PATCH 445/567] Use dladdr1() to get the full path of current module

PiperOrigin-RevId: 569614887
---
 tensorflow/core/tpu/tpu_api_dlsym_initializer.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index a6d587b27abbe0..020a01a5157ec4 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -76,10 +76,12 @@ absl::Status InitializeTpuLibrary(void* library_handle) {
 // Gets the path of current module. It is usually tensorflow_framework.so.
 const char* GetCurrentModulePath() {
   Dl_info DlInfo;
-  if (!dladdr((void*)GetCurrentModulePath, &DlInfo)) {
+  struct link_map* linkmap = nullptr;
+  if (!dladdr1((void*)GetCurrentModulePath, &DlInfo,
+               reinterpret_cast<void**>(&linkmap), RTLD_DL_LINKMAP)) {
     return nullptr;
   }
-  return DlInfo.dli_fname;
+  return linkmap->l_name;
 }
 
 absl::Status FindAndLoadTpuLibrary() {

From 94bde4690ca299ec754a3056830f5413a30cc006 Mon Sep 17 00:00:00 2001
From: Fiona Lang <flang@google.com>
Date: Fri, 29 Sep 2023 16:38:14 -0700
Subject: [PATCH 446/567] Delete dtensor LazyLoader instances now that
 python/__init__.py has been emptied.

PiperOrigin-RevId: 569618035
---
 tensorflow/python/ops/BUILD             |  2 --
 tensorflow/python/ops/array_ops.py      | 11 +----------
 tensorflow/python/ops/summary_ops_v2.py | 15 ++-------------
 3 files changed, 3 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 5de6f151ebd042..4253d8cf1ce863 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -798,7 +798,6 @@ py_strict_library(
         "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:lazy_loader",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
@@ -2987,7 +2986,6 @@ py_strict_library(
         "//tensorflow/python/trackable:resource",
         "//tensorflow/python/training:training_util",
         "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:lazy_loader",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
     ],
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index b0074775d3877c..ab2eedaaadb589 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -19,6 +19,7 @@
 import numpy as np
 
 from tensorflow.core.config import flags
+from tensorflow.dtensor.python import api as d_api
 from tensorflow.python.eager import context
 from tensorflow.python.eager import record
 from tensorflow.python.framework import common_shapes
@@ -49,19 +50,9 @@
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
 
-# TODO(b/282205877): Eliminate this LazyLoader.
-# DTensor doesn't depend on array_ops.py, but it imports tensorflow.python,
-# which imports array_ops.py, creating a cyclic import without a cyclic BUILD
-# dependency. The import cycle creates errors in some unit tests, but not
-# always.
-d_api = LazyLoader(
-    "api", globals(), "tensorflow.dtensor.python.api"
-)
-
 # Used for slicing to specify a new 1 size dimension
 newaxis = None
 tf_export("newaxis").export_constant(__name__, "newaxis")
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index ffeaaa045e913e..fcfa8a8b18c260 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -25,6 +25,8 @@
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import summary_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.dtensor.python import api as dtensor_api
+from tensorflow.dtensor.python import layout as layout_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import profiler as _profiler
 from tensorflow.python.framework import constant_op
@@ -45,22 +47,9 @@
 from tensorflow.python.training import training_util
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
-# TODO(b/282205877): Eliminate this LazyLoader.
-# DTensor doesn't depend on summary_ops_v2.py, but it imports tensorflow.python,
-# which imports summary_ops_v2.py, creating a cyclic import without a cyclic
-# BUILD dependency. The import cycle creates errors in some unit tests, but not
-# always.
-dtensor_api = LazyLoader(
-    "api", globals(), "tensorflow.dtensor.python.api"
-)
-layout_lib = LazyLoader(
-    "layout", globals(), "tensorflow.dtensor.python.layout"
-)
-
 # Name for graph collection of summary writer init ops, which is only exposed
 # as a legacy API for tf.contrib.summary in TF 1.x.
 _SUMMARY_WRITER_INIT_COLLECTION_NAME = "_SUMMARY_WRITER_V2"

From b476f9d220c8e6b922da3d39b226248df07620ca Mon Sep 17 00:00:00 2001
From: Jieying Luo <jieying@google.com>
Date: Fri, 29 Sep 2023 17:06:07 -0700
Subject: [PATCH 447/567] Move some common methods to PjrtCApiTestBase.

PiperOrigin-RevId: 569623763
---
 third_party/xla/xla/pjrt/c/BUILD              |  7 +-
 third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc | 78 ---------------
 .../xla/xla/pjrt/c/pjrt_c_api_test_base.cc    | 94 ++++++++++++++++++-
 .../xla/xla/pjrt/c/pjrt_c_api_test_base.h     | 22 +++++
 4 files changed, 121 insertions(+), 80 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index 83c8c354d38231..3838890340f678 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -203,8 +203,13 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":pjrt_c_api_hdrs",
-        ":pjrt_c_api_wrapper_impl",
+        ":pjrt_c_api_helpers",
+        "//xla:shape_util",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_future",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index c5e9ff2dd798ee..146d4dff7ccfc7 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -182,23 +182,6 @@ class PjrtCApiTest : public PjrtCApiTestBase {
 
   int GetNumDevices() const { return GetClientDevices().size(); }
 
-  absl::Span<PJRT_Device*> GetClientAddressableDevices() const {
-    PJRT_Client_AddressableDevices_Args addr_args;
-    addr_args.struct_size = PJRT_Client_AddressableDevices_Args_STRUCT_SIZE;
-    addr_args.priv = nullptr;
-    addr_args.client = client_;
-    PJRT_Error* error = api_->PJRT_Client_AddressableDevices(&addr_args);
-    CHECK(error == nullptr);
-    return absl::MakeSpan(addr_args.addressable_devices,
-                          addr_args.num_addressable_devices);
-  }
-
-  std::unique_ptr<PJRT_Error, ::pjrt::PJRT_ErrorDeleter> ToUniquePtr(
-      PJRT_Error* error) {
-    return std::unique_ptr<PJRT_Error, ::pjrt::PJRT_ErrorDeleter>{
-        error, ::pjrt::MakeErrorDeleter(api_)};
-  }
-
   std::string BuildSingleDeviceCompileOptionStr() {
     xla::ExecutableBuildOptions build_options;
     build_options.set_device_ordinal(0);
@@ -212,67 +195,6 @@ class PjrtCApiTest : public PjrtCApiTestBase {
     return options_proto->SerializeAsString();
   }
 
-  PJRT_Client_BufferFromHostBuffer_Args CreateBufferFromHostBufferArgs(
-      const std::vector<float>& data, const xla::Shape& shape,
-      const xla::PjRtClient::HostBufferSemantics host_buffer_semantics,
-      PJRT_Device* device = nullptr) {
-    PJRT_Client_BufferFromHostBuffer_Args args;
-    args.struct_size = PJRT_Client_BufferFromHostBuffer_Args_STRUCT_SIZE;
-    args.priv = nullptr;
-
-    args.data = data.data();
-    args.type = ::pjrt::ConvertToPjRtBufferType(shape.element_type());
-    args.dims = shape.dimensions().data();
-    args.num_dims = shape.dimensions().size();
-    args.byte_strides = nullptr;
-    args.num_byte_strides = 0;
-    args.device_layout = nullptr;
-    args.host_buffer_semantics =
-        ::pjrt::ConvertToPjRtHostBufferSemantics(host_buffer_semantics);
-    args.client = client_;
-    if (device == nullptr) {
-      device = GetClientAddressableDevices()[0];
-    }
-    args.device = device;
-    args.memory = nullptr;
-    return args;
-  }
-
-  std::pair<std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter>,
-            xla::PjRtFuture<absl::Status>>
-  create_buffer(PJRT_Device* device = nullptr) {
-    xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<float>({4});
-    std::vector<float> float_data(4);
-    std::iota(float_data.begin(), float_data.end(), 41.0f);
-
-    PJRT_Client_BufferFromHostBuffer_Args args = CreateBufferFromHostBufferArgs(
-        float_data, shape,
-        xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, device);
-
-    auto transfer_error =
-        ToUniquePtr(api_->PJRT_Client_BufferFromHostBuffer(&args));
-    EXPECT_EQ(transfer_error, nullptr);
-
-    std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter> buffer(
-        args.buffer, ::pjrt::MakeBufferDeleter(api_));
-
-    std::unique_ptr<PJRT_Event, ::pjrt::PJRT_EventDeleter>
-        done_with_host_buffer_event(args.done_with_host_buffer,
-                                    ::pjrt::MakeEventDeleter(api_));
-
-    PJRT_Buffer_ReadyEvent_Args get_event_args;
-    get_event_args.struct_size = PJRT_Buffer_ReadyEvent_Args_STRUCT_SIZE;
-    get_event_args.priv = nullptr;
-    get_event_args.buffer = buffer.get();
-    auto ready_event_error =
-        ToUniquePtr(api_->PJRT_Buffer_ReadyEvent(&get_event_args));
-    EXPECT_EQ(ready_event_error, nullptr);
-    xla::PjRtFuture<absl::Status> buffer_ready_event =
-        ::pjrt::ConvertCEventToCppFuture(get_event_args.event, api_);
-
-    return std::make_pair(std::move(buffer), buffer_ready_event);
-  }
-
   // Returns a scalar result of execution.
   // supply as e.g. `src_buffer = args.output_lists[0][0];`
   // after calling `api_->PJRT_LoadedExecutable_Execute(&args);`
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
index 73bfad8c989e0f..f7245c9973fb17 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
@@ -15,8 +15,21 @@ limitations under the License.
 
 #include "xla/pjrt/c/pjrt_c_api_test_base.h"
 
+#include <memory>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
-#include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 
 namespace pjrt {
 namespace {
@@ -55,4 +68,83 @@ void PjrtCApiTestBase::destroy_client(PJRT_Client* client) {
   CHECK_EQ(error, nullptr);
 }
 
+absl::Span<PJRT_Device*> PjrtCApiTestBase::GetClientAddressableDevices() const {
+  PJRT_Client_AddressableDevices_Args addr_args;
+  addr_args.struct_size = PJRT_Client_AddressableDevices_Args_STRUCT_SIZE;
+  addr_args.priv = nullptr;
+  addr_args.client = client_;
+  PJRT_Error* error = api_->PJRT_Client_AddressableDevices(&addr_args);
+  CHECK(error == nullptr);
+  return absl::MakeSpan(addr_args.addressable_devices,
+                        addr_args.num_addressable_devices);
+}
+
+PJRT_Client_BufferFromHostBuffer_Args
+PjrtCApiTestBase::CreateBufferFromHostBufferArgs(
+    const std::vector<float>& data, const xla::Shape& shape,
+    const xla::PjRtClient::HostBufferSemantics host_buffer_semantics,
+    PJRT_Device* device) {
+  PJRT_Client_BufferFromHostBuffer_Args args;
+  args.struct_size = PJRT_Client_BufferFromHostBuffer_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+
+  args.data = data.data();
+  args.type = ::pjrt::ConvertToPjRtBufferType(shape.element_type());
+  args.dims = shape.dimensions().data();
+  args.num_dims = shape.dimensions().size();
+  args.byte_strides = nullptr;
+  args.num_byte_strides = 0;
+  args.device_layout = nullptr;
+  args.host_buffer_semantics =
+      ::pjrt::ConvertToPjRtHostBufferSemantics(host_buffer_semantics);
+  args.client = client_;
+  if (device == nullptr) {
+    device = GetClientAddressableDevices()[0];
+  }
+  args.device = device;
+  args.memory = nullptr;
+  return args;
+}
+
+std::pair<std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter>,
+          xla::PjRtFuture<absl::Status>>
+PjrtCApiTestBase::create_buffer(PJRT_Device* device) {
+  xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<float>({4});
+  std::vector<float> float_data(4);
+  std::iota(float_data.begin(), float_data.end(), 41.0f);
+
+  PJRT_Client_BufferFromHostBuffer_Args args = CreateBufferFromHostBufferArgs(
+      float_data, shape,
+      xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, device);
+
+  auto transfer_error =
+      ToUniquePtr(api_->PJRT_Client_BufferFromHostBuffer(&args));
+  EXPECT_EQ(transfer_error, nullptr);
+
+  std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter> buffer(
+      args.buffer, ::pjrt::MakeBufferDeleter(api_));
+
+  std::unique_ptr<PJRT_Event, ::pjrt::PJRT_EventDeleter>
+      done_with_host_buffer_event(args.done_with_host_buffer,
+                                  ::pjrt::MakeEventDeleter(api_));
+
+  PJRT_Buffer_ReadyEvent_Args get_event_args;
+  get_event_args.struct_size = PJRT_Buffer_ReadyEvent_Args_STRUCT_SIZE;
+  get_event_args.priv = nullptr;
+  get_event_args.buffer = buffer.get();
+  auto ready_event_error =
+      ToUniquePtr(api_->PJRT_Buffer_ReadyEvent(&get_event_args));
+  EXPECT_EQ(ready_event_error, nullptr);
+  xla::PjRtFuture<absl::Status> buffer_ready_event =
+      ::pjrt::ConvertCEventToCppFuture(get_event_args.event, api_);
+
+  return std::make_pair(std::move(buffer), buffer_ready_event);
+}
+
+std::unique_ptr<PJRT_Error, ::pjrt::PJRT_ErrorDeleter>
+PjrtCApiTestBase::ToUniquePtr(PJRT_Error* error) {
+  return std::unique_ptr<PJRT_Error, ::pjrt::PJRT_ErrorDeleter>{
+      error, ::pjrt::MakeErrorDeleter(api_)};
+}
+
 }  // namespace pjrt
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
index 6792c7350fdd89..cb967756bc5510 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
@@ -13,8 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "absl/types/span.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/shape.h"
 
 #ifndef XLA_PJRT_C_PJRT_C_API_TEST_BASE_H_
 #define XLA_PJRT_C_PJRT_C_API_TEST_BASE_H_
@@ -31,6 +39,20 @@ class PjrtCApiTestBase : public ::testing::Test {
   PJRT_Client* client_;
   void destroy_client(PJRT_Client* client);
 
+  absl::Span<PJRT_Device*> GetClientAddressableDevices() const;
+
+  PJRT_Client_BufferFromHostBuffer_Args CreateBufferFromHostBufferArgs(
+      const std::vector<float>& data, const xla::Shape& shape,
+      xla::PjRtClient::HostBufferSemantics host_buffer_semantics,
+      PJRT_Device* device = nullptr);
+
+  std::pair<std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter>,
+            xla::PjRtFuture<absl::Status>>
+  create_buffer(PJRT_Device* device = nullptr);
+
+  std::unique_ptr<PJRT_Error, ::pjrt::PJRT_ErrorDeleter> ToUniquePtr(
+      PJRT_Error* error);
+
  private:
   PjrtCApiTestBase(const PjrtCApiTestBase&) = delete;
   void operator=(const PjrtCApiTestBase&) = delete;

From 6334682d8fcd1413eab2464e35696be48638a7a7 Mon Sep 17 00:00:00 2001
From: Zichuan Wei <zichuanwei@google.com>
Date: Fri, 29 Sep 2023 21:54:04 -0700
Subject: [PATCH 448/567] lite: add passes to insert transpose to & from [b, 0,
 1, f] for transpose conv

PiperOrigin-RevId: 569665655
---
 .../lite/stablehlo/tests/legalize_hlo.mlir    |  65 +++++-
 .../lite/stablehlo/transforms/legalize_hlo.cc | 213 ++++++++----------
 .../legalize_hlo_conversions/util.cc          |  60 +++++
 .../legalize_hlo_conversions/util.h           |  26 +++
 4 files changed, 234 insertions(+), 130 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index e440d9824bc7f8..195f539319f3e8 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -2354,6 +2354,51 @@ func.func @convert_group_conv2d(%arg0: tensor<1x14x14x2240xf32>, %arg1: tensor<3
   func.return %0 : tensor<1x7x7x2240xf32>
 }
 
+// CHECK-LABEL:    func.func @convert_transpose_conv_with_transpose(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x256x64x64xf32>,
+// CHECK-SAME:                         %[[VAL_1:.*]]: tensor<2x2x64x256xf32>) -> tensor<1x64x128x128xf32> {
+// CHECK:            %[[VAL_2:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:            %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0:.*]], %[[VAL_2:.*]]) : (tensor<1x256x64x64xf32>, tensor<4xi64>) -> tensor<1x64x64x256xf32>
+// CHECK:            %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:            %[[VAL_5:.*]] = "tf.ReverseV2"(%[[VAL_1:.*]], %[[VAL_4:.*]]) : (tensor<2x2x64x256xf32>, tensor<2xi64>) -> tensor<2x2x64x256xf32>
+// CHECK:            %[[VAL_6:.*]] = "tf.Const"() {value = dense<[1, 128, 128, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:            %[[VAL_7:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_6:.*]], %[[VAL_5:.*]], %[[VAL_3:.*]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<2x2x64x256xf32>, tensor<1x64x64x256xf32>) -> tensor<1x128x128x64xf32>
+// CHECK:            %[[VAL_8:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:            %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_7:.*]], %[[VAL_8:.*]]) : (tensor<1x128x128x64xf32>, tensor<4xi64>) -> tensor<1x64x128x128xf32>
+// CHECK:            return %[[VAL_9:.*]] : tensor<1x64x128x128xf32>
+// CHECK:           }
+
+func.func @convert_transpose_conv_with_transpose(%arg0: tensor<1x256x64x64xf32>, %arg1: tensor<2x2x64x256xf32>) -> tensor<1x64x128x128xf32> {
+  %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64,
+  dimension_numbers = #mhlo.conv<[b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1]>,
+  feature_group_count = 1 : i64, lhs_dilation = dense<2> : tensor<2xi64>, padding = dense<1> : tensor<2x2xi64>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<1> : tensor<2xi64>, window_reversal = dense<false> : tensor<2xi1>, window_strides = dense<1> : tensor<2xi64>} :
+  (tensor<1x256x64x64xf32>, tensor<2x2x64x256xf32>) -> tensor<1x64x128x128xf32>
+  func.return %0 : tensor<1x64x128x128xf32>
+}
+
+// CHECK-LABEL:    func.func @convert_transpose_conv_with_transpose2(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<64x64x1x256xf32>,
+// CHECK-SAME:                         %[[VAL_1:.*]]: tensor<2x2x64x256xf32>) -> tensor<128x128x1x64xf32> {
+// CHECK:            %[[VAL_2:.*]] = "tf.Const"() {value = dense<[2, 0, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:            %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0:.*]], %[[VAL_2:.*]]) : (tensor<64x64x1x256xf32>, tensor<4xi64>) -> tensor<1x64x64x256xf32>
+// CHECK:            %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:            %[[VAL_5:.*]] = "tf.ReverseV2"(%[[VAL_1:.*]], %[[VAL_4:.*]]) : (tensor<2x2x64x256xf32>, tensor<2xi64>) -> tensor<2x2x64x256xf32>
+// CHECK:            %[[VAL_6:.*]] = "tf.Const"() {value = dense<[1, 128, 128, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:            %[[VAL_7:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_6:.*]], %[[VAL_5:.*]], %[[VAL_3:.*]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<2x2x64x256xf32>, tensor<1x64x64x256xf32>) -> tensor<1x128x128x64xf32>
+// CHECK:            %[[VAL_8:.*]] = "tf.Const"() {value = dense<[1, 2, 0, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:            %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_7:.*]], %[[VAL_8:.*]]) : (tensor<1x128x128x64xf32>, tensor<4xi64>) -> tensor<128x128x1x64xf32>
+// CHECK:            return %[[VAL_9:.*]] : tensor<128x128x1x64xf32>
+// CHECK:           }
+
+func.func @convert_transpose_conv_with_transpose2(%arg0: tensor<64x64x1x256xf32>, %arg1: tensor<2x2x64x256xf32>) -> tensor<128x128x1x64xf32> {
+  %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64,
+  dimension_numbers = #mhlo.conv<[0, 1, b, f]x[0, 1, o, i]->[0, 1, b, f]>,
+  feature_group_count = 1 : i64, lhs_dilation = dense<2> : tensor<2xi64>, padding = dense<1> : tensor<2x2xi64>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<1> : tensor<2xi64>, window_reversal = dense<false> : tensor<2xi1>, window_strides = dense<1> : tensor<2xi64>} :
+  (tensor<64x64x1x256xf32>, tensor<2x2x64x256xf32>) -> tensor<128x128x1x64xf32>
+  func.return %0 : tensor<128x128x1x64xf32>
+}
+
+
 // CHECK-LABEL:   func @convert_conv2d_dynamic_batch(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<?x8x8x207xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<?x8x8x16xf32> {
@@ -2701,10 +2746,10 @@ func.func @convert_conv2d_resize_perferred(%arg0: tensor<1x56x1248x16xf32>, %arg
 // CHECK-LABEL:   func @convert_conv2d_back_prop_input(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<8x4x4x32xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x64x32xf32>) -> tensor<8x8x8x64xf32> {
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[8, 8, 8, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           %[[VAL_4:.*]] = "tf.ReverseV2"(%[[VAL_1]], %[[VAL_3]]) : (tensor<3x3x64x32xf32>, tensor<2xi64>) -> tensor<3x3x64x32xf32>
-// CHECK:           %[[VAL_5:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_2]], %[[VAL_4]], %[[VAL_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<3x3x64x32xf32>, tensor<8x4x4x32xf32>) -> tensor<8x8x8x64xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.ReverseV2"(%[[VAL_1]], %[[VAL_2]]) : (tensor<3x3x64x32xf32>, tensor<2xi64>) -> tensor<3x3x64x32xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<[8, 8, 8, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_4]], %[[VAL_3]], %[[VAL_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<3x3x64x32xf32>, tensor<8x4x4x32xf32>) -> tensor<8x8x8x64xf32>
 // CHECK:           return %[[VAL_5]] : tensor<8x8x8x64xf32>
 // CHECK:         }
 func.func @convert_conv2d_back_prop_input(%arg0: tensor<8x4x4x32xf32>, %arg1: tensor<3x3x64x32xf32>) -> tensor<8x8x8x64xf32> {
@@ -2728,12 +2773,12 @@ func.func @convert_conv2d_back_prop_input(%arg0: tensor<8x4x4x32xf32>, %arg1: te
 // CHECK-LABEL:   func @convert_conv2d_back_prop_input_transpose_filter(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<8x4x4x32xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x32x64xf32>) -> tensor<8x8x8x64xf32> {
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[8, 8, 8, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
-// CHECK-DAG:       %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_1]], %[[VAL_4]]) : (tensor<3x3x32x64xf32>, tensor<4xi64>) -> tensor<3x3x64x32xf32>
-// CHECK:           %[[VAL_6:.*]] = "tf.ReverseV2"(%[[VAL_5]], %[[VAL_3]]) : (tensor<3x3x64x32xf32>, tensor<2xi64>) -> tensor<3x3x64x32xf32>
-// CHECK:           %[[VAL_7:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_2]], %[[VAL_6]], %[[VAL_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<3x3x64x32xf32>, tensor<8x4x4x32xf32>) -> tensor<8x8x8x64xf32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_1]], %[[VAL_3]]) : (tensor<3x3x32x64xf32>, tensor<4xi64>) -> tensor<3x3x64x32xf32>
+// CHECK:           %[[VAL_5:.*]] = "tf.ReverseV2"(%[[VAL_4]], %[[VAL_2]]) : (tensor<3x3x64x32xf32>, tensor<2xi64>) -> tensor<3x3x64x32xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.Const"() {value = dense<[8, 8, 8, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           %[[VAL_7:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_6]], %[[VAL_5]], %[[VAL_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<3x3x64x32xf32>, tensor<8x4x4x32xf32>) -> tensor<8x8x8x64xf32>
 // CHECK:           return %[[VAL_7]] : tensor<8x8x8x64xf32>
 // CHECK:         }
 func.func @convert_conv2d_back_prop_input_transpose_filter(%arg0: tensor<8x4x4x32xf32>, %arg1: tensor<3x3x32x64xf32>) -> tensor<8x8x8x64xf32> {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index 28df4946880648..a0a58997f1ac6e 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -495,69 +495,6 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
     return true;
   }
 
-  // Returns true if the op needs reformat.
-  bool NeedsReformatTypeAndPermutation(int batch_dim, int feature_dim,
-                                       int spatial_dim_start,
-                                       int default_batch_dim,
-                                       int default_feature_dim,
-                                       int default_spatial_dim_start) const {
-    return batch_dim != default_batch_dim ||
-           feature_dim != default_feature_dim ||
-           spatial_dim_start != default_spatial_dim_start;
-  }
-
-  // Gets reformat type and permutation attribute. Call this function only if
-  // NeedsReformatTypeAndPermutation returns true.
-  std::pair<RankedTensorType, DenseIntElementsAttr>
-  GetReformatTypeAndPermutation(int batch_dim, int feature_dim,
-                                int spatial_dim_start, int default_batch_dim,
-                                int default_feature_dim,
-                                int default_spatial_dim_start,
-                                int num_spatial_dims, RankedTensorType type,
-                                ConversionPatternRewriter& rewriter) const {
-    auto shape = type.getShape();
-    llvm::SmallVector<int64_t, 4> permutation_array(num_spatial_dims + 2);
-    permutation_array[default_batch_dim] = batch_dim;
-    permutation_array[default_feature_dim] = feature_dim;
-    llvm::SmallVector<int64_t, 4> transposed_shape(num_spatial_dims + 2);
-    transposed_shape[default_batch_dim] = shape[batch_dim];
-    transposed_shape[default_feature_dim] = shape[feature_dim];
-    for (int i : llvm::seq<int>(0, num_spatial_dims)) {
-      permutation_array[default_spatial_dim_start + i] = spatial_dim_start + i;
-      transposed_shape[default_spatial_dim_start + i] =
-          shape[spatial_dim_start + i];
-    }
-    auto new_type =
-        RankedTensorType::get(transposed_shape, type.getElementType());
-    auto permutation = DenseIntElementsAttr::get(
-        RankedTensorType::get({type.getRank()}, rewriter.getI64Type()),
-        permutation_array);
-    return {new_type, permutation};
-  }
-
-  Value FormatToNHWC(Value value, int batch_dim, int feature_dim,
-                     ArrayRef<int64_t> spatial_dimensions,
-                     int default_batch_dim, int default_feature_dim,
-                     int default_spatial_dim_start, int num_spatial_dims,
-                     ConversionPatternRewriter& rewriter) const {
-    auto type = value.getType().cast<RankedTensorType>();
-    DenseIntElementsAttr permutation;
-    const int spatial_dim_start = spatial_dimensions.front();
-    if (!NeedsReformatTypeAndPermutation(
-            batch_dim, feature_dim, spatial_dim_start, default_batch_dim,
-            default_feature_dim, default_spatial_dim_start)) {
-      // Transpose is not needed because the current format is "NHWC".
-      return value;
-    }
-    std::pair<RankedTensorType&, DenseIntElementsAttr&>(type, permutation) =
-        GetReformatTypeAndPermutation(batch_dim, feature_dim, spatial_dim_start,
-                                      default_batch_dim, default_feature_dim,
-                                      default_spatial_dim_start,
-                                      num_spatial_dims, type, rewriter);
-    return rewriter.create<mhlo::TransposeOp>(value.getLoc(), type, value,
-                                              permutation);
-  }
-
   // Slices the input `value` if there are negative padding values in
   // `explicit_padding`.
   Value SliceNegativePadding(Value value, ArrayRef<int64_t> explicit_padding,
@@ -608,12 +545,13 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
                     ConversionPatternRewriter& rewriter) const {
     mhlo::ConvDimensionNumbersAttr dnums = conv_op.getDimensionNumbers();
     // Transposes lhs and rhs if their formats are not NHWC.
-    Value lhs = FormatToNHWC(
+    Value lhs = InsertTranspose(
         conv_op.getLhs(), dnums.getInputBatchDimension(),
         dnums.getInputFeatureDimension(), dnums.getInputSpatialDimensions(),
         /*default_batch_dim=*/0, /*default_feature_dim=*/num_spatial_dims + 1,
         /*default_spatial_dim_start=*/1, num_spatial_dims, rewriter);
-    Value rhs = FormatToNHWC(
+
+    Value rhs = InsertTranspose(
         conv_op.getRhs(), dnums.getKernelInputFeatureDimension(),
         dnums.getKernelOutputFeatureDimension(),
         dnums.getKernelSpatialDimensions(),
@@ -701,7 +639,8 @@ class ConvertNonTrivialConvOp
     : public OpConversionPattern<mhlo::ConvolutionOp> {
  public:
   using OpConversionPattern::OpConversionPattern;
-
+  // TODO(b/302150407): we should move to use direct legalization to TFlite
+  // instead going through TF in the future
   LogicalResult matchAndRewrite(
       mhlo::ConvolutionOp conv_op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
@@ -744,33 +683,27 @@ class ConvertNonTrivialConvOp
     }
 
     mhlo::ConvDimensionNumbersAttr dnums = conv_op.getDimensionNumbers();
+    auto input_spatial_dims = dnums.getInputSpatialDimensions();
+    int num_spatial_dims = input_spatial_dims.size();
+
     std::string padding;
     if (conv_op.getPadding().value().isSplat() &&
         conv_op.getPadding()->getSplatValue<int64_t>() == 0) {
       padding = "VALID";
     } else {
-      auto spatial_dims = dnums.getInputSpatialDimensions();
-      int num_spatial_dims =
-          std::accumulate(spatial_dims.begin(), spatial_dims.end(), 1LL,
-                          std::multiplies<int64_t>{});
       if (!IsSamePadding(conv_op, num_spatial_dims, strides)) {
         return rewriter.notifyMatchFailure(
             conv_op, "requires padding to be SAME or VALID");
       }
       padding = "SAME";
     }
+    auto conv_input = InsertTranspose(
+        conv_op.getLhs(), dnums.getInputBatchDimension(),
+        dnums.getInputFeatureDimension(), dnums.getInputSpatialDimensions(),
+        /*default_batch_dim=*/0,
+        /*default_feature_dim=*/num_spatial_dims + 1,
+        /*default_spatial_dim_start=*/1, num_spatial_dims, rewriter);
 
-    // Converts int64_t to int32_t.
-    llvm::SmallVector<int, 4> input_shape;
-    for (int64_t dim : conv_op.getType().cast<RankedTensorType>().getShape()) {
-      input_shape.push_back(dim);
-    }
-    auto input_sizes = rewriter.create<TF::ConstOp>(
-        conv_op.getLoc(),
-        DenseIntElementsAttr::get(
-            RankedTensorType::get({static_cast<int64_t>(input_shape.size())},
-                                  rewriter.getI32Type()),
-            input_shape));
     // Mirror the filter in the spatial dimensions.
     mlir::Value reverse_filter_in = conv_op.getRhs();
     // If the kernel is with format [0,1,i,o] we transpose it to [0,1,o,i]
@@ -796,14 +729,75 @@ class ConvertNonTrivialConvOp
     filter = rewriter.create<mhlo::ReverseOp>(
         conv_op.getLoc(), reverse_filter_in,
         rewriter.getI64TensorAttr(dnums.getKernelSpatialDimensions()));
-    rewriter.replaceOpWithNewOp<TF::Conv2DBackpropInputOp>(
-        conv_op, conv_op.getType(), input_sizes, filter, conv_op.getLhs(),
-        rewriter.getI64ArrayAttr(strides),
-        /*use_cudnn_on_gpu=*/rewriter.getBoolAttr(true),
-        /*padding=*/rewriter.getStringAttr(padding),
-        /*explicit_paddings=*/rewriter.getI64ArrayAttr({}),
-        /*data_format=*/rewriter.getStringAttr("NHWC"),
-        /*dilations=*/rewriter.getI64ArrayAttr(dilation));
+
+    // if output is not in [b, 0, 1, f] format, insert transpose to go back
+    if (dnums.getOutputBatchDimension() != 0 ||
+        dnums.getOutputFeatureDimension() != num_spatial_dims + 1) {
+      std::vector<int64_t> transpose_order;
+      transpose_order.resize(num_spatial_dims + 2);
+      // the output always in b, 0, 1, f format
+      transpose_order[dnums.getOutputBatchDimension()] = 0;
+      transpose_order[dnums.getOutputFeatureDimension()] = num_spatial_dims + 1;
+      for (size_t i = 0; i < num_spatial_dims; ++i) {
+        transpose_order[dnums.getOutputSpatialDimensions().data()[i]] = i + 1;
+      }
+      auto output_shape =
+          conv_op.getResult().getType().cast<RankedTensorType>().getShape();
+      SmallVector<int64_t, 4> transposed_output_shape = {
+          output_shape[dnums.getOutputBatchDimension()],
+          output_shape[dnums.getOutputSpatialDimensions().data()[0]],
+          output_shape[dnums.getOutputSpatialDimensions().data()[1]],
+          output_shape[dnums.getOutputFeatureDimension()]};
+      // Converts int64_t to int32_t.
+      SmallVector<int32_t, 4> transposed_output_shape_i32;
+      for (int64_t dim : transposed_output_shape) {
+        transposed_output_shape_i32.push_back(dim);
+      }
+      auto output_type = RankedTensorType::get(
+          transposed_output_shape,
+          conv_op.getRhs().getType().cast<ShapedType>().getElementType());
+      auto output_sizes = rewriter.create<TF::ConstOp>(
+          conv_op.getLoc(),
+          DenseIntElementsAttr::get(
+              RankedTensorType::get(
+                  {static_cast<int64_t>(transposed_output_shape_i32.size())},
+                  rewriter.getI32Type()),
+              transposed_output_shape_i32));
+      auto new_conv = rewriter.create<TF::Conv2DBackpropInputOp>(
+          conv_op.getLoc(), output_type, output_sizes, filter, conv_input,
+          rewriter.getI64ArrayAttr(strides),
+          /*use_cudnn_on_gpu=*/rewriter.getBoolAttr(true),
+          /*padding=*/rewriter.getStringAttr(padding),
+          /*explicit_paddings=*/rewriter.getI64ArrayAttr({}),
+          /*data_format=*/rewriter.getStringAttr("NHWC"),
+          /*dilations=*/rewriter.getI64ArrayAttr(dilation));
+      auto output_transpose = rewriter.create<mhlo::TransposeOp>(
+          conv_op.getLoc(), new_conv.getResult(),
+          rewriter.getI64TensorAttr(transpose_order));
+      conv_op->replaceAllUsesWith(output_transpose);
+      rewriter.eraseOp(conv_op);
+    } else {
+      SmallVector<int32_t, 4> output_shape_i32;
+      for (int64_t dim :
+           conv_op.getResult().getType().cast<RankedTensorType>().getShape()) {
+        output_shape_i32.push_back(dim);
+      }
+      auto output_sizes = rewriter.create<TF::ConstOp>(
+          conv_op.getLoc(),
+          DenseIntElementsAttr::get(
+              RankedTensorType::get(
+                  {static_cast<int64_t>(output_shape_i32.size())},
+                  rewriter.getI32Type()),
+              output_shape_i32));
+      rewriter.replaceOpWithNewOp<TF::Conv2DBackpropInputOp>(
+          conv_op, conv_op.getType(), output_sizes, filter, conv_input,
+          rewriter.getI64ArrayAttr(strides),
+          /*use_cudnn_on_gpu=*/rewriter.getBoolAttr(true),
+          /*padding=*/rewriter.getStringAttr(padding),
+          /*explicit_paddings=*/rewriter.getI64ArrayAttr({}),
+          /*data_format=*/rewriter.getStringAttr("NHWC"),
+          /*dilations=*/rewriter.getI64ArrayAttr(dilation));
+    }
     return success();
   };
 
@@ -811,12 +805,17 @@ class ConvertNonTrivialConvOp
   bool IsSamePadding(mhlo::ConvolutionOp conv_op, int num_spatial_dims,
                      ArrayRef<int64_t> strides) const {
     for (auto i : llvm::seq<int>(0, num_spatial_dims)) {
-      int dim = i + 1;
-      int stride = strides[dim];
-      int input_size = conv_op.getType().cast<ShapedType>().getDimSize(dim);
+      int stride_dim = i + 1;
+      int stride = strides[stride_dim];
+      int output_dim =
+          conv_op.getDimensionNumbers().getOutputSpatialDimensions().data()[i];
+      int input_dim =
+          conv_op.getDimensionNumbers().getInputSpatialDimensions().data()[i];
       int output_size =
-          conv_op.getLhs().getType().cast<ShapedType>().getDimSize(dim);
-      if (output_size != (input_size + stride - 1) / stride) {
+          conv_op.getType().cast<ShapedType>().getDimSize(output_dim);
+      int input_size =
+          conv_op.getLhs().getType().cast<ShapedType>().getDimSize(input_dim);
+      if (input_size != (output_size + stride - 1) / stride) {
         return false;
       }
     }
@@ -874,32 +873,6 @@ class ConvertNonTrivialConvOp
       return rewriter.notifyMatchFailure(conv_op,
                                          "doesn't support more than 2D");
 
-    // TODO(chhe): To support more data formats other than "NHWC".
-    // Checks format [b, 0, 1, f]x[0, 1, o, i]->[b, 0, 1, f].
-    if (dnums.getInputBatchDimension() != 0 ||
-        dnums.getInputFeatureDimension() != num_spatial_dims + 1)
-      return rewriter.notifyMatchFailure(conv_op,
-                                         "requires input format [b, 0, 1, f]");
-    auto input_spatial_dimensions = dnums.getInputSpatialDimensions();
-    for (auto p : llvm::enumerate(input_spatial_dimensions)) {
-      if (p.value() != p.index() + 1)
-        return rewriter.notifyMatchFailure(
-            conv_op, "requires input format [b, 0, 1, f]");
-    }
-
-    // Checks output dimensions.
-    if (dnums.getOutputBatchDimension() != 0 ||
-        conv_op.getDimensionNumbers().getOutputFeatureDimension() !=
-            num_spatial_dims + 1)
-      return rewriter.notifyMatchFailure(conv_op,
-                                         "requires output format [b, 0, 1, f]");
-    auto output_spatial_dimensions = dnums.getOutputSpatialDimensions();
-    for (auto p : llvm::enumerate(output_spatial_dimensions)) {
-      if (p.value() != p.index() + 1)
-        return rewriter.notifyMatchFailure(
-            conv_op, "requires output format [b, 0, 1, f]");
-    }
-
     // Checks kernel dimensions.
     if (!isKernelFormatHWIO(dnums) && !isKernelFormatHWOI(dnums))
       return rewriter.notifyMatchFailure(
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
index 3588b68bc1b182..a05ca60a2462a5 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <utility>
 
 #include "absl/algorithm/container.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -365,6 +367,64 @@ void AddStridedSliceOpIfRequiredImpl(
 
 }  // namespace
 
+bool NeedsReformatTypeAndPermutation(int batch_dim, int feature_dim,
+                                     int spatial_dim_start,
+                                     int default_batch_dim,
+                                     int default_feature_dim,
+                                     int default_spatial_dim_start) {
+  return batch_dim != default_batch_dim || feature_dim != default_feature_dim ||
+         spatial_dim_start != default_spatial_dim_start;
+}
+
+std::pair<RankedTensorType, DenseIntElementsAttr> GetReformatTypeAndPermutation(
+    int batch_dim, int feature_dim, int spatial_dim_start,
+    int default_batch_dim, int default_feature_dim,
+    int default_spatial_dim_start, int num_spatial_dims, RankedTensorType type,
+    ConversionPatternRewriter& rewriter) {
+  auto shape = type.getShape();
+  llvm::SmallVector<int64_t, 4> permutation_array(num_spatial_dims + 2);
+  permutation_array[default_batch_dim] = batch_dim;
+  permutation_array[default_feature_dim] = feature_dim;
+  llvm::SmallVector<int64_t, 4> transposed_shape(num_spatial_dims + 2);
+  transposed_shape[default_batch_dim] = shape[batch_dim];
+  transposed_shape[default_feature_dim] = shape[feature_dim];
+  for (int i : llvm::seq<int>(0, num_spatial_dims)) {
+    permutation_array[default_spatial_dim_start + i] = spatial_dim_start + i;
+    transposed_shape[default_spatial_dim_start + i] =
+        shape[spatial_dim_start + i];
+  }
+  auto new_type =
+      RankedTensorType::get(transposed_shape, type.getElementType());
+  auto permutation = DenseIntElementsAttr::get(
+      RankedTensorType::get({type.getRank()}, rewriter.getI64Type()),
+      permutation_array);
+  return {new_type, permutation};
+}
+
+Value InsertTranspose(Value value, int batch_dim, int feature_dim,
+                      ArrayRef<int64_t> spatial_dimensions,
+                      int default_batch_dim, int default_feature_dim,
+                      int default_spatial_dim_start, int num_spatial_dims,
+                      ConversionPatternRewriter& rewriter) {
+  auto type = value.getType().cast<RankedTensorType>();
+  DenseIntElementsAttr permutation;
+  const int spatial_dim_start = spatial_dimensions.front();
+  if (!NeedsReformatTypeAndPermutation(
+          batch_dim, feature_dim, spatial_dim_start, default_batch_dim,
+          default_feature_dim, default_spatial_dim_start)) {
+    // Transpose is not needed because the current format is the same a default
+    // format.
+    return value;
+  }
+  std::pair<RankedTensorType&, DenseIntElementsAttr&>(type, permutation) =
+      GetReformatTypeAndPermutation(batch_dim, feature_dim, spatial_dim_start,
+                                    default_batch_dim, default_feature_dim,
+                                    default_spatial_dim_start, num_spatial_dims,
+                                    type, rewriter);
+  return rewriter.create<mhlo::TransposeOp>(value.getLoc(), type, value,
+                                            permutation);
+}
+
 void AddStridedSliceOpIfRequired(ConversionState& state,
                                  const DenseElementsAttr& edge_padding_low,
                                  const DenseElementsAttr& edge_padding_high,
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
index 8d63bfa1435324..d74c6fd7073670 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
@@ -76,6 +76,32 @@ PermutationAndShape GetInversePermutationAndShape(
     llvm::ArrayRef<int64_t> permutation_array, ShapedType input_type,
     ConversionPatternRewriter& rewriter);
 
+// Returns true if the op needs reformat.
+bool NeedsReformatTypeAndPermutation(int batch_dim, int feature_dim,
+                                     int spatial_dim_start,
+                                     int default_batch_dim,
+                                     int default_feature_dim,
+                                     int default_spatial_dim_start);
+
+// Gets reformat type and permutation attribute. Call this function only if
+// NeedsReformatTypeAndPermutation returns true. If
+// NeedsReformatTypeAndPermutation returns false, this function returns the pair
+// of input type and no-op permutation.
+
+std::pair<RankedTensorType, DenseIntElementsAttr> GetReformatTypeAndPermutation(
+    int batch_dim, int feature_dim, int spatial_dim_start,
+    int default_batch_dim, int default_feature_dim,
+    int default_spatial_dim_start, int num_spatial_dims, RankedTensorType type,
+    ConversionPatternRewriter& rewriter);
+
+// Insert transpose so the input value is converted to the format specified by
+// the default dims
+Value InsertTranspose(Value value, int batch_dim, int feature_dim,
+                      ArrayRef<int64_t> spatial_dimensions,
+                      int default_batch_dim, int default_feature_dim,
+                      int default_spatial_dim_start, int num_spatial_dims,
+                      ConversionPatternRewriter& rewriter);
+
 // If index_vector_dim == indices.rank() then insert the implicit extra
 // dimension into indices to normalize everything to index_vector_dim ==
 // indices.rank() - 1.

From 28828b566c3e77bcda72b1865d12f6ae0fbf29e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 30 Sep 2023 02:02:05 -0700
Subject: [PATCH 449/567] compat: Update forward compatibility horizon to
 2023-09-30

PiperOrigin-RevId: 569693143
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index ba6dcd40265a78..af935567cea39e 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 29)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 30)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From e337d91c7d3396528b1bf3fefa707f284958b5cf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 30 Sep 2023 02:02:08 -0700
Subject: [PATCH 450/567] Update GraphDef version to 1635.

PiperOrigin-RevId: 569693153
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 08f4929b65a299..b77af451af193f 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1634  // Updated: 2023/9/29
+#define TF_GRAPH_DEF_VERSION 1635  // Updated: 2023/9/30
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 6872b8b1d4c0f52e90a38cd90dee49316b6b87f1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 30 Sep 2023 15:31:54 -0700
Subject: [PATCH 451/567] Consolidates the dot handler's mesh dimension
 enumeration logic into a single method.

PiperOrigin-RevId: 569771316
---
 .../xla/hlo/experimental/auto_sharding/BUILD  |   1 +
 .../auto_sharding_dot_handler.cc              | 144 ++++++------------
 2 files changed, 46 insertions(+), 99 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index 35b62d046cde03..2570c83b3e9fe6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -41,6 +41,7 @@ cc_library(
         "//xla:array",
         "//xla:shape_tree",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 3dd22b34261a99..7090a33dc24fda 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -13,18 +13,27 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
+#include <cstdint>
+#include <functional>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_solver_option.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
 #include "xla/hlo/experimental/auto_sharding/cluster_environment.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/status.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
@@ -57,6 +66,17 @@ void AppendNewStrategy(const HloInstruction* ins, const std::string& name,
   }));
 }
 
+// Calls the given 'split_func' on all possible mesh dim combinations.
+void SplitWithMeshShape(std::function<void(int, int)> split_func,
+                        absl::Span<const int64_t> mesh_shape) {
+  for (int64_t i = 0; i < mesh_shape.size(); ++i) {
+    for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
+      split_func(i, j);
+      split_func(j, i);
+    }
+  }
+}
+
 class DotHandler {
  public:
   DotHandler(std::unique_ptr<StrategyVector>& strategies,
@@ -617,76 +637,43 @@ class DotHandler {
     }
   }
 
-  Status RegisterStrategies() {
-    auto mesh_shape = device_mesh_.dimensions();
+  void Split(std::function<void(int, int)> split_func) const {
+    SplitWithMeshShape(split_func, device_mesh_.dimensions());
+  }
 
+  Status RegisterStrategies() {
     // SS = SR x RS
     // Split lhs space dim and rhs space dim.
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        SplitLhsSpaceRhsSpace(i, j);
-        SplitLhsSpaceRhsSpace(j, i);
-      }
-    }
+    Split([this](int i, int j) { this->SplitLhsSpaceRhsSpace(i, j); });
 
     // SSR = SSR x RR
     // Split lhs space dims only if it has more than 1 space dims.
     if (lhs_space_dims_.size() > 1) {
-      for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-        for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-          SplitLhsSpaceOnly(i, j);
-          SplitLhsSpaceOnly(j, i);
-        }
-      }
+      Split([this](int i, int j) { this->SplitLhsSpaceOnly(i, j); });
     }
     // RSS = RR x RSS
     // Split rhs space dims only if it has more than 1 space dims.
     if (rhs_space_dims_.size() > 1) {
-      for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-        for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-          SplitRhsSpaceOnly(i, j);
-          SplitRhsSpaceOnly(j, i);
-        }
-      }
+      Split([this](int i, int j) { this->SplitRhsSpaceOnly(i, j); });
     }
 
     // SR = SS x SR
     // Split lhs space dim and both contracting dims.
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        SplitLhsSpaceBothContract(i, j);
-        SplitLhsSpaceBothContract(j, i);
-      }
-    }
+    Split([this](int i, int j) { this->SplitLhsSpaceBothContract(i, j); });
 
     // RS = RS x SS
     // Split rhs space dim and both contracting dims.
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        SplitRhsSpaceBothContract(i, j);
-        SplitRhsSpaceBothContract(j, i);
-      }
-    }
+    Split([this](int i, int j) { this->SplitRhsSpaceBothContract(i, j); });
 
     // RR = SS x SS
     // Split two contracting dims on lhs and rhs.
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        SplitBothContractTwoDims(i, j);
-        SplitBothContractTwoDims(j, i);
-      }
-    }
+    Split([this](int i, int j) { this->SplitBothContractTwoDims(i, j); });
 
     // RR = RS x SR
     // This is a special case where we allow spliting only one dim in the
     // multi-dimensional mesh case. This allows some recomputation
     // (e.g., the dense layer in the LM_head of BERT).
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        RecomputeSplitBothContract(i, j);
-        RecomputeSplitBothContract(j, i);
-      }
-    }
+    Split([this](int i, int j) { this->RecomputeSplitBothContract(i, j); });
 
     // Add 1d data parallel in multi-dimensional mesh
     if (solver_option_.allow_mixed_mesh_shape) {
@@ -707,30 +694,15 @@ class DotHandler {
 
     // SbSi = SbSi x SbR
     // Split batch dim and lhs space dim
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        SplitBatchDimLhsSpace(i, j);
-        SplitBatchDimLhsSpace(j, i);
-      }
-    }
+    Split([this](int i, int j) { this->SplitBatchDimLhsSpace(i, j); });
 
     // SbSj = SbR x SbSj
     // Split batch dim and rhs space dim
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        SplitBatchDimRhsSpace(i, j);
-        SplitBatchDimRhsSpace(j, i);
-      }
-    }
+    Split([this](int i, int j) { this->SplitBatchDimRhsSpace(i, j); });
 
     // SbSj = SbR x SbSj
     // Split batch dim and contracting dim
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        SplitBatchDimBothContract(i, j);
-        SplitBatchDimBothContract(j, i);
-      }
-    }
+    Split([this](int i, int j) { this->SplitBatchDimBothContract(i, j); });
 
     if (solver_option_.batch_matmul_always_split_batch &&
         lhs_batch_dims_.size() == 2 &&
@@ -743,12 +715,7 @@ class DotHandler {
 
     // Sb = Sb x Sb
     // Split batch dims.
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        SplitTwoBatchDims(i, j);
-        SplitTwoBatchDims(j, i);
-      }
-    }
+    Split([this](int i, int j) { this->SplitTwoBatchDims(i, j); });
 
     if (solver_option_.allow_mixed_mesh_shape) {
       Add1DBatchSplit();
@@ -951,8 +918,11 @@ class ConvHandler {
                       cluster_env_, strategy_map_, strategies_);
   }
 
+  void Split(std::function<void(int, int)> split_func) const {
+    SplitWithMeshShape(split_func, device_mesh_.dimensions());
+  }
+
   Status RegisterStrategies() {
-    auto mesh_shape = device_mesh_.dimensions();
     // For 1D sharding
     if ((ins_->feature_group_count() ==
              lhs_->shape().dimensions(lhs_in_channel_dim_) &&
@@ -961,12 +931,7 @@ class ConvHandler {
       // for depthwise conv
       // SS = SS x S
       // Split batch dim and channel dim
-      for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-        for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-          SplitDepthwise(i, j, true);
-          SplitDepthwise(j, i, true);
-        }
-      }
+      Split([this](int i, int j) { this->SplitDepthwise(i, j, true); });
     } else if ((ins_->batch_group_count() ==
                     lhs_->shape().dimensions(lhs_batch_dim_) &&
                 ins_->batch_group_count() ==
@@ -974,40 +939,21 @@ class ConvHandler {
       // for depthwise conv filter_backward
       // SS = SS x S
       // Split batch dim and channel dim
-      for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-        for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-          SplitDepthwise(i, j, false);
-          SplitDepthwise(j, i, false);
-        }
-      }
+      Split([this](int i, int j) { this->SplitDepthwise(i, j, false); });
     }
 
     // SS = SR x RS
     // Split lhs batch dim and rhs out_channel dim.
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        SplitLhsBatchRhsOutchannel(i, j);
-        SplitLhsBatchRhsOutchannel(j, i);
-      }
-    }
+    Split([this](int i, int j) { this->SplitLhsBatchRhsOutchannel(i, j); });
 
     // SR = SS x SR
     // Split lhs batch dim and both in_channel dims.
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        SplitLhsBatchBothInchannel(i, j);
-        SplitLhsBatchBothInchannel(j, i);
-      }
-    }
+    Split([this](int i, int j) { this->SplitLhsBatchBothInchannel(i, j); });
 
     // RS = RS x SS
     // Split rhs out_channel dim and both in_channel dims.
-    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-        SplitRhsOutchannelBothInchannel(i, j);
-        SplitRhsOutchannelBothInchannel(j, i);
-      }
-    }
+    Split(
+        [this](int i, int j) { this->SplitRhsOutchannelBothInchannel(i, j); });
 
     // Add 1d data parallel in multi-dimensional mesh
     if (solver_option_.allow_mixed_mesh_shape) {

From 022d32e6d9a0c666113d078460af91e260ce3c9f Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Sat, 30 Sep 2023 18:30:26 -0700
Subject: [PATCH 452/567] Move `:gpu_client` into `if_cuda_or_rocm` rather than
 duplicating it within `if_cuda` and `if_rocm`

This prevents a build error when bazel matches both cuda and rocm

PiperOrigin-RevId: 569787230
---
 third_party/xla/xla/pjrt/gpu/BUILD | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 67d4ae617ccd65..e6a511e7142a45 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -1,6 +1,6 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_tsl//tsl:tsl.bzl", "if_nccl")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
@@ -87,15 +87,15 @@ cc_library(
         "@local_tsl//tsl/profiler/lib:connected_traceme",
         "@local_tsl//tsl/util:env_var",
         "@tf_runtime//:hostcontext",
-    ] + if_cuda([
+    ] + if_cuda_or_rocm([
+        "//xla/service/gpu:gpu_compiler",
+    ]) + if_cuda([
         ":nccl_id_store_cuda",
         "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
-        "//xla/service/gpu:gpu_compiler",
     ]) + if_rocm([
         ":nccl_id_store_rocm",
         "@local_config_rocm//rocm:rocm_headers",
-        "//xla/service/gpu:gpu_compiler",
     ]),
 )
 

From 0c79e428577c49f91cba0d0fefd34be8fea35c85 Mon Sep 17 00:00:00 2001
From: Krzysztof Kotowicz <koto@google.com>
Date: Sat, 30 Sep 2023 23:02:10 -0700
Subject: [PATCH 453/567] Added new threat model.

PiperOrigin-RevId: 569813726
---
 SECURITY.md | 421 ++++++++++++++++++++--------------------------------
 1 file changed, 165 insertions(+), 256 deletions(-)

diff --git a/SECURITY.md b/SECURITY.md
index 5ef9c31914ba4c..87d45a8671754d 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,8 +1,8 @@
 # Using TensorFlow Securely
 
-This document discusses the TensorFlow security model. It describes how to
-safely deal with untrusted programs (models or model parameters), and input
-data. We also provide guidelines on what constitutes a vulnerability in
+This document discusses the TensorFlow security model. It describes the security
+risks to consider when using models, checkpoints or input data for training or
+serving. We also provide guidelines on what constitutes a vulnerability in
 TensorFlow and how to report them.
 
 This document applies to other repositories in the TensorFlow organization,
@@ -16,272 +16,181 @@ use a term commonly used by machine learning practitioners) are expressed as
 programs that TensorFlow executes. TensorFlow programs are encoded as
 computation
 [**graphs**](https://developers.google.com/machine-learning/glossary/#graph).
-The model's parameters are often stored separately in **checkpoints**.
-
-At runtime, TensorFlow executes the computation graph using the parameters
-provided. Note that the behavior of the computation graph may change depending
-on the parameters provided. **TensorFlow itself is not a sandbox**. When
-executing the computation graph, TensorFlow may read and write files, send and
-receive data over the network, and even spawn additional processes. All these
-tasks are performed with the permission of the TensorFlow process. Allowing for
-this flexibility makes for a powerful machine learning platform, but it has
-security implications.
-
-The computation graph may also accept **inputs**. Those inputs are the
-data you supply to TensorFlow to train a model, or to use a model to run
-inference on the data.
-
-**TensorFlow models are programs, and need to be treated as such from a security
-perspective.**
-
-## Execution models of TensorFlow code
-
-The TensorFlow library has a wide API which can be used in multiple scenarios.
-The security requirements are also different depending on the usage.
-
-The API usage with the least security concerns is doing iterative exploration
-via the Python interpreter or small Python scripts. Here, only some parts of the
-API are exercised and eager execution is the default, meaning that each
-operation executes immediately. This mode is useful for testing, including
-fuzzing. For direct access to the C++ kernels, users of TensorFlow can directly
-call `tf.raw_ops.xxx` APIs. This gives control over all the parameters that
-would be sent to the kernel. Passing invalid combinations of parameters can
-allow insecure behavior (see definition of a vulnerability in a section below).
-However, these won’t always translate to actual vulnerabilities in TensorFlow.
-This would be similar to directly dereferencing a null pointer in a C++ program:
-not a vulnerability by itself but a coding error.
-
-The next 2 modes of using the TensorFlow API have the most security
-implications. These relate to the actual building and use of machine learning
-models. Both during training and inference, the TensorFlow runtime will build
-and execute computation graphs from (usually Python) code written by a
-practitioner (using compilation techniques to turn eager code into graph mode).
-In both of these scenarios, a vulnerability can be exploited to cause
-significant damage, hence the goal of the security team is to eliminate these
-vulnerabilities or otherwise reduce their impact. This is essential, given that
-both training and inference can run on accelerators (e.g. GPU, TPU) or in a
-distributed manner.
-
-Finally, the last mode of executing TensorFlow library code is as part of
-additional tooling. For example, TensorFlow provides a `saved_model_cli` tool
-which can be used to scan a `SavedModel` (the serialization format used by
-TensorFlow for models) and describe it. These tools are usually run by a single
-developer, on a single host, so the impact of a vulnerability in them is
-somewhat reduced.
-
-## Running untrusted models
-
-As a general rule: **Always** execute untrusted models inside a sandbox (e.g.,
-[nsjail](https://github.com/google/nsjail)).
-
-There are several ways in which a model could become untrusted. Obviously, if an
-untrusted party supplies TensorFlow kernels, arbitrary code may be executed.
-The same is true if the untrusted party provides Python code, such as the Python
-code that generates TensorFlow graphs.
-
-Even if the untrusted party only supplies the serialized computation graph (in
-form of a `GraphDef`, `SavedModel`, or equivalent on-disk format), the set of
-computation primitives available to TensorFlow is powerful enough that you
-should assume that the TensorFlow process effectively executes arbitrary code.
-One common solution is to allow only a few safe Ops. While this is possible in
-theory, we still recommend you sandbox the execution.
-
-It depends on the computation graph whether a user provided checkpoint is safe.
-It is easily possible to create computation graphs in which malicious
-checkpoints can trigger unsafe behavior. For example, consider a graph that
-contains a `tf.cond` operation depending on the value of a `tf.Variable`. One
-branch of the `tf.cond` is harmless, but the other is unsafe. Since the
-`tf.Variable` is stored in the checkpoint, whoever provides the checkpoint now
-has the ability to trigger unsafe behavior, even though the graph is not under
-their control.
-
-In other words, graphs can contain vulnerabilities of their own. To allow users
-to provide checkpoints to a model you run on their behalf (e.g., in order to
-compare model quality for a fixed model architecture), you must carefully audit
-your model, and we recommend you run the TensorFlow process in a sandbox.
-
-Similar considerations should apply if the model uses **custom ops** (C++ code
-written outside of the TensorFlow tree and loaded as plugins).
-
-## Accepting untrusted inputs
-
-It is possible to write models that are secure in the sense that they can safely
-process untrusted inputs assuming there are no bugs. There are, however, two
-main reasons to not rely on this: First, it is easy to write models which must
-not be exposed to untrusted inputs, and second, there are bugs in any software
-system of sufficient complexity. Letting users control inputs could allow them
-to trigger bugs either in TensorFlow or in dependencies.
-
-In general, it is good practice to isolate parts of any system which is exposed
-to untrusted (e.g., user-provided) inputs in a sandbox.
-
-A useful analogy to how any TensorFlow graph is executed is any interpreted
-programming language, such as Python. While it is possible to write secure
-Python code which can be exposed to user supplied inputs (by, e.g., carefully
-quoting and sanitizing input strings, size-checking input blobs, etc.), it is
-very easy to write Python programs which are insecure. Even secure Python code
-could be rendered insecure by a bug in the Python interpreter, or in a bug in a
-Python library used (e.g.,
-[this one](https://www.cvedetails.com/cve/CVE-2017-12852/)).
-
-## Running a TensorFlow server
+Since models are practically programs that TensorFlow executes, using untrusted
+models or graphs is equivalent to running untrusted code.
+
+If you need to run untrusted models, execute them inside a
+[**sandbox**](https://developers.google.com/code-sandboxing). Memory corruptions
+in TensorFlow ops can be recognized as security issues only if they are
+reachable and exploitable through production-grade, benign models.
+
+### Compilation
+
+Compiling models via the recommended entry points described in
+[XLA](https://www.tensorflow.org/xla) and
+[JAX](https://jax.readthedocs.io/en/latest/jax-101/02-jitting.html)
+documentation should be safe, while some of the testing and debugging tools that
+come with the compiler are not designed to be used with untrusted data and
+should be used with caution when working with untrusted models.
+
+### Saved graphs and checkpoints
+
+When loading untrusted serialized computation graphs (in form of a `GraphDef`,
+`SavedModel`, or equivalent on-disk format), the set of computation primitives
+available to TensorFlow is powerful enough that you should assume that the
+TensorFlow process effectively executes arbitrary code.
+
+The risk of loading untrusted checkpoints depends on the code or graph that you
+are working with. When loading untrusted checkpoints, the values of the traced
+variables from your model are also going to be untrusted. That means that if
+your code interacts with the filesystem, network, etc. and uses checkpointed
+variables as part of those interactions (ex: using a string variable to build a
+filesystem path), a maliciously created checkpoint might be able to change the
+targets of those operations, which could result in arbitrary
+read/write/executions.
+
+### Running a TensorFlow server
 
 TensorFlow is a platform for distributed computing, and as such there is a
-TensorFlow server (`tf.train.Server`). **The TensorFlow server is meant for
-internal communication only. It is not built for use in an untrusted network.**
+TensorFlow server (`tf.train.Server`). The TensorFlow server is intended for
+internal communication only. It is not built for use in untrusted environments
+or networks.
 
 For performance reasons, the default TensorFlow server does not include any
 authorization protocol and sends messages unencrypted. It accepts connections
 from anywhere, and executes the graphs it is sent without performing any checks.
-Therefore, if you run a `tf.train.Server` in your network, anybody with
-access to the network can execute what you should consider arbitrary code with
-the privileges of the process running the `tf.train.Server`.
-
-When running distributed TensorFlow, you must isolate the network in which the
-cluster lives. Cloud providers provide instructions for setting up isolated
-networks, which are sometimes branded as "virtual private cloud." Refer to the
-instructions for
-[GCP](https://cloud.google.com/compute/docs/networks-and-firewalls) and
-[AWS](https://aws.amazon.com/vpc/)) for details.
-
-Note that `tf.train.Server` is different from the server created by
-`tensorflow/serving` (the default binary for which is called `ModelServer`).
-By default, `ModelServer` also has no built-in mechanism for authentication.
-Connecting it to an untrusted network allows anyone on this network to run the
-graphs known to the `ModelServer`. This means that an attacker may run
-graphs using untrusted inputs as described above, but they would not be able to
-execute arbitrary graphs. It is possible to safely expose a `ModelServer`
-directly to an untrusted network, **but only if the graphs it is configured to
-use have been carefully audited to be safe**.
-
-Similar to best practices for other servers, we recommend running any
-`ModelServer` with appropriate privileges (i.e., using a separate user with
-reduced permissions). In the spirit of defense in depth, we recommend
-authenticating requests to any TensorFlow server connected to an untrusted
-network, as well as sandboxing the server to minimize the adverse effects of
-any breach.
-
-## Multitenancy environments
+Therefore, if you run a `tf.train.Server` in your network, anybody with access
+to the network can execute arbitrary code with the privileges of the user
+running the `tf.train.Server`.
+
+## Untrusted inputs during training and prediction
+
+TensorFlow supports a wide range of input data formats. For example it can
+process images, audio, videos, and text. There are several modules specialized
+in taking those formats, modifying them, and/or converting them to intermediate
+formats that can be processed by TensorFlow.
+
+These modifications and conversions are handled by a variety of libraries that
+have different security properties and provide different levels of confidence
+when dealing with untrusted data. Based on the security history of these
+libraries we consider that it is safe to work with untrusted inputs for PNG,
+BMP, GIF, WAV, RAW, RAW\_PADDED, CSV and PROTO formats. All other input formats,
+including tensorflow-io should be sandboxed if used to process untrusted data.
+
+For example, if an attacker were to upload a malicious video file, they could
+potentially exploit a vulnerability in the TensorFlow code that handles videos,
+which could allow them to execute arbitrary code on the system running
+TensorFlow.
+
+It is important to keep TensorFlow up to date with the latest security patches
+and follow the sandboxing guideline above to protect against these types of
+vulnerabilities.
+
+## Security properties of execution modes
+
+TensorFlow has several execution modes, with Eager-mode being the default in v2.
+Eager mode lets users write imperative-style statements that can be easily
+inspected and debugged and it is intended to be used during the development
+phase.
+
+As part of the differences that make Eager mode easier to debug, the [shape
+inference
+functions](https://www.tensorflow.org/guide/create_op#define_the_op_interface)
+are skipped, and any checks implemented inside the shape inference code are not
+executed.
+
+The security impact of skipping those checks should be low, since the attack
+scenario would require a malicious user to be able to control the model which as
+stated above is already equivalent to code execution. In any case, the
+recommendation is not to serve models using Eager mode since it also has
+performance limitations.
+
+## Multi-Tenant environments
 
 It is possible to run multiple TensorFlow models in parallel. For example,
 `ModelServer` collates all computation graphs exposed to it (from multiple
-`SavedModel`) and executes them in parallel on available executors. A denial of
-service caused by one model could bring down the entire server, but we don't
-consider this as a high impact vulnerability, given that there exists solutions
-to prevent this from happening (e.g., rate limits, ACLs, monitors to restart
-broken servers).
-
-However, it is a critical vulnerability if a model could be manipulated such
-that it would output parameters of another model (or itself!) or data that
-belongs to another model.
-
-Models that also run on accelerators could be abused to do hardware damage or to
-leak data that exists on the accelerators from previous executions, if not
-cleared.
-
-## Vulnerabilities in TensorFlow
-
-TensorFlow is a large and complex system. It also depends on a large set of
-third party libraries (e.g., `numpy`, `libjpeg-turbo`, PNG parsers, `protobuf`).
-It is possible that TensorFlow or its dependencies may contain vulnerabilities
-that would allow triggering unexpected or dangerous behavior with specially
-crafted inputs.
-
-Given TensorFlow's flexibility, it is possible to specify computation graphs
-which exhibit unexpected or unwanted behavior. The fact that TensorFlow models
-can perform arbitrary computations means that they may read and write files,
-communicate via the network, produce deadlocks and infinite loops, or run out of
-memory. It is only when these behaviors are outside the specifications of the
-operations involved that such behavior is a vulnerability.
-
-A `FileWriter` writing a file is not unexpected behavior and therefore is not a
-vulnerability in TensorFlow. A `MatMul` allowing arbitrary binary code execution
-**is** a vulnerability.
-
-This is more subtle from a system perspective. For example, it is easy to cause
-a TensorFlow process to try to allocate more memory than available by specifying
-a computation graph containing an ill-considered `tf.tile` operation. TensorFlow
-should exit cleanly in this case (it would raise an exception in Python, or
-return an error `Status` in C++). However, if the surrounding system is not
-expecting the possibility, such behavior could be used in a denial of service
-attack (or worse). Because TensorFlow behaves correctly, this is not a
-vulnerability in TensorFlow (although it would be a vulnerability of this
-hypothetical system).
-
-As a general rule, it is incorrect behavior for TensorFlow to access memory it
-does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to
-such behaviors constitute a vulnerability.
-
-One of the most critical parts of any system is input handling. If malicious
-input can trigger side effects or incorrect behavior, this is a bug, and likely
-a vulnerability.
-
-**Note**: Assertion failures used to be considered a vulnerability in
-TensorFlow. If an assertion failure  only leads to program termination and no
-other exploits, we will no longer consider assertion failures (e.g.,
-`CHECK`-fails) as vulnerabilities. However, if the assertion failure occurs only
-in debug mode (e.g., `DCHECK`) and in production-optimized mode the issue turns
-into other code weakness(e.g., heap overflow, etc.), then we will consider
-this to be a vulnerability. We recommend reporters to try to maximize the impact
-of the vulnerability report (see also [the Google VRP
-rules](https://bughunters.google.com/about/rules/6625378258649088/google-and-alphabet-vulnerability-reward-program-vrp-rules)
-and [the Google OSS VRP
-rules](https://bughunters.google.com/about/rules/6521337925468160/google-open-source-software-vulnerability-reward-program-rules)).
-
-**Note**: Although the iterative exploration of TF API via fuzzing
-`tf.raw_ops.xxx` symbols is the best way to uncover code weakness, please bear
-in mind that this is not a typical use case that has security implications. It
-is better to try to translate the vulnerability to something that can be
-exploited during training or inference of a model (i.e., build a model that when
-given a specific input would produce unwanted behavior). Alternatively, if the
-TensorFlow API is only used in ancillary tooling, consider the environment where
-the tool would run. For example, if `saved_model_cli` tool would crash on
-parsing a `SavedModel` that is not considered a vulnerability but a bug (since
-the user can use other ways to inspect the model if needed). However, it would
-be a vulnerability if passing a `SavedModel` to `saved_model_cli` would result
-in opening a new network connection, corrupting CPU state, or other forms of
-unwanted behavior.
+`SavedModel`) and executes them in parallel on available executors. Running
+TensorFlow in a multitenant design mixes the risks described above with the
+inherent ones from multitenant configurations. The primary areas of concern are
+tenant isolation, resource allocation, model sharing and hardware attacks.
 
-## Reporting vulnerabilities
+### Tenant isolation
 
-Please use [Google Bug Hunters reporting form](https://g.co/vulnz) to report security
-related issues.
+Since any tenants or users providing models, graphs or checkpoints can execute
+code in context of the TensorFlow service, it is important to design isolation
+mechanisms that prevent unwanted access to the data from other tenants.
 
-Please use a descriptive title for your report.
+Network isolation between different models is also important not only to prevent
+unauthorized access to data or models, but also to prevent malicious users or
+tenants sending graphs to execute under another tenant’s identity.
 
-In addition, please include the following information along with your report:
+The isolation mechanisms are the responsibility of the users to design and
+implement, and therefore security issues deriving from their absence are not
+considered a vulnerability in TensorFlow.
+
+### Resource allocation
+
+A denial of service caused by one model could bring down the entire server, but
+we don't consider this as a vulnerability, given that models can exhaust
+resources in many different ways and solutions exist to prevent this from
+happening (e.g., rate limits, ACLs, monitors to restart broken servers).
+
+### Model sharing
+
+If the multitenant design allows sharing models, make sure that tenants and
+users are aware of the security risks detailed here and that they are going to
+be practically running code provided by other users. Currently there are no good
+ways to detect malicious models/graphs/checkpoints, so the recommended way to
+mitigate the risk in this scenario is to sandbox the model execution.
+
+### Hardware attacks
+
+Physical GPUs or TPUs can also be the target of attacks. [Published
+research](https://scholar.google.com/scholar?q=gpu+side+channel) shows that it
+might be possible to use side channel attacks on the GPU to leak data from other
+running models or processes in the same system. GPUs can also have
+implementation bugs that might allow attackers to leave malicious code running
+and leak or tamper with applications from other users. Please report
+vulnerabilities to the vendor of the affected hardware accelerator.
+
+## Reporting vulnerabilities
 
-*   Your name and affiliation (if any).
-*   A description of the technical details of the vulnerabilities. It is very
-    important to let us know how we can reproduce your findings.
-*   A minimal example of the vulnerability.
-*   An explanation of who can exploit this vulnerability, and what they gain
-    when doing so -- write an attack scenario. This will help us evaluate your
-    report quickly, especially if the issue is complex.
-*   Whether this vulnerability is public or known to third parties. If it is,
+### Vulnerabilities in TensorFlow
+
+This document covers different use cases for TensorFlow together with comments
+whether these uses were recommended or considered safe, or where we recommend
+some form of isolation when dealing with untrusted data. As a result, this
+document also outlines what issues we consider as TensorFlow security
+vulnerabilities.
+
+We recognize issues as vulnerabilities only when they occur in scenarios that we
+outline as safe; issues that have a security impact only when TensorFlow is used
+in a discouraged way (e.g. running untrusted models or checkpoints, data parsing
+outside of the safe formats, etc.) are not treated as vulnerabilities..
+
+### Reporting process
+
+Please use [Google Bug Hunters reporting form](https://g.co/vulnz) to report
+security vulnerabilities. Please include the following information along with
+your report:
+
+  - A descriptive title
+  - Your name and affiliation (if any).
+  - A description of the technical details of the vulnerabilities.
+  - A minimal example of the vulnerability. It is very important to let us know
+    how we can reproduce your findings. For memory corruption triggerable in
+    TensorFlow models, please demonstrate an exploit against one of Alphabet's
+    models in <https://tfhub.dev/>
+  - An explanation of who can exploit this vulnerability, and what they gain
+    when doing so. Write an attack scenario that demonstrates how your issue
+    violates the use cases and security assumptions defined in the threat model.
+    This will help us evaluate your report quickly, especially if the issue is
+    complex.
+  - Whether this vulnerability is public or known to third parties. If it is,
     please provide details.
 
-After the initial reply to your report, the security team will endeavor to keep
-you informed of the progress being made towards a fix and announcement.
-TensorFlow uses the following disclosure process:
-
-* When a report is received, we confirm the issue and determine its severity.
-  **Please try to maximize impact in the report**, going beyond just obtaining
-  unwanted behavior in a fuzzer.
-* If we know of specific third-party services or software based on TensorFlow
-  that require mitigation before publication, those projects will be notified.
-* An advisory is prepared (but not published) which details the problem and
-  steps for mitigation.
-* The vulnerability is fixed and potential workarounds are identified.
-* We will publish a security advisory for all fixed vulnerabilities.
-
-For each vulnerability, we try to ingress it as soon as possible, given the size
-of the team and the number of reports. Vulnerabilities will, in general, be
-batched to be fixed at the same time as a quarterly release.
-
-Security advisories from 2018 to March 2023 are listed
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/README.md).
-From TF 2.13 onwards, we have sunset this list and only use GitHub's Security
-Advisory format, to simplify the post-vulnerability-fix process.  In both
-locations, we credit reporters for identifying security issues, although we keep
-your name confidential if you request it.
+We will try to fix the problems as soon as possible. Vulnerabilities will, in
+general, be batched to be fixed at the same time as a quarterly release. We
+credit reporters for identifying security issues, although we keep your name
+confidential if you request it. Please see Google Bug Hunters program website
+for more info.

From 397aabb53fa4cba39aed5a32c38c1492db208bb3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 1 Oct 2023 02:02:10 -0700
Subject: [PATCH 454/567] compat: Update forward compatibility horizon to
 2023-10-01

PiperOrigin-RevId: 569832476
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index af935567cea39e..bc72d15bda9590 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 9, 30)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 10, 1)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 825dcfb0253f214b94904cc9a3e4e79570bb4d29 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 1 Oct 2023 02:02:11 -0700
Subject: [PATCH 455/567] Update GraphDef version to 1636.

PiperOrigin-RevId: 569832478
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index b77af451af193f..3f24fdcf29c8e9 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1635  // Updated: 2023/9/30
+#define TF_GRAPH_DEF_VERSION 1636  // Updated: 2023/10/1
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From ad39c958186235fa84e6574ec74ab0ae333d9bd7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 1 Oct 2023 07:32:42 -0700
Subject: [PATCH 456/567] Consolidates the dimension checking logic into a
 single method.

PiperOrigin-RevId: 569865640
---
 .../auto_sharding/auto_sharding.cc            |   2 +-
 .../auto_sharding_dot_handler.cc              | 222 ++++--------------
 .../auto_sharding/auto_sharding_util.cc       |   4 +-
 .../auto_sharding/auto_sharding_util.h        |  14 +-
 4 files changed, 60 insertions(+), 182 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index d2b4dfcbb3f615..a35840689a8592 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -3731,7 +3731,7 @@ void AnnotateShardingWithSimpleHeuristic(
       const DotDimensionNumbers& dot_dnums = inst->dot_dimension_numbers();
       // const auto& lhs_con_dims = dot_dnums.lhs_contracting_dimensions();
       // const auto& rhs_con_dims = dot_dnums.rhs_contracting_dimensions();
-      std::vector<int64_t> lhs_space_dims, rhs_space_dims;
+      tsl::protobuf::RepeatedField<int64_t> lhs_space_dims, rhs_space_dims;
       std::tie(lhs_space_dims, rhs_space_dims) =
           GetSpaceDims(lhs->shape(), rhs->shape(), dot_dnums);
     }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 7090a33dc24fda..522848a3416798 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -39,6 +39,8 @@ limitations under the License.
 namespace xla {
 namespace spmd {
 
+using DimMap = StableHashMap</*instr. dim idx*/ int, /* mesh dim idx*/ int>;
+
 void AppendNewStrategy(const HloInstruction* ins, const std::string& name,
                        const HloSharding& output_spec,
                        absl::Span<const HloSharding> input_specs,
@@ -108,22 +110,27 @@ class DotHandler {
     CHECK_EQ(lhs_batch_dims_.size(), rhs_batch_dims_.size());
   }
 
+  bool CheckDims(const HloInstruction* ins,
+                 const tsl::protobuf::RepeatedField<int64_t>& instr_dims,
+                 const DimMap& dim_map) const {
+    for (const auto& [instr_dim_idx, mesh_dim_idx] : dim_map) {
+      auto instr_dim = instr_dims.at(instr_dim_idx);
+      auto shape_dim = ins->shape().dimensions().at(instr_dim);
+      auto mesh_dim = device_mesh_.dim(mesh_dim_idx);
+      if (shape_dim < mesh_dim) return false;
+      if (solver_option_.only_allow_divisible_intermediate &&
+          !IsDivisible(shape_dim, mesh_dim))
+        return false;
+    }
+    return true;
+  }
+
   void SplitLhsSpaceRhsSpace(int mesh_dim0, int mesh_dim1) {
     for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
       for (int64_t j = 0; j < rhs_space_dims_.size(); ++j) {
-        if (lhs_->shape().dimensions().at(lhs_space_dims_.at(i)) <
-                device_mesh_.dim(mesh_dim0) ||
-            rhs_->shape().dimensions().at(rhs_space_dims_.at(j)) <
-                device_mesh_.dim(mesh_dim1)) {
+        if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dim0}}) ||
+            !CheckDims(rhs_, rhs_space_dims_, {{j, mesh_dim1}}))
           continue;
-        }
-        if (solver_option_.only_allow_divisible_intermediate &&
-            (!IsDivisible(lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
-                          device_mesh_.dim(mesh_dim0)) ||
-             !IsDivisible(rhs_->shape().dimensions().at(rhs_space_dims_.at(j)),
-                          device_mesh_.dim(mesh_dim1)))) {
-          continue;
-        }
         std::string name =
             absl::StrFormat("SS = SR x RS @ {%d,%d}", mesh_dim0, mesh_dim1);
         HloSharding output_spec =
@@ -146,19 +153,8 @@ class DotHandler {
   void SplitLhsSpaceOnly(int mesh_dim0, int mesh_dim1) {
     for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
       for (int64_t j = i + 1; j < lhs_space_dims_.size(); ++j) {
-        if (lhs_->shape().dimensions().at(lhs_space_dims_.at(i)) <
-                device_mesh_.dim(mesh_dim0) ||
-            lhs_->shape().dimensions().at(lhs_space_dims_.at(j)) <
-                device_mesh_.dim(mesh_dim1)) {
+        if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dim0}, {j, mesh_dim1}}))
           continue;
-        }
-        if (solver_option_.only_allow_divisible_intermediate &&
-            (!IsDivisible(lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
-                          device_mesh_.dim(mesh_dim0)) ||
-             !IsDivisible(lhs_->shape().dimensions().at(lhs_space_dims_.at(j)),
-                          device_mesh_.dim(mesh_dim1)))) {
-          continue;
-        }
         std::string name =
             absl::StrFormat("SSR = SSR x RR @ {%d,%d}", mesh_dim0, mesh_dim1);
         HloSharding output_spec =
@@ -178,19 +174,8 @@ class DotHandler {
   void SplitRhsSpaceOnly(int mesh_dim0, int mesh_dim1) {
     for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
       for (int64_t j = i + 1; j < rhs_space_dims_.size(); ++j) {
-        if (rhs_->shape().dimensions().at(rhs_space_dims_.at(i)) <
-                device_mesh_.dim(mesh_dim0) ||
-            rhs_->shape().dimensions().at(rhs_space_dims_.at(j)) <
-                device_mesh_.dim(mesh_dim1)) {
+        if (!CheckDims(rhs_, rhs_space_dims_, {{i, mesh_dim0}, {j, mesh_dim1}}))
           continue;
-        }
-        if (solver_option_.only_allow_divisible_intermediate &&
-            (!IsDivisible(rhs_->shape().dimensions().at(rhs_space_dims_.at(i)),
-                          device_mesh_.dim(mesh_dim0)) ||
-             !IsDivisible(rhs_->shape().dimensions().at(rhs_space_dims_.at(j)),
-                          device_mesh_.dim(mesh_dim1)))) {
-          continue;
-        }
         std::string name =
             absl::StrFormat("RSS = RR x RSS @ {%d,%d}", mesh_dim0, mesh_dim1);
         HloSharding output_spec = Tile(
@@ -217,20 +202,9 @@ class DotHandler {
                           mesh_dim1, mesh_dim1);
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_con_dims_.size(); ++j) {
-          if (lhs_->shape().dimensions().at(lhs_space_dims_.at(i)) <
-                  device_mesh_.dim(mesh_dim0) ||
-              lhs_->shape().dimensions().at(lhs_con_dims_.at(j)) <
-                  device_mesh_.dim(mesh_dim1)) {
+          if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dim0}}) ||
+              !CheckDims(lhs_, lhs_con_dims_, {{j, mesh_dim1}}))
             continue;
-          }
-          if (solver_option_.only_allow_divisible_intermediate &&
-              (!IsDivisible(
-                   lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
-                   device_mesh_.dim(mesh_dim0)) ||
-               !IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(j)),
-                            device_mesh_.dim(mesh_dim1)))) {
-            continue;
-          }
 
           HloSharding output_spec = Tile(ins_->shape(), {space_base_dim_ + i},
                                          {mesh_dim0}, device_mesh_);
@@ -258,20 +232,9 @@ class DotHandler {
                           mesh_dim1, mesh_dim0);
       for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_con_dims_.size(); ++j) {
-          if (rhs_->shape().dimensions().at(rhs_space_dims_.at(i)) <
-                  device_mesh_.dim(mesh_dim1) ||
-              lhs_->shape().dimensions().at(lhs_con_dims_.at(j)) <
-                  device_mesh_.dim(mesh_dim0)) {
+          if (!CheckDims(rhs_, rhs_space_dims_, {{i, mesh_dim1}}) ||
+              !CheckDims(lhs_, lhs_con_dims_, {{j, mesh_dim0}}))
             continue;
-          }
-          if (solver_option_.only_allow_divisible_intermediate &&
-              (!IsDivisible(
-                   rhs_->shape().dimensions().at(rhs_space_dims_.at(i)),
-                   device_mesh_.dim(mesh_dim1)) ||
-               !IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(j)),
-                            device_mesh_.dim(mesh_dim0)))) {
-            continue;
-          }
           HloSharding output_spec =
               Tile(ins_->shape(),
                    {space_base_dim_ +
@@ -299,15 +262,7 @@ class DotHandler {
                          [](int64_t size) { return size > 1; }) == 1) {
       for (int64_t i = 0; i < lhs_batch_dims_.size(); ++i) {
         for (int64_t j = 0; j < device_mesh_.num_dimensions(); ++j) {
-          if (lhs_->shape().dimensions().at(lhs_batch_dims_.at(i)) <
-              device_mesh_.dim(j)) {
-            continue;
-          }
-          if (solver_option_.only_allow_divisible_intermediate &&
-              !IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(i)),
-                           device_mesh_.dim(j))) {
-            continue;
-          }
+          if (!CheckDims(lhs_, lhs_batch_dims_, {{i, j}})) continue;
           std::string name = absl::StrFormat("Sb_%d = Sb x Sb @ {%d}", i, j);
           HloSharding output_spec = Tile(ins_->shape(), {i}, {j}, device_mesh_);
           HloSharding lhs_spec =
@@ -325,19 +280,8 @@ class DotHandler {
   void SplitTwoBatchDims(int mesh_dim0, int mesh_dim1) {
     if (lhs_batch_dims_.size() == 2 && device_mesh_.dim(mesh_dim0) > 1 &&
         device_mesh_.dim(mesh_dim1) > 1) {
-      if (lhs_->shape().dimensions().at(lhs_batch_dims_.at(0)) <
-              device_mesh_.dim(mesh_dim0) ||
-          lhs_->shape().dimensions().at(lhs_batch_dims_.at(1)) <
-              device_mesh_.dim(mesh_dim1)) {
-        return;
-      }
-      if (solver_option_.only_allow_divisible_intermediate &&
-          (!IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(0)),
-                        device_mesh_.dim(mesh_dim0)) ||
-           !IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(1)),
-                        device_mesh_.dim(mesh_dim1)))) {
+      if (!CheckDims(lhs_, lhs_batch_dims_, {{0, mesh_dim0}, {1, mesh_dim1}}))
         return;
-      }
       std::string name =
           absl::StrFormat("Sb = Sb x Sb @ {%d,%d}", mesh_dim0, mesh_dim1);
       HloSharding output_spec =
@@ -360,21 +304,9 @@ class DotHandler {
           absl::StrFormat("SbSi = SbSi x SbR @ {%d,%d}", mesh_dim0, mesh_dim1);
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (lhs_->shape().dimensions().at(lhs_space_dims_.at(i)) <
-                  device_mesh_.dim(mesh_dim0) ||
-              lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)) <
-                  device_mesh_.dim(mesh_dim1)) {
-            continue;
-          }
-          if (solver_option_.only_allow_divisible_intermediate &&
-              (!IsDivisible(
-                   lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
-                   device_mesh_.dim(mesh_dim0)) ||
-               !IsDivisible(
-                   lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)),
-                   device_mesh_.dim(mesh_dim1)))) {
+          if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dim0}}) ||
+              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dim1}}))
             continue;
-          }
           HloSharding output_spec =
               Tile(ins_->shape(), {j, space_base_dim_ + i},
                    {mesh_dim0, mesh_dim1}, device_mesh_);
@@ -398,21 +330,9 @@ class DotHandler {
           absl::StrFormat("SbSj = SbR x SbSj @ {%d,%d}", mesh_dim0, mesh_dim1);
       for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (rhs_->shape().dimensions().at(rhs_space_dims_.at(i)) <
-                  device_mesh_.dim(mesh_dim1) ||
-              lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)) <
-                  device_mesh_.dim(mesh_dim0)) {
+          if (!CheckDims(rhs_, rhs_space_dims_, {{i, mesh_dim1}}) ||
+              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dim0}}))
             continue;
-          }
-          if (solver_option_.only_allow_divisible_intermediate &&
-              (!IsDivisible(
-                   rhs_->shape().dimensions().at(rhs_space_dims_.at(i)),
-                   device_mesh_.dim(mesh_dim1)) ||
-               !IsDivisible(
-                   lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)),
-                   device_mesh_.dim(mesh_dim0)))) {
-            continue;
-          }
           HloSharding output_spec =
               Tile(ins_->shape(),
                    {j, space_base_dim_ +
@@ -439,20 +359,9 @@ class DotHandler {
                           mesh_dim0, mesh_dim1, mesh_dim1);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (lhs_->shape().dimensions().at(lhs_con_dims_.at(i)) <
-                  device_mesh_.dim(mesh_dim1) ||
-              lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)) <
-                  device_mesh_.dim(mesh_dim0)) {
-            continue;
-          }
-          if (solver_option_.only_allow_divisible_intermediate &&
-              (!IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(i)),
-                            device_mesh_.dim(mesh_dim1)) ||
-               !IsDivisible(
-                   lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)),
-                   device_mesh_.dim(mesh_dim0)))) {
+          if (!CheckDims(lhs_, lhs_con_dims_, {{i, mesh_dim1}}) ||
+              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dim0}}))
             continue;
-          }
           HloSharding output_spec =
               Tile(ins_->shape(), {j}, {mesh_dim0}, device_mesh_);
           HloSharding lhs_spec =
@@ -482,27 +391,10 @@ class DotHandler {
                           mesh_dim0, mesh_dim1, mesh_dim0, mesh_dim1);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
         for (int64_t j = i + 1; j < lhs_con_dims_.size(); ++j) {
-          if (lhs_->shape().dimensions().at(lhs_con_dims_.at(i)) <
-                  device_mesh_.dim(mesh_dim0) ||
-              lhs_->shape().dimensions().at(lhs_con_dims_.at(j)) <
-                  device_mesh_.dim(mesh_dim1) ||
-              rhs_->shape().dimensions().at(rhs_con_dims_.at(i)) <
-                  device_mesh_.dim(mesh_dim0) ||
-              rhs_->shape().dimensions().at(rhs_con_dims_.at(j)) <
-                  device_mesh_.dim(mesh_dim1)) {
-            continue;
-          }
-          if (solver_option_.only_allow_divisible_intermediate &&
-              (!IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(i)),
-                            device_mesh_.dim(mesh_dim0)) ||
-               !IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(j)),
-                            device_mesh_.dim(mesh_dim1)) ||
-               !IsDivisible(rhs_->shape().dimensions().at(rhs_con_dims_.at(i)),
-                            device_mesh_.dim(mesh_dim0)) ||
-               !IsDivisible(rhs_->shape().dimensions().at(rhs_con_dims_.at(j)),
-                            device_mesh_.dim(mesh_dim1)))) {
+          if (!CheckDims(lhs_, lhs_con_dims_,
+                         {{i, mesh_dim0}, {j, mesh_dim1}}) ||
+              !CheckDims(rhs_, rhs_con_dims_, {{i, mesh_dim0}, {j, mesh_dim1}}))
             continue;
-          }
           HloSharding output_spec = HloSharding::Replicate();
           HloSharding lhs_spec =
               Tile(lhs_->shape(), {lhs_con_dims_[i], lhs_con_dims_[j]},
@@ -526,15 +418,7 @@ class DotHandler {
       std::string name = absl::StrFormat("RR = RS x SR @ {%d} (allreduce @ %d)",
                                          mesh_dim0, mesh_dim0);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
-        if (lhs_->shape().dimensions().at(lhs_con_dims_.at(i)) <
-            device_mesh_.dim(mesh_dim0)) {
-          continue;
-        }
-        if (solver_option_.only_allow_divisible_intermediate &&
-            !IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(i)),
-                         device_mesh_.dim(mesh_dim0))) {
-          continue;
-        }
+        if (!CheckDims(lhs_, lhs_con_dims_, {{i, mesh_dim0}})) continue;
         HloSharding output_spec = HloSharding::Replicate();
         HloSharding lhs_spec =
             Tile(lhs_->shape(), {lhs_con_dims_[i]}, {mesh_dim0}, device_mesh_);
@@ -614,25 +498,19 @@ class DotHandler {
                          [](int64_t size) { return size > 1; }) > 1) {
       int mesh_dim = 0;
       for (int64_t i = 0; i < lhs_batch_dims_.size(); ++i) {
-          if (rhs_->shape().dimensions().at(lhs_batch_dims_.at(i)) <
-              device_mesh_.dim(mesh_dim)) {
-          continue;
-          }
-          if (solver_option_.only_allow_divisible_intermediate &&
-              !IsDivisible(rhs_->shape().dimensions().at(lhs_batch_dims_.at(i)),
-                           device_mesh_.dim(mesh_dim))) {
+          if (!CheckDims(lhs_, lhs_batch_dims_, {{i, mesh_dim}}) ||
+              !CheckDims(rhs_, rhs_batch_dims_, {{i, mesh_dim}}))
           continue;
-          }
-        std::string name =
-            absl::StrFormat("Sb_%d = Sb x Sb @ {%d} 1d", i, mesh_dim);
-        HloSharding output_spec =
-            Tile(ins_->shape(), {i}, {mesh_dim}, device_mesh_1d_);
-        HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_batch_dims_[i]},
-                                    {mesh_dim}, device_mesh_1d_);
-        HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_batch_dims_[i]},
-                                    {mesh_dim}, device_mesh_1d_);
-        AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                          cluster_env_, strategy_map_, strategies_);
+          std::string name =
+              absl::StrFormat("Sb_%d = Sb x Sb @ {%d} 1d", i, mesh_dim);
+          HloSharding output_spec =
+              Tile(ins_->shape(), {i}, {mesh_dim}, device_mesh_1d_);
+          HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_batch_dims_[i]},
+                                      {mesh_dim}, device_mesh_1d_);
+          HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_batch_dims_[i]},
+                                      {mesh_dim}, device_mesh_1d_);
+          AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
+                            cluster_env_, strategy_map_, strategies_);
       }
     }
   }
@@ -748,7 +626,7 @@ class DotHandler {
   // Dimension information
   const DotDimensionNumbers& dot_dnums_;
   int64_t space_base_dim_;
-  std::vector<int64_t> lhs_space_dims_, rhs_space_dims_;
+  tsl::protobuf::RepeatedField<int64_t> lhs_space_dims_, rhs_space_dims_;
   const tsl::protobuf::RepeatedField<int64_t>& lhs_con_dims_;
   const tsl::protobuf::RepeatedField<int64_t>& rhs_con_dims_;
   const tsl::protobuf::RepeatedField<int64_t>& lhs_batch_dims_;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 05cc51657dcc96..25692d4564e8cb 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -498,7 +498,7 @@ void BatchDimMapForward(const std::vector<HloInstruction*>& instructions,
             ins->dot_dimension_numbers().lhs_batch_dimensions();
         const auto& rhs_batch_dims =
             ins->dot_dimension_numbers().rhs_batch_dimensions();
-        std::vector<int64_t> lhs_space_dims, rhs_space_dims;
+        tsl::protobuf::RepeatedField<int64_t> lhs_space_dims, rhs_space_dims;
         std::tie(lhs_space_dims, rhs_space_dims) =
             GetSpaceDims(lhs->shape(), rhs->shape(), dot_dnums);
         // This part assumes that the dot has been through the dot decomposer,
@@ -759,7 +759,7 @@ void BatchDimMapBackward(const std::vector<HloInstruction*>& instructions,
             ins->dot_dimension_numbers().lhs_batch_dimensions();
         const auto& rhs_batch_dims =
             ins->dot_dimension_numbers().rhs_batch_dimensions();
-        std::vector<int64_t> lhs_space_dims, rhs_space_dims;
+        tsl::protobuf::RepeatedField<int64_t> lhs_space_dims, rhs_space_dims;
         std::tie(lhs_space_dims, rhs_space_dims) =
             GetSpaceDims(lhs->shape(), rhs->shape(), dot_dnums);
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index 38926d9fa35afa..c6aac8d53abcea 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -185,18 +185,18 @@ inline bool DimensionsEqual(const Shape& a, const Shape& b) {
  * HloInstruction Utility
  */
 // Get the space dimensions of a dot instruction.
-inline std::pair<std::vector<int64_t>, std::vector<int64_t>> GetSpaceDims(
-    const Shape& lhs_shape, const Shape& rhs_shape,
-    const DotDimensionNumbers& dnums) {
-  std::vector<int64_t> lhs_space_dims;
-  std::vector<int64_t> rhs_space_dims;
+inline std::pair<tsl::protobuf::RepeatedField<int64_t>,
+                 tsl::protobuf::RepeatedField<int64_t>>
+GetSpaceDims(const Shape& lhs_shape, const Shape& rhs_shape,
+             const DotDimensionNumbers& dnums) {
+  tsl::protobuf::RepeatedField<int64_t> lhs_space_dims, rhs_space_dims;
 
   for (int64_t i = 0; i < lhs_shape.rank(); ++i) {
     if (absl::c_linear_search(dnums.lhs_batch_dimensions(), i) ||
         absl::c_linear_search(dnums.lhs_contracting_dimensions(), i)) {
       continue;
     }
-    lhs_space_dims.push_back(i);
+    lhs_space_dims.Add(i);
   }
 
   for (int64_t i = 0; i < rhs_shape.rank(); ++i) {
@@ -204,7 +204,7 @@ inline std::pair<std::vector<int64_t>, std::vector<int64_t>> GetSpaceDims(
         absl::c_linear_search(dnums.rhs_contracting_dimensions(), i)) {
       continue;
     }
-    rhs_space_dims.push_back(i);
+    rhs_space_dims.Add(i);
   }
   return std::make_pair(std::move(lhs_space_dims), std::move(rhs_space_dims));
 }

From 59ba591a05928e2da3a4535beb4c2d3d76d4ef0d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 1 Oct 2023 20:43:56 -0700
Subject: [PATCH 457/567] Pass along the shape param from
 tf.compat.v1.get_variable to its underlying variable creator. At this point,
 we have already checked that either shape is compatible w/ initial_value, or
 that it is not specified (None). Therefore it should always be safe to pass
 it. Having it can help subsequent nested variable creators save some cycles
 tracing the initializer function when all they need to know is the shape.

PiperOrigin-RevId: 569942484
---
 .../feature_column/feature_column_test.py     | 28 +++++++++++++------
 .../feature_column/feature_column_v2_test.py  | 12 +++++---
 tensorflow/python/ops/variable_scope.py       | 21 ++++++++++----
 3 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 3cde6c5657edc1..f8fbd8db7e3b15 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -4924,16 +4924,19 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
+        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
+          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
+          return array_ops.slice(
+              embedding_values, partition_info.var_offset, shape
+          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-
-        self.assertEqual(dtypes.float32, dtype)
-        return embedding_values
+          return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups = (
@@ -4976,7 +4979,12 @@ def _initializer(shape, dtype, partition_info=None):
       for v in global_vars:
         self.assertIsInstance(v, variables_lib.Variable)
       with _initialized_session():
-        self.assertAllEqual(embedding_values, global_vars[0])
+        if partition_variables:
+          self.assertAllEqual(
+              embedding_values, array_ops.concat(global_vars, axis=0)
+          )
+        else:
+          self.assertAllEqual(embedding_values, global_vars[0])
         self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
       if use_safe_embedding_lookup:
@@ -5783,16 +5791,19 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
+        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
+          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
+          return array_ops.slice(
+              embedding_values, partition_info.var_offset, shape
+          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-
-        self.assertEqual(dtypes.float32, dtype)
-        return embedding_values
+          return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups_a = (
@@ -5842,10 +5853,11 @@ def _initializer(shape, dtype, partition_info=None):
         self.assertCountEqual(('vars/embedding_weights/part_0:0',
                                'vars/embedding_weights/part_1:0'),
                               tuple([v.name for v in global_vars]))
+        embedding_var = array_ops.concat(global_vars, axis=0)
       else:
         self.assertCountEqual(('vars/embedding_weights:0',),
                               tuple([v.name for v in global_vars]))
-      embedding_var = global_vars[0]
+        embedding_var = global_vars[0]
 
       self.evaluate(variables_lib.global_variables_initializer())
       self.evaluate(lookup_ops.tables_initializer())
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 21dcbb4452d6c7..cb98bf60c03184 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -5762,16 +5762,19 @@ def test_get_dense_tensor(self, use_safe_embedding_lookup,
       )
 
       def _initializer(shape, dtype, partition_info=None):
+        self.assertEqual(dtypes.float32, dtype)
         if partition_variables:
+          assert partition_info is not None
           self.assertEqual([vocabulary_size, embedding_dimension],
                            partition_info.full_shape)
           self.assertAllEqual((2, embedding_dimension), shape)
+          return array_ops.slice(
+              embedding_values, partition_info.var_offset, shape
+          )
         else:
           self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
           self.assertIsNone(partition_info)
-
-        self.assertEqual(dtypes.float32, dtype)
-        return embedding_values
+          return embedding_values
 
       # Expected lookup result, using combiner='mean'.
       expected_lookups_a = (
@@ -5821,10 +5824,11 @@ def _initializer(shape, dtype, partition_info=None):
         self.assertCountEqual(('vars/aaa_bbb_shared_embedding/part_0:0',
                                'vars/aaa_bbb_shared_embedding/part_1:0'),
                               tuple([v.name for v in global_vars]))
+        embedding_var = array_ops.concat(global_vars, axis=0)
       else:
         self.assertCountEqual(('vars/aaa_bbb_shared_embedding:0',),
                               tuple([v.name for v in global_vars]))
-      embedding_var = global_vars[0]
+        embedding_var = global_vars[0]
 
       self.evaluate(variables_lib.global_variables_initializer())
       self.evaluate(lookup_ops.tables_initializer())
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 33dd0438fa2f2f..ad066f268a621f 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -807,7 +807,8 @@ def _get_partitioned_variable(self,
             use_resource=use_resource,
             constraint=constraint,
             synchronization=synchronization,
-            aggregation=aggregation)
+            aggregation=aggregation,
+        )
 
       # pylint: disable=protected-access
       var._set_save_slice_info(
@@ -880,7 +881,8 @@ def _get_single_variable(self,
       raise ValueError("If initializer is a constant, do not specify shape.")
 
     dtype = dtypes.as_dtype(dtype)
-    shape = tensor_shape.as_shape(shape)
+    if shape is not None:
+      shape = tensor_shape.as_shape(shape)
 
     if name in self._vars:
       # Here we handle the case when returning an existing variable.
@@ -901,7 +903,9 @@ def _get_single_variable(self,
         raise ValueError("%s Originally defined at:\n\n%s" %
                          (err_msg, "".join(traceback.format_list(tb))))
       found_var = self._vars[name]
-      if not shape.is_compatible_with(found_var.get_shape()):
+      if shape is not None and not shape.is_compatible_with(
+          found_var.get_shape()
+      ):
         raise ValueError("Trying to share variable %s, but specified shape %s"
                          " and found shape %s." %
                          (name, shape, found_var.get_shape()))
@@ -921,6 +925,11 @@ def _get_single_variable(self,
 
     # Create the tensor to initialize the variable with default value.
     if initializer is None:
+      if shape is None:
+        raise ValueError(
+            f"Variable {name} did not get an initializer, so its `shape`"
+            " argument must be specified."
+        )
       initializer, initializing_from_value = self._get_default_initializer(
           name=name, shape=shape, dtype=dtype)
     # Enter an init scope when creating the initializer.
@@ -932,7 +941,7 @@ def _get_single_variable(self,
         # Instantiate initializer if provided initializer is a type object.
         if tf_inspect.isclass(initializer):
           initializer = initializer()
-        if shape.is_fully_defined():
+        if shape is not None and shape.is_fully_defined():
           if "partition_info" in tf_inspect.getargspec(initializer).args:
             init_val = functools.partial(initializer,
                                          shape.as_list(),
@@ -967,7 +976,9 @@ def _get_single_variable(self,
         constraint=constraint,
         use_resource=use_resource,
         synchronization=synchronization,
-        aggregation=aggregation)
+        aggregation=aggregation,
+        shape=shape,
+    )
     if context.executing_eagerly() and self._store_eager_variables:
       if collections:
         ops.add_to_collections(collections, v)

From 397615b1550c5d73ea9cf0f963de3dfae090b73b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 02:02:05 -0700
Subject: [PATCH 458/567] Update GraphDef version to 1637.

PiperOrigin-RevId: 569987303
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 3f24fdcf29c8e9..23ccff90b8d429 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1636  // Updated: 2023/10/1
+#define TF_GRAPH_DEF_VERSION 1637  // Updated: 2023/10/2
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 038a5416cab5e6daf1729cb78d2d4ca3f54db7d7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 02:02:10 -0700
Subject: [PATCH 459/567] compat: Update forward compatibility horizon to
 2023-10-02

PiperOrigin-RevId: 569987324
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index bc72d15bda9590..65383715827778 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 10, 1)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 10, 2)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 63ab401f8d6f70221de50178b39d754c1801a464 Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Mon, 2 Oct 2023 04:17:15 -0700
Subject: [PATCH 460/567] Use template parameter rather than non-template
 parameter for KernelType.

PiperOrigin-RevId: 570020629
---
 tensorflow/lite/kernels/reduce.cc | 74 +++++++++++++++----------------
 1 file changed, 35 insertions(+), 39 deletions(-)

diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index cd859255d4cf42..8ead4274826d0c 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -429,10 +429,9 @@ void ResolveAxis(const int* axis_data, int axis_count,
   }
 }
 
-template <typename T, typename U>
+template <typename T, typename U, KernelType kernel_type>
 TfLiteStatus Mean(TfLiteContext* context, const OpContext* op_context,
-                  int* temp_index, int* resolved_axis, U* temp_sum,
-                  KernelType kernel_type) {
+                  int* temp_index, int* resolved_axis, U* temp_sum) {
   int num_axis = static_cast<int>(NumElements(op_context->axis));
   auto args = std::tuple(
       GetTensorData<T>(op_context->input), &op_context->input->dims->data[0],
@@ -448,13 +447,12 @@ TfLiteStatus Mean(TfLiteContext* context, const OpContext* op_context,
   return kTfLiteOk;
 }
 
-template <typename T>
+template <typename T, KernelType kernel_type>
 TfLiteStatus QuantizedMeanOrSum(TfLiteContext* context,
                                 const OpContext& op_context,
                                 const OpData* op_data, TfLiteTensor* temp_index,
                                 TfLiteTensor* resolved_axis,
-                                TfLiteTensor* temp_sum, KernelType kernel_type,
-                                bool compute_sum) {
+                                TfLiteTensor* temp_sum, bool compute_sum) {
   int num_axis = static_cast<int>(NumElements(op_context.axis));
   if (kernel_type == kGenericOptimized) {
     TF_LITE_ENSURE(
@@ -635,20 +633,19 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
-      Mean<float, float>(context, &op_context, GetTensorData<int>(temp_index),
-                         GetTensorData<int>(resolved_axis),
-                         GetTensorData<float>(temp_sum), kernel_type);
+      Mean<float, float, kernel_type>(
+          context, &op_context, GetTensorData<int>(temp_index),
+          GetTensorData<int>(resolved_axis), GetTensorData<float>(temp_sum));
       break;
     case kTfLiteInt32:
-      Mean<int, int64_t>(context, &op_context, GetTensorData<int>(temp_index),
-                         GetTensorData<int>(resolved_axis),
-                         GetTensorData<int64_t>(temp_sum), kernel_type);
+      Mean<int, int64_t, kernel_type>(
+          context, &op_context, GetTensorData<int>(temp_index),
+          GetTensorData<int>(resolved_axis), GetTensorData<int64_t>(temp_sum));
       break;
     case kTfLiteInt64:
-      Mean<int64_t, int64_t>(context, &op_context,
-                             GetTensorData<int>(temp_index),
-                             GetTensorData<int>(resolved_axis),
-                             GetTensorData<int64_t>(temp_sum), kernel_type);
+      Mean<int64_t, int64_t, kernel_type>(
+          context, &op_context, GetTensorData<int>(temp_index),
+          GetTensorData<int>(resolved_axis), GetTensorData<int64_t>(temp_sum));
       break;
     case kTfLiteInt8: {
       TF_LITE_ENSURE_OK(context, EvalQuantizedMean<int8_t>(
@@ -758,10 +755,9 @@ void ReduceAllDims(const T* input_data, const int* input_dims,
 }
 
 // The underlying logic for Reduce Sum/Prod/Max/Min/Any
-template <typename T>
+template <typename T, KernelType kernel_type>
 TfLiteStatus EvalType(TfLiteContext* context, TfLiteNode* node,
-                      OpContext* op_context, KernelType kernel_type,
-                      ReduceType reduce_type) {
+                      OpContext* op_context, ReduceType reduce_type) {
   int64_t num_axis = NumElements(op_context->axis);
   TfLiteTensor* temp_index;
   TF_LITE_ENSURE_OK(context,
@@ -877,32 +873,32 @@ TfLiteStatus EvalGeneric(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
   switch (op_context.input->type) {
     case kTfLiteFloat32:
-      return EvalType<float>(context, node, &op_context, kernel_type,
-                             reduce_type);
+      return EvalType<float, kernel_type>(context, node, &op_context,
+                                          reduce_type);
       break;
     case kTfLiteInt32:
-      return EvalType<int>(context, node, &op_context, kernel_type,
-                           reduce_type);
+      return EvalType<int, kernel_type>(context, node, &op_context,
+                                        reduce_type);
       break;
     case kTfLiteInt64:
-      return EvalType<int64_t>(context, node, &op_context, kernel_type,
-                               reduce_type);
+      return EvalType<int64_t, kernel_type>(context, node, &op_context,
+                                            reduce_type);
       break;
     case kTfLiteUInt8:
-      return EvalType<uint8_t>(context, node, &op_context, kernel_type,
-                               reduce_type);
+      return EvalType<uint8_t, kernel_type>(context, node, &op_context,
+                                            reduce_type);
       break;
     case kTfLiteInt8:
-      return EvalType<int8_t>(context, node, &op_context, kernel_type,
-                              reduce_type);
+      return EvalType<int8_t, kernel_type>(context, node, &op_context,
+                                           reduce_type);
       break;
     case kTfLiteInt16:
-      return EvalType<int16_t>(context, node, &op_context, kernel_type,
-                               reduce_type);
+      return EvalType<int16_t, kernel_type>(context, node, &op_context,
+                                            reduce_type);
       break;
     case kTfLiteBool:
-      return EvalType<bool>(context, node, &op_context, kernel_type,
-                            reduce_type);
+      return EvalType<bool, kernel_type>(context, node, &op_context,
+                                         reduce_type);
       break;
     default:
       return kTfLiteError;
@@ -937,14 +933,14 @@ TfLiteStatus EvalSum(TfLiteContext* context, TfLiteNode* node) {
     }
 
     if (input->type == kTfLiteUInt8) {
-      return QuantizedMeanOrSum<uint8_t>(context, op_context, op_data,
-                                         temp_index, resolved_axis, temp_sum,
-                                         kernel_type, /*compute_sum=*/true);
+      return QuantizedMeanOrSum<uint8_t, kernel_type>(
+          context, op_context, op_data, temp_index, resolved_axis, temp_sum,
+          /*compute_sum=*/true);
     }
     if (input->type == kTfLiteInt8) {
-      return QuantizedMeanOrSum<int8_t>(context, op_context, op_data,
-                                        temp_index, resolved_axis, temp_sum,
-                                        kernel_type, /*compute_sum=*/true);
+      return QuantizedMeanOrSum<int8_t, kernel_type>(
+          context, op_context, op_data, temp_index, resolved_axis, temp_sum,
+          /*compute_sum=*/true);
     }
   } else {
     return EvalGeneric<kernel_type, kSum>(context, node);

From 9736f56f7d6416b849ca1ce8c6ab1a4a8667df5c Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <sergachev@google.com>
Date: Mon, 2 Oct 2023 07:06:17 -0700
Subject: [PATCH 461/567] [XLA:GPU] Make loaded scalars 1-element tensors in
 the Triton emitter.

PiperOrigin-RevId: 570055575
---
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |  8 +++--
 .../xla/service/gpu/ir_emitter_triton_test.cc | 31 +++++++++++++++++++
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 6ce0adc2366d4d..94b4c68e0fd928 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -433,9 +433,11 @@ Value EmitParameterLoad(ImplicitLocOpBuilder& b, Value pointer,
                                 mt::EvictionPolicy::NORMAL,
                                 /*isVolatile=*/false);
   }
-  return b.create<mt::LoadOp>(pointer, mt::CacheModifier::NONE,
-                              mt::EvictionPolicy::NORMAL,
-                              /*isVolatile=*/false);
+  return Splat(b,
+               b.create<mt::LoadOp>(pointer, mt::CacheModifier::NONE,
+                                    mt::EvictionPolicy::NORMAL,
+                                    /*isVolatile=*/false),
+               {});
 }
 
 Value EmitConstant(ImplicitLocOpBuilder& b, const HloInstruction& constant) {
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index e8c5c0f16663a8..5bf82c04c40c20 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -869,6 +869,37 @@ ENTRY e {
 )");
 }
 
+TEST_F(TritonGemmTest, SingleElementTileIsHandled) {
+  MatchOptimizedHlo(R"(
+t {
+  p0 = f32[2,7,3]{2,1,0} parameter(0)
+  p1 = s32[2,1]{1,0} parameter(1)
+  c = s32[] constant(1)
+  br0 = s32[2,1]{1,0} broadcast(c), dimensions={}
+  cmp = pred[2,1]{1,0} compare(p1, br0), direction=LT
+  bc0 = pred[2]{0} bitcast(cmp)
+  br1 = pred[2,1,3,3]{3,2,0,1} broadcast(bc0), dimensions={0}
+  cvt = f32[2,1,3,3]{3,2,0,1} convert(br1)
+  bc1 = f32[2,3,3]{2,1,0} bitcast(cvt)
+  ROOT d = f32[2,7,3]{2,1,0} dot(p0, bc1),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f32[2,7,3]{2,1,0} parameter(0)
+  p1 = s32[2,1]{1,0} parameter(1)
+  ROOT r = f32[2,7,3]{2,1,0} fusion(p0, p1), kind=kCustom,
+    calls=t, backend_config={"kind":"__triton_gemm"}
+})",
+                    // This partially optimized HLO will go through the
+                    // autotuner which will run the fusion through the emitter
+                    // multiple times and assign block sizes on success.
+                    R"(
+; CHECK: block_m
+)");
+}
+
 class TritonGemmTestAny : public TritonGemmTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {

From 7ae63482b9cdf51b773a0b52eb2aa6e097aed612 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 07:29:14 -0700
Subject: [PATCH 462/567] Generalizes the Dot Handler's mesh dims to be of
 arbitrary size (though still limited to 2 for now)

PiperOrigin-RevId: 570060379
---
 .../auto_sharding_dot_handler.cc              | 337 ++++++++++--------
 1 file changed, 180 insertions(+), 157 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 522848a3416798..651a91b03bf5c4 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.h"
@@ -40,6 +41,7 @@ namespace xla {
 namespace spmd {
 
 using DimMap = StableHashMap</*instr. dim idx*/ int, /* mesh dim idx*/ int>;
+using MeshDims = absl::Span<const int64_t>;
 
 void AppendNewStrategy(const HloInstruction* ins, const std::string& name,
                        const HloSharding& output_spec,
@@ -69,12 +71,12 @@ void AppendNewStrategy(const HloInstruction* ins, const std::string& name,
 }
 
 // Calls the given 'split_func' on all possible mesh dim combinations.
-void SplitWithMeshShape(std::function<void(int, int)> split_func,
+void SplitWithMeshShape(std::function<void(MeshDims)> split_func,
                         absl::Span<const int64_t> mesh_shape) {
   for (int64_t i = 0; i < mesh_shape.size(); ++i) {
     for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-      split_func(i, j);
-      split_func(j, i);
+      split_func({i, j});
+      split_func({j, i});
     }
   }
 }
@@ -125,24 +127,25 @@ class DotHandler {
     return true;
   }
 
-  void SplitLhsSpaceRhsSpace(int mesh_dim0, int mesh_dim1) {
+  void SplitLhsSpaceRhsSpace(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
     for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
       for (int64_t j = 0; j < rhs_space_dims_.size(); ++j) {
-        if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dim0}}) ||
-            !CheckDims(rhs_, rhs_space_dims_, {{j, mesh_dim1}}))
+        if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dims[0]}}) ||
+            !CheckDims(rhs_, rhs_space_dims_, {{j, mesh_dims[1]}}))
           continue;
-        std::string name =
-            absl::StrFormat("SS = SR x RS @ {%d,%d}", mesh_dim0, mesh_dim1);
+        std::string name = absl::StrFormat("SS = SR x RS @ {%s}",
+                                           absl::StrJoin(mesh_dims, ","));
         HloSharding output_spec =
             Tile(ins_->shape(),
                  {space_base_dim_ + i,
                   space_base_dim_ +
                       static_cast<int64_t>(lhs_space_dims_.size()) + j},
-                 {mesh_dim0, mesh_dim1}, device_mesh_);
+                 mesh_dims, device_mesh_);
         HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_space_dims_[i]},
-                                    {mesh_dim0}, device_mesh_);
+                                    {mesh_dims[0]}, device_mesh_);
         HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_space_dims_[j]},
-                                    {mesh_dim1}, device_mesh_);
+                                    {mesh_dims[1]}, device_mesh_);
 
         AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
                           cluster_env_, strategy_map_, strategies_);
@@ -150,19 +153,21 @@ class DotHandler {
     }
   }
 
-  void SplitLhsSpaceOnly(int mesh_dim0, int mesh_dim1) {
+  void SplitLhsSpaceOnly(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
     for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
       for (int64_t j = i + 1; j < lhs_space_dims_.size(); ++j) {
-        if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dim0}, {j, mesh_dim1}}))
+        if (!CheckDims(lhs_, lhs_space_dims_,
+                       {{i, mesh_dims[0]}, {j, mesh_dims[1]}}))
           continue;
-        std::string name =
-            absl::StrFormat("SSR = SSR x RR @ {%d,%d}", mesh_dim0, mesh_dim1);
+        std::string name = absl::StrFormat("SSR = SSR x RR @ {%s}",
+                                           absl::StrJoin(mesh_dims, ","));
         HloSharding output_spec =
             Tile(ins_->shape(), {space_base_dim_ + i, space_base_dim_ + j},
-                 {mesh_dim0, mesh_dim1}, device_mesh_);
+                 mesh_dims, device_mesh_);
         HloSharding lhs_spec =
             Tile(lhs_->shape(), {lhs_space_dims_[i], lhs_space_dims_[j]},
-                 {mesh_dim0, mesh_dim1}, device_mesh_);
+                 mesh_dims, device_mesh_);
         HloSharding rhs_spec = HloSharding::Replicate();
 
         AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
@@ -171,23 +176,25 @@ class DotHandler {
     }
   }
 
-  void SplitRhsSpaceOnly(int mesh_dim0, int mesh_dim1) {
+  void SplitRhsSpaceOnly(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
     for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
       for (int64_t j = i + 1; j < rhs_space_dims_.size(); ++j) {
-        if (!CheckDims(rhs_, rhs_space_dims_, {{i, mesh_dim0}, {j, mesh_dim1}}))
+        if (!CheckDims(rhs_, rhs_space_dims_,
+                       {{i, mesh_dims[0]}, {j, mesh_dims[1]}}))
           continue;
-        std::string name =
-            absl::StrFormat("RSS = RR x RSS @ {%d,%d}", mesh_dim0, mesh_dim1);
+        std::string name = absl::StrFormat("RSS = RR x RSS @ {%s}",
+                                           absl::StrJoin(mesh_dims, ","));
         HloSharding output_spec = Tile(
             ins_->shape(),
             {space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) + i,
              space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) +
                  j},
-            {mesh_dim0, mesh_dim1}, device_mesh_);
+            mesh_dims, device_mesh_);
         HloSharding lhs_spec = HloSharding::Replicate();
         HloSharding rhs_spec =
             Tile(rhs_->shape(), {rhs_space_dims_[i], rhs_space_dims_[j]},
-                 {mesh_dim0, mesh_dim1}, device_mesh_);
+                 mesh_dims, device_mesh_);
 
         AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
                           cluster_env_, strategy_map_, strategies_);
@@ -195,28 +202,30 @@ class DotHandler {
     }
   }
 
-  void SplitLhsSpaceBothContract(int mesh_dim0, int mesh_dim1) {
-    if (device_mesh_.dim(mesh_dim0) > 1 && device_mesh_.dim(mesh_dim1) > 1) {
+  void SplitLhsSpaceBothContract(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
+    if (device_mesh_.dim(mesh_dims[0]) > 1 &&
+        device_mesh_.dim(mesh_dims[1]) > 1) {
       std::string name =
-          absl::StrFormat("SR = SS x SR @ {%d,%d} (allreduce @ %d)", mesh_dim0,
-                          mesh_dim1, mesh_dim1);
+          absl::StrFormat("SR = SS x SR @ {%s} (allreduce @ %d)",
+                          absl::StrJoin(mesh_dims, ","), mesh_dims[1]);
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_con_dims_.size(); ++j) {
-          if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dim0}}) ||
-              !CheckDims(lhs_, lhs_con_dims_, {{j, mesh_dim1}}))
+          if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dims[0]}}) ||
+              !CheckDims(lhs_, lhs_con_dims_, {{j, mesh_dims[1]}}))
             continue;
 
           HloSharding output_spec = Tile(ins_->shape(), {space_base_dim_ + i},
-                                         {mesh_dim0}, device_mesh_);
+                                         {mesh_dims[0]}, device_mesh_);
           HloSharding lhs_spec =
               Tile(lhs_->shape(), {lhs_space_dims_[i], lhs_con_dims_[j]},
-                   {mesh_dim0, mesh_dim1}, device_mesh_);
+                   mesh_dims, device_mesh_);
           HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_con_dims_[j]},
-                                      {mesh_dim1}, device_mesh_);
+                                      {mesh_dims[1]}, device_mesh_);
 
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost =
-              cluster_env_.AllReduceCost(memory_cost, mesh_dim1);
+              cluster_env_.AllReduceCost(memory_cost, mesh_dims[1]);
           AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
                             communication_cost, cluster_env_, strategy_map_,
                             strategies_);
@@ -225,29 +234,30 @@ class DotHandler {
     }
   }
 
-  void SplitRhsSpaceBothContract(int mesh_dim0, int mesh_dim1) {
-    if (device_mesh_.dim(mesh_dim0) > 1) {
+  void SplitRhsSpaceBothContract(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
+    if (device_mesh_.dim(mesh_dims[0]) > 1) {
       std::string name =
-          absl::StrFormat("RS = RS x SS @ {%d,%d} (allreduce @ %d)", mesh_dim0,
-                          mesh_dim1, mesh_dim0);
+          absl::StrFormat("RS = RS x SS @ {%s} (allreduce @ %d)",
+                          absl::StrJoin(mesh_dims, ","), mesh_dims[0]);
       for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_con_dims_.size(); ++j) {
-          if (!CheckDims(rhs_, rhs_space_dims_, {{i, mesh_dim1}}) ||
-              !CheckDims(lhs_, lhs_con_dims_, {{j, mesh_dim0}}))
+          if (!CheckDims(rhs_, rhs_space_dims_, {{i, mesh_dims[1]}}) ||
+              !CheckDims(lhs_, lhs_con_dims_, {{j, mesh_dims[0]}}))
             continue;
           HloSharding output_spec =
               Tile(ins_->shape(),
                    {space_base_dim_ +
                     static_cast<int64_t>(lhs_space_dims_.size()) + i},
-                   {mesh_dim1}, device_mesh_);
+                   {mesh_dims[1]}, device_mesh_);
           HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_con_dims_[j]},
-                                      {mesh_dim0}, device_mesh_);
+                                      {mesh_dims[0]}, device_mesh_);
           HloSharding rhs_spec =
               Tile(rhs_->shape(), {rhs_con_dims_[j], rhs_space_dims_[i]},
-                   {mesh_dim0, mesh_dim1}, device_mesh_);
+                   mesh_dims, device_mesh_);
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost =
-              cluster_env_.AllReduceCost(memory_cost, mesh_dim0);
+              cluster_env_.AllReduceCost(memory_cost, mesh_dims[0]);
 
           AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
                             communication_cost, cluster_env_, strategy_map_,
@@ -277,44 +287,46 @@ class DotHandler {
     }
   }
 
-  void SplitTwoBatchDims(int mesh_dim0, int mesh_dim1) {
-    if (lhs_batch_dims_.size() == 2 && device_mesh_.dim(mesh_dim0) > 1 &&
-        device_mesh_.dim(mesh_dim1) > 1) {
-      if (!CheckDims(lhs_, lhs_batch_dims_, {{0, mesh_dim0}, {1, mesh_dim1}}))
+  void SplitTwoBatchDims(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
+    if (lhs_batch_dims_.size() == 2 && device_mesh_.dim(mesh_dims[0]) > 1 &&
+        device_mesh_.dim(mesh_dims[1]) > 1) {
+      if (!CheckDims(lhs_, lhs_batch_dims_,
+                     {{0, mesh_dims[0]}, {1, mesh_dims[1]}}))
         return;
       std::string name =
-          absl::StrFormat("Sb = Sb x Sb @ {%d,%d}", mesh_dim0, mesh_dim1);
+          absl::StrFormat("Sb = Sb x Sb @ {%s}", absl::StrJoin(mesh_dims, ","));
       HloSharding output_spec =
-          Tile(ins_->shape(), {0, 1}, {mesh_dim0, mesh_dim1}, device_mesh_);
+          Tile(ins_->shape(), {0, 1}, mesh_dims, device_mesh_);
       HloSharding lhs_spec =
           Tile(lhs_->shape(), {lhs_batch_dims_[0], lhs_batch_dims_[1]},
-               {mesh_dim0, mesh_dim1}, device_mesh_);
+               mesh_dims, device_mesh_);
       HloSharding rhs_spec =
           Tile(rhs_->shape(), {rhs_batch_dims_[0], rhs_batch_dims_[1]},
-               {mesh_dim0, mesh_dim1}, device_mesh_);
+               mesh_dims, device_mesh_);
       AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
                         cluster_env_, strategy_map_, strategies_);
     }
   }
 
-  void SplitBatchDimLhsSpace(int mesh_dim0, int mesh_dim1) {
-    if (!lhs_batch_dims_.empty() && device_mesh_.dim(mesh_dim0) > 1 &&
-        device_mesh_.dim(mesh_dim1) > 1) {
-      std::string name =
-          absl::StrFormat("SbSi = SbSi x SbR @ {%d,%d}", mesh_dim0, mesh_dim1);
+  void SplitBatchDimLhsSpace(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
+    if (!lhs_batch_dims_.empty() && device_mesh_.dim(mesh_dims[0]) > 1 &&
+        device_mesh_.dim(mesh_dims[1]) > 1) {
+      std::string name = absl::StrFormat("SbSi = SbSi x SbR @ {%s}",
+                                         absl::StrJoin(mesh_dims, ","));
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dim0}}) ||
-              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dim1}}))
+          if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dims[0]}}) ||
+              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dims[1]}}))
             continue;
-          HloSharding output_spec =
-              Tile(ins_->shape(), {j, space_base_dim_ + i},
-                   {mesh_dim0, mesh_dim1}, device_mesh_);
+          HloSharding output_spec = Tile(
+              ins_->shape(), {j, space_base_dim_ + i}, mesh_dims, device_mesh_);
           HloSharding lhs_spec =
               Tile(lhs_->shape(), {lhs_batch_dims_[j], lhs_space_dims_[i]},
-                   {mesh_dim0, mesh_dim1}, device_mesh_);
+                   mesh_dims, device_mesh_);
           HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_batch_dims_[j]},
-                                      {mesh_dim0}, device_mesh_);
+                                      {mesh_dims[0]}, device_mesh_);
 
           AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
                             cluster_env_, strategy_map_, strategies_);
@@ -323,26 +335,27 @@ class DotHandler {
     }
   }
 
-  void SplitBatchDimRhsSpace(int mesh_dim0, int mesh_dim1) {
-    if (!lhs_batch_dims_.empty() && device_mesh_.dim(mesh_dim0) > 1 &&
-        device_mesh_.dim(mesh_dim1) > 1) {
-      std::string name =
-          absl::StrFormat("SbSj = SbR x SbSj @ {%d,%d}", mesh_dim0, mesh_dim1);
+  void SplitBatchDimRhsSpace(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
+    if (!lhs_batch_dims_.empty() && device_mesh_.dim(mesh_dims[0]) > 1 &&
+        device_mesh_.dim(mesh_dims[1]) > 1) {
+      std::string name = absl::StrFormat("SbSj = SbR x SbSj @ {%s}",
+                                         absl::StrJoin(mesh_dims, ","));
       for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!CheckDims(rhs_, rhs_space_dims_, {{i, mesh_dim1}}) ||
-              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dim0}}))
+          if (!CheckDims(rhs_, rhs_space_dims_, {{i, mesh_dims[1]}}) ||
+              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dims[0]}}))
             continue;
           HloSharding output_spec =
               Tile(ins_->shape(),
                    {j, space_base_dim_ +
                            static_cast<int64_t>(lhs_space_dims_.size()) + i},
-                   {mesh_dim0, mesh_dim1}, device_mesh_);
+                   mesh_dims, device_mesh_);
           HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_batch_dims_[j]},
-                                      {mesh_dim0}, device_mesh_);
+                                      {mesh_dims[0]}, device_mesh_);
           HloSharding rhs_spec =
               Tile(rhs_->shape(), {rhs_batch_dims_[j], rhs_space_dims_[i]},
-                   {mesh_dim0, mesh_dim1}, device_mesh_);
+                   mesh_dims, device_mesh_);
 
           AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
                             cluster_env_, strategy_map_, strategies_);
@@ -351,28 +364,29 @@ class DotHandler {
     }
   }
 
-  void SplitBatchDimBothContract(int mesh_dim0, int mesh_dim1) {
-    if (!lhs_batch_dims_.empty() && device_mesh_.dim(mesh_dim0) > 1 &&
-        device_mesh_.dim(mesh_dim1) > 1) {
+  void SplitBatchDimBothContract(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
+    if (!lhs_batch_dims_.empty() && device_mesh_.dim(mesh_dims[0]) > 1 &&
+        device_mesh_.dim(mesh_dims[1]) > 1) {
       std::string name =
-          absl::StrFormat("SbR = SbSk x SbSk @ {%d,%d} (allreduce @ %d}",
-                          mesh_dim0, mesh_dim1, mesh_dim1);
+          absl::StrFormat("SbR = SbSk x SbSk @ {%s} (allreduce @ %d}",
+                          absl::StrJoin(mesh_dims, ","), mesh_dims[1]);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!CheckDims(lhs_, lhs_con_dims_, {{i, mesh_dim1}}) ||
-              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dim0}}))
+          if (!CheckDims(lhs_, lhs_con_dims_, {{i, mesh_dims[1]}}) ||
+              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dims[0]}}))
             continue;
           HloSharding output_spec =
-              Tile(ins_->shape(), {j}, {mesh_dim0}, device_mesh_);
+              Tile(ins_->shape(), {j}, {mesh_dims[0]}, device_mesh_);
           HloSharding lhs_spec =
               Tile(lhs_->shape(), {lhs_batch_dims_[j], lhs_con_dims_[i]},
-                   {mesh_dim0, mesh_dim1}, device_mesh_);
+                   mesh_dims, device_mesh_);
           HloSharding rhs_spec =
               Tile(rhs_->shape(), {rhs_batch_dims_[j], rhs_con_dims_[i]},
-                   {mesh_dim0, mesh_dim1}, device_mesh_);
+                   mesh_dims, device_mesh_);
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost =
-              cluster_env_.AllReduceCost(memory_cost, mesh_dim1);
+              cluster_env_.AllReduceCost(memory_cost, mesh_dims[1]);
 
           AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
                             communication_cost, cluster_env_, strategy_map_,
@@ -382,29 +396,32 @@ class DotHandler {
     }
   }
 
-  void SplitBothContractTwoDims(int mesh_dim0, int mesh_dim1) {
+  void SplitBothContractTwoDims(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
     // Applies when there are more than one contracting dimension.
     if (lhs_con_dims_.size() >= 2 && rhs_con_dims_.size() >= 2 &&
-        device_mesh_.dim(mesh_dim0) > 1 && device_mesh_.dim(mesh_dim1) > 1) {
-      std::string name =
-          absl::StrFormat("RR = SS x SS @ {%d,%d} (allreduce @ {%d, %d}}",
-                          mesh_dim0, mesh_dim1, mesh_dim0, mesh_dim1);
+        device_mesh_.dim(mesh_dims[0]) > 1 &&
+        device_mesh_.dim(mesh_dims[1]) > 1) {
+      std::string name = absl::StrFormat(
+          "RR = SS x SS @ {%s} (allreduce @ {%s}}",
+          absl::StrJoin(mesh_dims, ","), absl::StrJoin(mesh_dims, ", "));
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
         for (int64_t j = i + 1; j < lhs_con_dims_.size(); ++j) {
           if (!CheckDims(lhs_, lhs_con_dims_,
-                         {{i, mesh_dim0}, {j, mesh_dim1}}) ||
-              !CheckDims(rhs_, rhs_con_dims_, {{i, mesh_dim0}, {j, mesh_dim1}}))
+                         {{i, mesh_dims[0]}, {j, mesh_dims[1]}}) ||
+              !CheckDims(rhs_, rhs_con_dims_,
+                         {{i, mesh_dims[0]}, {j, mesh_dims[1]}}))
             continue;
           HloSharding output_spec = HloSharding::Replicate();
           HloSharding lhs_spec =
               Tile(lhs_->shape(), {lhs_con_dims_[i], lhs_con_dims_[j]},
-                   {mesh_dim0, mesh_dim1}, device_mesh_);
+                   mesh_dims, device_mesh_);
           HloSharding rhs_spec =
               Tile(rhs_->shape(), {rhs_con_dims_[i], rhs_con_dims_[j]},
-                   {mesh_dim0, mesh_dim1}, device_mesh_);
+                   mesh_dims, device_mesh_);
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
-          double communication_cost =
-              cluster_env_.AllReduceCost(memory_cost, mesh_dim0, mesh_dim1);
+          double communication_cost = cluster_env_.AllReduceCost(
+              memory_cost, mesh_dims[0], mesh_dims[1]);
           AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
                             communication_cost, cluster_env_, strategy_map_,
                             strategies_);
@@ -413,22 +430,24 @@ class DotHandler {
     }
   }
 
-  void RecomputeSplitBothContract(int mesh_dim0, int mesh_dim1) {
-    if (device_mesh_.dim(mesh_dim0) > 1 && device_mesh_.dim(mesh_dim1) > 1) {
+  void RecomputeSplitBothContract(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
+    if (device_mesh_.dim(mesh_dims[0]) > 1 &&
+        device_mesh_.dim(mesh_dims[1]) > 1) {
       std::string name = absl::StrFormat("RR = RS x SR @ {%d} (allreduce @ %d)",
-                                         mesh_dim0, mesh_dim0);
+                                         mesh_dims[0], mesh_dims[0]);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
-        if (!CheckDims(lhs_, lhs_con_dims_, {{i, mesh_dim0}})) continue;
+        if (!CheckDims(lhs_, lhs_con_dims_, {{i, mesh_dims[0]}})) continue;
         HloSharding output_spec = HloSharding::Replicate();
-        HloSharding lhs_spec =
-            Tile(lhs_->shape(), {lhs_con_dims_[i]}, {mesh_dim0}, device_mesh_);
-        HloSharding rhs_spec =
-            Tile(rhs_->shape(), {rhs_con_dims_[i]}, {mesh_dim0}, device_mesh_);
+        HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_con_dims_[i]},
+                                    {mesh_dims[0]}, device_mesh_);
+        HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_con_dims_[i]},
+                                    {mesh_dims[0]}, device_mesh_);
         double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
         double compute_cost =
             cluster_env_.DotCost(lhs_->shape(), rhs_->shape(), dot_dnums_);
         double communication_cost =
-            cluster_env_.AllReduceCost(memory_cost, mesh_dim0);
+            cluster_env_.AllReduceCost(memory_cost, mesh_dims[0]);
 
         AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec},
                           compute_cost, communication_cost, cluster_env_,
@@ -515,43 +534,43 @@ class DotHandler {
     }
   }
 
-  void Split(std::function<void(int, int)> split_func) const {
+  void Split(std::function<void(MeshDims)> split_func) const {
     SplitWithMeshShape(split_func, device_mesh_.dimensions());
   }
 
   Status RegisterStrategies() {
     // SS = SR x RS
     // Split lhs space dim and rhs space dim.
-    Split([this](int i, int j) { this->SplitLhsSpaceRhsSpace(i, j); });
+    Split([this](MeshDims md) { this->SplitLhsSpaceRhsSpace(md); });
 
     // SSR = SSR x RR
     // Split lhs space dims only if it has more than 1 space dims.
     if (lhs_space_dims_.size() > 1) {
-      Split([this](int i, int j) { this->SplitLhsSpaceOnly(i, j); });
+      Split([this](MeshDims md) { this->SplitLhsSpaceOnly(md); });
     }
     // RSS = RR x RSS
     // Split rhs space dims only if it has more than 1 space dims.
     if (rhs_space_dims_.size() > 1) {
-      Split([this](int i, int j) { this->SplitRhsSpaceOnly(i, j); });
+      Split([this](MeshDims md) { this->SplitRhsSpaceOnly(md); });
     }
 
     // SR = SS x SR
     // Split lhs space dim and both contracting dims.
-    Split([this](int i, int j) { this->SplitLhsSpaceBothContract(i, j); });
+    Split([this](MeshDims md) { this->SplitLhsSpaceBothContract(md); });
 
     // RS = RS x SS
     // Split rhs space dim and both contracting dims.
-    Split([this](int i, int j) { this->SplitRhsSpaceBothContract(i, j); });
+    Split([this](MeshDims md) { this->SplitRhsSpaceBothContract(md); });
 
     // RR = SS x SS
     // Split two contracting dims on lhs and rhs.
-    Split([this](int i, int j) { this->SplitBothContractTwoDims(i, j); });
+    Split([this](MeshDims md) { this->SplitBothContractTwoDims(md); });
 
     // RR = RS x SR
     // This is a special case where we allow spliting only one dim in the
     // multi-dimensional mesh case. This allows some recomputation
     // (e.g., the dense layer in the LM_head of BERT).
-    Split([this](int i, int j) { this->RecomputeSplitBothContract(i, j); });
+    Split([this](MeshDims md) { this->RecomputeSplitBothContract(md); });
 
     // Add 1d data parallel in multi-dimensional mesh
     if (solver_option_.allow_mixed_mesh_shape) {
@@ -572,15 +591,15 @@ class DotHandler {
 
     // SbSi = SbSi x SbR
     // Split batch dim and lhs space dim
-    Split([this](int i, int j) { this->SplitBatchDimLhsSpace(i, j); });
+    Split([this](MeshDims md) { this->SplitBatchDimLhsSpace(md); });
 
     // SbSj = SbR x SbSj
     // Split batch dim and rhs space dim
-    Split([this](int i, int j) { this->SplitBatchDimRhsSpace(i, j); });
+    Split([this](MeshDims md) { this->SplitBatchDimRhsSpace(md); });
 
     // SbSj = SbR x SbSj
     // Split batch dim and contracting dim
-    Split([this](int i, int j) { this->SplitBatchDimBothContract(i, j); });
+    Split([this](MeshDims md) { this->SplitBatchDimBothContract(md); });
 
     if (solver_option_.batch_matmul_always_split_batch &&
         lhs_batch_dims_.size() == 2 &&
@@ -593,7 +612,7 @@ class DotHandler {
 
     // Sb = Sb x Sb
     // Split batch dims.
-    Split([this](int i, int j) { this->SplitTwoBatchDims(i, j); });
+    Split([this](MeshDims md) { this->SplitTwoBatchDims(md); });
 
     if (solver_option_.allow_mixed_mesh_shape) {
       Add1DBatchSplit();
@@ -675,37 +694,40 @@ class ConvHandler {
     out_out_channel_dim_ = conv_dnums_.output_feature_dimension();
   }
 
-  void SplitLhsBatchRhsOutchannel(int mesh_dim0, int mesh_dim1) {
+  void SplitLhsBatchRhsOutchannel(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
     std::string name =
-        absl::StrFormat("SS = SR x RS @ {%d,%d}", mesh_dim0, mesh_dim1);
+        absl::StrFormat("SS = SR x RS @ {%s}", absl::StrJoin(mesh_dims, ","));
     HloSharding output_spec =
-        Tile(ins_->shape(), {out_batch_dim_, out_out_channel_dim_},
-             {mesh_dim0, mesh_dim1}, device_mesh_);
+        Tile(ins_->shape(), {out_batch_dim_, out_out_channel_dim_}, mesh_dims,
+             device_mesh_);
     HloSharding lhs_spec =
-        Tile(lhs_->shape(), {lhs_batch_dim_}, {mesh_dim0}, device_mesh_);
-    HloSharding rhs_spec =
-        Tile(rhs_->shape(), {rhs_out_channel_dim_}, {mesh_dim1}, device_mesh_);
+        Tile(lhs_->shape(), {lhs_batch_dim_}, {mesh_dims[0]}, device_mesh_);
+    HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_out_channel_dim_},
+                                {mesh_dims[1]}, device_mesh_);
 
     AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
                       cluster_env_, strategy_map_, strategies_);
   }
 
-  void SplitLhsBatchBothInchannel(int mesh_dim0, int mesh_dim1) {
-    if (device_mesh_.dim(mesh_dim0) > 1 && device_mesh_.dim(mesh_dim1) > 1) {
+  void SplitLhsBatchBothInchannel(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
+    if (device_mesh_.dim(mesh_dims[0]) > 1 &&
+        device_mesh_.dim(mesh_dims[1]) > 1) {
       std::string name =
-          absl::StrFormat("SR = SS x SR @ {%d,%d} (allreduce @ %d)", mesh_dim0,
-                          mesh_dim1, mesh_dim1);
+          absl::StrFormat("SR = SS x SR @ {%s} (allreduce @ %d)",
+                          absl::StrJoin(mesh_dims, ","), mesh_dims[1]);
       HloSharding output_spec =
-          Tile(ins_->shape(), {out_batch_dim_}, {mesh_dim0}, device_mesh_);
+          Tile(ins_->shape(), {out_batch_dim_}, {mesh_dims[0]}, device_mesh_);
       HloSharding lhs_spec =
-          Tile(lhs_->shape(), {lhs_batch_dim_, lhs_in_channel_dim_},
-               {mesh_dim0, mesh_dim1}, device_mesh_);
-      HloSharding rhs_spec =
-          Tile(rhs_->shape(), {rhs_in_channel_dim_}, {mesh_dim1}, device_mesh_);
+          Tile(lhs_->shape(), {lhs_batch_dim_, lhs_in_channel_dim_}, mesh_dims,
+               device_mesh_);
+      HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_in_channel_dim_},
+                                  {mesh_dims[1]}, device_mesh_);
 
       double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
       double communication_cost =
-          cluster_env_.AllReduceCost(memory_cost, mesh_dim1);
+          cluster_env_.AllReduceCost(memory_cost, mesh_dims[1]);
 
       AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
                         communication_cost, cluster_env_, strategy_map_,
@@ -713,22 +735,23 @@ class ConvHandler {
     }
   }
 
-  void SplitRhsOutchannelBothInchannel(int mesh_dim0, int mesh_dim1) {
-    if (device_mesh_.dim(mesh_dim0) > 1) {
+  void SplitRhsOutchannelBothInchannel(MeshDims mesh_dims) {
+    DCHECK_EQ(mesh_dims.size(), 2);
+    if (device_mesh_.dim(mesh_dims[0]) > 1) {
       std::string name =
-          absl::StrFormat("RS = RS x SS @ {%d,%d} (allreduce @ %d)", mesh_dim0,
-                          mesh_dim1, mesh_dim0);
+          absl::StrFormat("RS = RS x SS @ {%s} (allreduce @ %d)",
+                          absl::StrJoin(mesh_dims, ","), mesh_dims[0]);
       HloSharding output_spec = Tile(ins_->shape(), {out_out_channel_dim_},
-                                     {mesh_dim1}, device_mesh_);
-      HloSharding lhs_spec =
-          Tile(lhs_->shape(), {lhs_in_channel_dim_}, {mesh_dim0}, device_mesh_);
+                                     {mesh_dims[1]}, device_mesh_);
+      HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_in_channel_dim_},
+                                  {mesh_dims[0]}, device_mesh_);
       HloSharding rhs_spec =
           Tile(rhs_->shape(), {rhs_in_channel_dim_, rhs_out_channel_dim_},
-               {mesh_dim0, mesh_dim1}, device_mesh_);
+               mesh_dims, device_mesh_);
 
       double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
       double communication_cost =
-          cluster_env_.AllReduceCost(memory_cost, mesh_dim0);
+          cluster_env_.AllReduceCost(memory_cost, mesh_dims[0]);
 
       AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
                         communication_cost, cluster_env_, strategy_map_,
@@ -777,26 +800,27 @@ class ConvHandler {
     }
   }
 
-  void SplitDepthwise(int mesh_dim0, int mesh_dim1, bool forward) {
+  void SplitDepthwise(MeshDims mesh_dims, bool forward) {
+    DCHECK_EQ(mesh_dims.size(), 2);
     std::string name =
-        absl::StrFormat("SS = SS x RS @ {%d,%d}", mesh_dim0, mesh_dim1);
+        absl::StrFormat("SS = SS x RS @ {%s}", absl::StrJoin(mesh_dims, ","));
     HloSharding output_spec =
-        Tile(ins_->shape(), {out_batch_dim_, out_out_channel_dim_},
-             {mesh_dim0, mesh_dim1}, device_mesh_);
+        Tile(ins_->shape(), {out_batch_dim_, out_out_channel_dim_}, mesh_dims,
+             device_mesh_);
     HloSharding lhs_spec =
         forward ? Tile(lhs_->shape(), {lhs_batch_dim_, lhs_in_channel_dim_},
-                       {mesh_dim0, mesh_dim1}, device_mesh_)
+                       mesh_dims, device_mesh_)
                 : Tile(lhs_->shape(), {lhs_batch_dim_, lhs_in_channel_dim_},
-                       {mesh_dim1, mesh_dim0}, device_mesh_);
+                       {mesh_dims[1], mesh_dims[0]}, device_mesh_);
 
-    HloSharding rhs_spec =
-        Tile(rhs_->shape(), {rhs_out_channel_dim_}, {mesh_dim1}, device_mesh_);
+    HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_out_channel_dim_},
+                                {mesh_dims[1]}, device_mesh_);
 
     AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
                       cluster_env_, strategy_map_, strategies_);
   }
 
-  void Split(std::function<void(int, int)> split_func) const {
+  void Split(std::function<void(MeshDims)> split_func) const {
     SplitWithMeshShape(split_func, device_mesh_.dimensions());
   }
 
@@ -809,7 +833,7 @@ class ConvHandler {
       // for depthwise conv
       // SS = SS x S
       // Split batch dim and channel dim
-      Split([this](int i, int j) { this->SplitDepthwise(i, j, true); });
+      Split([this](MeshDims md) { this->SplitDepthwise(md, true); });
     } else if ((ins_->batch_group_count() ==
                     lhs_->shape().dimensions(lhs_batch_dim_) &&
                 ins_->batch_group_count() ==
@@ -817,21 +841,20 @@ class ConvHandler {
       // for depthwise conv filter_backward
       // SS = SS x S
       // Split batch dim and channel dim
-      Split([this](int i, int j) { this->SplitDepthwise(i, j, false); });
+      Split([this](MeshDims md) { this->SplitDepthwise(md, false); });
     }
 
     // SS = SR x RS
     // Split lhs batch dim and rhs out_channel dim.
-    Split([this](int i, int j) { this->SplitLhsBatchRhsOutchannel(i, j); });
+    Split([this](MeshDims md) { this->SplitLhsBatchRhsOutchannel(md); });
 
     // SR = SS x SR
     // Split lhs batch dim and both in_channel dims.
-    Split([this](int i, int j) { this->SplitLhsBatchBothInchannel(i, j); });
+    Split([this](MeshDims md) { this->SplitLhsBatchBothInchannel(md); });
 
     // RS = RS x SS
     // Split rhs out_channel dim and both in_channel dims.
-    Split(
-        [this](int i, int j) { this->SplitRhsOutchannelBothInchannel(i, j); });
+    Split([this](MeshDims md) { this->SplitRhsOutchannelBothInchannel(md); });
 
     // Add 1d data parallel in multi-dimensional mesh
     if (solver_option_.allow_mixed_mesh_shape) {

From 65fd4e5b5d4c127ea46d90aaf38ebe32e57e80a8 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 2 Oct 2023 08:20:37 -0700
Subject: [PATCH 463/567] [stream_executor] NFC: Forward all users of dnn_proto
 to tsl/protobuf library

Remove indirection via stream_executor:dnn_proto target.

PiperOrigin-RevId: 570071946
---
 tensorflow/core/protobuf/BUILD                      |  2 +-
 tensorflow/core/protobuf/conv_autotuning.proto      |  2 +-
 tensorflow/core/util/autotune_maps/BUILD            |  6 +++---
 .../core/util/autotune_maps/autotune_map.proto      |  2 +-
 .../core/util/autotune_maps/autotune_serialize.cc   |  2 +-
 .../core/util/autotune_maps/conv_parameters.proto   |  2 +-
 third_party/xla/xla/service/BUILD                   |  2 +-
 third_party/xla/xla/service/gpu/BUILD               |  6 +++---
 .../xla/xla/service/gpu/backend_configs.proto       |  2 +-
 third_party/xla/xla/stream_executor/BUILD           | 13 -------------
 third_party/xla/xla/stream_executor/dnn.h           |  2 +-
 third_party/xla/xla/stream_executor/dnn.proto       |  8 --------
 third_party/xla/xla/xla.bzl                         |  4 ----
 13 files changed, 14 insertions(+), 39 deletions(-)
 delete mode 100644 third_party/xla/xla/stream_executor/dnn.proto

diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
index 752fd7c522fc9a..bbad66e909a746 100644
--- a/tensorflow/core/protobuf/BUILD
+++ b/tensorflow/core/protobuf/BUILD
@@ -72,7 +72,7 @@ tf_proto_library(
     cc_api_version = 2,
     make_default_target_header_only = True,
     protodeps = [
-        "@local_xla//xla/stream_executor:dnn_proto",
+        "@local_tsl//tsl/protobuf:dnn_proto",
     ],
 )
 
diff --git a/tensorflow/core/protobuf/conv_autotuning.proto b/tensorflow/core/protobuf/conv_autotuning.proto
index 8a7b3a604c0997..21f1c2adbf5613 100644
--- a/tensorflow/core/protobuf/conv_autotuning.proto
+++ b/tensorflow/core/protobuf/conv_autotuning.proto
@@ -4,7 +4,7 @@ syntax = "proto3";
 
 package tensorflow;
 
-import "xla/stream_executor/dnn.proto";
+import "tsl/protobuf/dnn.proto";
 
 option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
diff --git a/tensorflow/core/util/autotune_maps/BUILD b/tensorflow/core/util/autotune_maps/BUILD
index 4be1a30bf6d410..ade63bd0c26fee 100644
--- a/tensorflow/core/util/autotune_maps/BUILD
+++ b/tensorflow/core/util/autotune_maps/BUILD
@@ -45,7 +45,7 @@ tf_proto_library(
     cc_api_version = 2,
     protodeps = [
         "//tensorflow/core/framework:types_proto",
-        "@local_xla//xla/stream_executor:dnn_proto",
+        "@local_tsl//tsl/protobuf:dnn_proto",
     ],
 )
 
@@ -103,7 +103,7 @@ tf_proto_library(
     cc_api_version = 2,
     protodeps = [
         "//tensorflow/core/util/autotune_maps:conv_parameters_proto",
-        "@local_xla//xla/stream_executor:dnn_proto",
+        "@local_tsl//tsl/protobuf:dnn_proto",
     ],
     visibility = ["//waymo/ml/deploy/system/autotuning:__subpackages__"],
 )
@@ -135,9 +135,9 @@ tf_cuda_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:stream_executor",
+        "@local_tsl//tsl/protobuf:dnn_proto_cc",
         "@local_xla//xla:status_macros",
         "@local_xla//xla/stream_executor",
-        "@local_xla//xla/stream_executor:dnn_proto_cc",
         "@local_xla//xla/stream_executor:lazy_op_runner",
         "@local_xla//xla/stream_executor/gpu:gpu_init",
     ],
diff --git a/tensorflow/core/util/autotune_maps/autotune_map.proto b/tensorflow/core/util/autotune_maps/autotune_map.proto
index bc425648e4bbf2..c655b3c1a5927d 100644
--- a/tensorflow/core/util/autotune_maps/autotune_map.proto
+++ b/tensorflow/core/util/autotune_maps/autotune_map.proto
@@ -21,8 +21,8 @@ syntax = "proto3";
 
 package tensorflow;
 
-import "xla/stream_executor/dnn.proto";
 import "tensorflow/core/util/autotune_maps/conv_parameters.proto";
+import "tsl/protobuf/dnn.proto";
 
 message ConvMapProto {
   message Entry {
diff --git a/tensorflow/core/util/autotune_maps/autotune_serialize.cc b/tensorflow/core/util/autotune_maps/autotune_serialize.cc
index 13fec6bf16d7da..f943aecdfeedd4 100644
--- a/tensorflow/core/util/autotune_maps/autotune_serialize.cc
+++ b/tensorflow/core/util/autotune_maps/autotune_serialize.cc
@@ -23,7 +23,6 @@ limitations under the License.
 
 #include "xla/status_macros.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/dnn.pb.h"
 #include "xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/util/activation_mode.h"
@@ -32,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/util/autotune_maps/conv_parameters.h"
 #include "tensorflow/core/util/autotune_maps/conv_parameters.pb.h"
 #include "tsl/lib/strings/proto_serialization.h"
+#include "tsl/protobuf/dnn.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/autotune_maps/conv_parameters.proto b/tensorflow/core/util/autotune_maps/conv_parameters.proto
index f1ec5e46fd7492..03a9cfd005d6f6 100644
--- a/tensorflow/core/util/autotune_maps/conv_parameters.proto
+++ b/tensorflow/core/util/autotune_maps/conv_parameters.proto
@@ -22,8 +22,8 @@ syntax = "proto3";
 
 package tensorflow;
 
-import "xla/stream_executor/dnn.proto";
 import "tensorflow/core/framework/types.proto";
+import "tsl/protobuf/dnn.proto";
 
 // LINT.IfChange
 
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 856737f086c7b4..f66de01f79d546 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1366,7 +1366,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
-        "//xla/stream_executor:stream_executor_impl",
+        "//xla/stream_executor",
     ] + if_gpu_is_configured([
         ":service",
         "//xla/service/gpu:gpu_compiler",
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 56f6c2d937bd70..9180f83333d636 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -59,8 +59,8 @@ tf_proto_library(
     make_default_target_header_only = True,
     protodeps = [
         "//xla:xla_data_proto",
-        "//xla/stream_executor:dnn_proto",
         "//xla:autotuning_proto",
+        "@local_tsl//tsl/protobuf:dnn_proto",
     ],
     visibility = ["//visibility:public"],
 )
@@ -3597,7 +3597,6 @@ cc_library(
         "//xla/service:hlo_pass",
         "//xla/service:pattern_matcher",
         "//xla/stream_executor",
-        "//xla/stream_executor:dnn_proto_cc",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
@@ -3605,6 +3604,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/protobuf:dnn_proto_cc",
     ],
 )
 
@@ -3627,7 +3627,6 @@ cc_library(
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_pass",
         "//xla/service:pattern_matcher",
-        "//xla/stream_executor:dnn_proto_cc",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -3635,6 +3634,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/protobuf:dnn_proto_cc",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index 9eae0f1b87fbb3..a113dc70227919 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -3,8 +3,8 @@ syntax = "proto3";
 package xla.gpu;
 
 import "xla/autotuning.proto";
-import "xla/stream_executor/dnn.proto";
 import "xla/xla_data.proto";
+import "tsl/protobuf/dnn.proto";
 
 // Backend configs for XLA:GPU.
 //
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index fd229b9c47d8ea..6fdf8d77521f94 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -109,7 +109,6 @@ filegroup(
 STREAM_EXECUTOR_DEPENDENCIES = [
     ":allocator_stats",
     ":device_description_proto_cc",
-    ":dnn_proto_cc",
     ":host_or_device_scalar",
     ":multi_platform_manager",
     "@com_google_absl//absl/algorithm:container",
@@ -502,15 +501,6 @@ cc_library(
     ],
 )
 
-tf_proto_library(
-    name = "dnn_proto",
-    srcs = ["dnn.proto"],
-    make_default_target_header_only = True,
-    protodeps = ["@local_tsl//tsl/protobuf:dnn_proto"],
-    visibility = ["//visibility:public"],
-    exports = ["@local_tsl//tsl/protobuf:dnn_proto"],
-)
-
 cc_library(
     name = "fft",
     hdrs = ["fft.h"],
@@ -681,7 +671,6 @@ cc_library(
     deps = [
         ":device_description_proto_cc",
         ":device_memory",
-        ":dnn_proto_cc",
         ":numeric_options",
         ":stream_executor_headers",
         "//xla/stream_executor/platform",
@@ -708,7 +697,6 @@ cc_library(
         ":device_description",
         ":device_memory",
         ":dnn",
-        ":dnn_proto_cc",
         ":event",
         ":executor_cache",
         ":kernel",
@@ -721,7 +709,6 @@ cc_library(
         ":stream_executor_pimpl",
         ":temporary_device_memory",
         ":temporary_memory_manager",
-        "//xla/stream_executor:dnn_proto_cc_impl",
         "@local_tsl//tsl/protobuf:dnn_proto_cc_impl",
     ],
 )
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index 52ccb08117604d..d56f3677f102aa 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -41,12 +41,12 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/dnn.pb.h"
 #include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/port.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/protobuf/dnn.pb.h"
 
 namespace Eigen {
 struct half;
diff --git a/third_party/xla/xla/stream_executor/dnn.proto b/third_party/xla/xla/stream_executor/dnn.proto
deleted file mode 100644
index 6e27451346e893..00000000000000
--- a/third_party/xla/xla/stream_executor/dnn.proto
+++ /dev/null
@@ -1,8 +0,0 @@
-// LINT: LEGACY_NAMES
-syntax = "proto3";
-
-package stream_executor.dnn.dummy;
-
-import public "tsl/protobuf/dnn.proto";
-
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/stream_executor";
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index deac05f9e5077b..f1dfca8ebf28af 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -63,7 +63,6 @@ def xla_cc_binary(deps = None, copts = tsl_copts(), **kwargs):
         "//xla/service/gpu:backend_configs_cc_impl",
         "//xla/service/gpu:hlo_op_profile_proto_cc_impl",
         "//xla/stream_executor:device_description_proto_cc_impl",
-        "//xla/stream_executor:dnn_proto_cc_impl",
         "//xla/stream_executor:stream_executor_impl",
         "@local_tsl//tsl/platform:env_impl",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
@@ -72,7 +71,6 @@ def xla_cc_binary(deps = None, copts = tsl_copts(), **kwargs):
         "@local_tsl//tsl/profiler/backends/cpu:traceme_recorder_impl",
         "//xla:autotuning_proto_cc_impl",
         "@local_tsl//tsl/protobuf:protos_all_cc_impl",
-        "@local_tsl//tsl/protobuf:dnn_proto_cc_impl",
         "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/framework:allocator_registry_impl",
         "@local_tsl//tsl/util:determinism",
@@ -97,7 +95,6 @@ def xla_cc_test(
                        clean_dep("//xla/service/gpu:backend_configs_cc_impl"),
                        clean_dep("//xla/service/gpu:hlo_op_profile_proto_cc_impl"),
                        clean_dep("//xla/stream_executor:device_description_proto_cc_impl"),
-                       clean_dep("//xla/stream_executor:dnn_proto_cc_impl"),
                        clean_dep("//xla/stream_executor:device_id_utils"),
                        clean_dep("//xla/stream_executor:stream_executor_impl"),
                        clean_dep("//xla/stream_executor/gpu:gpu_cudamallocasync_allocator"),
@@ -107,7 +104,6 @@ def xla_cc_test(
                        clean_dep("@local_tsl//tsl/profiler/backends/cpu:traceme_recorder_impl"),
                        clean_dep("@local_tsl//tsl/profiler/protobuf:xplane_proto_cc_impl"),
                        clean_dep("//xla:autotuning_proto_cc_impl"),
-                       clean_dep("@local_tsl//tsl/protobuf:dnn_proto_cc_impl"),
                        clean_dep("@local_tsl//tsl/protobuf:protos_all_cc_impl"),
                        clean_dep("@local_tsl//tsl/platform:env_impl"),
                        clean_dep("@local_tsl//tsl/framework:allocator"),

From 97c6f22c91db5f5cca04462eafc0fd69590c50b2 Mon Sep 17 00:00:00 2001
From: Marissa Ikonomidis <marissaw@google.com>
Date: Mon, 2 Oct 2023 08:35:30 -0700
Subject: [PATCH 464/567] Refactor batches_ into a function to make future
 changes easier

batches_ will be replaced with high and low priority versions. It
will be easier to update all the call sites if they all call the
same function to get batches_.

PiperOrigin-RevId: 570075341
---
 .../batching_util/shared_batch_scheduler.h    | 74 ++++++++++++-------
 1 file changed, 49 insertions(+), 25 deletions(-)

diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index df7866c5f3c473..bcaedddc192c3b 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -445,6 +445,14 @@ class Queue {
   // Returns the number of enqueued batches.
   int64 num_enqueued_batches() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Gets the appropriate batches.
+  std::deque<std::unique_ptr<Batch<TaskType>>>& GetBatches()
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Gets the appropriate batches (const version).
+  const std::deque<std::unique_ptr<Batch<TaskType>>>& GetBatches() const
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   const typename SharedBatchScheduler<TaskType>::QueueOptions options_;
 
   // The environment to use.
@@ -804,7 +812,7 @@ Queue<TaskType>::Queue(
     task_handle_batches_.emplace_back(
         new Batch<BatchInputTaskHandle<TaskType>>);
   } else {
-    batches_.emplace_back(new Batch<TaskType>);
+    GetBatches().emplace_back(new Batch<TaskType>);
   }
 }
 
@@ -817,7 +825,7 @@ Queue<TaskType>::~Queue() {
   if (options_.enable_lazy_split) {
     task_handle_batches_.back()->Close();
   } else {
-    batches_.back()->Close();
+    GetBatches().back()->Close();
   }
 }
 
@@ -882,7 +890,7 @@ Status Queue<TaskType>::ScheduleWithLazySplit(std::unique_ptr<TaskType>* task) {
     }
 
     if (!schedulable_batch_) {
-      if (batches_.size() > 1 || IsOpenBatchSchedulable()) {
+      if (GetBatches().size() > 1 || IsOpenBatchSchedulable()) {
         schedulable_batch_ = true;
         notify_of_schedulable_batch = true;
       }
@@ -923,8 +931,10 @@ Status Queue<TaskType>::ScheduleWithoutOrEagerSplit(
     // use up all queue capacity.
     TF_RETURN_IF_ERROR(ValidateBatchTaskQueueCapacity((*task).get()));
 
+    std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+
     const int64_t open_batch_remaining_slot =
-        max_execution_batch_size() - batches_.back()->size();
+        max_execution_batch_size() - batches.back()->size();
 
     const int64_t input_task_size = (*task)->size();
 
@@ -939,11 +949,11 @@ Status Queue<TaskType>::ScheduleWithoutOrEagerSplit(
     }
 
     for (int i = 0; i < output_tasks.size(); ++i) {
-      if (batches_.back()->size() + output_tasks[i]->size() >
+      if (batches.back()->size() + output_tasks[i]->size() >
           max_execution_batch_size()) {
         StartNewBatch();
       }
-      if (batches_.back()->empty()) {
+      if (batches.back()->empty()) {
         open_batch_start_time_micros_ = env_->NowMicros();
       }
       profiler::TraceMeProducer trace_me(
@@ -952,12 +962,12 @@ Status Queue<TaskType>::ScheduleWithoutOrEagerSplit(
                                            {{"size", output_tasks[i]->size()}});
           },
           profiler::ContextType::kSharedBatchScheduler,
-          batches_.back()->traceme_context_id());
-      batches_.back()->AddTask(std::move(output_tasks[i]));
+          batches.back()->traceme_context_id());
+      batches.back()->AddTask(std::move(output_tasks[i]));
     }
 
     if (!schedulable_batch_) {
-      if (batches_.size() > 1 || IsOpenBatchSchedulable()) {
+      if (batches.size() > 1 || IsOpenBatchSchedulable()) {
         schedulable_batch_ = true;
         notify_of_schedulable_batch = true;
       }
@@ -982,7 +992,7 @@ size_t Queue<TaskType>::NumEnqueuedTasks() const {
     return num_enqueued_tasks;
   }
 
-  for (const auto& batch : batches_) {
+  for (const auto& batch : GetBatches()) {
     num_enqueued_tasks += batch->num_tasks();
   }
   return num_enqueued_tasks;
@@ -1036,13 +1046,13 @@ Status Queue<TaskType>::ValidateBatchTaskQueueCapacity(TaskType* task) const {
   // allows such models to continue to work.
   //
   // We need to revisit/remove this check after we fix model configs.
-  if (batches_.back()->size() + task->size() >
-      options_.input_batch_size_limit) {
-    if (batches_.size() >= options_.max_enqueued_batches) {
+  const std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+  if (batches.back()->size() + task->size() > options_.input_batch_size_limit) {
+    if (batches.size() >= options_.max_enqueued_batches) {
       return errors::Unavailable(
           "The batch scheduling queue to which this task was submitted is "
           "full; currently ",
-          batches_.size(), " batches enqueued and max_enqueued_batches is ",
+          batches.size(), " batches enqueued and max_enqueued_batches is ",
           options_.max_enqueued_batches);
     }
   }
@@ -1059,16 +1069,17 @@ Queue<TaskType>::ScheduleBatchWithEagerSplit() {
   {
     mutex_lock l(mu_);
 
+    std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
     // Consider closing the open batch at this time, to schedule it.
-    if (batches_.size() == 1 && IsOpenBatchSchedulable()) {
+    if (batches.size() == 1 && IsOpenBatchSchedulable()) {
       StartNewBatch();
     }
 
-    if (batches_.size() >= 2) {
+    if (batches.size() >= 2) {
       // There is at least one closed batch that is ready to be scheduled.
       ++num_batches_being_processed_;
-      batch_to_schedule = std::move(batches_.front());
-      batches_.pop_front();
+      batch_to_schedule = std::move(batches.front());
+      batches.pop_front();
     } else {
       schedulable_batch_ = false;
     }
@@ -1159,8 +1170,9 @@ bool Queue<TaskType>::IsEmptyInternal() const {
            task_handle_batches_.size() == 1 &&
            task_handle_batches_.back()->empty();
   }
-  return num_batches_being_processed_ == 0 && batches_.size() == 1 &&
-         batches_.back()->empty();
+  const std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+  return num_batches_being_processed_ == 0 && batches.size() == 1 &&
+         batches.back()->empty();
 }
 
 template <typename TaskType>
@@ -1171,8 +1183,9 @@ void Queue<TaskType>::StartNewBatch() {
         ++traceme_context_id_counter_));
     return;
   }
-  batches_.back()->Close();
-  batches_.emplace_back(new Batch<TaskType>(++traceme_context_id_counter_));
+  std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+  batches.back()->Close();
+  batches.emplace_back(new Batch<TaskType>(++traceme_context_id_counter_));
 }
 
 template <typename TaskType>
@@ -1188,7 +1201,7 @@ Status Queue<TaskType>::SplitInputBatchIntoSubtasks(
 
 template <typename TaskType>
 bool Queue<TaskType>::IsOpenBatchSchedulableAfterEagerSplit() const {
-  Batch<TaskType>* open_batch = batches_.back().get();
+  Batch<TaskType>* open_batch = GetBatches().back().get();
   if (open_batch->empty()) {
     return false;
   }
@@ -1218,7 +1231,7 @@ size_t Queue<TaskType>::tail_batch_task_size() const {
     return task_handle_batches_.back()->size();
   }
 
-  return batches_.back()->size();
+  return GetBatches().back()->size();
 }
 
 template <typename TaskType>
@@ -1226,7 +1239,18 @@ int64 Queue<TaskType>::num_enqueued_batches() const {
   if (options_.enable_lazy_split) {
     return task_handle_batches_.size();
   }
-  return batches_.size();
+  return GetBatches().size();
+}
+
+template <typename TaskType>
+std::deque<std::unique_ptr<Batch<TaskType>>>& Queue<TaskType>::GetBatches() {
+  return batches_;
+}
+
+template <typename TaskType>
+const std::deque<std::unique_ptr<Batch<TaskType>>>&
+Queue<TaskType>::GetBatches() const {
+  return batches_;
 }
 
 template <typename TaskType>

From 702de55dbbc726d49d52bbceac42307aef64bff1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 08:57:58 -0700
Subject: [PATCH 465/567] Use non-member equality operators for TensorShape and
 add a non-member operator== for PartialTensorShape.

PiperOrigin-RevId: 570081029
---
 tensorflow/core/framework/tensor_shape.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index 30fd25355f8315..8b1373bf0e945c 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -389,8 +389,6 @@ class TensorShape : public TensorShapeBase<TensorShape> {
   /// Returns true if `*this` and `b` have the same sizes. Ignores
   /// dimension names.
   bool IsSameSize(const TensorShape& b) const;
-  bool operator==(const TensorShape& b) const { return IsSameSize(b); }
-  bool operator!=(const TensorShape& b) const { return !IsSameSize(b); }
 
   /// Fill `*dsizes` from `*this`.
   /// Notice: Using IndexType=int32 in combination with To32Bit() can
@@ -443,6 +441,13 @@ class TensorShape : public TensorShapeBase<TensorShape> {
   friend class Tensor;
 };
 
+inline bool operator==(const TensorShape& a, const TensorShape& b) {
+  return a.IsSameSize(b);
+}
+inline bool operator!=(const TensorShape& a, const TensorShape& b) {
+  return !(a == b);
+}
+
 /// Outputs `TensorShapeBase` to `std::ostream`.
 inline std::ostream& operator<<(std::ostream& os, const TensorShape& ts) {
   return os << ts.DebugString();
@@ -611,6 +616,11 @@ class PartialTensorShape : public TensorShapeBase<PartialTensorShape> {
   }
 };
 
+inline bool operator==(const PartialTensorShape& a,
+                       const PartialTensorShape& b) {
+  return a.IsIdenticalTo(b);
+}
+
 /// \brief Static helper routines for `PartialTensorShape`. Includes a few
 /// common predicates on a partially known tensor shape.
 class PartialTensorShapeUtils {

From 95edcf9e835ba67939fe582846211ae1d669df80 Mon Sep 17 00:00:00 2001
From: Michael Hudgins <michaelhudgins@google.com>
Date: Mon, 2 Oct 2023 16:12:38 +0000
Subject: [PATCH 466/567] Add modifications for ARM 64 ci

---
 .bazelrc                                      | 62 +++++++++++++------
 .../envs/continuous_linux_arm64_cpu_py310     |  5 ++
 2 files changed, 47 insertions(+), 20 deletions(-)
 create mode 100644 ci/official/envs/continuous_linux_arm64_cpu_py310

diff --git a/.bazelrc b/.bazelrc
index 9691d264a24a8f..a0d3795d18cab2 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -321,6 +321,10 @@ build:linux --copt="-Werror=switch"
 # Required for building with clang
 build:linux --copt="-Wno-error=unused-but-set-variable"
 
+# Linux ARM64 specific options
+build:linux_arm64 --copt="-mtune=generic" --copt="-march=armv8-a" --copt="-O3"
+
+
 # On Windows, `__cplusplus` is wrongly defined without this switch
 # See https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
 build:windows --copt=/Zc:__cplusplus
@@ -563,43 +567,46 @@ try-import %workspace%/.bazelrc.user
 test:release_base --test_size_filters=small,medium
 
 # Target the AVX instruction set
-build:release_cpu_linux --config=avx_linux
-# Use the Clang toolchain to compile
-build:release_cpu_linux --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:release_linux_base --config=avx_linux
+
 # Disable clang extention that rejects type definitions within offsetof.
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
 # Can be removed once upb is updated, since a type definition is used within
 # offset of in the current version of ubp.
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
-build:release_cpu_linux --copt=-Wno-gnu-offsetof-extensions
-build:release_cpu_linux --copt=-Wno-error=array-parameter
-build:release_cpu_linux --copt=-Wno-error=unused-command-line-argument
+build:release_linux_base --copt=-Wno-gnu-offsetof-extensions
+build:release_linux_base --copt=-Wno-error=array-parameter
+build:release_linux_base --copt=-Wno-error=unused-command-line-argument
 # Set lld as the linker.
-build:release_cpu_linux --linkopt="-fuse-ld=lld"
-build:release_cpu_linux --linkopt="-lm"
+build:release_linux_base --linkopt="-fuse-ld=lld"
+build:release_linux_base --linkopt="-lm"
 
 # We have some invalid linker scripts in the build,
 # so we need to disable this check
-build:release_cpu_linux --linkopt=-Wl,--undefined-version
+build:release_linux_base --linkopt=-Wl,--undefined-version
 
 # Container environment settings below this point.
 # Use Python 3.X as installed in container image
-build:release_cpu_linux --action_env PYTHON_BIN_PATH="/usr/bin/python3"
-build:release_cpu_linux --action_env PYTHON_LIB_PATH="/usr/lib/tf_python"
-build:release_cpu_linux --python_path="/usr/bin/python3"
+build:release_linux_base --action_env PYTHON_BIN_PATH="/usr/bin/python3"
+build:release_linux_base --action_env PYTHON_LIB_PATH="/usr/lib/tf_python"
+build:release_linux_base --python_path="/usr/bin/python3"
 # Set Clang as compiler. Use the actual path to clang installed in container.
-build:release_cpu_linux --repo_env=CC="/usr/lib/llvm-17/bin/clang"
-build:release_cpu_linux --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/clang"
+build:release_linux_base --repo_env=CC="/usr/lib/llvm-17/bin/clang"
+build:release_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/clang"
 # Store performance profiling log in the mounted artifact directory.
 # The profile can be viewed by visiting chrome://tracing in a Chrome browser.
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
-build:release_cpu_linux --profile=/tf/pkg/profile.json.gz
+build:release_linux_base --profile=/tf/pkg/profile.json.gz
 # Test-related settings below this point.
-test:release_cpu_linux --build_tests_only --keep_going --test_output=errors --verbose_failures=true
-test:release_cpu_linux --local_test_jobs=HOST_CPUS
-test:release_cpu_linux --test_env=LD_LIBRARY_PATH
+test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
+test:release_linux_base --local_test_jobs=HOST_CPUS
+test:release_linux_base --test_env=LD_LIBRARY_PATH
 # Give only the list of failed tests at the end of the log
-test:release_cpu_linux --test_summary=short
+test:release_linux_base --test_summary=short
+
+# Use the Clang toolchain to compile
+build:release_cpu_linux --config=release_linux_base
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
 
 build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
@@ -609,6 +616,12 @@ test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lo
 # Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
 test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
 
+build:release_arm64_linux --config=release_linux_base
+build:release_arm64_linux --config=linux_arm64
+build:release_arm64_linux --crosstool_top="@ml2014_clang_aarch64_config_aarch64//crosstool:toolchain"
+build:release_arm64_linux --config=mkl_aarch64_threadpool
+build:release_arm64_linux --copt=-flax-vector-conversions
+
 # The old gcc linux build options are preserved in the unsupported_*_linux
 # configs. If your project fails to build with Clang, you can use these
 # unsupported flags to replace the release flags in your build command.
@@ -682,6 +695,9 @@ test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-
 test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
 test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+# ARM64 WHEEL
+test:linux_arm64_wheel_test_filters --config=linux_cpu_wheel_test_filters
+test:linux_arm64_wheel_test_filters --test_tag_filters=-no_aarch64
 
 # PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
 # the whole TF code base. These are usually run continuously or upon presubmit.
@@ -695,5 +711,11 @@ test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss
 test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
-
+# ARM64 PYCPP
+test:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+# TODO(michaelhudgins): Why did prior ARM tests seperate out py and cc? 
+test:linux_arm64_pycpp_test_filters --test_lang_filters=cc --test_size_filters=small,medium --flaky_test_attempts=3
+# TODO(michaelhudgins): Why do we need to specifically omit go and java here? 
+test:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test
 # END TF TEST SUITE OPTIONS
diff --git a/ci/official/envs/continuous_linux_arm64_cpu_py310 b/ci/official/envs/continuous_linux_arm64_cpu_py310
new file mode 100644
index 00000000000000..c44e75d8b75e5f
--- /dev/null
+++ b/ci/official/envs/continuous_linux_arm64_cpu_py310
@@ -0,0 +1,5 @@
+source ci/official/envs/ci_default
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
+TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --repo_env=TF_PYTHON_VERSION=3.10)
+TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
+TFCI_DOCKER_REBUILD_ARGS=(--target=tf ci/official/containers/linux_arm64)

From 759fda54f65f59354f68fc324387f363779b572a Mon Sep 17 00:00:00 2001
From: Michael Hudgins <michaelhudgins@google.com>
Date: Mon, 2 Oct 2023 16:17:29 +0000
Subject: [PATCH 467/567] Update bazelrc

---
 .bazelrc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index a0d3795d18cab2..3b4051a781b80d 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -696,8 +696,10 @@ test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,
 test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
 test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 WHEEL
-test:linux_arm64_wheel_test_filters --config=linux_cpu_wheel_test_filters
-test:linux_arm64_wheel_test_filters --test_tag_filters=-no_aarch64
+test:linux_arm64_wheel_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_arm64_wheel_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_arm64_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:linux_arm64_wheel_test --config=linux_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...  -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test
 
 # PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
 # the whole TF code base. These are usually run continuously or upon presubmit.

From 3cc8d2d34f92b8aa0736ae8edef37e85088cd30b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 09:13:11 -0700
Subject: [PATCH 468/567] Internal changes only.

PiperOrigin-RevId: 570085339
---
 tensorflow/lite/delegates/gpu/BUILD | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index d824c538afe9d6..6e767beb635cfb 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -33,6 +33,19 @@ config_setting(
     },
 )
 
+# copybara:uncomment_begin(google-only)
+# config_setting(
+#     name = "tflite_gpu_angle",
+#     flag_values = {
+#         "//tools/cpp:cc_target_os": "linux-google",
+#         "//third_party/angle:use_angle": "True",
+#     },
+#     values = {
+#         "cpu": "k8",
+#     },
+# )
+# copybara:uncomment_end
+
 cc_library(
     name = "gl_delegate",
     srcs = ["gl_delegate.cc"],

From 18831bfbf7801bb9b8dfac2b6a6a5833e9cf24c4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 09:29:09 -0700
Subject: [PATCH 469/567] Introduces a HandlerBase class containing
 functionality -- like Split() -- common to both DotHandler and ConvHandler.

PiperOrigin-RevId: 570089566
---
 .../auto_sharding_dot_handler.cc              | 264 ++++++++----------
 1 file changed, 112 insertions(+), 152 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 651a91b03bf5c4..4fe6753f449f21 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -43,51 +43,14 @@ namespace spmd {
 using DimMap = StableHashMap</*instr. dim idx*/ int, /* mesh dim idx*/ int>;
 using MeshDims = absl::Span<const int64_t>;
 
-void AppendNewStrategy(const HloInstruction* ins, const std::string& name,
-                       const HloSharding& output_spec,
-                       absl::Span<const HloSharding> input_specs,
-                       double compute_cost, double communication_cost,
-                       const ClusterEnvironment& cluster_env,
-                       const StrategyMap& strategy_map,
-                       std::unique_ptr<StrategyVector>& strategies) {
-  std::vector<std::vector<double>> resharding_costs;
-
-  for (int i = 0; i < ins->operand_count(); ++i) {
-    const HloInstruction* operand = ins->operand(i);
-    resharding_costs.push_back(
-        ReshardingCostVector(strategy_map.at(operand).get(), operand->shape(),
-                             input_specs[i], cluster_env));
-  }
-
-  strategies->leaf_vector.push_back(ShardingStrategy({
-      name,
-      output_spec,
-      compute_cost,
-      communication_cost,
-      GetBytes(ins->shape()) / output_spec.NumTiles(),
-      resharding_costs,
-      {input_specs.begin(), input_specs.end()},
-  }));
-}
-
-// Calls the given 'split_func' on all possible mesh dim combinations.
-void SplitWithMeshShape(std::function<void(MeshDims)> split_func,
-                        absl::Span<const int64_t> mesh_shape) {
-  for (int64_t i = 0; i < mesh_shape.size(); ++i) {
-    for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
-      split_func({i, j});
-      split_func({j, i});
-    }
-  }
-}
-
-class DotHandler {
- public:
-  DotHandler(std::unique_ptr<StrategyVector>& strategies,
-             StrategyMap& strategy_map, const HloInstruction* ins,
-             const ClusterEnvironment& cluster_env,
-             const InstructionBatchDimMap& batch_map,
-             const AutoShardingSolverOption& solver_option)
+// Contains base functionality common to both DotHandler and ConvHandler.
+class HandlerBase {
+ protected:
+  HandlerBase(std::unique_ptr<StrategyVector>& strategies,
+              StrategyMap& strategy_map, const HloInstruction* ins,
+              const ClusterEnvironment& cluster_env,
+              const InstructionBatchDimMap& batch_map,
+              const AutoShardingSolverOption& solver_option)
       : strategies_(strategies),
         strategy_map_(strategy_map),
         ins_(ins),
@@ -97,19 +60,41 @@ class DotHandler {
         device_mesh_(cluster_env.device_mesh_),
         device_mesh_1d_(cluster_env.device_mesh_1d_),
         lhs_(ins->operand(0)),
-        rhs_(ins->operand(1)),
-        dot_dnums_(ins->dot_dimension_numbers()),
-        space_base_dim_(dot_dnums_.lhs_batch_dimensions_size()),
-        lhs_con_dims_(
-            ins->dot_dimension_numbers().lhs_contracting_dimensions()),
-        rhs_con_dims_(
-            ins->dot_dimension_numbers().rhs_contracting_dimensions()),
-        lhs_batch_dims_(ins->dot_dimension_numbers().lhs_batch_dimensions()),
-        rhs_batch_dims_(ins->dot_dimension_numbers().rhs_batch_dimensions()) {
-    std::tie(lhs_space_dims_, rhs_space_dims_) =
-        GetSpaceDims(lhs_->shape(), rhs_->shape(), dot_dnums_);
-    CHECK_EQ(lhs_con_dims_.size(), rhs_con_dims_.size());
-    CHECK_EQ(lhs_batch_dims_.size(), rhs_batch_dims_.size());
+        rhs_(ins->operand(1)) {}
+
+  void AppendNewStrategy(const std::string& name,
+                         const HloSharding& output_spec,
+                         absl::Span<const HloSharding> input_specs,
+                         double compute_cost, double communication_cost) {
+    std::vector<std::vector<double>> resharding_costs;
+
+    for (int i = 0; i < ins_->operand_count(); ++i) {
+      const HloInstruction* operand = ins_->operand(i);
+      resharding_costs.push_back(
+          ReshardingCostVector(strategy_map_.at(operand).get(),
+                               operand->shape(), input_specs[i], cluster_env_));
+    }
+
+    strategies_->leaf_vector.push_back(ShardingStrategy({
+        name,
+        output_spec,
+        compute_cost,
+        communication_cost,
+        GetBytes(ins_->shape()) / output_spec.NumTiles(),
+        resharding_costs,
+        {input_specs.begin(), input_specs.end()},
+    }));
+  }
+
+  // Calls the given 'split_func' on all possible mesh dim combinations.
+  void Split(std::function<void(MeshDims)> split_func) {
+    auto mesh_shape = device_mesh_.dimensions();
+    for (int64_t i = 0; i < mesh_shape.size(); ++i) {
+      for (int64_t j = (i + 1); j < mesh_shape.size(); ++j) {
+        split_func({i, j});
+        split_func({j, i});
+      }
+    }
   }
 
   bool CheckDims(const HloInstruction* ins,
@@ -127,6 +112,42 @@ class DotHandler {
     return true;
   }
 
+  std::unique_ptr<StrategyVector>& strategies_;
+  StrategyMap& strategy_map_;
+  const HloInstruction* ins_;
+  const ClusterEnvironment& cluster_env_;
+  const InstructionBatchDimMap& batch_map_;
+  const AutoShardingSolverOption& solver_option_;
+
+  const Array<int64_t>& device_mesh_;
+  const Array<int64_t>& device_mesh_1d_;
+  const HloInstruction* lhs_;
+  const HloInstruction* rhs_;
+};
+
+class DotHandler : public HandlerBase {
+ public:
+  DotHandler(std::unique_ptr<StrategyVector>& strategies,
+             StrategyMap& strategy_map, const HloInstruction* ins,
+             const ClusterEnvironment& cluster_env,
+             const InstructionBatchDimMap& batch_map,
+             const AutoShardingSolverOption& solver_option)
+      : HandlerBase(strategies, strategy_map, ins, cluster_env, batch_map,
+                    solver_option),
+        dot_dnums_(ins->dot_dimension_numbers()),
+        space_base_dim_(dot_dnums_.lhs_batch_dimensions_size()),
+        lhs_con_dims_(
+            ins->dot_dimension_numbers().lhs_contracting_dimensions()),
+        rhs_con_dims_(
+            ins->dot_dimension_numbers().rhs_contracting_dimensions()),
+        lhs_batch_dims_(ins->dot_dimension_numbers().lhs_batch_dimensions()),
+        rhs_batch_dims_(ins->dot_dimension_numbers().rhs_batch_dimensions()) {
+    std::tie(lhs_space_dims_, rhs_space_dims_) =
+        GetSpaceDims(lhs_->shape(), rhs_->shape(), dot_dnums_);
+    CHECK_EQ(lhs_con_dims_.size(), rhs_con_dims_.size());
+    CHECK_EQ(lhs_batch_dims_.size(), rhs_batch_dims_.size());
+  }
+
   void SplitLhsSpaceRhsSpace(MeshDims mesh_dims) {
     DCHECK_EQ(mesh_dims.size(), 2);
     for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
@@ -147,8 +168,7 @@ class DotHandler {
         HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_space_dims_[j]},
                                     {mesh_dims[1]}, device_mesh_);
 
-        AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                          cluster_env_, strategy_map_, strategies_);
+        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
       }
     }
   }
@@ -170,8 +190,7 @@ class DotHandler {
                  mesh_dims, device_mesh_);
         HloSharding rhs_spec = HloSharding::Replicate();
 
-        AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                          cluster_env_, strategy_map_, strategies_);
+        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
       }
     }
   }
@@ -196,8 +215,7 @@ class DotHandler {
             Tile(rhs_->shape(), {rhs_space_dims_[i], rhs_space_dims_[j]},
                  mesh_dims, device_mesh_);
 
-        AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                          cluster_env_, strategy_map_, strategies_);
+        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
       }
     }
   }
@@ -226,9 +244,8 @@ class DotHandler {
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost =
               cluster_env_.AllReduceCost(memory_cost, mesh_dims[1]);
-          AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
-                            communication_cost, cluster_env_, strategy_map_,
-                            strategies_);
+          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
+                            communication_cost);
         }
       }
     }
@@ -259,9 +276,8 @@ class DotHandler {
           double communication_cost =
               cluster_env_.AllReduceCost(memory_cost, mesh_dims[0]);
 
-          AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
-                            communication_cost, cluster_env_, strategy_map_,
-                            strategies_);
+          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
+                            communication_cost);
         }
       }
     }
@@ -280,8 +296,7 @@ class DotHandler {
           HloSharding rhs_spec =
               Tile(rhs_->shape(), {rhs_batch_dims_[i]}, {j}, device_mesh_);
 
-          AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                            cluster_env_, strategy_map_, strategies_);
+          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
         }
       }
     }
@@ -304,8 +319,7 @@ class DotHandler {
       HloSharding rhs_spec =
           Tile(rhs_->shape(), {rhs_batch_dims_[0], rhs_batch_dims_[1]},
                mesh_dims, device_mesh_);
-      AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                        cluster_env_, strategy_map_, strategies_);
+      AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
     }
   }
 
@@ -328,8 +342,7 @@ class DotHandler {
           HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_batch_dims_[j]},
                                       {mesh_dims[0]}, device_mesh_);
 
-          AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                            cluster_env_, strategy_map_, strategies_);
+          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
         }
       }
     }
@@ -357,8 +370,7 @@ class DotHandler {
               Tile(rhs_->shape(), {rhs_batch_dims_[j], rhs_space_dims_[i]},
                    mesh_dims, device_mesh_);
 
-          AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                            cluster_env_, strategy_map_, strategies_);
+          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
         }
       }
     }
@@ -388,9 +400,8 @@ class DotHandler {
           double communication_cost =
               cluster_env_.AllReduceCost(memory_cost, mesh_dims[1]);
 
-          AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
-                            communication_cost, cluster_env_, strategy_map_,
-                            strategies_);
+          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
+                            communication_cost);
         }
       }
     }
@@ -422,9 +433,8 @@ class DotHandler {
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost = cluster_env_.AllReduceCost(
               memory_cost, mesh_dims[0], mesh_dims[1]);
-          AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
-                            communication_cost, cluster_env_, strategy_map_,
-                            strategies_);
+          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
+                            communication_cost);
         }
       }
     }
@@ -449,9 +459,8 @@ class DotHandler {
         double communication_cost =
             cluster_env_.AllReduceCost(memory_cost, mesh_dims[0]);
 
-        AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec},
-                          compute_cost, communication_cost, cluster_env_,
-                          strategy_map_, strategies_);
+        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, compute_cost,
+                          communication_cost);
       }
     }
   }
@@ -479,8 +488,7 @@ class DotHandler {
           HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_space_dims_[i]},
                                       {mesh_dim}, device_mesh_1d_);
           HloSharding rhs_spec = HloSharding::Replicate();
-          AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                            cluster_env_, strategy_map_, strategies_);
+          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
       }
 
       // R = Sk x Sk @ (allreduce @ 0)
@@ -504,9 +512,8 @@ class DotHandler {
           double communication_cost =
               cluster_env_.AllReduceCost(memory_cost, mesh_dim);
 
-          AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
-                            communication_cost, cluster_env_, strategy_map_,
-                            strategies_);
+          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
+                            communication_cost);
         }
     }
   }
@@ -528,16 +535,11 @@ class DotHandler {
                                       {mesh_dim}, device_mesh_1d_);
           HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_batch_dims_[i]},
                                       {mesh_dim}, device_mesh_1d_);
-          AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                            cluster_env_, strategy_map_, strategies_);
+          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
       }
     }
   }
 
-  void Split(std::function<void(MeshDims)> split_func) const {
-    SplitWithMeshShape(split_func, device_mesh_.dimensions());
-  }
-
   Status RegisterStrategies() {
     // SS = SR x RS
     // Split lhs space dim and rhs space dim.
@@ -630,18 +632,6 @@ class DotHandler {
     return OkStatus();
   }
 
-  std::unique_ptr<StrategyVector>& strategies_;
-  StrategyMap& strategy_map_;
-  const HloInstruction* ins_;
-  const ClusterEnvironment& cluster_env_;
-  const InstructionBatchDimMap& batch_map_;
-  const AutoShardingSolverOption& solver_option_;
-
-  const Array<int64_t>& device_mesh_;
-  const Array<int64_t>& device_mesh_1d_;
-  const HloInstruction* lhs_;
-  const HloInstruction* rhs_;
-
   // Dimension information
   const DotDimensionNumbers& dot_dnums_;
   int64_t space_base_dim_;
@@ -668,23 +658,15 @@ Status HandleDot(std::unique_ptr<StrategyVector>& strategies,
   return OkStatus();
 }
 
-class ConvHandler {
+class ConvHandler : public HandlerBase {
  public:
   ConvHandler(std::unique_ptr<StrategyVector>& strategies,
               StrategyMap& strategy_map, const HloInstruction* ins,
               const ClusterEnvironment& cluster_env,
               const InstructionBatchDimMap& batch_map,
               const AutoShardingSolverOption& solver_option)
-      : strategies_(strategies),
-        strategy_map_(strategy_map),
-        ins_(ins),
-        cluster_env_(cluster_env),
-        batch_map_(batch_map),
-        solver_option_(solver_option),
-        device_mesh_(cluster_env.device_mesh_),
-        device_mesh_1d_(cluster_env.device_mesh_1d_),
-        lhs_(ins->operand(0)),
-        rhs_(ins->operand(1)),
+      : HandlerBase(strategies, strategy_map, ins, cluster_env, batch_map,
+                    solver_option),
         conv_dnums_(ins->convolution_dimension_numbers()) {
     lhs_batch_dim_ = conv_dnums_.input_batch_dimension();
     lhs_in_channel_dim_ = conv_dnums_.input_feature_dimension();
@@ -706,8 +688,7 @@ class ConvHandler {
     HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_out_channel_dim_},
                                 {mesh_dims[1]}, device_mesh_);
 
-    AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                      cluster_env_, strategy_map_, strategies_);
+    AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
   }
 
   void SplitLhsBatchBothInchannel(MeshDims mesh_dims) {
@@ -729,9 +710,8 @@ class ConvHandler {
       double communication_cost =
           cluster_env_.AllReduceCost(memory_cost, mesh_dims[1]);
 
-      AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
-                        communication_cost, cluster_env_, strategy_map_,
-                        strategies_);
+      AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
+                        communication_cost);
     }
   }
 
@@ -753,9 +733,8 @@ class ConvHandler {
       double communication_cost =
           cluster_env_.AllReduceCost(memory_cost, mesh_dims[0]);
 
-      AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
-                        communication_cost, cluster_env_, strategy_map_,
-                        strategies_);
+      AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
+                        communication_cost);
     }
   }
 
@@ -775,8 +754,7 @@ class ConvHandler {
             Tile(lhs_->shape(), {lhs_batch_dim_}, {mesh_dim}, device_mesh_1d_);
         HloSharding rhs_spec = HloSharding::Replicate();
 
-        AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                          cluster_env_, strategy_map_, strategies_);
+        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
       }
 
       // R = Sk x Sk @ (allreduce @ 0)
@@ -793,9 +771,8 @@ class ConvHandler {
         double communication_cost = cluster_env_.AllReduceCost(memory_cost, 0) +
                                     cluster_env_.AllReduceCost(memory_cost, 1);
 
-        AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0,
-                          communication_cost, cluster_env_, strategy_map_,
-                          strategies_);
+        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
+                          communication_cost);
       }
     }
   }
@@ -816,12 +793,7 @@ class ConvHandler {
     HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_out_channel_dim_},
                                 {mesh_dims[1]}, device_mesh_);
 
-    AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
-                      cluster_env_, strategy_map_, strategies_);
-  }
-
-  void Split(std::function<void(MeshDims)> split_func) const {
-    SplitWithMeshShape(split_func, device_mesh_.dimensions());
+    AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
   }
 
   Status RegisterStrategies() {
@@ -873,18 +845,6 @@ class ConvHandler {
     return OkStatus();
   }
 
-  std::unique_ptr<StrategyVector>& strategies_;
-  StrategyMap& strategy_map_;
-  const HloInstruction* ins_;
-  const ClusterEnvironment& cluster_env_;
-  const InstructionBatchDimMap& batch_map_;
-  const AutoShardingSolverOption& solver_option_;
-
-  const Array<int64_t>& device_mesh_;
-  const Array<int64_t>& device_mesh_1d_;
-  const HloInstruction* lhs_;
-  const HloInstruction* rhs_;
-
   // Dimension information
   const ConvolutionDimensionNumbers& conv_dnums_;
   int64_t lhs_batch_dim_, lhs_in_channel_dim_;

From 2b26764c0d13baca0c2bd938ebe7c9f7f0c5ea71 Mon Sep 17 00:00:00 2001
From: Kanglan Tang <kanglan@google.com>
Date: Mon, 2 Oct 2023 09:54:51 -0700
Subject: [PATCH 470/567] Tag TF-TPU wheel with manylinux_2_27_x86_64

PiperOrigin-RevId: 570096613
---
 .../devel.usertools/rename_and_verify_wheels.sh           | 6 +++++-
 .../devel.usertools/wheel_verification.bats               | 8 ++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rename_and_verify_wheels.sh b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rename_and_verify_wheels.sh
index 1ba11e07f53a10..dd7c1524ba9bec 100755
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rename_and_verify_wheels.sh
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rename_and_verify_wheels.sh
@@ -21,7 +21,11 @@ set -euxo pipefail
 
 for wheel in /tf/pkg/*.whl; do
   echo "Checking and renaming $wheel..."
-  time python3 -m auditwheel repair --plat manylinux2014_x86_64 "$wheel" --wheel-dir /tf/pkg 2>&1 | tee check.txt
+  if [[ "$wheel" =~ .*_tpu.* ]]; then
+    time python3 -m auditwheel repair --plat manylinux_2_27_x86_64 "$wheel" --wheel-dir /tf/pkg 2>&1 | tee check.txt
+  else
+    time python3 -m auditwheel repair --plat manylinux2014_x86_64 "$wheel" --wheel-dir /tf/pkg 2>&1 | tee check.txt
+  fi
 
   # We don't need the original wheel if it was renamed
   new_wheel=$(grep --extended-regexp --only-matching '/tf/pkg/\S+.whl' check.txt)
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats
index 81c7166c73feea..60cde495b6a8b9 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats
@@ -26,9 +26,13 @@ teardown_file() {
     rm -rf /tf/venv
 }
 
-@test "Wheel is manylinux2014 (manylinux_2_17) compliant" {
+@test "Wheel is manylinux2014 (manylinux_2_17) compliant (TPU wheel is manylinux_2_27 compliant)" {
     python3 -m auditwheel show "$TF_WHEEL" > audit.txt
-    grep --quiet 'This constrains the platform tag to "manylinux_2_17_x86_64"' audit.txt
+    if [[ "$TF_WHEEL" =~ .*_tpu.* ]]; then
+        grep --quiet 'This constrains the platform tag to "manylinux_2_27_x86_64"' audit.txt
+    else
+        grep --quiet 'This constrains the platform tag to "manylinux_2_17_x86_64"' audit.txt
+    fi
 }
 
 @test "Wheel conforms to upstream size limitations" {

From 1034d8438b0695d4452e92bbe25e9018c4ee46cf Mon Sep 17 00:00:00 2001
From: Marissa Ikonomidis <marissaw@google.com>
Date: Mon, 2 Oct 2023 10:04:54 -0700
Subject: [PATCH 471/567] Deprecate batches_ and replace with
 high_priority_batches_

In a future cl, low_priority_batches_ will be used.
Batches will default to high priority unless otherwise specified.

PiperOrigin-RevId: 570099579
---
 .../batching_util/shared_batch_scheduler.h    | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index bcaedddc192c3b..d3805d2a7f3862 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -22,20 +22,17 @@ limitations under the License.
 #include <functional>
 #include <list>
 #include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/time/clock.h"
 #include "absl/types/variant.h"
-#include "absl/utility/utility.h"
 #include "tensorflow/core/kernels/batching_util/batch_input_task.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
@@ -207,11 +204,12 @@ class SharedBatchScheduler
     //
     // TODO(b/194294263):
     // Make `enable_lazy_split` a template parameter of queue, and adapts
-    // `batches_` and `task_handle_batches_` into one deque of
+    // `high_priority_batches_` and `task_handle_batches_` into one deque of
     // tensorflow::serving::Batch.
     bool enable_lazy_split = false;
 
-    // The maximum size of each enqueued batch (i.e., in `batches_`).
+    // The maximum size of each enqueued batch (i.e., in
+    // `high_priority_batches_`).
     //
     // The scheduler may form batches of any size between 1 and this number
     // (inclusive). If there is a need to quantize the batch sizes, i.e. only
@@ -420,12 +418,12 @@ class Queue {
       std::vector<std::unique_ptr<TaskType>>* output_tasks)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  // Determines whether the open batch residing at the back of 'batches_' is
-  // currently schedulable.
+  // Determines whether the open batch residing at the back of
+  // 'high_priority_batches_' is currently schedulable.
   bool IsOpenBatchSchedulable() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // A variant of `IsOpenBatchSchedulable`; used when batches are formed at
-  // task enqueue time, and open batch is `batches_.back()`.
+  // task enqueue time, and open batch is `high_priority_batches_.back()`.
   bool IsOpenBatchSchedulableAfterEagerSplit() const
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
@@ -478,13 +476,6 @@ class Queue {
   // for the duration of this object's life.
   std::atomic<bool> closed_ TF_GUARDED_BY(mu_){false};
 
-  // The enqueued batches.
-  // Each element corresponds to a task to be dequeued and processed by
-  // `Queue<TaskType>::ProcessBatch`.
-  //
-  // Used iff `QueueOptions.enable_lazy_split` is false.
-  std::deque<std::unique_ptr<Batch<TaskType>>> batches_ TF_GUARDED_BY(mu_);
-
   // The enqueued batches.
   //
   // Each element corresponds to the `task` enqueued in `Queue::Schedule`; the
@@ -494,11 +485,19 @@ class Queue {
   std::deque<std::unique_ptr<Batch<BatchInputTaskHandle<TaskType>>>>
       task_handle_batches_ TF_GUARDED_BY(mu_);
 
-  // The enqueued batches for low priority input
+  // The enqueued batches for low priority input.
+  // Each element corresponds to a task to be dequeued and processed by
+  // `Queue<TaskType>::ProcessBatch`.
+  //
+  // Used iff `QueueOptions.enable_lazy_split` is false.
   std::deque<std::unique_ptr<Batch<TaskType>>> low_priority_batches_
       TF_GUARDED_BY(mu_);
 
-  // The enqueued batches for high priority input
+  // The enqueued batches for high priority input.
+  // Each element corresponds to a task to be dequeued and processed by
+  // `Queue<TaskType>::ProcessBatch`.
+  //
+  // Used iff `QueueOptions.enable_lazy_split` is false.
   std::deque<std::unique_ptr<Batch<TaskType>>> high_priority_batches_
       TF_GUARDED_BY(mu_);
 
@@ -506,7 +505,8 @@ class Queue {
   uint64 traceme_context_id_counter_ TF_GUARDED_BY(mu_) = 0;
 
   // The time at which the first task was added to the open (back-most) batch
-  // in 'batches_'. Valid iff that batch contains at least one task.
+  // in 'high_priority_batches_'. Valid iff that batch contains at least one
+  // task.
   uint64 open_batch_start_time_micros_ TF_GUARDED_BY(mu_);
 
   // Whether this queue contains a batch that is eligible to be scheduled.
@@ -1244,13 +1244,13 @@ int64 Queue<TaskType>::num_enqueued_batches() const {
 
 template <typename TaskType>
 std::deque<std::unique_ptr<Batch<TaskType>>>& Queue<TaskType>::GetBatches() {
-  return batches_;
+  return high_priority_batches_;
 }
 
 template <typename TaskType>
 const std::deque<std::unique_ptr<Batch<TaskType>>>&
 Queue<TaskType>::GetBatches() const {
-  return batches_;
+  return high_priority_batches_;
 }
 
 template <typename TaskType>

From 6bd9b9bb83a61a74cb8d356390d83dd83bd1572c Mon Sep 17 00:00:00 2001
From: Samuel Agyakwa <sagyakwa@google.com>
Date: Mon, 2 Oct 2023 10:07:56 -0700
Subject: [PATCH 472/567] [XLA: Python] Update PJRT plugin configuration
 mapping with bool type

PiperOrigin-RevId: 570100527
---
 third_party/xla/xla/python/xla_client.py  | 2 +-
 third_party/xla/xla/python/xla_client.pyi | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 4885579643cb51..c8eafe9202ea55 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -57,7 +57,7 @@
 
 logger = logging.getLogger(__name__)
 
-_NameValueMapping = Mapping[str, Union[str, int, List[int], float]]
+_NameValueMapping = Mapping[str, Union[str, int, List[int], float, bool]]
 
 
 def make_cpu_client() -> ...:
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
index 7a01e4817a5364..47786cc8026d3e 100644
--- a/third_party/xla/xla/python/xla_client.pyi
+++ b/third_party/xla/xla/python/xla_client.pyi
@@ -62,7 +62,7 @@ float8_e5m2: Type[numpy.generic]
 float8_e5m2fnuz: Type[numpy.generic]
 XLA_ELEMENT_TYPE_TO_DTYPE: Dict[PrimitiveType, numpy.dtype]
 
-_NameValueMapping = Mapping[str, Union[str, int, List[int], float]]
+_NameValueMapping = Mapping[str, Union[str, int, List[int], float, bool]]
 
 
 def dtype_to_etype(dtype: numpy.dtype) -> PrimitiveType:

From b68b3b2c3824e440e3c27463ac170d3627496f0d Mon Sep 17 00:00:00 2001
From: Larry Lansing <llansing@google.com>
Date: Mon, 2 Oct 2023 10:22:17 -0700
Subject: [PATCH 473/567] Make enum_value packed in PackedTestValue. Enums are
 packable just like other scalar types.

PiperOrigin-RevId: 570104566
---
 tensorflow/python/kernel_tests/proto/test_example.proto | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/proto/test_example.proto b/tensorflow/python/kernel_tests/proto/test_example.proto
index bec0fba28baee5..f7e0d435a8a92a 100644
--- a/tensorflow/python/kernel_tests/proto/test_example.proto
+++ b/tensorflow/python/kernel_tests/proto/test_example.proto
@@ -100,7 +100,7 @@ message PackedTestValue {
   repeated sint32 sint32_value = 17 [packed = true];
   repeated sint64 sint64_value = 18 [packed = true];
   repeated PrimitiveValue message_value = 19;
-  repeated Color enum_value = 35;
+  repeated Color enum_value = 35 [packed = true];
 
   optional double double_value_with_default = 20 [default = 1.0];
   optional float float_value_with_default = 21 [default = 2.0];

From a008c44880352097996205b3cc7236c9b5e639e4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 10:28:47 -0700
Subject: [PATCH 474/567] Minor simplification to CheckDims (where the tensor
 dim indexing is performed by the caller).

PiperOrigin-RevId: 570106474
---
 .../auto_sharding_dot_handler.cc              | 69 +++++++++----------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 4fe6753f449f21..90e545892aef94 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -40,7 +40,7 @@ limitations under the License.
 namespace xla {
 namespace spmd {
 
-using DimMap = StableHashMap</*instr. dim idx*/ int, /* mesh dim idx*/ int>;
+using DimMap = StableHashMap</*tensor dim*/ int, /* mesh dim*/ int>;
 using MeshDims = absl::Span<const int64_t>;
 
 // Contains base functionality common to both DotHandler and ConvHandler.
@@ -97,16 +97,13 @@ class HandlerBase {
     }
   }
 
-  bool CheckDims(const HloInstruction* ins,
-                 const tsl::protobuf::RepeatedField<int64_t>& instr_dims,
-                 const DimMap& dim_map) const {
-    for (const auto& [instr_dim_idx, mesh_dim_idx] : dim_map) {
-      auto instr_dim = instr_dims.at(instr_dim_idx);
-      auto shape_dim = ins->shape().dimensions().at(instr_dim);
-      auto mesh_dim = device_mesh_.dim(mesh_dim_idx);
-      if (shape_dim < mesh_dim) return false;
+  bool CheckDims(const HloInstruction* ins, const DimMap& dim_map) const {
+    for (const auto& [tensor_dim, mesh_dim] : dim_map) {
+      auto shape_dim = ins->shape().dimensions().at(tensor_dim);
+      auto device_mesh_dim = device_mesh_.dim(mesh_dim);
+      if (shape_dim < device_mesh_dim) return false;
       if (solver_option_.only_allow_divisible_intermediate &&
-          !IsDivisible(shape_dim, mesh_dim))
+          !IsDivisible(shape_dim, device_mesh_dim))
         return false;
     }
     return true;
@@ -152,8 +149,8 @@ class DotHandler : public HandlerBase {
     DCHECK_EQ(mesh_dims.size(), 2);
     for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
       for (int64_t j = 0; j < rhs_space_dims_.size(); ++j) {
-        if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dims[0]}}) ||
-            !CheckDims(rhs_, rhs_space_dims_, {{j, mesh_dims[1]}}))
+        if (!CheckDims(lhs_, {{lhs_space_dims_[i], mesh_dims[0]}}) ||
+            !CheckDims(rhs_, {{rhs_space_dims_[j], mesh_dims[1]}}))
           continue;
         std::string name = absl::StrFormat("SS = SR x RS @ {%s}",
                                            absl::StrJoin(mesh_dims, ","));
@@ -177,8 +174,8 @@ class DotHandler : public HandlerBase {
     DCHECK_EQ(mesh_dims.size(), 2);
     for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
       for (int64_t j = i + 1; j < lhs_space_dims_.size(); ++j) {
-        if (!CheckDims(lhs_, lhs_space_dims_,
-                       {{i, mesh_dims[0]}, {j, mesh_dims[1]}}))
+        if (!CheckDims(lhs_, {{lhs_space_dims_[i], mesh_dims[0]},
+                              {lhs_space_dims_[j], mesh_dims[1]}}))
           continue;
         std::string name = absl::StrFormat("SSR = SSR x RR @ {%s}",
                                            absl::StrJoin(mesh_dims, ","));
@@ -199,8 +196,8 @@ class DotHandler : public HandlerBase {
     DCHECK_EQ(mesh_dims.size(), 2);
     for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
       for (int64_t j = i + 1; j < rhs_space_dims_.size(); ++j) {
-        if (!CheckDims(rhs_, rhs_space_dims_,
-                       {{i, mesh_dims[0]}, {j, mesh_dims[1]}}))
+        if (!CheckDims(rhs_, {{rhs_space_dims_[i], mesh_dims[0]},
+                              {rhs_space_dims_[j], mesh_dims[1]}}))
           continue;
         std::string name = absl::StrFormat("RSS = RR x RSS @ {%s}",
                                            absl::StrJoin(mesh_dims, ","));
@@ -229,8 +226,8 @@ class DotHandler : public HandlerBase {
                           absl::StrJoin(mesh_dims, ","), mesh_dims[1]);
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_con_dims_.size(); ++j) {
-          if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dims[0]}}) ||
-              !CheckDims(lhs_, lhs_con_dims_, {{j, mesh_dims[1]}}))
+          if (!CheckDims(lhs_, {{lhs_space_dims_[i], mesh_dims[0]},
+                                {lhs_con_dims_[j], mesh_dims[1]}}))
             continue;
 
           HloSharding output_spec = Tile(ins_->shape(), {space_base_dim_ + i},
@@ -259,8 +256,8 @@ class DotHandler : public HandlerBase {
                           absl::StrJoin(mesh_dims, ","), mesh_dims[0]);
       for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_con_dims_.size(); ++j) {
-          if (!CheckDims(rhs_, rhs_space_dims_, {{i, mesh_dims[1]}}) ||
-              !CheckDims(lhs_, lhs_con_dims_, {{j, mesh_dims[0]}}))
+          if (!CheckDims(rhs_, {{rhs_space_dims_[i], mesh_dims[1]}}) ||
+              !CheckDims(lhs_, {{lhs_con_dims_[j], mesh_dims[0]}}))
             continue;
           HloSharding output_spec =
               Tile(ins_->shape(),
@@ -288,7 +285,7 @@ class DotHandler : public HandlerBase {
                          [](int64_t size) { return size > 1; }) == 1) {
       for (int64_t i = 0; i < lhs_batch_dims_.size(); ++i) {
         for (int64_t j = 0; j < device_mesh_.num_dimensions(); ++j) {
-          if (!CheckDims(lhs_, lhs_batch_dims_, {{i, j}})) continue;
+          if (!CheckDims(lhs_, {{lhs_batch_dims_[i], j}})) continue;
           std::string name = absl::StrFormat("Sb_%d = Sb x Sb @ {%d}", i, j);
           HloSharding output_spec = Tile(ins_->shape(), {i}, {j}, device_mesh_);
           HloSharding lhs_spec =
@@ -306,8 +303,8 @@ class DotHandler : public HandlerBase {
     DCHECK_EQ(mesh_dims.size(), 2);
     if (lhs_batch_dims_.size() == 2 && device_mesh_.dim(mesh_dims[0]) > 1 &&
         device_mesh_.dim(mesh_dims[1]) > 1) {
-      if (!CheckDims(lhs_, lhs_batch_dims_,
-                     {{0, mesh_dims[0]}, {1, mesh_dims[1]}}))
+      if (!CheckDims(lhs_, {{lhs_batch_dims_[0], mesh_dims[0]},
+                            {lhs_batch_dims_[1], mesh_dims[1]}}))
         return;
       std::string name =
           absl::StrFormat("Sb = Sb x Sb @ {%s}", absl::StrJoin(mesh_dims, ","));
@@ -331,8 +328,8 @@ class DotHandler : public HandlerBase {
                                          absl::StrJoin(mesh_dims, ","));
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!CheckDims(lhs_, lhs_space_dims_, {{i, mesh_dims[0]}}) ||
-              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dims[1]}}))
+          if (!CheckDims(lhs_, {{lhs_space_dims_[i], mesh_dims[0]},
+                                {lhs_batch_dims_[j], mesh_dims[1]}}))
             continue;
           HloSharding output_spec = Tile(
               ins_->shape(), {j, space_base_dim_ + i}, mesh_dims, device_mesh_);
@@ -356,8 +353,8 @@ class DotHandler : public HandlerBase {
                                          absl::StrJoin(mesh_dims, ","));
       for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!CheckDims(rhs_, rhs_space_dims_, {{i, mesh_dims[1]}}) ||
-              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dims[0]}}))
+          if (!CheckDims(rhs_, {{rhs_space_dims_[i], mesh_dims[1]}}) ||
+              !CheckDims(lhs_, {{lhs_batch_dims_[j], mesh_dims[0]}}))
             continue;
           HloSharding output_spec =
               Tile(ins_->shape(),
@@ -385,8 +382,8 @@ class DotHandler : public HandlerBase {
                           absl::StrJoin(mesh_dims, ","), mesh_dims[1]);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!CheckDims(lhs_, lhs_con_dims_, {{i, mesh_dims[1]}}) ||
-              !CheckDims(lhs_, lhs_batch_dims_, {{j, mesh_dims[0]}}))
+          if (!CheckDims(lhs_, {{lhs_con_dims_[i], mesh_dims[1]},
+                                {lhs_batch_dims_[j], mesh_dims[0]}}))
             continue;
           HloSharding output_spec =
               Tile(ins_->shape(), {j}, {mesh_dims[0]}, device_mesh_);
@@ -418,10 +415,10 @@ class DotHandler : public HandlerBase {
           absl::StrJoin(mesh_dims, ","), absl::StrJoin(mesh_dims, ", "));
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
         for (int64_t j = i + 1; j < lhs_con_dims_.size(); ++j) {
-          if (!CheckDims(lhs_, lhs_con_dims_,
-                         {{i, mesh_dims[0]}, {j, mesh_dims[1]}}) ||
-              !CheckDims(rhs_, rhs_con_dims_,
-                         {{i, mesh_dims[0]}, {j, mesh_dims[1]}}))
+          if (!CheckDims(lhs_, {{lhs_con_dims_[i], mesh_dims[0]},
+                                {lhs_con_dims_[j], mesh_dims[1]}}) ||
+              !CheckDims(rhs_, {{rhs_con_dims_[i], mesh_dims[0]},
+                                {rhs_con_dims_[j], mesh_dims[1]}}))
             continue;
           HloSharding output_spec = HloSharding::Replicate();
           HloSharding lhs_spec =
@@ -447,7 +444,7 @@ class DotHandler : public HandlerBase {
       std::string name = absl::StrFormat("RR = RS x SR @ {%d} (allreduce @ %d)",
                                          mesh_dims[0], mesh_dims[0]);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
-        if (!CheckDims(lhs_, lhs_con_dims_, {{i, mesh_dims[0]}})) continue;
+        if (!CheckDims(lhs_, {{lhs_con_dims_[i], mesh_dims[0]}})) continue;
         HloSharding output_spec = HloSharding::Replicate();
         HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_con_dims_[i]},
                                     {mesh_dims[0]}, device_mesh_);
@@ -524,8 +521,8 @@ class DotHandler : public HandlerBase {
                          [](int64_t size) { return size > 1; }) > 1) {
       int mesh_dim = 0;
       for (int64_t i = 0; i < lhs_batch_dims_.size(); ++i) {
-          if (!CheckDims(lhs_, lhs_batch_dims_, {{i, mesh_dim}}) ||
-              !CheckDims(rhs_, rhs_batch_dims_, {{i, mesh_dim}}))
+          if (!CheckDims(lhs_, {{lhs_batch_dims_[i], mesh_dim}}) ||
+              !CheckDims(rhs_, {{rhs_batch_dims_[i], mesh_dim}}))
           continue;
           std::string name =
               absl::StrFormat("Sb_%d = Sb x Sb @ {%d} 1d", i, mesh_dim);

From f66187934aa12f37393ae497f0ec01a47a2c5273 Mon Sep 17 00:00:00 2001
From: Haibo Huang <hhb@google.com>
Date: Mon, 2 Oct 2023 11:42:41 -0700
Subject: [PATCH 475/567] Include link.h for struct link_map

PiperOrigin-RevId: 570129061
---
 tensorflow/core/tpu/tpu_api_dlsym_initializer.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index 020a01a5157ec4..3cf658e28e474d 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <dirent.h>
 #include <dlfcn.h>
 #include <fcntl.h>
+#include <link.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>

From fa5c608e56cdb0ae3ec38bc872fa4725a26290aa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 11:44:46 -0700
Subject: [PATCH 476/567] Metric XLA binary size loaded onto the GPU device.

PiperOrigin-RevId: 570129604
---
 third_party/xla/xla/service/gpu/BUILD           |  1 +
 third_party/xla/xla/service/gpu/gpu_compiler.cc |  1 +
 third_party/xla/xla/service/gpu/metrics.cc      | 11 +++++++++++
 third_party/xla/xla/service/gpu/metrics.h       |  3 +++
 4 files changed, 16 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 9180f83333d636..a80f5514cb23af 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -4195,6 +4195,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "@local_tsl//tsl/lib/monitoring:counter",
+        "@local_tsl//tsl/lib/monitoring:gauge",
         "@local_tsl//tsl/lib/monitoring:sampler",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 8214b862c7a470..ef883c47dc9706 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1521,6 +1521,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
       CompileToTargetBinary(
           module->config(), std::move(compile_module_results.llvm_module),
           GetGpuVersion(stream_exec), stream_exec, options, module.get()));
+  RecordXlaDeviceBinarySize(binary.size());
 
   if (DumpingEnabledForHloModule(*module) &&
       std::holds_alternative<GpuExecutable::OwnedThunkSequence>(
diff --git a/third_party/xla/xla/service/gpu/metrics.cc b/third_party/xla/xla/service/gpu/metrics.cc
index 770ae48ebdb4a0..44fff398f23acd 100644
--- a/third_party/xla/xla/service/gpu/metrics.cc
+++ b/third_party/xla/xla/service/gpu/metrics.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "xla/service/gpu/metrics.h"
 
+#include <cstdint>
+
 #include "tsl/lib/monitoring/counter.h"
+#include "tsl/lib/monitoring/gauge.h"
 #include "tsl/lib/monitoring/sampler.h"
 
 namespace xla {
@@ -33,6 +36,10 @@ auto* compile_time_usecs_histogram = tsl::monitoring::Sampler<1>::New(
 auto* compiled_programs_count = tsl::monitoring::Counter<0>::New(
     "/xla/service/gpu/compiled_programs_count", "Number of compiled programs.");
 
+auto* xla_device_binary_size = tsl::monitoring::Gauge<int64_t, 0>::New(
+    "/xla/service/gpu/xla_device_binary_size",
+    "The size of the XLA binary loaded onto the GPU device.");
+
 }  // namespace
 
 void RecordHloPassesDuration(const uint64_t time_usecs) {
@@ -78,4 +85,8 @@ int64_t GetCompiledProgramsCount() {
   return compiled_programs_count->GetCell()->value();
 }
 
+void RecordXlaDeviceBinarySize(const int64_t size) {
+  xla_device_binary_size->GetCell()->Set(size);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/metrics.h b/third_party/xla/xla/service/gpu/metrics.h
index a18f2dcd302590..d660743838a26f 100644
--- a/third_party/xla/xla/service/gpu/metrics.h
+++ b/third_party/xla/xla/service/gpu/metrics.h
@@ -45,6 +45,9 @@ void IncrementCompiledProgramsCount();
 // Gets compiled programs numbers.
 int64_t GetCompiledProgramsCount();
 
+// Records the size of the XLA device binary in bytes.
+void RecordXlaDeviceBinarySize(int64_t size);
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_GPU_METRICS_H_

From aa3894147f6f3a8c304f658fd35179cefc8187a1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 11:54:57 -0700
Subject: [PATCH 477/567] Allow quantized tensor as the input for mhlo.reshape.
 This also include change the folding logic to allow constant folding on
 quantized tensor.

There are basically two changes proposed here. The first change in reshape method is a fix to avoid folders to crash on quantized type, so that translation like MHLO -> (X) can run smoothly for quantized types. The second change in `materializeConstant allows constant folding for quantized types.

We note that the fixes are not ideal and there is a [ticket](https://github.com/openxla/stablehlo/issues/1691) to explore better solutions for the same.

PiperOrigin-RevId: 570133154
---
 .../xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc       | 26 +++++++++++-
 .../xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td       |  2 +-
 .../Dialect/mhlo/canonicalize/reshape.mlir    | 41 +++++++++++++++++++
 3 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index 5f3d77d213fe0e..a8f8eacd7252f8 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -259,6 +259,16 @@ DenseElementsAttr reshape(DenseElementsAttr attr, ShapedType newType) {
     auto splatValue = attr.getValues<bool>()[0];
     return DenseElementsAttr::get(newType, {splatValue});
   }
+  // Bypass the element type check for quantized tensor. For quantized tensors,
+  // we only require storage type and shape match the attribute type and shape.
+  if (auto quantElemTy =
+          newType.getElementType().dyn_cast<quant::QuantizedType>()) {
+    // Only shape and storage type information is needed to reshape the
+    // attribute.
+    auto quantShapedType =
+        RankedTensorType::get(newType.getShape(), quantElemTy.getStorageType());
+    return attr.reshape(quantShapedType);
+  }
   return attr.reshape(newType);
 }
 
@@ -7143,7 +7153,21 @@ Operation* MhloDialect::materializeConstant(OpBuilder& builder, Attribute value,
   // HLO dialect constants only support ElementsAttr unlike standard dialect
   // constant which supports all attributes.
   if (!elementsAttr) return nullptr;
-  // HLO dialect constants require the type of value and result to match.
+  auto resultShapedType = type.dyn_cast<ShapedType>();
+  auto attrShapedType = elementsAttr.getType().dyn_cast<ShapedType>();
+  if (resultShapedType && attrShapedType) {
+    if (auto quantElemTy = resultShapedType.getElementType()
+                               .dyn_cast<quant::QuantizedType>()) {
+      // Attribute type and shape should match storage type and shape for
+      // quantized tensors.
+      if ((attrShapedType.getElementType() != quantElemTy.getStorageType()) ||
+          (attrShapedType.getShape() != resultShapedType.getShape()))
+        return nullptr;
+    }
+    return builder.create<mhlo::ConstantOp>(loc, type, elementsAttr);
+  }
+  // HLO dialect constants require the type of value and result to match for
+  // non-quantized tensors.
   if (type != elementsAttr.getType()) return nullptr;
 
   return builder.create<mhlo::ConstantOp>(loc, type, elementsAttr);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
index af5b778427c93b..6ef32579142967 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
@@ -2718,7 +2718,7 @@ def MHLO_MapOp: MHLO_ShapedInterfaceOp<"map",
 }
 
 def MHLO_ReshapeOp: MHLO_Op<"reshape",
-      [Pure, SameOperandsAndResultElementType]> {
+      [Pure, HLO_CompatibleOperandsAndResultElementType]> {
   let summary = "Reshape operation";
   let description = [{
     Performs reshape of `operand` tensor to a `result` tensor.
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reshape.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reshape.mlir
index f15829fd2add0a..d1e11a4bfb7ec5 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reshape.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reshape.mlir
@@ -147,3 +147,44 @@ func.func @non_const_many_chained_reshapes(%arg : tensor<2x3x4xi32>) -> tensor<1
   // CHECK-NEXT: return [[RES]]
   func.return %4 : tensor<1x2x4x3xi32>
 }
+
+// -----
+
+// CHECK-LABEL: func @fold_per_tensor_quantized_tensor
+func.func @fold_per_tensor_quantized_tensor() -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+0:16>> {
+  %cst = mhlo.constant() {value = dense<[[[1, 2],[3, 4]],[[5, 6],[7, 8]]]> : tensor<2x2x2xi8>} : () -> tensor<2x2x2x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  %0 = "mhlo.reshape"(%cst) : (tensor<2x2x2x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  // CHECK-NEXT: [[CST:%.+]] = mhlo.constant()
+  // CHECK-SAME: [1, 2], [3, 4], [5, 6], [7, 8]
+  // CHECK-SAME: () -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+00:16>>
+  // CHECK-NEXT: return [[CST]]
+  func.return %0 : tensor<4x2x!quant.uniform<i8:f32, 2.000000e+0:16>>
+}
+
+// -----
+
+// CHECK-LABEL: func @fold_per_axis_quantized_tensor
+func.func @fold_per_axis_quantized_tensor() -> tensor<4x2x!quant.uniform<i8:f32:1, {2.000000e+0:16,3.000000e+0:32}>> {
+  %cst = mhlo.constant() {value = dense<[[[1, 2],[3, 4]],[[5, 6],[7, 8]]]> : tensor<2x2x2xi8>} : () -> tensor<2x2x2x!quant.uniform<i8:f32:2, {2.000000e+0:16,3.000000e+0:32}>>
+  %0 = "mhlo.reshape"(%cst) : (tensor<2x2x2x!quant.uniform<i8:f32:2, {2.000000e+0:16,3.000000e+0:32}>>) -> tensor<4x2x!quant.uniform<i8:f32:1, {2.000000e+0:16,3.000000e+0:32}>>
+  // CHECK-NEXT: [[CST:%.+]] = mhlo.constant()
+  // CHECK-SAME: [1, 2], [3, 4], [5, 6], [7, 8]
+  // CHECK-SAME: () -> tensor<4x2x!quant.uniform<i8:f32:1, {2.000000e+00:16,3.000000e+00:32}>>
+  // CHECK-NEXT: return [[CST]]
+  func.return %0 : tensor<4x2x!quant.uniform<i8:f32:1, {2.000000e+0:16,3.000000e+0:32}>>
+}
+
+// -----
+
+// CHECK-LABEL: func @non_const_many_chained_reshapes_quantized
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func.func @non_const_many_chained_reshapes_quantized(%arg : tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<1x2x4x3x!quant.uniform<i8:f32, 2.000000e+0:16>> {
+  %0 = "mhlo.reshape"(%arg) : (tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<4x3x2x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  %1 = "mhlo.reshape"(%0) : (tensor<4x3x2x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<12x2x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  %2 = "mhlo.reshape"(%1) : (tensor<12x2x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<2x12x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  %3 = "mhlo.reshape"(%2) : (tensor<2x12x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<24x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  %4 = "mhlo.reshape"(%3) : (tensor<24x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<1x2x4x3x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  // CHECK-NEXT: [[RES:%.+]] = mhlo.reshape [[ARG]] : (tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:16>>) -> tensor<1x2x4x3x!quant.uniform<i8:f32, 2.000000e+00:16>>
+  // CHECK-NEXT: return [[RES]]
+  func.return %4 : tensor<1x2x4x3x!quant.uniform<i8:f32, 2.000000e+0:16>>
+}
\ No newline at end of file

From 8e452b87adea81158a7c76af7bfdee0ad7b038db Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 12:11:53 -0700
Subject: [PATCH 478/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/fb9b279b454c71995f63d33a9c78d52c4bf3ca1c.

PiperOrigin-RevId: 570138454
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 3fc4047aa7903e..140d5adb7eb8e3 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "88e2ee6c6aaed74fc4201ad62e0b6dad0c40d56f"
-    TFRT_SHA256 = "c9e0cbaf6a39144f8bb1bc3a4789165d4af032ff5f2999724ad27c449de9b969"
+    TFRT_COMMIT = "fb9b279b454c71995f63d33a9c78d52c4bf3ca1c"
+    TFRT_SHA256 = "dcf6b24d18c59e149ae94b05b35e6e66403a28b3b0a0a11777e2ba85d16d3ff8"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index 3fc4047aa7903e..140d5adb7eb8e3 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "88e2ee6c6aaed74fc4201ad62e0b6dad0c40d56f"
-    TFRT_SHA256 = "c9e0cbaf6a39144f8bb1bc3a4789165d4af032ff5f2999724ad27c449de9b969"
+    TFRT_COMMIT = "fb9b279b454c71995f63d33a9c78d52c4bf3ca1c"
+    TFRT_SHA256 = "dcf6b24d18c59e149ae94b05b35e6e66403a28b3b0a0a11777e2ba85d16d3ff8"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 3fc4047aa7903e..140d5adb7eb8e3 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "88e2ee6c6aaed74fc4201ad62e0b6dad0c40d56f"
-    TFRT_SHA256 = "c9e0cbaf6a39144f8bb1bc3a4789165d4af032ff5f2999724ad27c449de9b969"
+    TFRT_COMMIT = "fb9b279b454c71995f63d33a9c78d52c4bf3ca1c"
+    TFRT_SHA256 = "dcf6b24d18c59e149ae94b05b35e6e66403a28b3b0a0a11777e2ba85d16d3ff8"
 
     tf_http_archive(
         name = "tf_runtime",

From 66349e3a058164fe1eae97487c26e688a7fe3567 Mon Sep 17 00:00:00 2001
From: Michael Hudgins <michaelhudgins@google.com>
Date: Mon, 2 Oct 2023 19:18:00 +0000
Subject: [PATCH 479/567] py and cc do work now that i fixed labels

---
 .bazelrc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 3b4051a781b80d..66eb37866bc827 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -716,8 +716,7 @@ test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflo
 # ARM64 PYCPP
 test:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-# TODO(michaelhudgins): Why did prior ARM tests seperate out py and cc? 
-test:linux_arm64_pycpp_test_filters --test_lang_filters=cc --test_size_filters=small,medium --flaky_test_attempts=3
+test:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
 # TODO(michaelhudgins): Why do we need to specifically omit go and java here? 
 test:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test
 # END TF TEST SUITE OPTIONS

From 67f519e727605f502c711cd340df46af89c4085e Mon Sep 17 00:00:00 2001
From: Michael Hudgins <michaelhudgins@google.com>
Date: Mon, 2 Oct 2023 12:16:03 -0700
Subject: [PATCH 480/567] Change ARM64 docker runtime base to be CUDA.  A
 separate non-cuda will be added later

PiperOrigin-RevId: 570139525
---
 ci/official/containers/linux_arm64/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/official/containers/linux_arm64/Dockerfile b/ci/official/containers/linux_arm64/Dockerfile
index f72a8b983e1b3d..617e67dbdaf9a1 100644
--- a/ci/official/containers/linux_arm64/Dockerfile
+++ b/ci/official/containers/linux_arm64/Dockerfile
@@ -23,7 +23,7 @@ COPY apt.conf /etc/apt/
 RUN /build_patchelf.sh
 
 ################################################################################
-FROM ubuntu:20.04 as devel
+FROM nvidia/cuda:12.2.0-devel-ubuntu20.04 as devel
 ################################################################################
 COPY --from=builder /dt10 /dt10
 COPY --from=builder /patchelf/patchelf_0.14.3-1_arm64.deb /patchelf/patchelf_0.14.3-1_arm64.deb

From ad1ca90c246e32cabbd3de92bee25220bc2a3a1d Mon Sep 17 00:00:00 2001
From: Clive Verghese <cliveverghese@google.com>
Date: Mon, 2 Oct 2023 12:16:26 -0700
Subject: [PATCH 481/567] Add host analysis to TPU timeline.

PiperOrigin-RevId: 570139622
---
 tensorflow/core/profiler/convert/BUILD        |   5 +
 .../convert/dcn_slack_analysis_combiner.cc    |   4 +
 .../convert/xplane_to_dcn_collective_stats.cc |   2 +-
 .../convert/xspace_to_dcn_slack_analysis.cc   | 181 ++++++++++++++++--
 .../convert/xspace_to_dcn_slack_analysis.h    |  55 +++++-
 .../protobuf/dcn_slack_analysis.proto         |   8 +-
 6 files changed, 228 insertions(+), 27 deletions(-)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index dbb28725049d28..cf76fb7e2c01af 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -1012,16 +1012,21 @@ cc_library(
     hdrs = ["xspace_to_dcn_slack_analysis.h"],
     deps = [
         "//tensorflow/core/profiler/protobuf:dcn_slack_analysis_proto_cc",
+        "//tensorflow/core/profiler/protobuf:topology_proto_cc",
         "//tensorflow/core/profiler/utils:hlo_module_utils",
         "//tensorflow/core/profiler/utils:hlo_proto_map",
         "//tensorflow/core/profiler/utils:hlo_proto_to_module",
+        "//tensorflow/core/profiler/utils:xplane_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:regexp",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/utils:math_utils",
         "@local_tsl//tsl/profiler/utils:tf_xplane_visitor",
+        "@local_tsl//tsl/profiler/utils:timespan",
+        "@local_tsl//tsl/profiler/utils:tpu_xplane_utils",
         "@local_tsl//tsl/profiler/utils:xplane_schema",
         "@local_tsl//tsl/profiler/utils:xplane_utils",
         "@local_tsl//tsl/profiler/utils:xplane_visitor",
diff --git a/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.cc b/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.cc
index 755b12db97b1fe..32a91c836c1497 100644
--- a/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.cc
+++ b/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.cc
@@ -44,6 +44,8 @@ void DcnSlackAnalysisCombiner::Combine(const DcnSlackAnalysis& slack_analysis) {
                                  slack.send_duration_us() * occurrences);
     summary.set_recv_duration_us(summary.recv_duration_us() +
                                  slack.recv_duration_us() * occurrences);
+    summary.set_host_stall_us(summary.host_stall_us() +
+                              slack.host_stall_us() * occurrences);
     summary.set_occurrences(summary.occurrences() + slack.occurrences());
     summary.set_bytes_transmitted_over_network(
         slack.bytes_transmitted_over_network());
@@ -74,6 +76,8 @@ DcnSlackAnalysis DcnSlackAnalysisCombiner::Finalize() {
         SafeDivide(summary.send_duration_us(), summary.occurrences()));
     slack->set_recv_duration_us(
         SafeDivide(summary.recv_duration_us(), summary.occurrences()));
+    slack->set_host_stall_us(
+        SafeDivide(summary.host_stall_us(), summary.occurrences()));
     slack->set_occurrences(summary.occurrences());
     slack->set_bytes_transmitted_over_network(
         summary.bytes_transmitted_over_network());
diff --git a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.cc b/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.cc
index ad131ae672f8d7..6586356b60a294 100644
--- a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.cc
@@ -67,7 +67,7 @@ StatusOr<bool> GetDcnCollectiveStatsFromMultiXSpaceAndSaveToFile(
     }
 
     DcnSlackAnalysis dcnSlackAnalysis =
-        ConvertXSpaceToDcnSlackAnalysis(*xspace);
+        ConvertXSpaceToDcnSlackAnalysis(*xspace, nullptr, nullptr);
 
     TF_RETURN_IF_ERROR(WriteBinaryProto(session_snapshot,
                                         StoredDataType::DCN_COLLECTIVE_STATS,
diff --git a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc
index 854454dca2145e..a1478c6178170b 100644
--- a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc
+++ b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc
@@ -34,13 +34,18 @@ limitations under the License.
 #include "xla/side_effect_util.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
+#include "tensorflow/core/profiler/protobuf/topology.pb.h"
 #include "tensorflow/core/profiler/utils/hlo_module_utils.h"
 #include "tensorflow/core/profiler/utils/hlo_proto_map.h"
 #include "tensorflow/core/profiler/utils/hlo_proto_to_module.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tsl/platform/regexp.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/utils/math_utils.h"
 #include "tsl/profiler/utils/tf_xplane_visitor.h"
+#include "tsl/profiler/utils/timespan.h"
+#include "tsl/profiler/utils/tpu_xplane_utils.h"
 #include "tsl/profiler/utils/xplane_schema.h"
 #include "tsl/profiler/utils/xplane_utils.h"
 #include "tsl/profiler/utils/xplane_visitor.h"
@@ -50,14 +55,15 @@ namespace profiler {
 namespace {
 
 using tensorflow::profiler::DcnSlackSummary;
+using tensorflow::profiler::Topology;
 using tsl::profiler::CreateTfXPlaneVisitor;
 using tsl::profiler::FindLineWithName;
-using tsl::profiler::FindPlanesWithPrefix;
-using tsl::profiler::kTpuPlanePrefix;
 using tsl::profiler::kXlaOpLineName;
 using tsl::profiler::NanoToMicro;
+using tsl::profiler::PicoToMicro;
 using tsl::profiler::SafeDivide;
 using tsl::profiler::StatType;
+using tsl::profiler::Timespan;
 using tsl::profiler::XEventContextTracker;
 using tsl::profiler::XEventVisitor;
 using tsl::profiler::XLineVisitor;
@@ -65,6 +71,11 @@ using tsl::profiler::XPlaneVisitor;
 using tsl::profiler::XStatVisitor;
 using xla::HloOpcode;
 
+// TODO: Identify mechanism to maintain consistency between producer and
+// consumer here.
+const char kHostEventRegex[] = {
+    "device_[0-9][0-9][0-9]([0-9][0-9][0-9])_gid_(.*)"};
+
 std::optional<std::string> GetAttributeFromInstr(
     const xla::HloInstruction* instr, std::string_view attribute) {
   std::optional<std::string> attribute_value;
@@ -79,14 +90,56 @@ std::optional<std::string> GetRendezvous(const xla::HloInstruction* instr) {
   return GetAttributeFromInstr(instr, xla::kXlaHostTransferRendezvousNameAttr);
 }
 
+dcn_analysis_internal::DcnHostEvent ParseDcnHostEvent(
+    const XEventVisitor& visitor) {
+  dcn_analysis_internal::DcnHostEvent event;
+  static const LazyRE2 re = {kHostEventRegex};
+  RE2::FullMatch(visitor.Name(), *re, &event.multi_slice_device_id,
+                 &event.rendezvous_name);
+
+  event.timespan = visitor.GetTimespan();
+  return event;
+}
+
 std::optional<std::string> GetTransferType(const xla::HloInstruction* instr) {
   return GetAttributeFromInstr(instr, "_xla_megascale_transfer_type");
 }
 
+std::string HostCollectiveKey(int index_on_host,
+                              std::string_view rendezvous_name) {
+  return absl::StrCat(index_on_host, "_", rendezvous_name);
+}
+
 }  // namespace
 
 namespace dcn_analysis_internal {
 
+void DcnHostEventList::insert(DcnHostEvent event) {
+  if (iter_ != events_.end() && event.timespan < iter_->timespan) {
+    // The event being inserted is from a new line, Reset iterator to the
+    // beginning.
+    iter_ = events_.begin();
+  }
+  while (iter_ != events_.end() && iter_->timespan < event.timespan) {
+    iter_++;
+  }
+  iter_ = events_.insert(iter_, event);
+}
+
+std::optional<DcnHostEvent> DcnHostEventList::pop(const Timespan& timespan) {
+  while (!events_.empty() && events_.front().timespan < timespan) {
+    events_.pop_front();
+  }
+
+  if (!events_.empty() && timespan.Includes(events_.front().timespan)) {
+    DcnHostEvent front = events_.front();
+    events_.pop_front();
+    return front;
+  } else {
+    return std::nullopt;
+  }
+}
+
 absl::StatusOr<InstrMetadata> DcnTracker::GetInstrMetadataFromHloModule(
     std::string_view module_name, std::string_view instr_name) {
   if (!hlo_module_cache_.contains(module_name)) {
@@ -176,20 +229,20 @@ void DcnTracker::VisitOp(const InstrMetadata& instr,
       opState.overlapping_duration = 0;
       opState.stall_duration_ns = visitor.DurationNs();
       opState.send_op_name = visitor.DisplayName();
-      opState.send.set_duration_ns(visitor.DurationNs());
-      opState.send.set_start_time_ns(visitor.TimestampNs());
+      opState.send.set_duration_ps(visitor.DurationPs());
+      opState.send.set_start_time_ps(visitor.TimestampPs());
       break;
     case HloOpcode::kRecv:
-      opState.recv.set_duration_ns(visitor.DurationNs());
-      opState.recv.set_start_time_ns(visitor.TimestampNs());
+      opState.recv.set_duration_ps(visitor.DurationPs());
+      opState.recv.set_start_time_ps(visitor.TimestampPs());
       break;
     case HloOpcode::kSendDone:
-      opState.send_done.set_duration_ns(visitor.DurationNs());
-      opState.send_done.set_start_time_ns(visitor.TimestampNs());
+      opState.send_done.set_duration_ps(visitor.DurationPs());
+      opState.send_done.set_start_time_ps(visitor.TimestampPs());
       break;
     case HloOpcode::kRecvDone: {
-      opState.recv_done.set_duration_ns(visitor.DurationNs());
-      opState.recv_done.set_start_time_ns(visitor.TimestampNs());
+      opState.recv_done.set_duration_ps(visitor.DurationPs());
+      opState.recv_done.set_start_time_ps(visitor.TimestampPs());
       if (opState.start_time != 0) {
         DcnSlack* analysis = slack_analysis_.add_dcn_slack();
         analysis->set_rendezvous(rendezvous_name);
@@ -227,9 +280,17 @@ void DcnTracker::VisitOp(const InstrMetadata& instr,
   UpdateActiveOps(visitor.DurationNs());
 }
 
+std::optional<DcnHostEvent> DcnTracker::GetCollectiveHostEvent(
+    int core_id, std::string_view rendezvous, Timespan timespan) {
+  return core_id_to_host_event_map_[HostCollectiveKey(core_id, rendezvous)].pop(
+      timespan);
+}
+
 void DcnTracker::SummarizeDcnSlackAnalysis() {
   absl::flat_hash_map<std::string_view, DcnSlackSummary> summary;
-  for (const DcnSlack& analysis : slack_analysis_.dcn_slack()) {
+  // TODO(b/302596260) : Expand to process all cores.
+  int core_id = 0;
+  for (DcnSlack& analysis : *slack_analysis_.mutable_dcn_slack()) {
     DcnSlackSummary& s = summary[analysis.rendezvous()];
     s.set_slack_us(s.slack_us() + analysis.slack_us());
     s.set_occurrences(s.occurrences() + 1);
@@ -245,15 +306,32 @@ void DcnTracker::SummarizeDcnSlackAnalysis() {
     s.set_recv_op_name(analysis.recv_op_name());
     s.set_send_op_name(analysis.send_op_name());
     s.set_send_duration_us(s.send_duration_us() +
-                           NanoToMicro(analysis.send().duration_ns()));
+                           PicoToMicro(analysis.send().duration_ps()));
     s.set_recv_duration_us(s.recv_duration_us() +
-                           NanoToMicro(analysis.recv().duration_ns()));
+                           PicoToMicro(analysis.recv().duration_ps()) / 1E6);
     s.set_send_done_duration_us(
         s.send_done_duration_us() +
-        NanoToMicro(analysis.send_done().duration_ns()));
+        PicoToMicro(analysis.send_done().duration_ps()));
     s.set_recv_done_duration_us(
         s.recv_done_duration_us() +
-        NanoToMicro(analysis.recv_done().duration_ns()));
+        PicoToMicro(analysis.recv_done().duration_ps()));
+
+    // Populate Host summary to DcnSlackSummary
+    std::optional<DcnHostEvent> host_event = GetCollectiveHostEvent(
+        core_id, analysis.rendezvous(),
+        Timespan::FromEndPoints(analysis.send().start_time_ps(),
+                                analysis.recv_done().start_time_ps() +
+                                    analysis.recv_done().duration_ps()));
+    if (host_event.has_value()) {
+      OpInstance* host_graph_execution =
+          analysis.mutable_host_graph_execution();
+      host_graph_execution->set_start_time_ps(host_event->timespan.begin_ps());
+      host_graph_execution->set_duration_ps(host_event->timespan.duration_ps());
+
+      s.set_host_stall_us(((int64_t)host_event->timespan.end_ps() -
+                           (int64_t)analysis.recv_done().start_time_ps()) /
+                          1E6);
+    }
   }
 
   for (auto& [_, s] : summary) {
@@ -267,20 +345,73 @@ void DcnTracker::SummarizeDcnSlackAnalysis() {
         SafeDivide(s.recv_done_duration_us(), s.occurrences()));
     s.set_send_duration_us(SafeDivide(s.send_duration_us(), s.occurrences()));
     s.set_recv_duration_us(SafeDivide(s.recv_duration_us(), s.occurrences()));
+    s.set_host_stall_us(SafeDivide(s.host_stall_us(), s.occurrences()));
     *slack_analysis_.add_dcn_slack_summary() = s;
   }
 }
 
+void DcnTracker::ProcessTopology(const Topology& topology) {
+  for (const auto& mesh_location : topology.mesh_location()) {
+    global_chip_id_to_local_index_map_[mesh_location.global_id()] =
+        mesh_location.index_on_host();
+  }
+}
+
+int DcnTracker::GetLocalIndex(int dcn_device_id) {
+  /* Based on if megacore was present or not, the LocalIndex calculation will
+   * differ,
+   * dcn device id would use the global index in cases of megacore, and use
+   * 2*global_index (+1) for non megacore instances
+   * TODO(b/302145703): Identify if transformation can be obtained from the
+   * TpuTopology directly
+   */
+  int global_device_id = dcn_device_id;
+  if (!is_megacore_) {
+    if (global_chip_id_to_local_index_map_.contains(global_device_id)) {
+      return global_chip_id_to_local_index_map_[dcn_device_id / 2] +
+             dcn_device_id % 2;
+    }
+  }
+  if (global_chip_id_to_local_index_map_.contains(global_device_id)) {
+    return global_chip_id_to_local_index_map_[global_device_id];
+  }
+  LOG(WARNING) << "Could not map dcn_device_id to Local index, Using "
+                  "dcn_device_id : "
+               << global_device_id;
+  return global_device_id;
+}
+
+void DcnTracker::VisitHostEvent(const DcnHostEvent& event) {
+  std::string key = HostCollectiveKey(
+      GetLocalIndex(event.multi_slice_device_id), event.rendezvous_name);
+  if (event.rendezvous_name.empty()) return;
+  core_id_to_host_event_map_[key].insert(event);
+}
+
+void ProcessDcnTraces(const XPlane& xplane, DcnTracker& dcn_tracker) {
+  XPlaneVisitor xplane_visitor = CreateTfXPlaneVisitor(&xplane);
+  HloProtoMap hlo_proto_map;
+  xplane_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      dcn_tracker.VisitHostEvent(ParseDcnHostEvent(event));
+    });
+  });
+}
+
 }  // namespace dcn_analysis_internal
 
-DcnSlackAnalysis ConvertXSpaceToDcnSlackAnalysis(const XSpace& xspace) {
-  const auto& xplanes = FindPlanesWithPrefix(xspace, kTpuPlanePrefix);
-  if (xplanes.empty()) return DcnSlackAnalysis();
-  const XPlane* xplane = xplanes.at(0);
+DcnSlackAnalysis ConvertXSpaceToDcnSlackAnalysis(const XSpace& xspace,
+                                                 const XPlane* dcn_host_plane,
+                                                 const Topology* topology,
+                                                 bool is_megacore) {
+  int num_cores = tsl::profiler::FindTensorCorePlanes(xspace).size();
+  if (num_cores == 0) return DcnSlackAnalysis();
+  const XPlane* xplane =
+      FindPlaneWithName(xspace, tsl::profiler::TpuPlaneName(0));
   XPlaneVisitor xplane_visitor = CreateTfXPlaneVisitor(xplane);
   HloProtoMap hlo_proto_map;
   hlo_proto_map.AddHloProtosFromXSpace(xspace);
-  dcn_analysis_internal::DcnTracker dcn_tracker(hlo_proto_map);
+  dcn_analysis_internal::DcnTracker dcn_tracker(hlo_proto_map, is_megacore);
   XEventContextTracker hlo_module_context(
       &xplane_visitor,
       FindLineWithName(*xplane, tsl::profiler::kXlaModuleLineName));
@@ -303,7 +434,7 @@ DcnSlackAnalysis ConvertXSpaceToDcnSlackAnalysis(const XSpace& xspace) {
         if (!module.has_value()) return;
         if (absl::StrContains(hlo_category, "host send") ||
             absl::StrContains(hlo_category, "host recv")) {
-          // All Megascale send/send-done/recv/recv-done ops.
+          // All Dcn send/send-done/recv/recv-done ops.
           auto instr = dcn_tracker.GetInstructionMetadata(module->Name(),
                                                           xevent.DisplayName());
           if (instr.ok()) {
@@ -313,6 +444,14 @@ DcnSlackAnalysis ConvertXSpaceToDcnSlackAnalysis(const XSpace& xspace) {
       });
     }
   });
+
+  if (dcn_host_plane != nullptr) {
+    VLOG(1) << "Processing host traces.";
+    if (topology != nullptr) {
+      dcn_tracker.ProcessTopology(*topology);
+    }
+    ProcessDcnTraces(*dcn_host_plane, dcn_tracker);
+  }
   return dcn_tracker.Finalize();
 }
 
diff --git a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h
index 108eae3185ff0c..3f38da65460346 100644
--- a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h
+++ b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XSPACE_TO_DCN_SLACK_ANALYSIS_H_
 
 #include <cstdint>
+#include <deque>
+#include <list>
 #include <map>
 #include <memory>
 #include <optional>
@@ -23,11 +25,14 @@ limitations under the License.
 #include <string_view>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
+#include "tensorflow/core/profiler/protobuf/topology.pb.h"
 #include "tensorflow/core/profiler/utils/hlo_proto_map.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
+#include "tsl/profiler/utils/timespan.h"
 #include "tsl/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
@@ -55,6 +60,33 @@ struct DcnOpState {
   OpInstance recv_done;
 };
 
+// Structure to extract and store the DcnHostEvents.
+struct DcnHostEvent {
+  std::string rendezvous_name;
+  tsl::profiler::Timespan timespan;
+  int multi_slice_device_id;
+};
+
+// When visiting DcnHostEvents from the megascale planes, The events are stored
+// in separate lines in an ascending (by time) order. The List allows insertion
+// of multiple arrays of sorted events.
+class DcnHostEventList {
+ public:
+  // Insert the event into the sorted list.
+  void insert(DcnHostEvent event);
+
+  // Pop the events from the front that is included within the timestamp when
+  // available.
+  std::optional<DcnHostEvent> pop(const tsl::profiler::Timespan& timespan);
+
+  // Number of events.
+  int size() const { return events_.size(); }
+
+ private:
+  std::list<DcnHostEvent> events_;
+  std::list<DcnHostEvent>::iterator iter_ = events_.begin();
+};
+
 struct InstrMetadata {
   xla::HloOpcode opcode;
   uint64_t channel_id;
@@ -65,8 +97,9 @@ struct InstrMetadata {
 
 class DcnTracker {
  public:
-  explicit DcnTracker(const tensorflow::profiler::HloProtoMap& hlo_proto_map)
-      : hlo_proto_map_(hlo_proto_map) {}
+  explicit DcnTracker(const tensorflow::profiler::HloProtoMap& hlo_proto_map,
+                      bool is_megacore)
+      : hlo_proto_map_(hlo_proto_map), is_megacore_(is_megacore) {}
 
   absl::StatusOr<InstrMetadata> GetInstructionMetadata(std::string_view module,
                                                        std::string_view instr);
@@ -78,14 +111,21 @@ class DcnTracker {
   void VisitOp(const InstrMetadata& instr,
                const tsl::profiler::XEventVisitor& visitor);
 
+  void VisitHostEvent(const DcnHostEvent& event);
+
+  void ProcessTopology(const tensorflow::profiler::Topology& topology);
+
  private:
   DcnSlackAnalysis slack_analysis_;
   absl::flat_hash_map<std::string, DcnOpState> rendezvous_to_op_map_;
   absl::flat_hash_map<uint64_t, std::string> channel_id_to_rendezvous_map_;
   absl::flat_hash_map<std::string, InstrMetadata> instruction_metadata_map_;
+  absl::flat_hash_map<std::string, DcnHostEventList> core_id_to_host_event_map_;
   const tensorflow::profiler::HloProtoMap& hlo_proto_map_;
+  absl::flat_hash_map<int, int> global_chip_id_to_local_index_map_;
   absl::flat_hash_map<std::string, std::unique_ptr<xla::HloModule>>
       hlo_module_cache_;
+  bool is_megacore_ = true;
 
   absl::StatusOr<InstrMetadata> GetInstrMetadataFromHloModule(
       std::string_view module, std::string_view instr);
@@ -93,13 +133,22 @@ class DcnTracker {
   void UpdateActiveOps(uint64_t duration);
 
   void SummarizeDcnSlackAnalysis();
+
+  std::optional<DcnHostEvent> GetCollectiveHostEvent(
+      int core_id, std::string_view rendezvous_name,
+      tsl::profiler::Timespan timespan);
+
+  // GetLocalIndex when available, else return the global_device_id itself.
+  int GetLocalIndex(int dcn_device_id);
 };
 
 }  // namespace dcn_analysis_internal
 
 // Convert Hlo Events in XSpace to Dcn Slack analysis.
 DcnSlackAnalysis ConvertXSpaceToDcnSlackAnalysis(
-    const tensorflow::profiler::XSpace& xspace);
+    const tensorflow::profiler::XSpace& xspace,
+    const tensorflow::profiler::XPlane* dcn_host_plane,
+    const tensorflow::profiler::Topology* topology, bool is_megacore = true);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/protobuf/dcn_slack_analysis.proto b/tensorflow/core/profiler/protobuf/dcn_slack_analysis.proto
index ce444615061513..40cb04cffdfcc4 100644
--- a/tensorflow/core/profiler/protobuf/dcn_slack_analysis.proto
+++ b/tensorflow/core/profiler/protobuf/dcn_slack_analysis.proto
@@ -3,8 +3,8 @@ syntax = "proto3";
 package tensorflow.profiler;
 
 message OpInstance {
-  uint64 start_time_ns = 1;
-  uint64 duration_ns = 2;
+  uint64 start_time_ps = 1;
+  uint64 duration_ps = 2;
 }
 
 message DcnSlack {
@@ -52,6 +52,8 @@ message DcnSlack {
   OpInstance recv_done = 12;
 
   string transfer_type = 13;
+
+  OpInstance host_graph_execution = 14;
 }
 
 message DcnSlackSummary {
@@ -80,6 +82,8 @@ message DcnSlackSummary {
   uint64 recv_done_duration_us = 12;
 
   string transfer_type = 13;
+
+  int64 host_stall_us = 14;
 }
 
 message DcnSlackAnalysis {

From e6b58a8b231099b0207030fcad0e4e211ef8f116 Mon Sep 17 00:00:00 2001
From: Camilla Halsey <halseycamilla@google.com>
Date: Mon, 2 Oct 2023 12:23:35 -0700
Subject: [PATCH 482/567] Fix config typo in TF's bazelrc file.

PiperOrigin-RevId: 570141677
---
 .bazelrc                                 | 2 +-
 third_party/xla/.bazelrc                 | 2 +-
 third_party/xla/third_party/tsl/.bazelrc | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 7cddd444e396fc..15dab09520ac8f 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -691,7 +691,7 @@ test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflo
 test:linux_cpu_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # CUDA PYCPP:
 test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 7cddd444e396fc..15dab09520ac8f 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -691,7 +691,7 @@ test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflo
 test:linux_cpu_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # CUDA PYCPP:
 test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 7cddd444e396fc..15dab09520ac8f 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -691,7 +691,7 @@ test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflo
 test:linux_cpu_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # CUDA PYCPP:
 test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11

From d397ccdb37e7fed88420e95d13e5ddf7bb183c84 Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Mon, 2 Oct 2023 12:58:42 -0700
Subject: [PATCH 483/567] Move `GetMaxIdsAndUniques()` function to
 sparse_core_ops_utils so that non-xla ops can call it.

PiperOrigin-RevId: 570152309
---
 tensorflow/core/tpu/kernels/BUILD             |  3 ++
 .../core/tpu/kernels/sparse_core_ops_utils.cc | 23 +++++++++++++
 .../core/tpu/kernels/sparse_core_ops_utils.h  | 15 +++++++++
 .../tpu/kernels/sparse_core_preprocess_ops.cc | 14 ++++----
 .../tpu/kernels/sparse_core_preprocess_ops.h  | 15 ---------
 .../core/tpu/kernels/sparse_core_xla_ops.cc   | 33 ++++---------------
 6 files changed, 55 insertions(+), 48 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 1fd1b1e7f7f709..ba3e889108c41e 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -1394,6 +1394,9 @@ cc_library(
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@local_xla//xla/stream_executor/tpu:status_helper",
+        "@local_xla//xla/stream_executor/tpu:tpu_api",
+        "@local_xla//xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
     ],
 )
 
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
index 2e3daafa93b80c..d106449800153a 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
@@ -29,6 +29,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/jit/flags.h"
+#include "xla/stream_executor/tpu/status_helper.h"
+#include "xla/stream_executor/tpu/tpu_api.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -98,6 +101,26 @@ std::function<float(float)> GetCombinerScaleTransformFunction(
   }
 }
 
+Status GetMaxIdsAndUniquesExternal(const std::string& program_key,
+                                   const std::string& table_name,
+                                   int64_t num_samples_per_sparse_core,
+                                   int64_t feature_width,
+                                   int64_t* max_ids_per_partition,
+                                   int64_t* max_unique_ids_per_partition) {
+  SparseCore_GetMaxIdsAndUniques_Params params;
+  params.program_key = program_key.c_str();
+  params.table_name = table_name.c_str();
+  params.num_samples_per_sparse_core = num_samples_per_sparse_core;
+  params.feature_width = feature_width;
+  StatusHelper status;
+  params.status = status.c_status;
+
+  stream_executor::tpu::OpsApiFn()->SparseCore_GetMaxIdsAndUniquesFn(&params);
+  *max_ids_per_partition = params.max_ids_per_partition;
+  *max_unique_ids_per_partition = params.max_unique_ids_per_partition;
+  return status.status();
+}
+
 std::vector<std::vector<std::string>> GetTableStacks(
     const std::vector<int64_t>& table_height,
     const std::vector<int64_t>& table_width,
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
index 718676e4d81700..c62e4ebb4c6b6b 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
@@ -16,7 +16,9 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_OPS_UTILS_H_
 
 #include <cstdint>
+#include <functional>
 #include <limits>
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -63,6 +65,19 @@ int64_t GetXlaSparseCoreStackingMemLimit();
 
 int64_t GetXlaSparseCoreStackingTableShardLimit();
 
+bool GetDisableTableStacking();
+
+int64_t GetXlaSparseCoreStackingMemLimit();
+
+int64_t GetXlaSparseCoreStackingTableShardLimit();
+
+Status GetMaxIdsAndUniquesExternal(const std::string& program_key,
+                                   const std::string& table_name,
+                                   int64_t num_samples_per_sparse_core,
+                                   int64_t feature_width,
+                                   int64_t* max_ids_per_partition,
+                                   int64_t* max_unique_ids_per_partition);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_OPS_UTILS_H_
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index 1dd490a6799229..286ef7c472a26f 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -349,9 +349,10 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   int64_t max_ids_per_partition = -1;
   int64_t max_unique_ids_per_partition = -1;
 
-  GetMaxIdsAndUniques(ctx, program_key, table_name_, per_sparse_core_batch_size,
-                      feature_width_, &max_ids_per_partition,
-                      &max_unique_ids_per_partition);
+  OP_REQUIRES_OK(ctx, GetMaxIdsAndUniquesExternal(
+                          program_key, table_name_, per_sparse_core_batch_size,
+                          feature_width_, &max_ids_per_partition,
+                          &max_unique_ids_per_partition));
 
   const int32* row_ids_tensor_ptr = row_ids->flat<int32>().data();
   const int32* col_ids_tensor_ptr = col_ids->flat<int32>().data();
@@ -561,9 +562,10 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   int64_t max_ids_per_partition = -1;
   int64_t max_unique_ids_per_partition = -1;
 
-  GetMaxIdsAndUniques(ctx, program_key, table_name_, per_sc_sample_count,
-                      feature_width_, &max_ids_per_partition,
-                      &max_unique_ids_per_partition);
+  OP_REQUIRES_OK(
+      ctx, GetMaxIdsAndUniquesExternal(
+               program_key, table_name_, per_sc_sample_count, feature_width_,
+               &max_ids_per_partition, &max_unique_ids_per_partition));
 
   sprase_core_ops_stats_handler_->Record(StatsType::MAX_IDS_PER_PARTITION,
                                          max_ids_per_partition, device_name_,
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
index 0ecf6214f1f4a5..67190cd1f35707 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
@@ -67,14 +67,6 @@ class GetMinibatchesInCsrWithPhysicalReplicaOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  protected:
-  virtual void GetMaxIdsAndUniques(OpKernelContext* ctx,
-                                   const std::string& program_key,
-                                   const std::string& table_name,
-                                   int64_t num_samples_per_sparse_core,
-                                   int64_t feature_width,
-                                   int64_t* max_ids_per_partition,
-                                   int64_t* max_unique_ids_per_partition) {}
-
   int sample_count_ = 1;
   int feature_width_ = 1;
   int64_t num_sc_per_chip_;
@@ -101,13 +93,6 @@ class GetMinibatchSplitsWithPhysicalReplicaOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  protected:
-  virtual void GetMaxIdsAndUniques(OpKernelContext* ctx,
-                                   const std::string& program_key,
-                                   const std::string& table_name,
-                                   int64_t num_samples_per_sparse_core,
-                                   int64_t feature_width,
-                                   int64_t* max_ids_per_partition,
-                                   int64_t* max_unique_ids_per_partition) {}
   virtual void CalculateHeadroom(int32 this_max_ids, int32 this_max_uniques,
                                  tstring program_key,
                                  int64_t max_ids_per_partition,
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
index abe0871ac318ae..964e5de0dedb4d 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
@@ -55,27 +55,6 @@ xla::XlaOp PowHack(xla::XlaOp base, xla::XlaOp exp) {
   return xla::Exp(xla::Log(base) * exp);
 }
 
-// Gets the max ids and max unique ids for the given table.
-Status GetMaxIdsAndUniques(const std::string& program_key,
-                           const std::string& table_name,
-                           int64_t num_samples_per_sparse_core,
-                           int64_t feature_width,
-                           int64_t* max_ids_per_partition,
-                           int64_t* max_unique_ids_per_partition) {
-  SparseCore_GetMaxIdsAndUniques_Params params;
-  params.program_key = program_key.c_str();
-  params.table_name = table_name.c_str();
-  params.num_samples_per_sparse_core = num_samples_per_sparse_core;
-  params.feature_width = feature_width;
-  StatusHelper status;
-  params.status = status.c_status;
-
-  stream_executor::tpu::OpsApiFn()->SparseCore_GetMaxIdsAndUniquesFn(&params);
-  *max_ids_per_partition = params.max_ids_per_partition;
-  *max_unique_ids_per_partition = params.max_unique_ids_per_partition;
-  return status.status();
-}
-
 // Get the SparseCore logical replica count.
 // TODO(agagik): get it from the tpu topology.
 StatusOr<int64_t> GetSparseCoresPerChip() { return 4; }
@@ -264,9 +243,9 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
     const int32_t feature_width = embedding_table_shape.dimensions(1);
 
     OP_REQUIRES_OK(
-        ctx, GetMaxIdsAndUniques("", table_name_, per_sparse_core_batch_size,
-                                 feature_width, &max_ids_per_partition,
-                                 &max_unique_ids_per_partition));
+        ctx, GetMaxIdsAndUniquesExternal(
+                 "", table_name_, per_sparse_core_batch_size, feature_width,
+                 &max_ids_per_partition, &max_unique_ids_per_partition));
     VLOG(3) << "XlaSparseDenseMatmulWithCsrInputOp: "
             << "table_name = '" << table_name_
             << "', max_ids = " << max_ids_per_partition
@@ -406,9 +385,9 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
 
     const int32_t feature_width = embedding_table_shape.dimensions(1);
     OP_REQUIRES_OK(
-        ctx, GetMaxIdsAndUniques("", table_name_, per_sparse_core_batch_size,
-                                 feature_width, &max_ids_per_partition,
-                                 &max_unique_ids_per_partition));
+        ctx, GetMaxIdsAndUniquesExternal(
+                 "", table_name_, per_sparse_core_batch_size, feature_width,
+                 &max_ids_per_partition, &max_unique_ids_per_partition));
     VLOG(3) << "XlaSparseDenseMatmulWithCsrInputOp: "
             << "table_name = '" << table_name_
             << "', max_ids = " << max_ids_per_partition

From c791fe82581e5e1ad4a05a91281413832ff09289 Mon Sep 17 00:00:00 2001
From: Tao Wang <wangtao@google.com>
Date: Mon, 2 Oct 2023 13:04:03 -0700
Subject: [PATCH 484/567] Add more API set up for Mock GPU client. Also clean
 up previous mock GPU client API.

PiperOrigin-RevId: 570153877
---
 third_party/xla/xla/python/xla.cc             | 78 ++-----------------
 third_party/xla/xla/python/xla_client.py      | 17 +---
 third_party/xla/xla/python/xla_client.pyi     |  3 +-
 .../xla/xla/python/xla_extension/__init__.pyi |  3 +-
 4 files changed, 13 insertions(+), 88 deletions(-)

diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index c1bef710507c0b..8f8e10abbedc10 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -527,8 +527,8 @@ static void Init(py::module_& m) {
          std::shared_ptr<DistributedRuntimeClient> distributed_client,
          int node_id, int num_nodes,
          std::optional<std::set<int>> allowed_devices,
-         std::optional<std::string> platform_name)
-          -> std::shared_ptr<PyClient> {
+         std::optional<std::string> platform_name,
+         std::optional<bool> mock = false) -> std::shared_ptr<PyClient> {
         py::gil_scoped_release gil_release;
         PjRtClient::KeyValueGetCallback kv_get = nullptr;
         PjRtClient::KeyValuePutCallback kv_put = nullptr;
@@ -552,76 +552,8 @@ static void Init(py::module_& m) {
             xla::ValueOrThrow(GetStreamExecutorGpuClient(
                 asynchronous, allocator_config, node_id, num_nodes,
                 allowed_devices, platform_name,
-                /*should_stage_host_to_device_transfers=*/true, kv_get,
-                kv_put));
-        return std::make_shared<PyClient>(
-            ifrt::PjRtClient::Create(std::move(client)));
-      },
-      py::arg("asynchronous") = true,
-      py::arg("allocator_config") = GpuAllocatorConfig(),
-      py::arg("distributed_client") = nullptr, py::arg("node_id") = 0,
-      py::arg("num_nodes") = 1, py::arg("allowed_devices") = std::nullopt,
-      py::arg("platform_name") = std::nullopt);
-  m.def(
-      "get_mock_gpu_client",
-      [](bool asynchronous, const GpuAllocatorConfig& allocator_config,
-         std::shared_ptr<DistributedRuntimeClient> distributed_client,
-         int node_id, int num_nodes,
-         std::optional<std::set<int>> allowed_devices,
-         std::optional<std::string> platform_name)
-          -> std::shared_ptr<PyClient> {
-        py::gil_scoped_release gil_release;
-        PjRtClient::KeyValueGetCallback kv_get = nullptr;
-        PjRtClient::KeyValuePutCallback kv_put = nullptr;
-        absl::flat_hash_map<std::string, std::string> device_maps;
-        absl::Mutex mu;
-        if (num_nodes > 1) {
-          kv_get = [&device_maps, &mu, &num_nodes](
-                       const std::string& k,
-                       absl::Duration timeout) -> xla::StatusOr<std::string> {
-            std::string result;
-            {
-              absl::MutexLock lock(&mu);
-              if (device_maps.find(k) != device_maps.end()) {
-                result = device_maps[k];
-              } else {
-                // TODO(wangtao): Move fake topology logic down to
-                // GetStreamExecutorGpuClient.
-                // Get device_id from key.
-                int device_id;
-                std::vector<std::string> tokens = absl::StrSplit(k, ':');
-                if (tokens.size() != 2 ||
-                    !absl::SimpleAtoi(tokens[1], &device_id)) {
-                  device_id = num_nodes - 1;
-                }
-                // Return fake local topology with device_id info back.
-                LocalTopologyProto local;
-                local.set_boot_id("fake_boot_id");
-                local.set_node_id(device_id);
-                DeviceProto* device = local.add_devices();
-                device->set_global_device_id(device_id);
-                device->set_name("fake_device");
-                device->set_vendor("fake_vendor");
-                result = local.SerializeAsString();
-              }
-            }
-            return result;
-          };
-          kv_put = [&device_maps, &mu](const std::string& k,
-                                       const std::string& v) -> xla::Status {
-            {
-              absl::MutexLock lock(&mu);
-              device_maps[k] = v;
-            }
-            return xla::OkStatus();
-          };
-        }
-        std::unique_ptr<PjRtClient> client =
-            xla::ValueOrThrow(GetStreamExecutorGpuClient(
-                asynchronous, allocator_config, node_id, num_nodes,
-                allowed_devices, platform_name,
-                /*should_stage_host_to_device_transfers=*/true, kv_get,
-                kv_put));
+                /*should_stage_host_to_device_transfers=*/true, kv_get, kv_put,
+                /*enable_mock_nccl=*/mock.value_or(false)));
         return std::make_shared<PyClient>(
             ifrt::PjRtClient::Create(std::move(client)));
       },
@@ -629,7 +561,7 @@ static void Init(py::module_& m) {
       py::arg("allocator_config") = GpuAllocatorConfig(),
       py::arg("distributed_client") = nullptr, py::arg("node_id") = 0,
       py::arg("num_nodes") = 1, py::arg("allowed_devices") = std::nullopt,
-      py::arg("platform_name") = std::nullopt);
+      py::arg("platform_name") = std::nullopt, py::arg("mock") = std::nullopt);
 #endif  // XLA_PYTHON_ENABLE_GPU
 
   m.def(
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index c8eafe9202ea55..1f7977e903abee 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -45,7 +45,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 199
+_version = 200
 
 # Version number for MLIR:Python components.
 mlir_api_version = 54
@@ -102,17 +102,6 @@ def make_gpu_client(
       'ROCM', _xla.register_custom_call_target
   )
 
-  if mock:
-    return _xla.get_mock_gpu_client(
-        asynchronous=True,
-        allocator_config=config,
-        distributed_client=distributed_client,
-        node_id=node_id,
-        num_nodes=num_nodes,
-        platform_name=platform_name,
-        allowed_devices=allowed_devices,
-    )
-
   return _xla.get_gpu_client(
       asynchronous=True,
       allocator_config=config,
@@ -120,7 +109,9 @@ def make_gpu_client(
       node_id=node_id,
       num_nodes=num_nodes,
       platform_name=platform_name,
-      allowed_devices=allowed_devices)
+      allowed_devices=allowed_devices,
+      mock=mock,
+  )
 
 
 def make_tfrt_tpu_c_api_client(options: Optional[_NameValueMapping] = None):
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
index 47786cc8026d3e..3110e7d611049f 100644
--- a/third_party/xla/xla/python/xla_client.pyi
+++ b/third_party/xla/xla/python/xla_client.pyi
@@ -90,7 +90,8 @@ def make_gpu_client(
     node_id: int = ...,
     num_nodes: int = ...,
     platform_name: Optional[str] = ...,
-    allowed_devices: Optional[Set[int]] = ...) -> Client:
+    allowed_devices: Optional[Set[int]] = ...,
+    mock: Optional[bool]=...) -> Client:
   ...
 
 
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
index 31615b9af2cac5..71b9eccb1e1ace 100644
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ b/third_party/xla/xla/python/xla_extension/__init__.pyi
@@ -475,7 +475,8 @@ def get_gpu_client(
     distributed_client: Optional[DistributedRuntimeClient] = ...,
     node_id: int = ...,
     allowed_devices: Optional[Any] = ...,
-    platform_name: Optional[str] = ...) -> Client:...
+    platform_name: Optional[str] = ...,
+    mock:Optional[bool]=...) -> Client:...
 def get_mock_gpu_client(
     asynchronous: bool = ...,
     allocator_config: GpuAllocatorConfig = ...,

From d6e25b069d9ddf4111d7def74c145d118c6f4f81 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 13:24:25 -0700
Subject: [PATCH 485/567] Promote c_api_opaque.h to the 'mostly stable' API.

For compatibility reasons, ':c_api_experimental*' targets in 'core/c' depend on 'c_api_opaque*'s.

PiperOrigin-RevId: 570159652
---
 tensorflow/lite/build_def.bzl         |  12 ++-
 tensorflow/lite/c/BUILD               |  68 ++++++++++++++-
 tensorflow/lite/core/c/BUILD          | 116 +++++++++++++++++++++++++-
 tensorflow/lite/core/c/c_api_opaque.h |   7 +-
 4 files changed, 193 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 6a9f0a812feea4..3f8bab5a1c1a3f 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -686,15 +686,21 @@ def tflite_custom_c_library(
             "//tensorflow/lite/c:c_api_experimental.h",
             "//tensorflow/lite/c:c_api_opaque.h",
         ]
-        experimental_deps = [
+        deps = [
             "//tensorflow/lite/c:c_api_experimental_without_op_resolver_without_alwayslink",
             "//tensorflow/lite/core/c:private_c_api_experimental_without_op_resolver_without_alwayslink",
+            "//tensorflow/lite/c:c_api_opaque_without_op_resolver_without_alwayslink",
+            "//tensorflow/lite/core/c:private_c_api_opaque_without_op_resolver_without_alwayslink",
         ]
     else:
         hdrs = [
             "//tensorflow/lite/c:c_api.h",
+            "//tensorflow/lite/c:c_api_opaque.h",
+        ]
+        deps = [
+            "//tensorflow/lite/c:c_api_opaque_without_op_resolver_without_alwayslink",
+            "//tensorflow/lite/core/c:private_c_api_opaque_without_op_resolver_without_alwayslink",
         ]
-        experimental_deps = []
     native.cc_library(
         name = name,
         hdrs = hdrs,
@@ -708,7 +714,7 @@ def tflite_custom_c_library(
             "//tensorflow/lite/core/c:private_c_api_without_op_resolver_without_alwayslink",
             "//tensorflow/lite/core/c:private_common",
             "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
-        ] + experimental_deps,
+        ] + deps,
         **kwargs
     )
 
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 335dcfa671f5e8..849afed05b35c8 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -106,6 +106,11 @@ cc_library_with_tflite_with_c_headers_test(
     name = "c_api_experimental",
     hdrs = [
         "c_api_experimental.h",
+        # ':c_api_opaque' was promoted to the 'mostly stable' API.
+        # Please explicitly add a dependency on ':c_api_opaque' if your target
+        # needs both ':c_api_experimental' and ':c_api_opaque' dependencies.
+        # We plan to remove 'c_api_opaque.h' from the list of exposed headers and
+        # will remove the dependency from this target on ':c_api_opaque' in the future.
         "c_api_opaque.h",
     ],
     copts = tflite_copts() + tflite_copts_warnings(),
@@ -113,7 +118,10 @@ cc_library_with_tflite_with_c_headers_test(
     tflite_deps = [
         ":c_api",
     ],
-    deps = ["//tensorflow/lite/core/c:c_api_experimental"],
+    deps = [
+        "//tensorflow/lite/core/c:c_api_experimental",
+        "//tensorflow/lite/core/c:c_api_opaque",
+    ],
 )
 
 # Same as ":c_api_experimental", but without linking in the default CreateOpResolver implementation.
@@ -121,6 +129,12 @@ cc_library_with_tflite_with_c_headers_test(
     name = "c_api_experimental_without_op_resolver",
     hdrs = [
         "c_api_experimental.h",
+        # DEPRECATED
+        # ':c_api_opaque_without_op_resolver' was promoted to the 'mostly stable' API.
+        # Please explicitly add a dependency on ':c_api_opaque_without_op_resolver' if your target
+        # needs both ':c_api_experimental_without_op_resolver' and ':c_api_opaque_without_op_resolver' dependencies.
+        # We plan to remove 'c_api_opaque.h' from the list of exposed headers and
+        # will remove the dependency from this target on ':c_api_opaque_without_op_resolver' in the future.
         "c_api_opaque.h",
     ],
     copts = tflite_copts() + tflite_copts_warnings(),
@@ -128,7 +142,10 @@ cc_library_with_tflite_with_c_headers_test(
     tflite_deps = [
         ":c_api_without_op_resolver",
     ],
-    deps = ["//tensorflow/lite/core/c:c_api_experimental_without_op_resolver"],
+    deps = [
+        "//tensorflow/lite/core/c:c_api_experimental_without_op_resolver",
+        "//tensorflow/lite/core/c:c_api_opaque_without_op_resolver",
+    ],
 )
 
 # Same as ":c_api_experimental", but without linking in the default CreateOpResolver implementation,
@@ -137,6 +154,50 @@ cc_library_with_tflite_with_c_headers_test(
     name = "c_api_experimental_without_op_resolver_without_alwayslink",
     hdrs = [
         "c_api_experimental.h",
+        # DEPRECATED
+        # ':c_api_opaque_without_op_resolver_without_alwayslink' was promoted to the 'mostly stable' API.
+        # Please explicitly add a dependency on ':c_api_opaque_without_op_resolver_without_alwayslink'
+        # if your target needs both ':c_api_experimental_without_op_resolver_without_alwayslink' and
+        # ':c_api_opaque_without_op_resolver_without_alwayslink' dependencies.
+        # We plan to remove 'c_api_opaque.h' from the list of exposed headers and
+        # will remove the dependency from this target on
+        # ':c_api_opaque_without_op_resolver_without_alwayslink' in the future.
+        "c_api_opaque.h",
+    ],
+    copts = tflite_copts() + tflite_copts_warnings(),
+    tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
+    tflite_deps = [":c_api_without_op_resolver_without_alwayslink"],
+    deps = [
+        "//tensorflow/lite/core/c:c_api_experimental_without_op_resolver_without_alwayslink",
+        "//tensorflow/lite/core/c:c_api_opaque_without_op_resolver_without_alwayslink",
+    ],
+)
+
+cc_library_with_tflite_with_c_headers_test(
+    name = "c_api_opaque",
+    hdrs = [
+        "c_api_opaque.h",
+    ],
+    copts = tflite_copts() + tflite_copts_warnings(),
+    generate_opaque_delegate_target = True,
+    tflite_deps = [":c_api"],
+    deps = ["//tensorflow/lite/core/c:c_api_opaque"],
+)
+
+cc_library_with_tflite_with_c_headers_test(
+    name = "c_api_opaque_without_op_resolver",
+    hdrs = [
+        "c_api_opaque.h",
+    ],
+    copts = tflite_copts() + tflite_copts_warnings(),
+    tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
+    tflite_deps = [":c_api_without_op_resolver"],
+    deps = ["//tensorflow/lite/core/c:c_api_opaque_without_op_resolver"],
+)
+
+cc_library_with_tflite_with_c_headers_test(
+    name = "c_api_opaque_without_op_resolver_without_alwayslink",
+    hdrs = [
         "c_api_opaque.h",
     ],
     copts = tflite_copts() + tflite_copts_warnings(),
@@ -144,7 +205,7 @@ cc_library_with_tflite_with_c_headers_test(
     tflite_deps = [
         ":c_api_without_op_resolver_without_alwayslink",
     ],
-    deps = ["//tensorflow/lite/core/c:c_api_experimental_without_op_resolver_without_alwayslink"],
+    deps = ["//tensorflow/lite/core/c:c_api_opaque_without_op_resolver_without_alwayslink"],
 )
 
 cc_library_with_tflite_with_c_headers_test(
@@ -392,6 +453,7 @@ cc_test(
         ":c_api_experimental",
         "//tensorflow/lite/core/c:c_api",
         "//tensorflow/lite/core/c:c_api_experimental",
+        "//tensorflow/lite/core/c:c_api_opaque",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
     ],
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index c1218edace4b0d..468e05b3346869 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -316,6 +316,12 @@ tflite_cc_library_with_c_headers_test(
     name = "c_api_experimental",
     hdrs = [
         "c_api_experimental.h",
+        # DEPRECATED
+        # ':c_api_opaque' was promoted to the 'mostly stable' API.
+        # Please explicitly add a dependency on ':c_api_opaque' if your target
+        # needs both ':c_api_experimental' and ':c_api_opaque' dependencies.
+        # We plan to remove 'c_api_opaque.h' from the list of exposed headers and
+        # will remove the dependency from this target on ':c_api_opaque' in the future.
         "c_api_opaque.h",
     ],
     copts = tflite_copts(),
@@ -325,6 +331,7 @@ tflite_cc_library_with_c_headers_test(
     deps = [
         ":c_api",
         ":c_api_experimental_without_op_resolver",
+        ":c_api_opaque",
         ":c_api_types",
         ":common",
         "//tensorflow/lite:builtin_ops",
@@ -343,15 +350,21 @@ tflite_cc_library_with_c_headers_test(
     name = "c_api_experimental_without_op_resolver",
     srcs = [
         "c_api_experimental.cc",
-        "c_api_opaque.cc",
     ],
     hdrs = [
         "c_api_experimental.h",
+        # DEPRECATED
+        # ':c_api_opaque_without_op_resolver' was promoted to the 'mostly stable' API.
+        # Please explicitly add a dependency on ':c_api_opaque_without_op_resolver' if your target
+        # needs both ':c_api_experimental_without_op_resolver' and ':c_api_opaque_without_op_resolver' dependencies.
+        # We plan to remove 'c_api_opaque.h' from the list of exposed headers and
+        # will remove the dependency from this target on ':c_api_opaque_without_op_resolver' in the future.
         "c_api_opaque.h",
     ],
     copts = tflite_copts(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
+        ":c_api_opaque_without_op_resolver",
         ":c_api_types",
         ":c_api_without_op_resolver",
         ":common",
@@ -387,15 +400,23 @@ tflite_cc_library_with_c_headers_test(
     name = "c_api_experimental_without_op_resolver_without_alwayslink",
     srcs = [
         "c_api_experimental.cc",
-        "c_api_opaque.cc",
     ],
     hdrs = [
         "c_api_experimental.h",
+        # DEPRECATED
+        # ':c_api_opaque_without_op_resolver_without_alwayslink' was promoted to the 'mostly stable' API.
+        # Please explicitly add a dependency on ':c_api_opaque_without_op_resolver_without_alwayslink'
+        # if your target needs both ':c_api_experimental_without_op_resolver_without_alwayslink' and
+        # ':c_api_opaque_without_op_resolver_without_alwayslink' dependencies.
+        # We plan to remove 'c_api_opaque.h' from the list of exposed headers and
+        # will remove the dependency from this target on
+        # ':c_api_opaque_without_op_resolver_without_alwayslink' in the future.
         "c_api_opaque.h",
     ],
     copts = tflite_copts(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
+        ":c_api_opaque_without_op_resolver_without_alwayslink",
         ":c_api_types",
         ":c_api_without_op_resolver_without_alwayslink",
         ":common",
@@ -424,6 +445,97 @@ alias(
     ],
 )
 
+tflite_cc_library_with_c_headers_test(
+    name = "c_api_opaque",
+    hdrs = [
+        "c_api_opaque.h",
+    ],
+    copts = tflite_copts(),
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ] + c_api_experimental_visibility_allowlist(),
+    deps = [
+        ":c_api",
+        ":c_api_opaque_without_op_resolver",
+        ":c_api_types",
+        ":common",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_opaque_internal",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/kernels:kernel_util",
+    ],
+)
+
+tflite_cc_library_with_c_headers_test(
+    name = "c_api_opaque_without_op_resolver",
+    srcs = [
+        "c_api_opaque.cc",
+    ],
+    hdrs = [
+        "c_api_opaque.h",
+    ],
+    copts = tflite_copts(),
+    tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
+    deps = [
+        ":c_api_types",
+        ":c_api_without_op_resolver",
+        ":common",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_opaque_internal",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/kernels:kernel_util",
+    ],
+)
+
+# This is a private target, its visibility is set to public only to be
+# used by "custom_c_library_with_tflite".
+# Do not use this target directly and don't consider it as a part of the public API.
+alias(
+    name = "private_c_api_opaque_without_op_resolver",
+    actual = ":c_api_opaque_without_op_resolver",
+    tags = ["avoid_dep"],
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
+# Same as ":c_api_opaque_without_op_resolver", but without alwayslink=1.
+tflite_cc_library_with_c_headers_test(
+    name = "c_api_opaque_without_op_resolver_without_alwayslink",
+    srcs = [
+        "c_api_opaque.cc",
+    ],
+    hdrs = [
+        "c_api_opaque.h",
+    ],
+    copts = tflite_copts(),
+    tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
+    deps = [
+        ":c_api_types",
+        ":c_api_without_op_resolver_without_alwayslink",
+        ":common",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_opaque_internal_without_alwayslink",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/kernels:kernel_util",
+    ],
+)
+
+# This is a private target, its visibility is set to public only to be
+# used by "tflite_custom_c_library".
+# Do not use this target directly and don't consider it as a part of the public API.
+alias(
+    name = "private_c_api_opaque_without_op_resolver_without_alwayslink",
+    actual = ":c_api_opaque_without_op_resolver_without_alwayslink",
+    tags = ["avoid_dep"],
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
 cc_test(
     name = "c_api_experimental_test",
     size = "small",
diff --git a/tensorflow/lite/core/c/c_api_opaque.h b/tensorflow/lite/core/c/c_api_opaque.h
index b06709cd61d872..b3999ee3fd3f0a 100644
--- a/tensorflow/lite/core/c/c_api_opaque.h
+++ b/tensorflow/lite/core/c/c_api_opaque.h
@@ -31,7 +31,11 @@ extern "C" {
 /// These APIs are accessors for TFLite Opaque Types.  These APIs are primarily
 /// intended to be used by delegates and custom OP implementations.
 ///
-/// WARNING: This is an experimental API and subject to change.
+/// This API is part of the TensorFlow Lite Extension APIs.
+/// We reserve the right to make changes to this API in future releases,
+/// potentially including non-backwards-compatible changes, on a different
+/// schedule than for the other TensorFlow Lite APIs. See
+/// https://www.tensorflow.org/guide/versions#separate_version_number_for_tensorflow_lite_extension_apis.
 
 /** \addtogroup c_api_opaque tensorflow/lite/c/c_api_opaque.h
  *  @{
@@ -392,7 +396,6 @@ TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
     TfLiteOpaqueNode** node,
     TfLiteRegistrationExternal** registration_external);
 
-/// WARNING: This is an experimental API and subject to change.
 /// Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
 ///
 /// Replaces the specified `nodes_to_replace` that are associated with the

From d7c159b8784e217c132e58df159c17f6e286eb85 Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Mon, 2 Oct 2023 13:34:48 -0700
Subject: [PATCH 486/567] Add xla flag for
 `xla_sparse_core_minibatch_max_division_level`.

PiperOrigin-RevId: 570162883
---
 tensorflow/compiler/jit/flags.cc                             | 5 +++++
 tensorflow/compiler/jit/flags.h                              | 3 +++
 tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc         | 3 ++-
 tensorflow/core/tpu/kernels/sparse_core_ops_utils.h          | 2 --
 tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h | 1 +
 5 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index 46d185f79a0438..276ab1786a8260 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -179,6 +179,9 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
            &sparse_core_flags->tf_xla_sparse_core_disable_table_stacking,
            "Disable table stacking for all the tables passed to the SparseCore"
            "mid level API."),
+      Flag("tf_xla_sparse_core_minibatch_max_division_level",
+           &sparse_core_flags->tf_xla_sparse_core_minibatch_max_division_level,
+           "Max level of division to split input data into minibatches."),
       Flag("tf_xla_sparse_core_stacking_mem_limit_bytes",
            &sparse_core_flags->tf_xla_sparse_core_stacking_mem_limit_bytes,
            "If non-zero, limits the size of the activations for a given table"
@@ -248,6 +251,8 @@ void AllocateAndParseFlags() {
   device_flags->tf_xla_enable_xla_devices = false;
 
   sparse_core_flags = new XlaSparseCoreFlags;
+  sparse_core_flags->tf_xla_sparse_core_minibatch_max_division_level =
+      kDefaultSparseCoreMinibatchMaxDivisionLevel;
   sparse_core_flags->tf_xla_sparse_core_disable_table_stacking =
       kDefaultDisableTableStacking;
   sparse_core_flags->tf_xla_sparse_core_stacking_mem_limit_bytes =
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index ccda739133aaa7..04a15136d43072 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -115,6 +115,9 @@ struct MarkForCompilationPassFlags {
 
 // Flags associated with XLA Sparse Core.
 struct XlaSparseCoreFlags {
+  // Max level of division to split input data into minibatches.
+  int tf_xla_sparse_core_minibatch_max_division_level;
+
   // Disable table stacking for all the tables passed to the SparseCore
   // mid level API.
   bool tf_xla_sparse_core_disable_table_stacking;
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
index d106449800153a..82ff674d0b483d 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
@@ -203,7 +203,8 @@ std::vector<std::vector<std::string>> GetTableStacks(
 }
 
 ABSL_ATTRIBUTE_WEAK int GetMinibatchMaxDivisionLevel() {
-  return kMinibatchMaxDivisionLevel;
+  XlaSparseCoreFlags* sparse_core_flags = GetXlaSparseCoreFlags();
+  return sparse_core_flags->tf_xla_sparse_core_minibatch_max_division_level;
 }
 
 ABSL_ATTRIBUTE_WEAK bool GetDisableTableStacking() {
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
index c62e4ebb4c6b6b..e6a51d417119ce 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
@@ -28,8 +28,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-constexpr int kMinibatchMaxDivisionLevel = 6;
-
 // Pad value used for SparseCore mini batching logic.
 const int32_t kXlaPadValue = std::numeric_limits<int32_t>::max();
 
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h b/tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h
index 5b212fa729fe5b..42a258363a0bc5 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_flags_defaults.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 namespace tensorflow {
 
+constexpr int kDefaultSparseCoreMinibatchMaxDivisionLevel = 6;
 constexpr bool kDefaultDisableTableStacking = false;
 constexpr int64_t kDefaultXlaSparseCoreStackingMemLimit = 2097152;
 constexpr int64_t kDefaultXlaSparseCoreStackingTableShardLimit = 2147483648;

From bd5427c612480d96c0fba35cd2384797d35b2aa0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 13:38:40 -0700
Subject: [PATCH 487/567] Creates a new method -- CreateInputSpec() -- that
 constructs the appropriate HloSharding objects.

PiperOrigin-RevId: 570164031
---
 .../auto_sharding_dot_handler.cc              | 174 +++++++++---------
 1 file changed, 82 insertions(+), 92 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 90e545892aef94..fa0584330a5b87 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -109,6 +109,16 @@ class HandlerBase {
     return true;
   }
 
+  HloSharding CreateInputSpec(const HloInstruction* ins,
+                              const DimMap& dim_map) const {
+    std::vector<int64_t> tensor_dims, mesh_dims;
+    for (const auto& [tensor_dim, mesh_dim] : dim_map) {
+      tensor_dims.push_back(tensor_dim);
+      mesh_dims.push_back(mesh_dim);
+    }
+    return Tile(ins->shape(), tensor_dims, mesh_dims, device_mesh_);
+  }
+
   std::unique_ptr<StrategyVector>& strategies_;
   StrategyMap& strategy_map_;
   const HloInstruction* ins_;
@@ -149,8 +159,9 @@ class DotHandler : public HandlerBase {
     DCHECK_EQ(mesh_dims.size(), 2);
     for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
       for (int64_t j = 0; j < rhs_space_dims_.size(); ++j) {
-        if (!CheckDims(lhs_, {{lhs_space_dims_[i], mesh_dims[0]}}) ||
-            !CheckDims(rhs_, {{rhs_space_dims_[j], mesh_dims[1]}}))
+        const DimMap lhs_dim_map = {{lhs_space_dims_[i], mesh_dims[0]}};
+        const DimMap rhs_dim_map = {{rhs_space_dims_[j], mesh_dims[1]}};
+        if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map))
           continue;
         std::string name = absl::StrFormat("SS = SR x RS @ {%s}",
                                            absl::StrJoin(mesh_dims, ","));
@@ -160,10 +171,8 @@ class DotHandler : public HandlerBase {
                   space_base_dim_ +
                       static_cast<int64_t>(lhs_space_dims_.size()) + j},
                  mesh_dims, device_mesh_);
-        HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_space_dims_[i]},
-                                    {mesh_dims[0]}, device_mesh_);
-        HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_space_dims_[j]},
-                                    {mesh_dims[1]}, device_mesh_);
+        HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
+        HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
 
         AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
       }
@@ -174,17 +183,15 @@ class DotHandler : public HandlerBase {
     DCHECK_EQ(mesh_dims.size(), 2);
     for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
       for (int64_t j = i + 1; j < lhs_space_dims_.size(); ++j) {
-        if (!CheckDims(lhs_, {{lhs_space_dims_[i], mesh_dims[0]},
-                              {lhs_space_dims_[j], mesh_dims[1]}}))
-          continue;
+        const DimMap lhs_dim_map = {{lhs_space_dims_[i], mesh_dims[0]},
+                                    {lhs_space_dims_[j], mesh_dims[1]}};
+        if (!CheckDims(lhs_, lhs_dim_map)) continue;
         std::string name = absl::StrFormat("SSR = SSR x RR @ {%s}",
                                            absl::StrJoin(mesh_dims, ","));
         HloSharding output_spec =
             Tile(ins_->shape(), {space_base_dim_ + i, space_base_dim_ + j},
                  mesh_dims, device_mesh_);
-        HloSharding lhs_spec =
-            Tile(lhs_->shape(), {lhs_space_dims_[i], lhs_space_dims_[j]},
-                 mesh_dims, device_mesh_);
+        HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
         HloSharding rhs_spec = HloSharding::Replicate();
 
         AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
@@ -196,9 +203,9 @@ class DotHandler : public HandlerBase {
     DCHECK_EQ(mesh_dims.size(), 2);
     for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
       for (int64_t j = i + 1; j < rhs_space_dims_.size(); ++j) {
-        if (!CheckDims(rhs_, {{rhs_space_dims_[i], mesh_dims[0]},
-                              {rhs_space_dims_[j], mesh_dims[1]}}))
-          continue;
+        const DimMap rhs_dim_map = {{rhs_space_dims_[i], mesh_dims[0]},
+                                    {rhs_space_dims_[j], mesh_dims[1]}};
+        if (!CheckDims(rhs_, rhs_dim_map)) continue;
         std::string name = absl::StrFormat("RSS = RR x RSS @ {%s}",
                                            absl::StrJoin(mesh_dims, ","));
         HloSharding output_spec = Tile(
@@ -208,9 +215,7 @@ class DotHandler : public HandlerBase {
                  j},
             mesh_dims, device_mesh_);
         HloSharding lhs_spec = HloSharding::Replicate();
-        HloSharding rhs_spec =
-            Tile(rhs_->shape(), {rhs_space_dims_[i], rhs_space_dims_[j]},
-                 mesh_dims, device_mesh_);
+        HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
 
         AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
       }
@@ -226,17 +231,15 @@ class DotHandler : public HandlerBase {
                           absl::StrJoin(mesh_dims, ","), mesh_dims[1]);
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_con_dims_.size(); ++j) {
-          if (!CheckDims(lhs_, {{lhs_space_dims_[i], mesh_dims[0]},
-                                {lhs_con_dims_[j], mesh_dims[1]}}))
-            continue;
+          const DimMap lhs_dim_map = {{lhs_space_dims_[i], mesh_dims[0]},
+                                      {lhs_con_dims_[j], mesh_dims[1]}};
+          const DimMap rhs_dim_map = {{rhs_con_dims_[j], mesh_dims[1]}};
+          if (!CheckDims(lhs_, lhs_dim_map)) continue;
 
           HloSharding output_spec = Tile(ins_->shape(), {space_base_dim_ + i},
                                          {mesh_dims[0]}, device_mesh_);
-          HloSharding lhs_spec =
-              Tile(lhs_->shape(), {lhs_space_dims_[i], lhs_con_dims_[j]},
-                   mesh_dims, device_mesh_);
-          HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_con_dims_[j]},
-                                      {mesh_dims[1]}, device_mesh_);
+          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
+          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
 
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost =
@@ -256,19 +259,18 @@ class DotHandler : public HandlerBase {
                           absl::StrJoin(mesh_dims, ","), mesh_dims[0]);
       for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_con_dims_.size(); ++j) {
-          if (!CheckDims(rhs_, {{rhs_space_dims_[i], mesh_dims[1]}}) ||
-              !CheckDims(lhs_, {{lhs_con_dims_[j], mesh_dims[0]}}))
+          const DimMap rhs_dim_map = {{rhs_space_dims_[i], mesh_dims[1]},
+                                      {rhs_con_dims_[j], mesh_dims[0]}};
+          const DimMap lhs_dim_map = {{lhs_con_dims_[j], mesh_dims[0]}};
+          if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map))
             continue;
           HloSharding output_spec =
               Tile(ins_->shape(),
                    {space_base_dim_ +
                     static_cast<int64_t>(lhs_space_dims_.size()) + i},
                    {mesh_dims[1]}, device_mesh_);
-          HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_con_dims_[j]},
-                                      {mesh_dims[0]}, device_mesh_);
-          HloSharding rhs_spec =
-              Tile(rhs_->shape(), {rhs_con_dims_[j], rhs_space_dims_[i]},
-                   mesh_dims, device_mesh_);
+          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
+          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost =
               cluster_env_.AllReduceCost(memory_cost, mesh_dims[0]);
@@ -285,13 +287,13 @@ class DotHandler : public HandlerBase {
                          [](int64_t size) { return size > 1; }) == 1) {
       for (int64_t i = 0; i < lhs_batch_dims_.size(); ++i) {
         for (int64_t j = 0; j < device_mesh_.num_dimensions(); ++j) {
-          if (!CheckDims(lhs_, {{lhs_batch_dims_[i], j}})) continue;
+          const DimMap lhs_dim_map = {{lhs_batch_dims_[i], j}};
+          const DimMap rhs_dim_map = {{rhs_batch_dims_[i], j}};
+          if (!CheckDims(lhs_, lhs_dim_map)) continue;
           std::string name = absl::StrFormat("Sb_%d = Sb x Sb @ {%d}", i, j);
           HloSharding output_spec = Tile(ins_->shape(), {i}, {j}, device_mesh_);
-          HloSharding lhs_spec =
-              Tile(lhs_->shape(), {lhs_batch_dims_[i]}, {j}, device_mesh_);
-          HloSharding rhs_spec =
-              Tile(rhs_->shape(), {rhs_batch_dims_[i]}, {j}, device_mesh_);
+          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
+          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
 
           AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
         }
@@ -303,19 +305,17 @@ class DotHandler : public HandlerBase {
     DCHECK_EQ(mesh_dims.size(), 2);
     if (lhs_batch_dims_.size() == 2 && device_mesh_.dim(mesh_dims[0]) > 1 &&
         device_mesh_.dim(mesh_dims[1]) > 1) {
-      if (!CheckDims(lhs_, {{lhs_batch_dims_[0], mesh_dims[0]},
-                            {lhs_batch_dims_[1], mesh_dims[1]}}))
-        return;
+      const DimMap lhs_dim_map = {{lhs_batch_dims_[0], mesh_dims[0]},
+                                  {lhs_batch_dims_[1], mesh_dims[1]}};
+      const DimMap rhs_dim_map = {{rhs_batch_dims_[0], mesh_dims[0]},
+                                  {rhs_batch_dims_[1], mesh_dims[1]}};
+      if (!CheckDims(lhs_, lhs_dim_map)) return;
       std::string name =
           absl::StrFormat("Sb = Sb x Sb @ {%s}", absl::StrJoin(mesh_dims, ","));
       HloSharding output_spec =
           Tile(ins_->shape(), {0, 1}, mesh_dims, device_mesh_);
-      HloSharding lhs_spec =
-          Tile(lhs_->shape(), {lhs_batch_dims_[0], lhs_batch_dims_[1]},
-               mesh_dims, device_mesh_);
-      HloSharding rhs_spec =
-          Tile(rhs_->shape(), {rhs_batch_dims_[0], rhs_batch_dims_[1]},
-               mesh_dims, device_mesh_);
+      HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
+      HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
       AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
     }
   }
@@ -328,16 +328,14 @@ class DotHandler : public HandlerBase {
                                          absl::StrJoin(mesh_dims, ","));
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!CheckDims(lhs_, {{lhs_space_dims_[i], mesh_dims[0]},
-                                {lhs_batch_dims_[j], mesh_dims[1]}}))
-            continue;
+          const DimMap lhs_dim_map = {{lhs_space_dims_[i], mesh_dims[1]},
+                                      {lhs_batch_dims_[j], mesh_dims[0]}};
+          const DimMap rhs_dim_map = {{rhs_batch_dims_[j], mesh_dims[0]}};
+          if (!CheckDims(lhs_, lhs_dim_map)) continue;
           HloSharding output_spec = Tile(
               ins_->shape(), {j, space_base_dim_ + i}, mesh_dims, device_mesh_);
-          HloSharding lhs_spec =
-              Tile(lhs_->shape(), {lhs_batch_dims_[j], lhs_space_dims_[i]},
-                   mesh_dims, device_mesh_);
-          HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_batch_dims_[j]},
-                                      {mesh_dims[0]}, device_mesh_);
+          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
+          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
 
           AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
         }
@@ -353,19 +351,18 @@ class DotHandler : public HandlerBase {
                                          absl::StrJoin(mesh_dims, ","));
       for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!CheckDims(rhs_, {{rhs_space_dims_[i], mesh_dims[1]}}) ||
-              !CheckDims(lhs_, {{lhs_batch_dims_[j], mesh_dims[0]}}))
+          const DimMap rhs_dim_map = {{rhs_space_dims_[i], mesh_dims[1]},
+                                      {rhs_batch_dims_[j], mesh_dims[0]}};
+          const DimMap lhs_dim_map = {{lhs_batch_dims_[j], mesh_dims[0]}};
+          if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map))
             continue;
           HloSharding output_spec =
               Tile(ins_->shape(),
                    {j, space_base_dim_ +
                            static_cast<int64_t>(lhs_space_dims_.size()) + i},
                    mesh_dims, device_mesh_);
-          HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_batch_dims_[j]},
-                                      {mesh_dims[0]}, device_mesh_);
-          HloSharding rhs_spec =
-              Tile(rhs_->shape(), {rhs_batch_dims_[j], rhs_space_dims_[i]},
-                   mesh_dims, device_mesh_);
+          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
+          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
 
           AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
         }
@@ -382,17 +379,14 @@ class DotHandler : public HandlerBase {
                           absl::StrJoin(mesh_dims, ","), mesh_dims[1]);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!CheckDims(lhs_, {{lhs_con_dims_[i], mesh_dims[1]},
-                                {lhs_batch_dims_[j], mesh_dims[0]}}))
-            continue;
+          const DimMap lhs_dim_map = {{lhs_con_dims_[i], mesh_dims[1]},
+                                      {lhs_batch_dims_[j], mesh_dims[0]}};
+          const DimMap rhs_dim_map = {{rhs_batch_dims_[j], mesh_dims[0]}};
+          if (!CheckDims(lhs_, lhs_dim_map)) continue;
           HloSharding output_spec =
               Tile(ins_->shape(), {j}, {mesh_dims[0]}, device_mesh_);
-          HloSharding lhs_spec =
-              Tile(lhs_->shape(), {lhs_batch_dims_[j], lhs_con_dims_[i]},
-                   mesh_dims, device_mesh_);
-          HloSharding rhs_spec =
-              Tile(rhs_->shape(), {rhs_batch_dims_[j], rhs_con_dims_[i]},
-                   mesh_dims, device_mesh_);
+          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
+          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost =
               cluster_env_.AllReduceCost(memory_cost, mesh_dims[1]);
@@ -415,18 +409,15 @@ class DotHandler : public HandlerBase {
           absl::StrJoin(mesh_dims, ","), absl::StrJoin(mesh_dims, ", "));
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
         for (int64_t j = i + 1; j < lhs_con_dims_.size(); ++j) {
-          if (!CheckDims(lhs_, {{lhs_con_dims_[i], mesh_dims[0]},
-                                {lhs_con_dims_[j], mesh_dims[1]}}) ||
-              !CheckDims(rhs_, {{rhs_con_dims_[i], mesh_dims[0]},
-                                {rhs_con_dims_[j], mesh_dims[1]}}))
+          const DimMap lhs_dim_map = {{lhs_con_dims_[i], mesh_dims[0]},
+                                      {lhs_con_dims_[j], mesh_dims[1]}};
+          const DimMap rhs_dim_map = {{rhs_con_dims_[i], mesh_dims[0]},
+                                      {rhs_con_dims_[j], mesh_dims[1]}};
+          if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map))
             continue;
           HloSharding output_spec = HloSharding::Replicate();
-          HloSharding lhs_spec =
-              Tile(lhs_->shape(), {lhs_con_dims_[i], lhs_con_dims_[j]},
-                   mesh_dims, device_mesh_);
-          HloSharding rhs_spec =
-              Tile(rhs_->shape(), {rhs_con_dims_[i], rhs_con_dims_[j]},
-                   mesh_dims, device_mesh_);
+          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
+          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost = cluster_env_.AllReduceCost(
               memory_cost, mesh_dims[0], mesh_dims[1]);
@@ -444,12 +435,12 @@ class DotHandler : public HandlerBase {
       std::string name = absl::StrFormat("RR = RS x SR @ {%d} (allreduce @ %d)",
                                          mesh_dims[0], mesh_dims[0]);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
-        if (!CheckDims(lhs_, {{lhs_con_dims_[i], mesh_dims[0]}})) continue;
+        const DimMap lhs_dim_map = {{lhs_con_dims_[i], mesh_dims[0]}};
+        const DimMap rhs_dim_map = {{rhs_con_dims_[i], mesh_dims[0]}};
+        if (!CheckDims(lhs_, lhs_dim_map)) continue;
         HloSharding output_spec = HloSharding::Replicate();
-        HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_con_dims_[i]},
-                                    {mesh_dims[0]}, device_mesh_);
-        HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_con_dims_[i]},
-                                    {mesh_dims[0]}, device_mesh_);
+        HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
+        HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
         double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
         double compute_cost =
             cluster_env_.DotCost(lhs_->shape(), rhs_->shape(), dot_dnums_);
@@ -521,17 +512,16 @@ class DotHandler : public HandlerBase {
                          [](int64_t size) { return size > 1; }) > 1) {
       int mesh_dim = 0;
       for (int64_t i = 0; i < lhs_batch_dims_.size(); ++i) {
-          if (!CheckDims(lhs_, {{lhs_batch_dims_[i], mesh_dim}}) ||
-              !CheckDims(rhs_, {{rhs_batch_dims_[i], mesh_dim}}))
+          const DimMap lhs_dim_map = {{lhs_batch_dims_[i], mesh_dim}};
+          const DimMap rhs_dim_map = {{rhs_batch_dims_[i], mesh_dim}};
+          if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map))
           continue;
           std::string name =
               absl::StrFormat("Sb_%d = Sb x Sb @ {%d} 1d", i, mesh_dim);
           HloSharding output_spec =
               Tile(ins_->shape(), {i}, {mesh_dim}, device_mesh_1d_);
-          HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_batch_dims_[i]},
-                                      {mesh_dim}, device_mesh_1d_);
-          HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_batch_dims_[i]},
-                                      {mesh_dim}, device_mesh_1d_);
+          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
+          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
           AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
       }
     }

From d9af31c0a6f3750eeeb57bc01e9736396fb87861 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Mon, 2 Oct 2023 13:59:00 -0700
Subject: [PATCH 488/567] [XLA:GPU] Add the flag for xla_gpu_mock_custom_calls

PiperOrigin-RevId: 570170696
---
 third_party/xla/xla/debug_options_flags.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index ad67367e338306..1cbfc25daa9302 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -1276,6 +1276,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 debug_options->xla_gpu_cublas_fallback(),
                 "Allow Triton GEMM autotuning to fall back to cuBLAS when that "
                 "is faster."));
+  flag_list->push_back(
+      tsl::Flag("xla_gpu_mock_custom_calls",
+                bool_setter_for(&DebugOptions::set_xla_gpu_mock_custom_calls),
+                debug_options->xla_gpu_mock_custom_calls(),
+                "Replace custom calls with noop operations."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_while_loop_double_buffering",
       bool_setter_for(

From 54b55af7a69d212d88cc43d2a4f73fe69fd3d569 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 14:07:17 -0700
Subject: [PATCH 489/567] Added a new reference type.

PiperOrigin-RevId: 570173600
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index b1805ce7167bb9..56df36b0ac1deb 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -510,6 +510,7 @@ def TF_FpOrI32OrI64Tensor : TensorOf<[TF_Float, TF_I32OrI64]>;
 def TF_IntOrFpTensor : TensorOf<[TF_Int, TF_Float]>;
 def TF_SintOrFpTensor : TensorOf<[TF_SInt, TF_Float]>;
 def TF_FpOrComplexTensor : TensorOf<[TF_Float, TF_Complex]>;
+def TF_BoolOrFpTensor: TensorOf<[TF_Bool, TF_Float]>;
 
 def TF_Number : AnyTypeOf<
   [TF_Int, TF_Float, TF_Quantized, TF_Complex], "number">;

From 4184aa8e64b65fcec7cde71678b66f6c96bdb9ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 14:10:18 -0700
Subject: [PATCH 490/567] Enable UniqueV2Op in MLIR

PiperOrigin-RevId: 570174493
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 66 +++++++++++++++++++
 .../tests/legalize-tf-prefer-tf2xla.mlir      | 17 +++++
 .../transforms/legalization_op_config_test.cc |  2 +-
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index ecd8c234eff40d..55baef4feaead8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -20710,6 +20710,72 @@ idx ==> [0, 1, 2, 3, 4, 4, 0, 1]
   TF_DerivedResultTypeAttr out_idx = TF_DerivedResultTypeAttr<1>;
 }
 
+def TF_UniqueV2Op : TF_Op<"UniqueV2", [Pure]> {
+  let summary = "Finds unique elements along an axis of a tensor.";
+
+  let description = [{
+This operation either returns a tensor `y` containing unique elements
+along the `axis` of a tensor. The returned unique elements is sorted
+in the same order as they occur along `axis` in `x`.
+This operation also returns a tensor `idx` that is the same size as
+the number of the elements in `x` along the `axis` dimension. It
+contains the index in the unique output `y`.
+In other words, for an `1-D` tensor `x` with `axis = None:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx = unique(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+```
+
+For an `2-D` tensor `x` with `axis = 0`:
+
+```
+# tensor 'x' is [[1, 0, 0],
+#                [1, 0, 0],
+#                [2, 0, 0]]
+y, idx = unique(x, axis=0)
+y ==> [[1, 0, 0],
+       [2, 0, 0]]
+idx ==> [0, 0, 1]
+```
+
+For an `2-D` tensor `x` with `axis = 1`:
+
+```
+# tensor 'x' is [[1, 0, 0],
+#                [1, 0, 0],
+#                [2, 0, 0]]
+y, idx = unique(x, axis=1)
+y ==> [[1, 0],
+       [1, 0],
+       [2, 0]]
+idx ==> [0, 1, 1]
+```
+  }];
+
+  let arguments = (ins
+    Arg<TF_Tensor, [{A `Tensor`.}]>:$x,
+    Arg<TF_I32OrI64Tensor, [{A `Tensor` of type `int32` (default: None). The axis of the Tensor to
+find the unique elements.}]>:$axis
+  );
+
+  let results = (outs
+    Res<TF_Tensor, [{A `Tensor`. Unique elements along the `axis` of `Tensor` x.}]>:$y,
+    Res<TF_I32OrI64Tensor, [{A 1-D Tensor. Has the same type as x that contains the index of each
+value of x in the output y.}]>:$idx
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Taxis = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr out_idx = TF_DerivedResultTypeAttr<1>;
+}
+
 def TF_UnpackOp : TF_Op<"Unpack", [Pure]> {
   let summary = [{
 Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir
index 4c0c000233ab91..c1bff70e2e4ff6 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir
@@ -210,4 +210,21 @@ func.func @fused_conv2d(%input: tensor<1x300x300x40xi8>,
   func.return %conv2d : tensor<1x300x300x40xi8>
 }
 
+//===----------------------------------------------------------------------===//
+// tf.UniqueV2 legalization
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: @doesnt_legalize_uniquev2
+func.func @doesnt_legalize_uniquev2(%arg0: tensor<3xf32>) -> (tensor<?xf32>, tensor<3xi32>) {
+  //CHECK:        %0 = mhlo.constant dense<0> : tensor<1xi32>
+  //CHECK-NEXT:   %y, %idx = "tf.UniqueV2"(%arg0, %0) {device = ""} : (tensor<3xf32>, tensor<1xi32>) -> (tensor<?xf32>, tensor<3xi32>)
+  //CHECK-NEXT:   return %y, %idx : tensor<?xf32>, tensor<3xi32>
+
+  %cst = "tf.Const"() {device = "", value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %y, %idx = "tf.UniqueV2"(%arg0, %cst) {device = ""} : (tensor<3xf32>, tensor<1xi32>) -> (tensor<?xf32>, tensor<3xi32>)
+  func.return %y, %idx : tensor<?xf32>, tensor<3xi32>
+}
+
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
index 7453cdb042e515..fa3350bb59a2f0 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
@@ -131,7 +131,7 @@ TEST_F(LegalizationOpConfigTest, CountLoweringsSet) {
   // a new op, we should expect these to change too.
   EXPECT_EQ(mlir_lowering_count, 68);
   EXPECT_EQ(tf2xla_fallback_count, 313);
-  EXPECT_EQ(non_categorized_count, 419);
+  EXPECT_EQ(non_categorized_count, 420);
 }
 
 // Just a counter test to see which ops have duplicate lowerings. This isn't a

From 61f4f2f9a0e4549309f112f7479c3b2d14a2d6b4 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Mon, 2 Oct 2023 14:11:14 -0700
Subject: [PATCH 491/567] Fix error message in function attribute parser.

PiperOrigin-RevId: 570174801
---
 tensorflow/python/eager/polymorphic_function/attributes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/polymorphic_function/attributes.py b/tensorflow/python/eager/polymorphic_function/attributes.py
index f9eeab0c1df6f0..2e9be8a94c9c98 100644
--- a/tensorflow/python/eager/polymorphic_function/attributes.py
+++ b/tensorflow/python/eager/polymorphic_function/attributes.py
@@ -150,7 +150,7 @@ def _parse_func_attr_value(key, value):
     return attr_value_pb2.AttrValue(list=list_value)
   else:
     raise ValueError(
-        f"Attribute {key} must be bool, int, float, string, list, or"
+        f"Attribute {key} must be bool, int, float, string, list, or "
         f"AttrValue. Got {type(value)}."
     )
 

From b439a7f42ca8ea9e8c24be2945382918e126bd0c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 14:42:17 -0700
Subject: [PATCH 492/567] Allow quantized tensor as the input for
 mhlo.transpose.

PiperOrigin-RevId: 570183739
---
 .../xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td       |  2 +-
 .../Dialect/mhlo/canonicalize/transpose.mlir  | 39 +++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
index 6ef32579142967..94d72f5b4c7121 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
@@ -3114,7 +3114,7 @@ def MHLO_TraceOp: MHLO_Op<"trace", []> {
 }
 
 def MHLO_TransposeOp: MHLO_ShapedInterfaceOp<"transpose",
-      [Pure, SameOperandsAndResultElementType,
+      [Pure, HLO_CompatibleOperandsAndResultElementType,
       DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Transpose operation";
   let description = [{
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir
index 38f2cce820d309..2b4b8cd9982a23 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir
@@ -120,3 +120,42 @@ func.func @broadcast_transpose_multi_dim(%arg0 : tensor<95x64xf32>) -> tensor<5x
     // CHECK-NEXT: return [[RET]]
     func.return %1 : tensor<5x64x31x95xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @transpose_splat_constant_quantized_per_tensor
+// CHECK-NOT: mhlo.transpose
+func.func @transpose_splat_constant_quantized_per_tensor() -> tensor<5x10x!quant.uniform<i8:f32, 2.000000e+0:16>> {
+  %cst = mhlo.constant() {value = dense<42> : tensor<10x5xi8>} : () -> tensor<10x5x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  %0 = "mhlo.transpose"(%cst) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<10x5x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<5x10x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  // CHECK-NEXT: [[CST:%.+]] = mhlo.constant
+  // CHECK-SAME: tensor<5x10x!quant.uniform<i8:f32, 2.000000e+00:16>>
+  // CHECK-NEXT: return [[CST]]
+  func.return %0 : tensor<5x10x!quant.uniform<i8:f32, 2.000000e+0:16>>
+}
+
+// -----
+
+// CHECK-LABEL: func @transpose_splat_constant_quantized_per_axis
+// CHECK-NOT: mhlo.transpose
+func.func @transpose_splat_constant_quantized_per_axis() -> tensor<2x10x!quant.uniform<i8:f32:0, {2.000000e+0:16,3.000000e+0:32}>> {
+  %cst = mhlo.constant() {value = dense<42> : tensor<10x2xi8>} : () -> tensor<10x2x!quant.uniform<i8:f32:1, {2.000000e+0:16,3.000000e+0:32}>>
+  %0 = "mhlo.transpose"(%cst) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<10x2x!quant.uniform<i8:f32:1, {2.000000e+0:16,3.000000e+0:32}>>) -> tensor<2x10x!quant.uniform<i8:f32:0, {2.000000e+0:16,3.000000e+0:32}>>
+  // CHECK-NEXT: [[CST:%.+]] = mhlo.constant
+  // CHECK-SAME: tensor<2x10x!quant.uniform<i8:f32:0, {2.000000e+00:16,3.000000e+00:32}>>
+  // CHECK-NEXT: return [[CST]]
+  func.return %0 : tensor<2x10x!quant.uniform<i8:f32:0, {2.000000e+0:16,3.000000e+0:32}>>
+}
+
+// -----
+
+// Can not fold non-splat tensors (quantized or not)
+// CHECK-LABEL: func @nofold_nonsplat_quant_constant
+func.func @nofold_nonsplat_quant_constant() -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+0:16>> {
+  %cst = mhlo.constant() {value = dense<[[1, 2, 3, 4],[5, 6, 7, 8]]> : tensor<2x4xi8>} : () -> tensor<2x4x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  %0 = "mhlo.transpose"(%cst) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<2x4x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  // CHECK: [[TRANSPOSED:%.+]] = "mhlo.transpose"
+  // CHECK-SAME: -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+00:16>>
+  // CHECK-NEXT: return [[TRANSPOSED]]
+  func.return %0 : tensor<4x2x!quant.uniform<i8:f32, 2.000000e+0:16>>
+}
\ No newline at end of file

From 79ac64f6a7bca73ecbdee6507439f3234cb1a6c2 Mon Sep 17 00:00:00 2001
From: Sandeep Dasgupta <sdasgup@google.com>
Date: Mon, 2 Oct 2023 14:47:04 -0700
Subject: [PATCH 493/567] Integrate StableHLO at openxla/stablehlo@6bf3f5d

PiperOrigin-RevId: 570185242
---
 third_party/stablehlo/workspace.bzl               |  4 ++--
 .../xla/third_party/stablehlo/workspace.bzl       |  4 ++--
 third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc   | 15 +++++++++++----
 .../Dialect/mhlo/hlo-legalize-to-stablehlo.mlir   |  8 ++++----
 .../mhlo/mhlo_infer_shape_type_methods.mlir       |  2 +-
 .../Dialect/mhlo/stablehlo-legalize-to-hlo.mlir   |  8 ++++----
 6 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index ebf1c7041e4100..82c29f575b94d4 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "2eec6dbd23dc5bd9d51f98c5768edf5b1a470ff7"
-    STABLEHLO_SHA256 = "a5e31ed87bb70005fa421abc355e692dc9d82c83c681a3d8d89ab15c607fc1f9"
+    STABLEHLO_COMMIT = "6bf3f5d7b8dd3fc67ee1236432606d9c414ffcbd"
+    STABLEHLO_SHA256 = "95e10a1f10cd6aaf66cbe5fe43a65dc588099f3ded179ef87af0748adf500fdb"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index ebf1c7041e4100..82c29f575b94d4 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "2eec6dbd23dc5bd9d51f98c5768edf5b1a470ff7"
-    STABLEHLO_SHA256 = "a5e31ed87bb70005fa421abc355e692dc9d82c83c681a3d8d89ab15c607fc1f9"
+    STABLEHLO_COMMIT = "6bf3f5d7b8dd3fc67ee1236432606d9c414ffcbd"
+    STABLEHLO_SHA256 = "95e10a1f10cd6aaf66cbe5fe43a65dc588099f3ded179ef87af0748adf500fdb"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index a8f8eacd7252f8..99007914acbd9d 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -3183,10 +3183,14 @@ LogicalResult OutfeedOp::inferReturnTypes(
 //===----------------------------------------------------------------------===//
 
 LogicalResult SendOp::inferReturnTypes(
-    MLIRContext* context, std::optional<Location> location, ValueRange,
-    DictionaryAttr, OpaqueProperties, RegionRange,
+    MLIRContext* context, std::optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  return hlo::inferSendOp(getMhloDialect(context), location,
+  SendOp::Adaptor adaptor(operands, attributes, properties, regions);
+  bool isDeviceToDevice = adaptor.getChannelHandle().getType() == 1;
+  bool isDeviceToHost = adaptor.getChannelHandle().getType() == 2;
+  return hlo::inferSendOp(getMhloDialect(context), location, isDeviceToDevice,
+                          isDeviceToHost, adaptor.getIsHostTransfer(),
                           inferredReturnTypes);
 }
 
@@ -3195,8 +3199,11 @@ LogicalResult SendOp::inferReturnTypes(
 //===----------------------------------------------------------------------===//
 
 LogicalResult RecvOp::verify() {
+  bool isDeviceToDevice = getChannelHandle().getType() == 1;
+  bool isHostToDevice = getChannelHandle().getType() == 3;
   return hlo::verifyRecvOp(getMhloDialect(getContext()), getLoc(),
-                           getResults());
+                           isDeviceToDevice, isHostToDevice,
+                           getIsHostTransfer(), getResults());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index 6fbd7ee730ae4f..9a767f505f409e 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -1157,11 +1157,11 @@ func.func @op_real(%arg0: tensor<complex<f32>>) -> tensor<f32> {
 // CHECK-LABEL: "op_recv"
 func.func @op_recv(%arg0: !mhlo.token) -> (tensor<f32>, !mhlo.token) {
   //      CHECK: "stablehlo.recv"(%arg0) {
-  // CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
+  // CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 3>,
   // CHECK-SAME:   is_host_transfer = true
   // CHECK-SAME: } : (!stablehlo.token) -> (tensor<f32>, !stablehlo.token)
   %0:2 = "mhlo.recv"(%arg0) {
-    channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
+    channel_handle = #mhlo.channel_handle<handle = 0, type = 3>,
     is_host_transfer = true
   } : (!mhlo.token) -> (tensor<f32>, !mhlo.token)
   func.return %0#0, %0#1 : tensor<f32>, !mhlo.token
@@ -1402,11 +1402,11 @@ func.func @op_select(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>)
 // CHECK-LABEL: "op_send"
 func.func @op_send(%arg0: tensor<f32>, %arg1: !mhlo.token) -> !mhlo.token {
   //      CHECK: "stablehlo.send"(%arg0, %arg1) {
-  // CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
+  // CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 2>,
   // CHECK-SAME:   is_host_transfer = true
   // CHECK-SAME: } : (tensor<f32>, !stablehlo.token) -> !stablehlo.token
   %0 = "mhlo.send"(%arg0, %arg1) {
-    channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
+    channel_handle = #mhlo.channel_handle<handle = 0, type = 2>,
     is_host_transfer = true
   } : (tensor<f32>, !mhlo.token) -> !mhlo.token
   func.return %0 : !mhlo.token
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
index 3ebe9d38969e6a..358e33df945d0f 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
@@ -1177,7 +1177,7 @@ func.func @partition_id() -> tensor<*xindex> {
 // CHECK-LABEL: @send
 func.func @send(%arg0: !mhlo.token) -> !mhlo.token {
   %result = "mhlo.send"(%arg0) {
-    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 1>
   } : (!mhlo.token) -> !mhlo.token
   // CHECK: types0 = !mhlo.token
   %1 = "mhlo_test.get_return_types"(%result) : (!mhlo.token) -> !mhlo.token
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index 753d36620ac176..0495dd279b8c86 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -1180,11 +1180,11 @@ func.func @op_real(%arg0: tensor<complex<f32>>) -> tensor<f32> {
 // CHECK-LABEL: "op_recv"
 func.func @op_recv(%arg0: !stablehlo.token) -> (tensor<f32>, !stablehlo.token) {
   //      CHECK: "mhlo.recv"(%arg0) {
-  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
+  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 3>,
   // CHECK-SAME:   is_host_transfer = true
   // CHECK-SAME: } : (!mhlo.token) -> (tensor<f32>, !mhlo.token)
   %0:2 = "stablehlo.recv"(%arg0) {
-    channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
+    channel_handle = #stablehlo.channel_handle<handle = 0, type = 3>,
     is_host_transfer = true
   } : (!stablehlo.token) -> (tensor<f32>, !stablehlo.token)
   func.return %0#0, %0#1 : tensor<f32>, !stablehlo.token
@@ -1425,11 +1425,11 @@ func.func @op_select(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>)
 // CHECK-LABEL: "op_send"
 func.func @op_send(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.token {
   //      CHECK: "mhlo.send"(%arg0, %arg1) {
-  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
+  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 2>,
   // CHECK-SAME:   is_host_transfer = true
   // CHECK-SAME: } : (tensor<f32>, !mhlo.token) -> !mhlo.token
   %0 = "stablehlo.send"(%arg0, %arg1) {
-    channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
+    channel_handle = #stablehlo.channel_handle<handle = 0, type = 2>,
     is_host_transfer = true
   } : (tensor<f32>, !stablehlo.token) -> !stablehlo.token
   func.return %0 : !stablehlo.token

From d18f72626b42c787eeead92b64278412e7e32dce Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 14:56:12 -0700
Subject: [PATCH 494/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/09614836c83b716cfc7f6ed63208a0a122020799.

PiperOrigin-RevId: 570188293
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 140d5adb7eb8e3..2727639805158a 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "fb9b279b454c71995f63d33a9c78d52c4bf3ca1c"
-    TFRT_SHA256 = "dcf6b24d18c59e149ae94b05b35e6e66403a28b3b0a0a11777e2ba85d16d3ff8"
+    TFRT_COMMIT = "09614836c83b716cfc7f6ed63208a0a122020799"
+    TFRT_SHA256 = "a62a69706c814038605d67f5e6b9c68467aa51b9283b8fe3fec104a21874d8d8"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index 140d5adb7eb8e3..2727639805158a 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "fb9b279b454c71995f63d33a9c78d52c4bf3ca1c"
-    TFRT_SHA256 = "dcf6b24d18c59e149ae94b05b35e6e66403a28b3b0a0a11777e2ba85d16d3ff8"
+    TFRT_COMMIT = "09614836c83b716cfc7f6ed63208a0a122020799"
+    TFRT_SHA256 = "a62a69706c814038605d67f5e6b9c68467aa51b9283b8fe3fec104a21874d8d8"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 140d5adb7eb8e3..2727639805158a 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "fb9b279b454c71995f63d33a9c78d52c4bf3ca1c"
-    TFRT_SHA256 = "dcf6b24d18c59e149ae94b05b35e6e66403a28b3b0a0a11777e2ba85d16d3ff8"
+    TFRT_COMMIT = "09614836c83b716cfc7f6ed63208a0a122020799"
+    TFRT_SHA256 = "a62a69706c814038605d67f5e6b9c68467aa51b9283b8fe3fec104a21874d8d8"
 
     tf_http_archive(
         name = "tf_runtime",

From 44aaffee483e90a1e60a9e263952a8dec40ceb98 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 15:14:29 -0700
Subject: [PATCH 495/567] Creates a single entry point to handle dimension
 checking & input spec creation & strategy generation.

PiperOrigin-RevId: 570194157
---
 .../auto_sharding_dot_handler.cc              | 219 ++++++------------
 1 file changed, 77 insertions(+), 142 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index fa0584330a5b87..99b96cb3b0a92b 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -109,14 +109,26 @@ class HandlerBase {
     return true;
   }
 
-  HloSharding CreateInputSpec(const HloInstruction* ins,
-                              const DimMap& dim_map) const {
+  HloSharding CreateInputSpec(const HloInstruction* ins, const DimMap& dim_map,
+                              const Array<int64_t>& device_mesh) const {
+    if (dim_map.empty()) return HloSharding::Replicate();
     std::vector<int64_t> tensor_dims, mesh_dims;
     for (const auto& [tensor_dim, mesh_dim] : dim_map) {
       tensor_dims.push_back(tensor_dim);
       mesh_dims.push_back(mesh_dim);
     }
-    return Tile(ins->shape(), tensor_dims, mesh_dims, device_mesh_);
+    return Tile(ins->shape(), tensor_dims, mesh_dims, device_mesh);
+  }
+
+  void MaybeAppend(const std::string& name, const HloSharding& output_spec,
+                   const DimMap& lhs_dim_map, const DimMap& rhs_dim_map,
+                   const Array<int64_t>& device_mesh, double compute_cost = 0,
+                   double communication_cost = 0) {
+    if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map)) return;
+    HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map, device_mesh);
+    HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map, device_mesh);
+    AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, compute_cost,
+                      communication_cost);
   }
 
   std::unique_ptr<StrategyVector>& strategies_;
@@ -161,8 +173,6 @@ class DotHandler : public HandlerBase {
       for (int64_t j = 0; j < rhs_space_dims_.size(); ++j) {
         const DimMap lhs_dim_map = {{lhs_space_dims_[i], mesh_dims[0]}};
         const DimMap rhs_dim_map = {{rhs_space_dims_[j], mesh_dims[1]}};
-        if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map))
-          continue;
         std::string name = absl::StrFormat("SS = SR x RS @ {%s}",
                                            absl::StrJoin(mesh_dims, ","));
         HloSharding output_spec =
@@ -171,10 +181,7 @@ class DotHandler : public HandlerBase {
                   space_base_dim_ +
                       static_cast<int64_t>(lhs_space_dims_.size()) + j},
                  mesh_dims, device_mesh_);
-        HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-        HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
-
-        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+        MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_);
       }
     }
   }
@@ -185,16 +192,12 @@ class DotHandler : public HandlerBase {
       for (int64_t j = i + 1; j < lhs_space_dims_.size(); ++j) {
         const DimMap lhs_dim_map = {{lhs_space_dims_[i], mesh_dims[0]},
                                     {lhs_space_dims_[j], mesh_dims[1]}};
-        if (!CheckDims(lhs_, lhs_dim_map)) continue;
         std::string name = absl::StrFormat("SSR = SSR x RR @ {%s}",
                                            absl::StrJoin(mesh_dims, ","));
         HloSharding output_spec =
             Tile(ins_->shape(), {space_base_dim_ + i, space_base_dim_ + j},
                  mesh_dims, device_mesh_);
-        HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-        HloSharding rhs_spec = HloSharding::Replicate();
-
-        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+        MaybeAppend(name, output_spec, lhs_dim_map, {}, device_mesh_);
       }
     }
   }
@@ -205,7 +208,6 @@ class DotHandler : public HandlerBase {
       for (int64_t j = i + 1; j < rhs_space_dims_.size(); ++j) {
         const DimMap rhs_dim_map = {{rhs_space_dims_[i], mesh_dims[0]},
                                     {rhs_space_dims_[j], mesh_dims[1]}};
-        if (!CheckDims(rhs_, rhs_dim_map)) continue;
         std::string name = absl::StrFormat("RSS = RR x RSS @ {%s}",
                                            absl::StrJoin(mesh_dims, ","));
         HloSharding output_spec = Tile(
@@ -214,10 +216,7 @@ class DotHandler : public HandlerBase {
              space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) +
                  j},
             mesh_dims, device_mesh_);
-        HloSharding lhs_spec = HloSharding::Replicate();
-        HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
-
-        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+        MaybeAppend(name, output_spec, {}, rhs_dim_map, device_mesh_);
       }
     }
   }
@@ -234,18 +233,13 @@ class DotHandler : public HandlerBase {
           const DimMap lhs_dim_map = {{lhs_space_dims_[i], mesh_dims[0]},
                                       {lhs_con_dims_[j], mesh_dims[1]}};
           const DimMap rhs_dim_map = {{rhs_con_dims_[j], mesh_dims[1]}};
-          if (!CheckDims(lhs_, lhs_dim_map)) continue;
-
           HloSharding output_spec = Tile(ins_->shape(), {space_base_dim_ + i},
                                          {mesh_dims[0]}, device_mesh_);
-          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
-
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost =
               cluster_env_.AllReduceCost(memory_cost, mesh_dims[1]);
-          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
-                            communication_cost);
+          MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_,
+                      0, communication_cost);
         }
       }
     }
@@ -262,21 +256,16 @@ class DotHandler : public HandlerBase {
           const DimMap rhs_dim_map = {{rhs_space_dims_[i], mesh_dims[1]},
                                       {rhs_con_dims_[j], mesh_dims[0]}};
           const DimMap lhs_dim_map = {{lhs_con_dims_[j], mesh_dims[0]}};
-          if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map))
-            continue;
           HloSharding output_spec =
               Tile(ins_->shape(),
                    {space_base_dim_ +
                     static_cast<int64_t>(lhs_space_dims_.size()) + i},
                    {mesh_dims[1]}, device_mesh_);
-          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost =
               cluster_env_.AllReduceCost(memory_cost, mesh_dims[0]);
-
-          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
-                            communication_cost);
+          MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_,
+                      0, communication_cost);
         }
       }
     }
@@ -289,13 +278,10 @@ class DotHandler : public HandlerBase {
         for (int64_t j = 0; j < device_mesh_.num_dimensions(); ++j) {
           const DimMap lhs_dim_map = {{lhs_batch_dims_[i], j}};
           const DimMap rhs_dim_map = {{rhs_batch_dims_[i], j}};
-          if (!CheckDims(lhs_, lhs_dim_map)) continue;
           std::string name = absl::StrFormat("Sb_%d = Sb x Sb @ {%d}", i, j);
           HloSharding output_spec = Tile(ins_->shape(), {i}, {j}, device_mesh_);
-          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
-
-          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+          MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map,
+                      device_mesh_);
         }
       }
     }
@@ -309,14 +295,11 @@ class DotHandler : public HandlerBase {
                                   {lhs_batch_dims_[1], mesh_dims[1]}};
       const DimMap rhs_dim_map = {{rhs_batch_dims_[0], mesh_dims[0]},
                                   {rhs_batch_dims_[1], mesh_dims[1]}};
-      if (!CheckDims(lhs_, lhs_dim_map)) return;
       std::string name =
           absl::StrFormat("Sb = Sb x Sb @ {%s}", absl::StrJoin(mesh_dims, ","));
       HloSharding output_spec =
           Tile(ins_->shape(), {0, 1}, mesh_dims, device_mesh_);
-      HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-      HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
-      AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_);
     }
   }
 
@@ -331,13 +314,10 @@ class DotHandler : public HandlerBase {
           const DimMap lhs_dim_map = {{lhs_space_dims_[i], mesh_dims[1]},
                                       {lhs_batch_dims_[j], mesh_dims[0]}};
           const DimMap rhs_dim_map = {{rhs_batch_dims_[j], mesh_dims[0]}};
-          if (!CheckDims(lhs_, lhs_dim_map)) continue;
           HloSharding output_spec = Tile(
               ins_->shape(), {j, space_base_dim_ + i}, mesh_dims, device_mesh_);
-          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
-
-          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+          MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map,
+                      device_mesh_);
         }
       }
     }
@@ -354,17 +334,13 @@ class DotHandler : public HandlerBase {
           const DimMap rhs_dim_map = {{rhs_space_dims_[i], mesh_dims[1]},
                                       {rhs_batch_dims_[j], mesh_dims[0]}};
           const DimMap lhs_dim_map = {{lhs_batch_dims_[j], mesh_dims[0]}};
-          if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map))
-            continue;
           HloSharding output_spec =
               Tile(ins_->shape(),
                    {j, space_base_dim_ +
                            static_cast<int64_t>(lhs_space_dims_.size()) + i},
                    mesh_dims, device_mesh_);
-          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
-
-          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+          MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map,
+                      device_mesh_);
         }
       }
     }
@@ -382,17 +358,13 @@ class DotHandler : public HandlerBase {
           const DimMap lhs_dim_map = {{lhs_con_dims_[i], mesh_dims[1]},
                                       {lhs_batch_dims_[j], mesh_dims[0]}};
           const DimMap rhs_dim_map = {{rhs_batch_dims_[j], mesh_dims[0]}};
-          if (!CheckDims(lhs_, lhs_dim_map)) continue;
           HloSharding output_spec =
               Tile(ins_->shape(), {j}, {mesh_dims[0]}, device_mesh_);
-          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost =
               cluster_env_.AllReduceCost(memory_cost, mesh_dims[1]);
-
-          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
-                            communication_cost);
+          MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_,
+                      0, communication_cost);
         }
       }
     }
@@ -413,16 +385,12 @@ class DotHandler : public HandlerBase {
                                       {lhs_con_dims_[j], mesh_dims[1]}};
           const DimMap rhs_dim_map = {{rhs_con_dims_[i], mesh_dims[0]},
                                       {rhs_con_dims_[j], mesh_dims[1]}};
-          if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map))
-            continue;
           HloSharding output_spec = HloSharding::Replicate();
-          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost = cluster_env_.AllReduceCost(
               memory_cost, mesh_dims[0], mesh_dims[1]);
-          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
-                            communication_cost);
+          MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_,
+                      0, communication_cost);
         }
       }
     }
@@ -437,18 +405,14 @@ class DotHandler : public HandlerBase {
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
         const DimMap lhs_dim_map = {{lhs_con_dims_[i], mesh_dims[0]}};
         const DimMap rhs_dim_map = {{rhs_con_dims_[i], mesh_dims[0]}};
-        if (!CheckDims(lhs_, lhs_dim_map)) continue;
         HloSharding output_spec = HloSharding::Replicate();
-        HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-        HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
         double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
         double compute_cost =
             cluster_env_.DotCost(lhs_->shape(), rhs_->shape(), dot_dnums_);
         double communication_cost =
             cluster_env_.AllReduceCost(memory_cost, mesh_dims[0]);
-
-        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, compute_cost,
-                          communication_cost);
+        MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_,
+                    compute_cost, communication_cost);
       }
     }
   }
@@ -462,6 +426,7 @@ class DotHandler : public HandlerBase {
 
       // Si = Si x R @ 0
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
+        const DimMap lhs_dim_map = {{lhs_space_dims_[i], mesh_dim}};
         if (lhs_->shape().dimensions(lhs_space_dims_[i]) < num_devices) {
           continue;
         }
@@ -473,14 +438,13 @@ class DotHandler : public HandlerBase {
           std::string name = absl::StrFormat("Si = Si x R @ %d", mesh_dim);
           HloSharding output_spec = Tile(ins_->shape(), {space_base_dim_ + i},
                                          {mesh_dim}, device_mesh_1d_);
-          HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_space_dims_[i]},
-                                      {mesh_dim}, device_mesh_1d_);
-          HloSharding rhs_spec = HloSharding::Replicate();
-          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+          MaybeAppend(name, output_spec, lhs_dim_map, {}, device_mesh_1d_);
       }
 
       // R = Sk x Sk @ (allreduce @ 0)
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
+          const DimMap lhs_dim_map = {{lhs_con_dims_[i], mesh_dim}};
+          const DimMap rhs_dim_map = {{rhs_con_dims_[i], mesh_dim}};
           if (lhs_->shape().dimensions(lhs_con_dims_[i]) < num_devices) {
           continue;
           }
@@ -492,16 +456,11 @@ class DotHandler : public HandlerBase {
           std::string name = absl::StrFormat(
               "R = Sk x Sk @ %d (allreduce @ %d)", mesh_dim, mesh_dim);
           HloSharding output_spec = HloSharding::Replicate();
-          HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_con_dims_[i]},
-                                      {mesh_dim}, device_mesh_1d_);
-          HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_con_dims_[i]},
-                                      {mesh_dim}, device_mesh_1d_);
           double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
           double communication_cost =
               cluster_env_.AllReduceCost(memory_cost, mesh_dim);
-
-          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
-                            communication_cost);
+          MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map,
+                      device_mesh_1d_, 0, communication_cost);
         }
     }
   }
@@ -514,15 +473,12 @@ class DotHandler : public HandlerBase {
       for (int64_t i = 0; i < lhs_batch_dims_.size(); ++i) {
           const DimMap lhs_dim_map = {{lhs_batch_dims_[i], mesh_dim}};
           const DimMap rhs_dim_map = {{rhs_batch_dims_[i], mesh_dim}};
-          if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map))
-          continue;
           std::string name =
               absl::StrFormat("Sb_%d = Sb x Sb @ {%d} 1d", i, mesh_dim);
           HloSharding output_spec =
               Tile(ins_->shape(), {i}, {mesh_dim}, device_mesh_1d_);
-          HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map);
-          HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map);
-          AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+          MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map,
+                      device_mesh_1d_);
       }
     }
   }
@@ -665,63 +621,52 @@ class ConvHandler : public HandlerBase {
 
   void SplitLhsBatchRhsOutchannel(MeshDims mesh_dims) {
     DCHECK_EQ(mesh_dims.size(), 2);
+    const DimMap lhs_dim_map = {{lhs_batch_dim_, mesh_dims[0]}};
+    const DimMap rhs_dim_map = {{rhs_out_channel_dim_, mesh_dims[1]}};
     std::string name =
         absl::StrFormat("SS = SR x RS @ {%s}", absl::StrJoin(mesh_dims, ","));
     HloSharding output_spec =
         Tile(ins_->shape(), {out_batch_dim_, out_out_channel_dim_}, mesh_dims,
              device_mesh_);
-    HloSharding lhs_spec =
-        Tile(lhs_->shape(), {lhs_batch_dim_}, {mesh_dims[0]}, device_mesh_);
-    HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_out_channel_dim_},
-                                {mesh_dims[1]}, device_mesh_);
-
-    AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+    MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_);
   }
 
   void SplitLhsBatchBothInchannel(MeshDims mesh_dims) {
     DCHECK_EQ(mesh_dims.size(), 2);
     if (device_mesh_.dim(mesh_dims[0]) > 1 &&
         device_mesh_.dim(mesh_dims[1]) > 1) {
+      const DimMap lhs_dim_map = {{lhs_batch_dim_, mesh_dims[0]},
+                                  {lhs_in_channel_dim_, mesh_dims[1]}};
+      const DimMap rhs_dim_map = {{rhs_in_channel_dim_, mesh_dims[1]}};
       std::string name =
           absl::StrFormat("SR = SS x SR @ {%s} (allreduce @ %d)",
                           absl::StrJoin(mesh_dims, ","), mesh_dims[1]);
       HloSharding output_spec =
           Tile(ins_->shape(), {out_batch_dim_}, {mesh_dims[0]}, device_mesh_);
-      HloSharding lhs_spec =
-          Tile(lhs_->shape(), {lhs_batch_dim_, lhs_in_channel_dim_}, mesh_dims,
-               device_mesh_);
-      HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_in_channel_dim_},
-                                  {mesh_dims[1]}, device_mesh_);
-
       double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
       double communication_cost =
           cluster_env_.AllReduceCost(memory_cost, mesh_dims[1]);
-
-      AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
-                        communication_cost);
+      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_, 0,
+                  communication_cost);
     }
   }
 
   void SplitRhsOutchannelBothInchannel(MeshDims mesh_dims) {
     DCHECK_EQ(mesh_dims.size(), 2);
     if (device_mesh_.dim(mesh_dims[0]) > 1) {
+      const DimMap lhs_dim_map = {{lhs_in_channel_dim_, mesh_dims[0]}};
+      const DimMap rhs_dim_map = {{rhs_in_channel_dim_, mesh_dims[0]},
+                                  {rhs_out_channel_dim_, mesh_dims[1]}};
       std::string name =
           absl::StrFormat("RS = RS x SS @ {%s} (allreduce @ %d)",
                           absl::StrJoin(mesh_dims, ","), mesh_dims[0]);
       HloSharding output_spec = Tile(ins_->shape(), {out_out_channel_dim_},
                                      {mesh_dims[1]}, device_mesh_);
-      HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_in_channel_dim_},
-                                  {mesh_dims[0]}, device_mesh_);
-      HloSharding rhs_spec =
-          Tile(rhs_->shape(), {rhs_in_channel_dim_, rhs_out_channel_dim_},
-               mesh_dims, device_mesh_);
-
       double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
       double communication_cost =
           cluster_env_.AllReduceCost(memory_cost, mesh_dims[0]);
-
-      AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
-                        communication_cost);
+      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_, 0,
+                  communication_cost);
     }
   }
 
@@ -734,53 +679,43 @@ class ConvHandler : public HandlerBase {
 
       // Si = Si x R @ 0
       if (lhs_->shape().dimensions(lhs_batch_dim_) % num_devices == 0) {
-        std::string name = absl::StrFormat("Si = Si x R @ 0");
-        HloSharding output_spec =
-            Tile(ins_->shape(), {out_batch_dim_}, {mesh_dim}, device_mesh_1d_);
-        HloSharding lhs_spec =
-            Tile(lhs_->shape(), {lhs_batch_dim_}, {mesh_dim}, device_mesh_1d_);
-        HloSharding rhs_spec = HloSharding::Replicate();
-
-        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+          const DimMap lhs_dim_map = {{lhs_batch_dim_, mesh_dim}};
+          std::string name = absl::StrFormat("Si = Si x R @ 0");
+          HloSharding output_spec = Tile(ins_->shape(), {out_batch_dim_},
+                                         {mesh_dim}, device_mesh_1d_);
+          MaybeAppend(name, output_spec, lhs_dim_map, {}, device_mesh_1d_);
       }
 
       // R = Sk x Sk @ (allreduce @ 0)
       if (lhs_->shape().dimensions(lhs_in_channel_dim_) % num_devices == 0 &&
           rhs_->shape().dimensions(rhs_in_channel_dim_) % num_devices == 0) {
-        std::string name = absl::StrFormat("R = Sk x Sk @ %d (allreduce @ %d)",
-                                           mesh_dim, mesh_dim);
-        HloSharding output_spec = HloSharding::Replicate();
-        HloSharding lhs_spec = Tile(lhs_->shape(), {lhs_in_channel_dim_},
-                                    {mesh_dim}, device_mesh_1d_);
-        HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_in_channel_dim_},
-                                    {mesh_dim}, device_mesh_1d_);
-        double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
-        double communication_cost = cluster_env_.AllReduceCost(memory_cost, 0) +
-                                    cluster_env_.AllReduceCost(memory_cost, 1);
-
-        AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0,
-                          communication_cost);
+          const DimMap lhs_dim_map = {{lhs_in_channel_dim_, mesh_dim}};
+          const DimMap rhs_dim_map = {{rhs_in_channel_dim_, mesh_dim}};
+          std::string name = absl::StrFormat(
+              "R = Sk x Sk @ %d (allreduce @ %d)", mesh_dim, mesh_dim);
+          HloSharding output_spec = HloSharding::Replicate();
+          double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
+          double communication_cost =
+              cluster_env_.AllReduceCost(memory_cost, 0) +
+              cluster_env_.AllReduceCost(memory_cost, 1);
+          MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map,
+                      device_mesh_1d_, 0, communication_cost);
       }
     }
   }
 
   void SplitDepthwise(MeshDims mesh_dims, bool forward) {
     DCHECK_EQ(mesh_dims.size(), 2);
+    const DimMap lhs_dim_map = {
+        {lhs_batch_dim_, mesh_dims[forward ? 0 : 1]},
+        {lhs_in_channel_dim_, mesh_dims[forward ? 1 : 0]}};
+    const DimMap rhs_dim_map = {{rhs_out_channel_dim_, mesh_dims[1]}};
     std::string name =
         absl::StrFormat("SS = SS x RS @ {%s}", absl::StrJoin(mesh_dims, ","));
     HloSharding output_spec =
         Tile(ins_->shape(), {out_batch_dim_, out_out_channel_dim_}, mesh_dims,
              device_mesh_);
-    HloSharding lhs_spec =
-        forward ? Tile(lhs_->shape(), {lhs_batch_dim_, lhs_in_channel_dim_},
-                       mesh_dims, device_mesh_)
-                : Tile(lhs_->shape(), {lhs_batch_dim_, lhs_in_channel_dim_},
-                       {mesh_dims[1], mesh_dims[0]}, device_mesh_);
-
-    HloSharding rhs_spec = Tile(rhs_->shape(), {rhs_out_channel_dim_},
-                                {mesh_dims[1]}, device_mesh_);
-
-    AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, 0, 0);
+    MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_);
   }
 
   Status RegisterStrategies() {

From 2a64b462701e35cae9f7e8c1bfad3363245c5267 Mon Sep 17 00:00:00 2001
From: Tongfei Guo <tongfei@google.com>
Date: Mon, 2 Oct 2023 15:20:21 -0700
Subject: [PATCH 496/567] [XLA:SPMD] Bug fix that empty tuple sharding
 shouldn't be a shard group.

PiperOrigin-RevId: 570195864
---
 third_party/xla/xla/hlo/ir/hlo_sharding.h        | 12 ++++++++----
 third_party/xla/xla/service/hlo_sharding_test.cc |  9 +++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/xla/xla/hlo/ir/hlo_sharding.h
index e30d2acc1ba3a2..430f95799a7bad 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -211,15 +211,18 @@ class HloSharding {
       return shard_group_.shard_group_id != -1 &&
              (shard_group_.shard_like || shard_group_.shard_as);
     }
-    return absl::c_all_of(
-        tuple_elements_, [](const HloSharding& s) { return s.IsShardGroup(); });
+    return !tuple_elements_.empty() &&
+           absl::c_all_of(tuple_elements_, [](const HloSharding& s) {
+             return s.IsShardGroup();
+           });
   }
 
   bool IsShardAs() const {
     if (!IsTuple()) {
       return shard_group_.shard_group_id != -1 && shard_group_.shard_as;
     }
-    return absl::c_all_of(tuple_elements_,
+    return !tuple_elements_.empty() &&
+           absl::c_all_of(tuple_elements_,
                           [](const HloSharding& s) { return s.IsShardAs(); });
   }
 
@@ -227,7 +230,8 @@ class HloSharding {
     if (!IsTuple()) {
       return shard_group_.shard_group_id != -1 && shard_group_.shard_like;
     }
-    return absl::c_all_of(tuple_elements_,
+    return !tuple_elements_.empty() &&
+           absl::c_all_of(tuple_elements_,
                           [](const HloSharding& s) { return s.IsShardLike(); });
   }
 
diff --git a/third_party/xla/xla/service/hlo_sharding_test.cc b/third_party/xla/xla/service/hlo_sharding_test.cc
index 96b4b1102004a4..b2c2535297709f 100644
--- a/third_party/xla/xla/service/hlo_sharding_test.cc
+++ b/third_party/xla/xla/service/hlo_sharding_test.cc
@@ -267,6 +267,15 @@ TEST_F(HloShardingTest, EmptySingleTuple) {
   EXPECT_TRUE(sharding.ExtractSingleSharding());
 }
 
+// Tests that empty tuple is not a shard group.
+TEST_F(HloShardingTest, EmptySingleTupleIsNotShardGroup) {
+  HloSharding sharding = HloSharding::SingleTuple(ShapeUtil::MakeTupleShape({}),
+                                                  HloSharding::AssignDevice(0));
+  EXPECT_FALSE(sharding.IsShardGroup());
+  EXPECT_FALSE(sharding.IsShardAs());
+  EXPECT_FALSE(sharding.IsShardLike());
+}
+
 TEST_F(HloShardingTest, NestedTuple) {
   // nested_tuple_shape = (f32[], (f32[3]), f32[4, 6])
   Shape nested_tuple_shape = ShapeUtil::MakeTupleShape({

From e6cb578e18e43a04702644a4fefb2169bc2e4b09 Mon Sep 17 00:00:00 2001
From: jameshollyer <j.c.hollyer@gmail.com>
Date: Mon, 2 Oct 2023 22:29:53 +0000
Subject: [PATCH 497/567] pin ml-dtypes version

---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index a3f46675f70633..33dccdff756029 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -89,7 +89,7 @@ def standard_or_nightly(standard, nightly):
     'google_pasta >= 0.1.1',
     'h5py >= 2.9.0',
     'libclang >= 13.0.0',
-    'ml_dtypes >= 0.2.0',
+    'ml_dtypes ~= 0.2.0',
     'numpy >= 1.23.5',
     'opt_einsum >= 2.3.2',
     'packaging',

From c37628c576dca6a45bb0b13ca85204829b5893c4 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Mon, 2 Oct 2023 15:40:07 -0700
Subject: [PATCH 498/567] [pjrt] Use a single implementation for Vec128
 transpose

We can generalize the transpose logic a bit further to make it so that it can handle rectangular Vec128 transposes.

PiperOrigin-RevId: 570201065
---
 third_party/xla/xla/pjrt/transpose.cc        |  33 +-
 third_party/xla/xla/pjrt/transpose_kernels.h | 479 ++++++++++---------
 2 files changed, 249 insertions(+), 263 deletions(-)

diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index 1ebc15e22a8095..acab3a8991e974 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -429,7 +429,7 @@ void TransposePlan::ExecuteTyped(const char* a, char* b,
         handle_inner_block_elems(std::integral_constant<int, 16>{});
         break;
       default:
-        LOG(FATAL) << "Invalid inner_block_size " << inner_block_elems_;
+        LOG(FATAL) << "Invalid inner_block_elems_ " << inner_block_elems_;
     }
   }
 }
@@ -1083,6 +1083,7 @@ void TransposePlan::Initialize() {
     b_stride1_size = std::min(b_stride1_size, b_dims_.back());
   }
 
+  constexpr int kMaxOuterBlockElems = 16;
   if (inner_kernel_is_memcpy_) {
     inner_block_elems_ = -1;
     outer_block_elems_a_ = -1;
@@ -1092,45 +1093,22 @@ void TransposePlan::Initialize() {
     // vectorized kernel for this element size?
     int min_inner_block_elems;
     int max_inner_block_elems;
-#ifdef XLA_HAS_VEC128
     switch (elem_size_in_bytes_) {
       case 1:
-        min_inner_block_elems = 4;
-        max_inner_block_elems = sizeof(Vec128) / sizeof(uint8_t);
-        break;
       case 2:
-        min_inner_block_elems = 4;
-        max_inner_block_elems = kMaxInnerBlockSizeBytes / sizeof(uint16_t);
-        break;
       case 4:
-        min_inner_block_elems = sizeof(Vec128) / sizeof(uint32_t);
-        max_inner_block_elems = kMaxInnerBlockSizeBytes / sizeof(uint32_t);
-        break;
       case 8:
-        min_inner_block_elems = sizeof(Vec128) / sizeof(uint64_t);
-        max_inner_block_elems = kMaxInnerBlockSizeBytes / sizeof(uint64_t);
-        break;
-      case 16:
         min_inner_block_elems = 1;
-        max_inner_block_elems = 1;
+        max_inner_block_elems = std::min<int>(
+            kMaxOuterBlockElems, kMaxInnerBlockSizeBytes / elem_size_in_bytes_);
         break;
-      default:
-        LOG(FATAL) << "Unreachable: element size " << elem_size_in_bytes_;
-    }
-#else
-    switch (elem_size_in_bytes_) {
-      case 1:
-      case 2:
-      case 4:
-      case 8:
       case 16:
         min_inner_block_elems = 1;
-        max_inner_block_elems = kMaxInnerBlockSizeBytes / elem_size_in_bytes_;
+        max_inner_block_elems = 1;
         break;
       default:
         LOG(FATAL) << "Unreachable: element size " << elem_size_in_bytes_;
     }
-#endif
     inner_block_elems_ = max_inner_block_elems;
     while (inner_block_elems_ > std::min(a_stride1_size, b_stride1_size)) {
       inner_block_elems_ /= 2;
@@ -1140,7 +1118,6 @@ void TransposePlan::Initialize() {
       // path.
       inner_block_elems_ = 1;
     }
-    constexpr int kMaxOuterBlockElems = 16;
     outer_block_elems_a_ = FloorOfRatio<int64_t>(
         std::min<int64_t>(kMaxOuterBlockElems, a_stride1_size),
         inner_block_elems_);
diff --git a/third_party/xla/xla/pjrt/transpose_kernels.h b/third_party/xla/xla/pjrt/transpose_kernels.h
index 11e34b15e4fa74..faaa8fd099ee33 100644
--- a/third_party/xla/xla/pjrt/transpose_kernels.h
+++ b/third_party/xla/xla/pjrt/transpose_kernels.h
@@ -16,6 +16,12 @@ limitations under the License.
 #ifndef XLA_PJRT_TRANSPOSE_KERNELS_H_
 #define XLA_PJRT_TRANSPOSE_KERNELS_H_
 
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+
 #if (defined(__GNUC__) || defined(__clang__)) && defined(__SSE2__)
 #define XLA_HAS_SSE2
 #elif defined(_MSC_VER) && !defined(_M_ARM64EC) && defined(_M_X64)
@@ -43,12 +49,6 @@ limitations under the License.
 #define XLA_HAS_VEC128
 #endif
 
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <type_traits>
-
 namespace xla {
 
 // Generic transpose kernel.
@@ -299,38 +299,69 @@ inline __m128i Unpack<8, Extract::kHi>(__m128i a, __m128i b) {
 
 using Vec128 = __m128i;
 
-template <typename T>
+template <size_t bytes>
 __m128i LoadElementIntoVec128(const void* p);
 
 template <>
-inline __m128i LoadElementIntoVec128<uint32_t>(const void* p) {
+inline __m128i LoadElementIntoVec128</*bytes=*/sizeof(uint16_t)>(
+    const void* p) {
+  // Note: We would ideally use `_mm_loadu_si16` here but older compilers do
+  // not support it. However, we can replicate it using a sequence such that
+  // even older compilers will turn this into a single movd instruction.
+  // memcpy is used because `p` is not guaranteed to be aligned to a 2-byte
+  // address.
+  uint16_t load;
+  memcpy(&load, p, sizeof(load));
+  return _mm_cvtsi32_si128(load);
+}
+
+template <>
+inline __m128i LoadElementIntoVec128</*bytes=*/sizeof(uint32_t)>(
+    const void* p) {
   // Note: We would ideally use `_mm_loadu_si32` here but older compilers do
   // not support it. However, we can replicate it using a sequence such that
   // even older compilers will turn this into a single movd instruction.
   // memcpy is used because `p` is not guaranteed to be aligned to a 4-byte
   // address.
-  int load;
+  uint32_t load;
   memcpy(&load, p, sizeof(load));
   return _mm_cvtsi32_si128(load);
 }
 
 template <>
-inline __m128i LoadElementIntoVec128<uint64_t>(const void* p) {
-  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
+inline __m128i LoadElementIntoVec128</*bytes=*/sizeof(uint64_t)>(
+    const void* p) {
+  // Note: We would ideally use `_mm_loadu_si64` here but older compilers do
+  // not support it. However, we can replicate it using a sequence such that
+  // even older compilers will turn this into a single movd instruction.
+  // memcpy is used because `p` is not guaranteed to be aligned to a 8-byte
+  // address.
+  uint64_t load;
+  memcpy(&load, p, sizeof(load));
+  return _mm_cvtsi64_si128(load);
 }
 
 template <>
-inline __m128i LoadElementIntoVec128<__m128i>(const void* p) {
+inline __m128i LoadElementIntoVec128</*bytes=*/sizeof(__m128i)>(const void* p) {
   return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
 }
 
-template <typename T, int lane>
+template <size_t bytes, int lane>
 inline void StoreElementFromVec128(void* p, __m128i v) {
-  if constexpr (sizeof(T) * lane == sizeof(Vec128) / 2) {
-    return StoreElementFromVec128<T, 0>(p,
-                                        Unpack<sizeof(T), Extract::kHi>(v, v));
-  } else if constexpr (std::is_same_v<T, uint32_t>) {
-    if constexpr (lane != 0) {
+  static_assert(bytes * (lane + 1) <= sizeof(Vec128));
+  constexpr bool halfway = bytes * lane == sizeof(Vec128) / 2;
+  if constexpr (bytes == sizeof(uint16_t)) {
+    // Note: We would ideally use `_mm_storeu_si16` here but older compilers do
+    // not support it. However, we can replicate it using a sequence such that
+    // even older compilers will turn this into a single movd instruction.
+    // memcpy is used because `p` is not guaranteed to be aligned to a 4-byte
+    // address.
+    const uint16_t scalar = _mm_extract_epi16(v, lane);
+    memcpy(p, &scalar, bytes);
+  } else if constexpr (bytes == sizeof(uint32_t)) {
+    if constexpr (halfway) {
+      v = Unpack<bytes, Extract::kHi>(v, v);
+    } else if constexpr (lane != 0) {
       v = _mm_shuffle_epi32(v, _MM_SHUFFLE(lane, lane, lane, lane));
     }
     // Note: We would ideally use `_mm_storeu_si32` here but older compilers do
@@ -338,14 +369,26 @@ inline void StoreElementFromVec128(void* p, __m128i v) {
     // even older compilers will turn this into a single movd instruction.
     // memcpy is used because `p` is not guaranteed to be aligned to a 4-byte
     // address.
-    memcpy(p, &v, sizeof(uint32_t));
-  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    const uint32_t scalar = _mm_cvtsi128_si32(v);
+    memcpy(p, &scalar, bytes);
+  } else if constexpr (bytes == sizeof(uint64_t)) {
+    if constexpr (halfway) {
+      v = Unpack<bytes, Extract::kHi>(v, v);
+    } else {
+      static_assert(lane == 0);
+    }
+    // Note: We would ideally use `_mm_storeu_si64` here but older compilers do
+    // not support it. However, we can replicate it using a sequence such that
+    // even older compilers will turn this into a single movd instruction.
+    // memcpy is used because `p` is not guaranteed to be aligned to a 8-byte
+    // address.
+    const uint64_t scalar = _mm_cvtsi128_si64(v);
+    memcpy(p, &scalar, bytes);
+  } else if constexpr (bytes == sizeof(__m128i)) {
     static_assert(lane == 0);
-    _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v);
-  } else if constexpr (std::is_same_v<T, __m128i>) {
     _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);
   } else {
-    static_assert(sizeof(T) == 0);
+    static_assert(bytes == 0);
   }
 }
 #endif
@@ -402,11 +445,23 @@ inline uint64x2_t Unpack<8, Extract::kHi>(uint64x2_t a, uint64x2_t b) {
 
 using Vec128 = uint64x2_t;
 
-template <typename T>
+template <size_t>
 uint64x2_t LoadElementIntoVec128(const void* p);
 
 template <>
-inline uint64x2_t LoadElementIntoVec128<uint32_t>(const void* p) {
+inline uint64x2_t LoadElementIntoVec128</*bytes=*/sizeof(uint16_t)>(
+    const void* p) {
+  // Ideally, we would use `vld1q_lane_u16` but it assumes that its input is
+  // aligned to a 16-bit boundary. We can only promise 8-bit aligned. That said,
+  // this sequence will compile to `ldr St, [Xn]` but without an alignment hint.
+  uint16_t x;
+  memcpy(&x, p, sizeof(x));
+  return vreinterpretq_u64_u16(vsetq_lane_u16(x, vdupq_n_u16(0), 0));
+}
+
+template <>
+inline uint64x2_t LoadElementIntoVec128</*bytes=*/sizeof(uint32_t)>(
+    const void* p) {
   // Ideally, we would use `vld1q_lane_u32` but it assumes that its input is
   // aligned to a 32-bit boundary. We can only promise 8-bit aligned. That said,
   // this sequence will compile to `ldr St, [Xn]` but without an alignment hint.
@@ -416,7 +471,8 @@ inline uint64x2_t LoadElementIntoVec128<uint32_t>(const void* p) {
 }
 
 template <>
-inline uint64x2_t LoadElementIntoVec128<uint64_t>(const void* p) {
+inline uint64x2_t LoadElementIntoVec128</*bytes=*/sizeof(uint64_t)>(
+    const void* p) {
   // Ideally, we would use `vld1q_lane_u64` but it assumes that its input is
   // aligned to a 64-bit boundary. We can only promise 8-bit aligned. That said,
   // this sequence will compile to `ldr Dt, [Xn]` but without an alignment hint.
@@ -425,33 +481,42 @@ inline uint64x2_t LoadElementIntoVec128<uint64_t>(const void* p) {
 }
 
 template <>
-inline uint64x2_t LoadElementIntoVec128<uint64x2_t>(const void* p) {
+inline uint64x2_t LoadElementIntoVec128</*bytes=*/sizeof(uint64x2_t)>(
+    const void* p) {
   return vreinterpretq_u64_u8(vld1q_u8(reinterpret_cast<const uint8_t*>(p)));
 }
 
-template <typename T, int lane>
+template <size_t bytes, int lane>
 inline void StoreElementFromVec128(void* p, uint64x2_t v) {
-  static_assert(sizeof(T) * (lane + 1) <= sizeof(uint64x2_t));
-  if constexpr (std::is_same_v<T, uint64x2_t>) {
+  static_assert(bytes * (lane + 1) <= sizeof(uint64x2_t));
+  if constexpr (bytes == sizeof(uint64x2_t)) {
+    static_assert(lane == 0);
     vst1q_u8(reinterpret_cast<uint8_t*>(p), vreinterpretq_u8_u64(v));
   } else {
-    T extracted;
-    if constexpr (std::is_same_v<T, uint64_t>) {
+    if constexpr (bytes == sizeof(uint64_t)) {
       // Ideally, we would use `vst1q_lane_u64` but it assumes that its input is
       // aligned to a 64-bit boundary. We can only promise 8-bit aligned. That
       // said, this sequence will compile to `st1 {vt.d}[lane], [xn]` but
       // without an alignment hint.
-      extracted = vgetq_lane_u64(v, lane);
-    } else if constexpr (std::is_same_v<T, uint32_t>) {
+      uint64_t extracted = vgetq_lane_u64(v, lane);
+      memcpy(p, &extracted, sizeof(extracted));
+    } else if constexpr (bytes == sizeof(uint32_t)) {
       // Ideally, we would use `vst1q_lane_u32` but it assumes that its input is
       // aligned to a 32-bit boundary. We can only promise 8-bit aligned. That
       // said, this sequence will compile to `st1 {vt.s}[lane], [xn]` but
       // without an alignment hint.
-      extracted = vgetq_lane_u32(vreinterpretq_u32_u64(v), lane);
+      uint32_t extracted = vgetq_lane_u32(vreinterpretq_u32_u64(v), lane);
+      memcpy(p, &extracted, sizeof(extracted));
+    } else if constexpr (bytes == sizeof(uint16_t)) {
+      // Ideally, we would use `vst1q_lane_u16` but it assumes that its input is
+      // aligned to a 16-bit boundary. We can only promise 8-bit aligned. That
+      // said, this sequence will compile to `st1 {vt.h}[lane], [xn]` but
+      // without an alignment hint.
+      uint16_t extracted = vgetq_lane_u16(vreinterpretq_u16_u64(v), lane);
+      memcpy(p, &extracted, sizeof(extracted));
     } else {
-      static_assert(sizeof(T) == 0);
+      static_assert(bytes == 0);
     }
-    memcpy(p, &extracted, sizeof(extracted));
   }
 }
 #endif
@@ -474,41 +539,85 @@ inline std::array<T, N> UnpackStep(const std::array<T, N>& last_transpose) {
   return unpack;
 }
 
-template <size_t element_size, size_t step_size, size_t max_step_size,
+template <size_t element_size, size_t step_size, size_t unpack_limit,
           typename T, size_t N>
 inline std::array<T, N> UnpackSequence(const std::array<T, N>& last_transpose) {
-  if constexpr (element_size * step_size <= max_step_size) {
+  if constexpr (element_size * step_size < unpack_limit) {
     std::array<T, N> unpack =
         UnpackStep<element_size, step_size>(last_transpose);
-    return UnpackSequence<element_size, step_size * 2, max_step_size>(unpack);
+    return UnpackSequence<element_size, step_size * 2, unpack_limit>(unpack);
   }
   return last_transpose;
 }
 
+template <size_t element_size, size_t bs, typename T, size_t N>
+inline auto UnpackLowSequence(const std::array<T, N>& last_transpose) {
+  if constexpr (N > 1 && element_size * bs < sizeof(T)) {
+    static_assert(N % 2 == 0);
+    std::array<T, N / 2> unpack;
+    for (int i = 0; i < N; i += 2) {
+      unpack[i / 2] = Unpack<element_size, Extract::kLo>(last_transpose[i],
+                                                         last_transpose[i + 1]);
+    }
+    return UnpackLowSequence<element_size * 2, bs>(unpack);
+  } else {
+    return last_transpose;
+  }
+}
+
+template <size_t bytes, size_t... lane>
+inline void StoreElementsFromVec128(char* b, int64_t ldb, Vec128 x, size_t i,
+                                    std::index_sequence<lane...>) {
+  (StoreElementFromVec128</*bytes=*/bytes, lane>(b + ldb * (i + lane), x), ...);
+}
+
 template <typename T, int bs>
-struct Vec128SquareTransposeMicroKernelImpl {
+struct Vec128RectangularTransposeMicroKernelImpl {
   XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
                                 char* __restrict b, int64_t ldb) {
     constexpr size_t element_size = sizeof(T);
-    static_assert(element_size <= 16);
-    static_assert(16 % element_size == 0);
-    static_assert(bs * element_size == sizeof(Vec128));
-    std::array<Vec128, bs> last_transpose;
+    static_assert(sizeof(Vec128) % element_size == 0);
+    std::array<Vec128, bs> loads;
+    // `loads`:
+    // [  0,  1,  2,  3 ]
+    // [  4,  5,  6,  7 ]
+    // [  8,  9, 10, 11 ]
+    // [ 12, 13, 14, 15 ]
     XLA_UNROLL
     for (int i = 0; i < bs; ++i) {
-      last_transpose[i] = LoadElementIntoVec128<Vec128>(a + lda * i);
+      loads[i] = LoadElementIntoVec128<element_size * bs>(a + lda * i);
     }
 
-    last_transpose =
-        UnpackSequence<element_size, /*step_size=*/1, /*max_step_size=*/8>(
-            last_transpose);
+    // Each load may not have filled the entire vector. Combine rows to get a
+    // fully populated vector.
+    auto last_transpose = UnpackLowSequence<element_size, bs>(loads);
 
+    constexpr int kBytesInMatrix = element_size * bs * bs;
+    static_assert(kBytesInMatrix <= sizeof(Vec128) * last_transpose.size());
+    static_assert(bs % last_transpose.size() == 0);
+    constexpr int kStoresPerCombinedRow = bs / last_transpose.size();
+
+    // Each row of the matrix originally occupied some fraction of a vector.
+    // We may need to finish the transpose if `last_transpose.size() > 1`.
+    // `last_transpose`:
+    // [ 0,  4,  1,  5,  2,  6,  3,  7 ]
+    // [ 8, 12,  9, 13, 10, 14, 11, 15 ]
+    last_transpose =
+        UnpackSequence<kBytesInMatrix / (bs * last_transpose.size()),
+                       /*step_size=*/1,
+                       /*unpack_limit=*/kBytesInMatrix / bs>(last_transpose);
+    // `last_transpose`:
+    // [ 0,  4,  8, 12,  1,  5,  9, 13 ]
+    // [ 2,  6, 10, 14,  3,  7, 11, 15 ]
     XLA_UNROLL
-    for (int i = 0; i < bs; ++i) {
-      StoreElementFromVec128<Vec128, 0>(b + ldb * i, last_transpose[i]);
+    for (size_t i = 0; i < last_transpose.size(); ++i) {
+      StoreElementsFromVec128<element_size * bs>(
+          b, ldb, last_transpose[i], i * kStoresPerCombinedRow,
+          std::make_index_sequence<kStoresPerCombinedRow>{});
     }
   }
 };
+
 #endif
 
 #ifdef __AVX__
@@ -517,9 +626,9 @@ struct AvxSquareTransposeMicroKernelImpl {
   XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
                                 char* __restrict b, int64_t ldb) {
     constexpr size_t element_size = sizeof(T);
-    static_assert(element_size <= 16);
-    static_assert(16 % element_size == 0);
-    static_assert(bs * element_size == sizeof(__m256i));
+    static_assert(element_size <= sizeof(__m128i));
+    static_assert(sizeof(__m128i) % element_size == 0);
+    static_assert(element_size * bs == sizeof(__m256i));
     std::array<__m256i, bs> last_transpose;
     XLA_UNROLL
     for (int i = 0; i < bs / 2; ++i) {
@@ -535,8 +644,8 @@ struct AvxSquareTransposeMicroKernelImpl {
     }
 
     last_transpose =
-        UnpackSequence<element_size, /*step_size=*/1, /*max_step_size=*/8>(
-            last_transpose);
+        UnpackSequence<element_size, /*step_size=*/1,
+                       /*unpack_limit=*/sizeof(__m128i)>(last_transpose);
 
     XLA_UNROLL
     for (int i = 0; i < bs; ++i) {
@@ -553,9 +662,9 @@ struct AvxRectangularTransposeMicroKernelImpl {
   XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
                                 char* __restrict b, int64_t ldb) {
     constexpr size_t element_size = sizeof(T);
-    static_assert(element_size <= 16);
-    static_assert(16 % element_size == 0);
-    static_assert(bs * element_size * 2 == sizeof(__m256i));
+    static_assert(element_size <= sizeof(__m128i));
+    static_assert(sizeof(__m128i) % element_size == 0);
+    static_assert(element_size * bs * 2 == sizeof(__m256i));
     std::array<__m256i, bs / 2> last_transpose;
     XLA_UNROLL
     for (int i = 0; i < bs / 2; ++i) {
@@ -566,10 +675,10 @@ struct AvxRectangularTransposeMicroKernelImpl {
     }
 
     last_transpose =
-        UnpackSequence<element_size, /*step_size=*/1, /*max_step_size=*/4>(
-            last_transpose);
+        UnpackSequence<element_size, /*step_size=*/1,
+                       /*unpack_limit=*/sizeof(__m128i) / 2>(last_transpose);
 
-    if constexpr (element_size <= 8) {
+    if constexpr (element_size <= sizeof(__m128i) / 2) {
       XLA_UNROLL
       for (int i = 0; i < bs / 2; ++i) {
 #if defined(__AVX2__)
@@ -597,39 +706,26 @@ struct AvxRectangularTransposeMicroKernelImpl {
 };
 #endif
 
+#ifdef XLA_HAS_VEC128
+template <>
+struct TransposeMicroKernel<uint8_t, /*bs=*/2> {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
+    using T = uint8_t;
+    constexpr int bs = 2;
+    Vec128RectangularTransposeMicroKernelImpl<T, bs>::Apply(a, lda, b, ldb);
+  }
+};
+#endif
+
 #ifdef XLA_HAS_VEC128
 template <>
 struct TransposeMicroKernel<uint8_t, /*bs=*/4> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
     using T = uint8_t;
     constexpr int bs = 4;
-    constexpr size_t element_size = sizeof(T);
-    std::array<Vec128, bs> loads;
-    // [  0,  1,  2,  3 ]
-    // [  4,  5,  6,  7 ]
-    // [  8,  9, 10, 11 ]
-    // [ 12, 13, 14, 15 ]
-    XLA_UNROLL
-    for (int i = 0; i < bs; ++i) {
-      loads[i] = LoadElementIntoVec128<uint32_t>(a + lda * i);
-    }
-    // [  0,  4,  1,  5,  2,  6,  3,  7 ]
-    Vec128 x_0_1 = Unpack<element_size, Extract::kLo>(loads[0], loads[1]);
-    // [  8, 12,  9, 13, 10, 14, 11, 15 ]
-    Vec128 x_2_3 = Unpack<element_size, Extract::kLo>(loads[2], loads[3]);
-    // [  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 ]
-    Vec128 x = Unpack<element_size * 2, Extract::kLo>(x_0_1, x_2_3);
-
-    // [  0,  4,  8, 12 ]
-    StoreElementFromVec128<uint32_t, 0>(b + ldb * 0, x);
-    // [  1,  5,  9, 13 ]
-    StoreElementFromVec128<uint32_t, 1>(b + ldb * 1, x);
-    // [  2,  6, 10, 14 ]
-    StoreElementFromVec128<uint32_t, 2>(b + ldb * 2, x);
-    // [  3,  7, 11, 15 ]
-    StoreElementFromVec128<uint32_t, 3>(b + ldb * 3, x);
+    Vec128RectangularTransposeMicroKernelImpl<T, bs>::Apply(a, lda, b, ldb);
   }
 };
 #endif
@@ -637,75 +733,11 @@ struct TransposeMicroKernel<uint8_t, /*bs=*/4> {
 #ifdef XLA_HAS_VEC128
 template <>
 struct TransposeMicroKernel<uint8_t, /*bs=*/8> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
     using T = uint8_t;
     constexpr int bs = 8;
-    constexpr size_t element_size = sizeof(T);
-    // To help understand each step, let's show the contents of our SIMD
-    // vectors.
-    // The numbers shown are in octal and represent the source position from the
-    // input in (row, column) format.
-    //
-    // [00, 01, 02, 03, 04, 05, 06, 07],
-    // [10, 11, 12, 13, 14, 15, 16, 17],
-    // [20, 21, 22, 23, 24, 25, 26, 27],
-    // [30, 31, 32, 33, 34, 35, 36, 37],
-    // [40, 41, 42, 43, 44, 45, 46, 47],
-    // [50, 51, 52, 53, 54, 55, 56, 57],
-    // [60, 61, 62, 63, 64, 65, 66, 67],
-    // [70, 71, 72, 73, 74, 75, 76, 77],
-    std::array<Vec128, bs> loads;
-    XLA_UNROLL
-    for (int i = 0; i < bs; ++i) {
-      loads[i] = LoadElementIntoVec128<uint64_t>(a + lda * i);
-    }
-
-    // Pack adjacent loads together into one SIMD vector by interleaving the
-    // lanes.
-    //
-    // [00, 10, 01, 11, 02, 12, 03, 13, 04, 14, 05, 15, 06, 16, 07, 17],
-    // [20, 30, 21, 31, 22, 32, 23, 33, 24, 34, 25, 35, 26, 36, 27, 37],
-    // [40, 50, 41, 51, 42, 52, 43, 53, 44, 54, 45, 55, 46, 56, 47, 57],
-    // [60, 70, 61, 71, 62, 72, 63, 73, 64, 74, 65, 75, 66, 76, 67, 77],
-    // In effect, we are splitting each SIMD vector into two blocks of 8
-    // elements, then interleaving the elements.
-    std::array<Vec128, bs / 2> last_transpose;
-    XLA_UNROLL
-    for (int i = 0; i < bs / 2; ++i) {
-      // There is no need for `Unpack<1, Extract::kHi>` as the high half of the
-      // two vectors contains zeros.
-      last_transpose[i] =
-          Unpack<element_size, Extract::kLo>(loads[i * 2], loads[i * 2 + 1]);
-    }
-
-    // [00, 10, 20, 30, 40, 50, 60, 70, 01, 11, 21, 31, 41, 51, 61, 71],
-    // [02, 12, 22, 32, 42, 52, 62, 72, 03, 13, 23, 33, 43, 53, 63, 73],
-    // [04, 14, 24, 34, 44, 54, 64, 74, 05, 15, 25, 35, 45, 55, 65, 75],
-    // [06, 16, 26, 36, 46, 56, 66, 76, 07, 17, 27, 37, 47, 57, 67, 77],
-    last_transpose =
-        UnpackSequence<element_size * 2, /*step_size=*/1, /*max_step_size=*/4>(
-            last_transpose);
-
-    // We have two rows stored in our 128-bit SIMD vector but our block size
-    // is 64-bit, unpack and do two stores.
-    //
-    // [00, 10, 20, 30, 40, 50, 60, 70],
-    // [01, 11, 21, 31, 41, 51, 61, 71],
-    // [02, 12, 22, 32, 42, 52, 62, 72],
-    // [03, 13, 23, 33, 43, 53, 63, 73],
-    // [04, 14, 24, 34, 44, 54, 64, 74],
-    // [05, 15, 25, 35, 45, 55, 65, 75],
-    // [06, 16, 26, 36, 46, 56, 66, 76],
-    // [07, 17, 27, 37, 47, 57, 67, 77],
-    XLA_UNROLL
-    for (int i = 0; i < bs; i += 2) {
-      StoreElementFromVec128<uint64_t, 0>(b + ldb * (i + 0),
-                                          last_transpose[i / 2]);
-      StoreElementFromVec128<uint64_t, 1>(b + ldb * (i + 1),
-                                          last_transpose[i / 2]);
-    }
+    Vec128RectangularTransposeMicroKernelImpl<T, bs>::Apply(a, lda, b, ldb);
   }
 };
 #endif
@@ -713,20 +745,30 @@ struct TransposeMicroKernel<uint8_t, /*bs=*/8> {
 #ifdef __AVX__
 template <>
 struct TransposeMicroKernel<uint8_t, /*bs=*/16> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
     AvxRectangularTransposeMicroKernelImpl<uint8_t, 16>::Apply(a, lda, b, ldb);
   }
 };
 #elif defined(XLA_HAS_VEC128)
 template <>
 struct TransposeMicroKernel<uint8_t, /*bs=*/16> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
-    Vec128SquareTransposeMicroKernelImpl<uint8_t, /*bs=*/16>::Apply(a, lda, b,
-                                                                    ldb);
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
+    Vec128RectangularTransposeMicroKernelImpl<uint8_t, /*bs=*/16>::Apply(
+        a, lda, b, ldb);
+  }
+};
+#endif
+
+#ifdef XLA_HAS_VEC128
+template <>
+struct TransposeMicroKernel<uint16_t, /*bs=*/2> {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
+    using T = uint16_t;
+    constexpr int bs = 2;
+    Vec128RectangularTransposeMicroKernelImpl<T, bs>::Apply(a, lda, b, ldb);
   }
 };
 #endif
@@ -734,48 +776,11 @@ struct TransposeMicroKernel<uint8_t, /*bs=*/16> {
 #ifdef XLA_HAS_VEC128
 template <>
 struct TransposeMicroKernel<uint16_t, /*bs=*/4> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
     using T = uint16_t;
     constexpr int bs = 4;
-    constexpr size_t element_size = sizeof(T);
-    // Note, Vec128 vectors can hold 8 uint16_t elements but our block size
-    // is 4. We need to issue 4 loads to properly handle strides.
-    //
-    // [ 0,  1,  2,  3],
-    // [ 4,  5,  6,  7],
-    // [ 8,  9, 10, 11],
-    // [12, 13, 14, 15],
-    std::array<Vec128, bs> loads;
-    XLA_UNROLL
-    for (int i = 0; i < bs; ++i) {
-      loads[i] = LoadElementIntoVec128<uint64_t>(a + lda * i);
-    }
-
-    // [ 0,  4,  1,  5,  2,  6,  3,  7 ]
-    auto unpack_16_0 = Unpack<element_size, Extract::kLo>(loads[0], loads[1]);
-    // [ 8, 12,  9, 13, 10, 14, 11, 15 ]
-    auto unpack_16_1 = Unpack<element_size, Extract::kLo>(loads[2], loads[3]);
-
-    // Note: there is no need for `Unpack<2, Extract::kHi>` as we only populate
-    // the bottom 4 lanes.
-
-    // [ 0,  4,  8, 12,  1,  5,  9, 13 ]
-    auto unpack_32_0 =
-        Unpack<element_size * 2, Extract::kLo>(unpack_16_0, unpack_16_1);
-    // [ 2,  6, 10, 14,  3,  7, 11, 15 ]
-    auto unpack_32_1 =
-        Unpack<element_size * 2, Extract::kHi>(unpack_16_0, unpack_16_1);
-
-    // [ 0,  4,  8, 12 ]
-    StoreElementFromVec128<uint64_t, 0>(b + ldb * 0, unpack_32_0);
-    // [ 1,  5,  9, 13 ]
-    StoreElementFromVec128<uint64_t, 1>(b + ldb * 1, unpack_32_0);
-    // [ 2,  6, 10, 14 ]
-    StoreElementFromVec128<uint64_t, 0>(b + ldb * 2, unpack_32_1);
-    // [ 3,  7, 11, 15 ]
-    StoreElementFromVec128<uint64_t, 1>(b + ldb * 3, unpack_32_1);
+    Vec128RectangularTransposeMicroKernelImpl<T, bs>::Apply(a, lda, b, ldb);
   }
 };
 #endif
@@ -783,20 +788,18 @@ struct TransposeMicroKernel<uint16_t, /*bs=*/4> {
 #if defined(__AVX__)
 template <>
 struct TransposeMicroKernel<uint16_t, /*bs=*/8> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
     AvxRectangularTransposeMicroKernelImpl<uint16_t, 8>::Apply(a, lda, b, ldb);
   }
 };
 #elif defined(XLA_HAS_VEC128)
 template <>
 struct TransposeMicroKernel<uint16_t, /*bs=*/8> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
-    Vec128SquareTransposeMicroKernelImpl<uint16_t, /*bs=*/8>::Apply(a, lda, b,
-                                                                    ldb);
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
+    Vec128RectangularTransposeMicroKernelImpl<uint16_t, /*bs=*/8>::Apply(
+        a, lda, b, ldb);
   }
 };
 #endif
@@ -804,32 +807,41 @@ struct TransposeMicroKernel<uint16_t, /*bs=*/8> {
 #ifdef __AVX__
 template <>
 struct TransposeMicroKernel<uint16_t, /*bs=*/16> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
     AvxSquareTransposeMicroKernelImpl<uint16_t, /*bs=*/16>::Apply(a, lda, b,
                                                                   ldb);
   }
 };
 #endif
 
+#ifdef XLA_HAS_VEC128
+template <>
+struct TransposeMicroKernel<uint32_t, /*bs=*/2> {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
+    using T = uint32_t;
+    constexpr int bs = 2;
+    Vec128RectangularTransposeMicroKernelImpl<T, bs>::Apply(a, lda, b, ldb);
+  }
+};
+#endif
+
 #ifdef __AVX__
 template <>
 struct TransposeMicroKernel<uint32_t, /*bs=*/4> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
     AvxRectangularTransposeMicroKernelImpl<uint32_t, 4>::Apply(a, lda, b, ldb);
   }
 };
 #elif defined(XLA_HAS_VEC128)
 template <>
 struct TransposeMicroKernel<uint32_t, /*bs=*/4> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
-    Vec128SquareTransposeMicroKernelImpl<uint32_t, /*bs=*/4>::Apply(a, lda, b,
-                                                                    ldb);
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
+    Vec128RectangularTransposeMicroKernelImpl<uint32_t, /*bs=*/4>::Apply(
+        a, lda, b, ldb);
   }
 };
 #endif
@@ -837,9 +849,8 @@ struct TransposeMicroKernel<uint32_t, /*bs=*/4> {
 #ifdef __AVX__
 template <>
 struct TransposeMicroKernel<uint32_t, /*bs=*/8> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
     AvxSquareTransposeMicroKernelImpl<uint32_t, /*bs=*/8>::Apply(a, lda, b,
                                                                  ldb);
   }
@@ -849,11 +860,10 @@ struct TransposeMicroKernel<uint32_t, /*bs=*/8> {
 #ifdef XLA_HAS_VEC128
 template <>
 struct TransposeMicroKernel<uint64_t, /*bs=*/2> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
-    Vec128SquareTransposeMicroKernelImpl<uint64_t, /*bs=*/2>::Apply(a, lda, b,
-                                                                    ldb);
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
+    Vec128RectangularTransposeMicroKernelImpl<uint64_t, /*bs=*/2>::Apply(
+        a, lda, b, ldb);
   }
 };
 #endif
@@ -861,9 +871,8 @@ struct TransposeMicroKernel<uint64_t, /*bs=*/2> {
 #ifdef __AVX__
 template <>
 struct TransposeMicroKernel<uint64_t, /*bs=*/4> {
-  XLA_FLATTEN XLA_FLATTEN static void Apply(const char* __restrict a,
-                                            int64_t lda, char* __restrict b,
-                                            int64_t ldb) {
+  XLA_FLATTEN static void Apply(const char* __restrict a, int64_t lda,
+                                char* __restrict b, int64_t ldb) {
     AvxSquareTransposeMicroKernelImpl<uint64_t, /*bs=*/4>::Apply(a, lda, b,
                                                                  ldb);
   }

From 59d97aece0420ab1576ff9adc469e04279e68766 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 15:50:50 -0700
Subject: [PATCH 499/567] Add support for detecting the AllGather->DynamicSlice
 pattern.

PiperOrigin-RevId: 570203891
---
 third_party/xla/xla/service/BUILD             |   7 +-
 ...atter_utils.cc => collective_opt_utils.cc} | 141 +++++++++++-------
 ...scatter_utils.h => collective_opt_utils.h} |  27 +++-
 third_party/xla/xla/service/gpu/BUILD         |   2 +-
 .../service/gpu/gpu_reduce_scatter_creator.cc |   2 +-
 5 files changed, 120 insertions(+), 59 deletions(-)
 rename third_party/xla/xla/service/{reduce_scatter_utils.cc => collective_opt_utils.cc} (77%)
 rename third_party/xla/xla/service/{reduce_scatter_utils.h => collective_opt_utils.h} (52%)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index f66de01f79d546..201a95e2ce3a4b 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -100,12 +100,13 @@ filegroup(
 )
 
 cc_library(
-    name = "reduce_scatter_utils",
-    srcs = ["reduce_scatter_utils.cc"],
-    hdrs = ["reduce_scatter_utils.h"],
+    name = "collective_opt_utils",
+    srcs = ["collective_opt_utils.cc"],
+    hdrs = ["collective_opt_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
diff --git a/third_party/xla/xla/service/reduce_scatter_utils.cc b/third_party/xla/xla/service/collective_opt_utils.cc
similarity index 77%
rename from third_party/xla/xla/service/reduce_scatter_utils.cc
rename to third_party/xla/xla/service/collective_opt_utils.cc
index 87938e868bb395..c83f1d7b5f51fd 100644
--- a/third_party/xla/xla/service/reduce_scatter_utils.cc
+++ b/third_party/xla/xla/service/collective_opt_utils.cc
@@ -13,19 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/reduce_scatter_utils.h"
+#include "xla/service/collective_opt_utils.h"
 
 #include <functional>
 #include <optional>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 
 namespace xla {
 namespace {
-
 bool IsTableLookup(const HloInstruction* hlo) {
   while (hlo->opcode() == HloOpcode::kBitcast ||
          hlo->opcode() == HloOpcode::kReshape ||
@@ -87,21 +89,22 @@ int64_t GetIndexForId(const HloInstruction* index, int64_t id,
 bool IsPerIdOffsets(absl::Span<const HloInstruction*> offsets,
                     int64_t shard_size, const MapIdToTableOffset& map_id,
                     std::vector<int64_t> slice_group_sizes,
-                    const HloAllReduceInstruction* ar) {
+                    const HloInstruction* instruction, bool is_cross_module,
+                    bool use_global_device_ids) {
   if (offsets.size() != slice_group_sizes.size()) {
     return false;
   }
-  if (!ar->IsCrossModuleAllReduce() || !ar->use_global_device_ids()) {
+  if (!is_cross_module || !use_global_device_ids) {
     return false;
   }
 
-  int num_groups = ar->replica_groups().size();
+  int num_groups = instruction->replica_groups().size();
   int num_split_dims = slice_group_sizes.size();
 
   for (int64_t i = 0; i < num_groups; ++i) {
     for (int64_t j = 0; j < Product(slice_group_sizes); ++j) {
       int64_t final_table_entry = 0;
-      int64_t id = ar->replica_groups()[i].replica_ids(j);
+      int64_t id = instruction->replica_groups()[i].replica_ids(j);
       int64_t slice_group_size = Product(slice_group_sizes);
       for (int dim = 0; dim < num_split_dims; dim++) {
         auto scalar_offset = offsets[dim];
@@ -141,10 +144,10 @@ bool IsPerIdOffsets(absl::Span<const HloInstruction*> offsets,
 // Returns if `offset` == shard_size * id.
 bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
                    const MapIdToTableOffset& map_id, int64_t group_size,
-                   const HloAllReduceInstruction* ar) {
-  const bool iota_group =
-      ar->replica_groups().empty() ||
-      (ar->IsCrossModuleAllReduce() && !ar->use_global_device_ids());
+                   const HloInstruction* instruction, bool is_cross_module,
+                   bool use_global_device_ids) {
+  const bool iota_group = instruction->replica_groups().empty() ||
+                          (is_cross_module && !use_global_device_ids);
 
   if (offset->opcode() == HloOpcode::kMultiply) {
     // Check if it's constant * IsPerIdOffset(..., shard_size / constant, ...)
@@ -168,7 +171,8 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
       return false;
     }
     return IsPerIdOffset(offset->operand(1 - const_operand),
-                         shard_size / *multiplier, map_id, group_size, ar);
+                         shard_size / *multiplier, map_id, group_size,
+                         instruction, is_cross_module, use_global_device_ids);
   }
   if (shard_size == 1 && iota_group) {
     bool id_mapping_is_identity = true;
@@ -187,7 +191,7 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
       offset->opcode() == HloOpcode::kReshape ||
       offset->opcode() == HloOpcode::kCopy) {
     return IsPerIdOffset(offset->operand(0), shard_size, map_id, group_size,
-                         ar);
+                         instruction, is_cross_module, use_global_device_ids);
   }
 
   if (offset->opcode() == HloOpcode::kConvert &&
@@ -195,7 +199,7 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
       primitive_util::BitWidth(offset->operand(0)->shape().element_type()) <=
           primitive_util::BitWidth(offset->shape().element_type())) {
     return IsPerIdOffset(offset->operand(0), shard_size, map_id, group_size,
-                         ar);
+                         instruction, is_cross_module, use_global_device_ids);
   }
 
   if (offset->opcode() == HloOpcode::kClamp) {
@@ -208,16 +212,18 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
       return false;
     }
     return IsPerIdOffset(offset->operand(1), shard_size, map_id, group_size,
-                         ar);
+                         instruction, is_cross_module, use_global_device_ids);
   }
 
-  const int64_t num_groups = iota_group ? 1 : ar->replica_groups().size();
+  const int64_t num_groups =
+      iota_group ? 1 : instruction->replica_groups().size();
   if (IsTableLookup(offset)) {
     // Check the values of the offset table, and see if they are shard_index *
     // shard_size.
     for (int64_t i = 0; i < num_groups; ++i) {
       for (int64_t j = 0; j < group_size; ++j) {
-        int64_t id = iota_group ? j : ar->replica_groups()[i].replica_ids(j);
+        int64_t id =
+            iota_group ? j : instruction->replica_groups()[i].replica_ids(j);
         int64_t table_index = GetIndexForId(offset->operand(1), id, map_id);
         if (table_index < 0) {
           VLOG(2) << "Failed to infer table index from "
@@ -246,7 +252,8 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
   // Check if the offset is the id itself and it has the right values.
   for (int64_t i = 0; i < num_groups; ++i) {
     for (int64_t j = 0; j < group_size; ++j) {
-      int64_t id = iota_group ? j : ar->replica_groups()[i].replica_ids(j);
+      int64_t id =
+          iota_group ? j : instruction->replica_groups()[i].replica_ids(j);
       int mapped_id = map_id(offset, id);
       if (mapped_id != shard_size * j) {
         VLOG(2) << "Mapping of " << id << " to " << mapped_id
@@ -267,37 +274,69 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
     int64_t num_replicas, bool allow_multiple_split_dims,
     bool allow_intervening_reshape, int64_t min_rank,
     HloPredicate match_partition_id, HloPredicate match_replica_id) {
-  if (!ar->shape().IsArray() || ar->constrain_layout() ||
-      (ar->IsCrossModuleAllReduce() &&
-       !ar->GetModule()->config().use_spmd_partitioning())) {
-    VLOG(2) << "Unsupported all-reduce: " << ar->ToString();
+  auto spec = MatchWithDynamicSlice(
+      ar, num_partitions, num_replicas, allow_multiple_split_dims,
+      allow_intervening_reshape, min_rank, match_partition_id, match_replica_id,
+      ar->constrain_layout(), ar->use_global_device_ids(),
+      ar->channel_id() && ar->opcode() == HloOpcode::kAllReduce);
+  return spec;
+}
+
+bool AllGatherDynamicSliceCancellation(
+    const HloAllGatherInstruction* ag, int64_t num_partitions,
+    int64_t num_replicas, bool allow_multiple_split_dims,
+    bool allow_intervening_reshape, int64_t min_rank,
+    HloPredicate match_partition_id, HloPredicate match_replica_id) {
+  auto spec = MatchWithDynamicSlice(
+      ag, num_partitions, num_replicas, allow_multiple_split_dims,
+      allow_intervening_reshape, min_rank, match_partition_id, match_replica_id,
+      ag->constrain_layout(), ag->use_global_device_ids(),
+      ag->channel_id() && ag->opcode() == HloOpcode::kAllGather);
+  if (spec.has_value()) {
+    return true;
+  }
+  return false;
+}
+
+std::optional<ReduceScatterSpec> MatchWithDynamicSlice(
+    const HloChannelInstruction* instruction, int64_t num_partitions,
+    int64_t num_replicas, bool allow_multiple_split_dims,
+    bool allow_intervening_reshape, int64_t min_rank,
+    HloPredicate match_partition_id, HloPredicate match_replica_id,
+    bool is_constrain_layout, bool use_global_device_ids,
+    bool is_cross_module) {
+  if (!instruction->shape().IsArray() || is_constrain_layout ||
+      (is_cross_module &&
+       !instruction->GetModule()->config().use_spmd_partitioning())) {
+    VLOG(2) << "Unsupported collective: " << instruction->ToString();
     return std::nullopt;
   }
-  if (ar->shape().rank() - absl::c_count(ar->shape().dimensions(), 1) <
+  if (instruction->shape().rank() -
+          absl::c_count(instruction->shape().dimensions(), 1) <
       min_rank) {
     VLOG(2) << " Should be at least rank-" << min_rank
-            << " excluding trivial dimensions " << ar->ToString();
+            << " excluding trivial dimensions " << instruction->ToString();
     return std::nullopt;
   }
-  if (ar->user_count() != 1) {
-    VLOG(2) << "All-reduce user_count > 1 " << ar->ToString();
+  if (instruction->user_count() != 1) {
+    VLOG(2) << "All-gather user_count > 1 " << instruction->ToString();
     return std::nullopt;
   }
-  if (ar->replica_groups().size() > 1) {
-    const int64_t size = ar->replica_groups()[0].replica_ids_size();
-    absl::Span<const ReplicaGroup> rgs = ar->replica_groups();
+  if (instruction->replica_groups().size() > 1) {
+    const int64_t size = instruction->replica_groups()[0].replica_ids_size();
+    absl::Span<const ReplicaGroup> rgs = instruction->replica_groups();
     const bool has_uniform_size = absl::c_all_of(
         rgs.subspan(1, size - 1), [size](const ReplicaGroup& group) {
           return group.replica_ids_size() == size;
         });
     if (!has_uniform_size) {
       VLOG(2) << "Unsupported non-uniform replica group size "
-              << ar->ToString();
+              << instruction->ToString();
       return std::nullopt;
     }
   }
 
-  HloInstruction* user = ar->users()[0];
+  HloInstruction* user = instruction->users()[0];
   HloInstruction* reshape = nullptr;
   if (allow_intervening_reshape && user->opcode() == HloOpcode::kReshape) {
     // Allow the intervening reshape if it reshapes just the non scattered
@@ -318,22 +357,22 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
   int64_t group_size;
   MapIdToTableOffset map_id;
   spec.dynamic_slice = user;
-  if (!ar->IsCrossModuleAllReduce()) {
+  if (!is_cross_module) {
     spec.sharded_replicas = num_replicas;
-    group_size = ar->replica_groups().empty()
+    group_size = instruction->replica_groups().empty()
                      ? num_replicas
-                     : ar->replica_groups()[0].replica_ids_size();
+                     : instruction->replica_groups()[0].replica_ids_size();
     map_id = [&](const HloInstruction* hlo, int64_t id) {
       return match_replica_id(hlo) ? id : -1;
     };
-  } else if (ar->use_global_device_ids()) {
+  } else if (use_global_device_ids) {
     spec.sharded_replicas = num_replicas;
     spec.sharded_partitions = num_partitions;
-    group_size = ar->replica_groups()[0].replica_ids_size();
+    group_size = instruction->replica_groups()[0].replica_ids_size();
     bool orthogonal_replicas = true;
     std::vector<int64_t> partition_id_to_index(num_partitions, -1);
-    for (int64_t g = 0; g < ar->replica_groups().size(); ++g) {
-      const auto& group = ar->replica_groups()[g];
+    for (int64_t g = 0; g < instruction->replica_groups().size(); ++g) {
+      const auto& group = instruction->replica_groups()[g];
       for (int64_t i = 0; i < group.replica_ids_size(); ++i) {
         int64_t global_id = group.replica_ids(i);
         int64_t partition_id = global_id % num_partitions;
@@ -380,11 +419,11 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
   } else {
     // Right now all cross-partition all-reduces' subgroups refer to replicas
     // unless they use use_global_device_ids.
-    if (ar->replica_groups().size() != num_replicas ||
-        ar->replica_groups()[0].replica_ids_size() != 1) {
+    if (instruction->replica_groups().size() != num_replicas ||
+        instruction->replica_groups()[0].replica_ids_size() != 1) {
       VLOG(2) << "Unsupported size > 1 replica groups for cross-partition, "
                  "non-global ID "
-              << ar->ToString();
+              << instruction->ToString();
       return std::nullopt;
     }
     spec.sharded_partitions = num_partitions;
@@ -394,7 +433,7 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
     };
   }
   if (group_size < 2) {
-    VLOG(2) << "Group_size < 2, nothing to do " << ar->ToString();
+    VLOG(2) << "Group_size < 2, nothing to do " << instruction->ToString();
     return std::nullopt;
   }
   spec.group_size = group_size;
@@ -403,8 +442,8 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
   // First find a single dimension where the input and output of dynamic slice
   // differ.
   int num_dims = 0;
-  for (int64_t dim = 0; dim < ar->shape().rank(); ++dim) {
-    if (ar->shape().dimensions(dim) == user->shape().dimensions(dim)) {
+  for (int64_t dim = 0; dim < instruction->shape().rank(); ++dim) {
+    if (instruction->shape().dimensions(dim) == user->shape().dimensions(dim)) {
       continue;
     }
     num_dims++;
@@ -441,19 +480,18 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
       spec.split_dim = dim;
     }
   }
-
   std::vector<int64_t> group_sizes;
   group_sizes.reserve(split_dims.size());
   for (auto dim : split_dims) {
     group_sizes.push_back(user->operand(0)->shape().dimensions(dim) /
                           user->dynamic_slice_sizes()[dim]);
   }
+
   if (Product(group_sizes) != group_size) {
     VLOG(2) << "Group size mismatch " << user->ToString() << " vs "
-            << ar->ToString();
+            << instruction->ToString();
     return std::nullopt;
   }
-
   if (split_dims.size() > 1) {
     std::vector<const HloInstruction*> offsets;
     int shard_size = 1;
@@ -461,17 +499,18 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
       offsets.push_back(user->operand(dim + 1));
       shard_size *= user->dynamic_slice_sizes()[dim];
     }
-
     if (!IsPerIdOffsets(absl::MakeSpan(offsets), shard_size, map_id,
-                        group_sizes, ar)) {
-      VLOG(2) << "IsPerIdOffsets() failed " << ar->ToString();
+                        group_sizes, instruction, is_cross_module,
+                        use_global_device_ids)) {
+      VLOG(2) << "IsPerIdOffsets() failed " << instruction->ToString();
       return std::nullopt;
     }
   } else {
     if (!IsPerIdOffset(user->operand(spec.split_dim + 1),
                        user->dynamic_slice_sizes()[spec.split_dim], map_id,
-                       group_size, ar)) {
-      VLOG(2) << "IsPerIdOffsets() failed " << ar->ToString();
+                       group_size, instruction, is_cross_module,
+                       use_global_device_ids)) {
+      VLOG(2) << "IsPerIdOffset() failed " << instruction->ToString();
       return std::nullopt;
     }
   }
diff --git a/third_party/xla/xla/service/reduce_scatter_utils.h b/third_party/xla/xla/service/collective_opt_utils.h
similarity index 52%
rename from third_party/xla/xla/service/reduce_scatter_utils.h
rename to third_party/xla/xla/service/collective_opt_utils.h
index 2ff610950e9f0b..d26c2ab1ad50dd 100644
--- a/third_party/xla/xla/service/reduce_scatter_utils.h
+++ b/third_party/xla/xla/service/collective_opt_utils.h
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_REDUCE_SCATTER_UTILS_H_
-#define XLA_SERVICE_REDUCE_SCATTER_UTILS_H_
+#ifndef XLA_SERVICE_COLLECTIVE_OPT_UTILS_H_
+#define XLA_SERVICE_COLLECTIVE_OPT_UTILS_H_
 
+#include <cstdint>
 #include <optional>
 #include <vector>
 
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 
 namespace xla {
@@ -40,6 +42,25 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
     HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
     HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>);
 
+// Check whether AG(ICI) and its single user DS(ICI) can be canceled out.
+bool AllGatherDynamicSliceCancellation(
+    const HloAllGatherInstruction* ag, int64_t num_partitions,
+    int64_t num_replicas, bool allow_multiple_split_dims = false,
+    bool allow_intervening_reshape = false, int64_t min_rank = 1,
+    HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
+    HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>);
+
+// Check if a given instruction (AllReduce or AllGather) matches a DynamicSlice;
+// the DynamicSlice has to be the only user of the given instruction.
+std::optional<ReduceScatterSpec> MatchWithDynamicSlice(
+    const HloChannelInstruction* instruction, int64_t num_partitions,
+    int64_t num_replicas, bool allow_multiple_split_dims = false,
+    bool allow_intervening_reshape = false, int64_t min_rank = 1,
+    HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
+    HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>,
+    bool is_constrain_layout = false, bool use_global_device_ids = false,
+    bool is_cross_module = false);
+
 }  // namespace xla
 
-#endif  // XLA_SERVICE_REDUCE_SCATTER_UTILS_H_
+#endif  // XLA_SERVICE_COLLECTIVE_OPT_UTILS_H_
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index a80f5514cb23af..d8812e9f76377b 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2373,8 +2373,8 @@ cc_library(
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
+        "//xla/service:collective_opt_utils",
         "//xla/service:hlo_pass",
-        "//xla/service:reduce_scatter_utils",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/gpu_reduce_scatter_creator.cc b/third_party/xla/xla/service/gpu/gpu_reduce_scatter_creator.cc
index 9dfe3b23635693..9950581a10c182 100644
--- a/third_party/xla/xla/service/gpu/gpu_reduce_scatter_creator.cc
+++ b/third_party/xla/xla/service/gpu/gpu_reduce_scatter_creator.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
-#include "xla/service/reduce_scatter_utils.h"
+#include "xla/service/collective_opt_utils.h"
 
 namespace xla {
 namespace gpu {

From b39de08b5ee1c7795da483e6b57f5215832efed6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 16:06:31 -0700
Subject: [PATCH 500/567] Adds an experimental kwargs in TPUEmbeddingServing
 for restore.

PiperOrigin-RevId: 570207893
---
 tensorflow/python/tpu/tpu_embedding_for_serving.py          | 6 +++++-
 ...xperimental.embedding.-t-p-u-embedding-for-serving.pbtxt | 2 +-
 ...xperimental.embedding.-t-p-u-embedding-for-serving.pbtxt | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/tpu/tpu_embedding_for_serving.py b/tensorflow/python/tpu/tpu_embedding_for_serving.py
index fb8a3205e1358c..2ce0afd231690d 100644
--- a/tensorflow/python/tpu/tpu_embedding_for_serving.py
+++ b/tensorflow/python/tpu/tpu_embedding_for_serving.py
@@ -88,7 +88,8 @@ class TPUEmbeddingForServing(tpu_embedding_base.TPUEmbeddingBase):
   def __init__(
       self,
       feature_config: Union[tpu_embedding_v2_utils.FeatureConfig, Iterable],  # pylint:disable=g-bare-generic
-      optimizer: Optional[tpu_embedding_v2_utils._Optimizer]):  # pylint:disable=protected-access
+      optimizer: Optional[tpu_embedding_v2_utils._Optimizer],
+      experimental_sparsecore_restore_info: Optional[Dict[str, Any]] = None):  # pylint:disable=protected-access
     """Creates the TPUEmbeddingForServing mid level API object.
 
     ```python
@@ -108,6 +109,9 @@ def __init__(
         may be set to None to avoid the creation of the optimizer slot
         variables, useful for optimizing memory consumption when exporting the
         model for serving where slot variables aren't needed.
+      experimental_sparsecore_restore_info: Information from the sparse core
+        training, required to restore from checkpoint for serving (like number
+        of TPU devices used `num_tpu_devices`.)
 
     Raises:
       RuntimeError: If created under TPUStrategy.
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-for-serving.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-for-serving.pbtxt
index 54e189cfe7c99f..33757deb1641a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-for-serving.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-for-serving.pbtxt
@@ -11,7 +11,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_config\', \'optimizer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'feature_config\', \'optimizer\', \'experimental_sparsecore_restore_info\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "build"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-for-serving.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-for-serving.pbtxt
index 54e189cfe7c99f..33757deb1641a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-for-serving.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-for-serving.pbtxt
@@ -11,7 +11,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_config\', \'optimizer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'feature_config\', \'optimizer\', \'experimental_sparsecore_restore_info\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "build"

From fa580a07c95756615439c0d9386321c226df3580 Mon Sep 17 00:00:00 2001
From: Sergey Kozub <sergeykozub@google.com>
Date: Mon, 2 Oct 2023 16:19:45 -0700
Subject: [PATCH 501/567] Add flag to allow printing more debug information in
 buffer allocation

PiperOrigin-RevId: 570210990
---
 third_party/xla/xla/debug_options_flags.cc       | 6 ++++++
 third_party/xla/xla/service/buffer_assignment.cc | 9 ++++-----
 third_party/xla/xla/service/buffer_assignment.h  | 2 +-
 third_party/xla/xla/service/gpu/gpu_compiler.cc  | 7 +++++--
 third_party/xla/xla/xla.proto                    | 5 ++++-
 5 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 1cbfc25daa9302..e2d27b6ff245bf 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -62,6 +62,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_dump_module_metadata(false);
   opts.set_xla_dump_hlo_as_long_text(false);
   opts.set_xla_dump_enable_mlir_pretty_form(true);
+  opts.set_xla_debug_buffer_assignment_show_max(15);
 #ifdef ENABLE_MKL
   opts.set_xla_cpu_use_mkl_dnn(true);
 #endif  // ENABLE_MKL
@@ -1301,6 +1302,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
               set_xla_gpu_filter_kernels_spilling_registers_on_autotuning),
       debug_options->xla_gpu_filter_kernels_spilling_registers_on_autotuning(),
       "Filter out kernels that spill registers during autotuning"));
+  flag_list->push_back(tsl::Flag(
+      "xla_debug_buffer_assignment_show_max",
+      int64_setter_for(&DebugOptions::set_xla_debug_buffer_assignment_show_max),
+      debug_options->xla_debug_buffer_assignment_show_max(),
+      "Number of buffers to display when debugging the buffer assignment"));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/service/buffer_assignment.cc b/third_party/xla/xla/service/buffer_assignment.cc
index ae14bdf5f73d58..cf3e7e198bf891 100644
--- a/third_party/xla/xla/service/buffer_assignment.cc
+++ b/third_party/xla/xla/service/buffer_assignment.cc
@@ -839,16 +839,15 @@ std::vector<std::pair<int64_t, const HloValue*>> TopKPeakBuffers(
   return topk_descending;
 }
 
-std::string BufferAssignment::ToVerboseString() const {
-  // TODO(loreno): make this tunable via flag.
-  const size_t kMaxBuffersToShow = 15;
+std::string BufferAssignment::ToVerboseString(
+    size_t max_buffers_to_show) const {
   std::string output =
       absl::StrCat("BufferAssignment OOM Debugging.\n", stats_.ToString());
 
   std::vector<std::pair<int64_t, const HloValue*>> peak_buffers =
-      TopKPeakBuffers(kMaxBuffersToShow, allocations_);
+      TopKPeakBuffers(max_buffers_to_show, allocations_);
   std::vector<std::string> buf_strs;
-  for (size_t i = 0; i < std::min(kMaxBuffersToShow, peak_buffers.size());
+  for (size_t i = 0; i < std::min(max_buffers_to_show, peak_buffers.size());
        ++i) {
     const HloValue* value = peak_buffers[i].second;
     const HloInstruction* instr = value->instruction();
diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index da46122da3b6f0..152035526c0228 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -472,7 +472,7 @@ class BufferAssignment {
   std::string ToString() const;
   // Verbose string tailored to debugging OOMs, includes the Hlo op metadata for
   // every buffer associated with each allocation.
-  std::string ToVerboseString() const;
+  std::string ToVerboseString(size_t max_buffers_to_show) const;
   std::string BufferInfoString() const;
 
   // Convert BufferAssignment to or from a proto.
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index ef883c47dc9706..3f5545c0294bf1 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <any>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -1543,8 +1544,10 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     buffer_assignment = std::move(compile_module_results.buffer_assignment);
     buffer_assignment_proto =
         std::make_unique<BufferAssignmentProto>(buffer_assignment->ToProto());
-    buffer_assignment_dumper = [buffer_assignment] {
-      return buffer_assignment->ToVerboseString();
+    size_t max_buffers_to_show =
+        module->config().debug_options().xla_debug_buffer_assignment_show_max();
+    buffer_assignment_dumper = [buffer_assignment, max_buffers_to_show] {
+      return buffer_assignment->ToVerboseString(max_buffers_to_show);
     };
   }
 
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 69b00d3914bbe2..af4172ed2f73c2 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -630,7 +630,10 @@ message DebugOptions {
   // Filter out kernels that spill registers during autotuning.
   bool xla_gpu_filter_kernels_spilling_registers_on_autotuning = 250;
 
-  // Next id: 251
+  // Maximum number of buffers to print when debugging buffer assignment.
+  int64 xla_debug_buffer_assignment_show_max = 251;
+
+  // Next id: 252
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.

From ba466831602c68ec066fb9257ebdfe294773f6cc Mon Sep 17 00:00:00 2001
From: Tongfei Guo <tongfei@google.com>
Date: Mon, 2 Oct 2023 16:51:08 -0700
Subject: [PATCH 502/567] [XLA:SPMD] Support sharding propagation/partitioning
 for variadic Scatter.

1. Make sharding prop work for variadic Scatter.
2. If variadic Scatter's operands do not share the same sharding, find a common sharding and do reshard.
3. Some code refactoring.
4. Tests.

PiperOrigin-RevId: 570218446
---
 third_party/xla/xla/hlo/utils/BUILD           |   3 +-
 .../xla/xla/hlo/utils/hlo_sharding_util.cc    |  55 +-
 .../xla/xla/hlo/utils/hlo_sharding_util.h     |   7 +-
 third_party/xla/xla/service/BUILD             |   1 +
 .../xla/xla/service/sharding_propagation.cc   | 313 +++++---
 .../xla/service/sharding_propagation_test.cc  | 748 ++++++++++++++++++
 .../service/spmd/gather_scatter_handler.cc    |  24 +-
 .../xla/service/spmd/spmd_partitioner_test.cc |  49 ++
 8 files changed, 1068 insertions(+), 132 deletions(-)

diff --git a/third_party/xla/xla/hlo/utils/BUILD b/third_party/xla/xla/hlo/utils/BUILD
index 5bbf24b2efdf9a..3d46d69bb498ae 100644
--- a/third_party/xla/xla/hlo/utils/BUILD
+++ b/third_party/xla/xla/hlo/utils/BUILD
@@ -1,11 +1,11 @@
 # Description:
 #   Implementation of XLA’s HLO utilities used for higher-level transformations.
 
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -109,6 +109,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 912c353f0d60a4..c37efb945740e2 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -184,36 +185,38 @@ bool IsShardingMoreSpecific(const HloSharding& lhs, const HloSharding& rhs) {
   }
 }
 
-bool MergeSharding(const HloSharding& old, HloSharding* to_merge,
+bool MergeSharding(const HloSharding& to_merge, HloSharding* dst,
                    bool may_combine_partial_sharding) {
-  if (old.IsTuple()) {
-    CHECK(to_merge->IsTuple());
+  if (to_merge.IsTuple()) {
+    CHECK(dst->IsTuple());
     bool changed = false;
-    for (int64_t i = 0; i < old.tuple_elements().size(); ++i) {
+    for (int64_t i = 0; i < to_merge.tuple_elements().size(); ++i) {
       changed |=
-          MergeSharding(old.tuple_elements()[i], &to_merge->tuple_elements()[i],
+          MergeSharding(to_merge.tuple_elements()[i], &dst->tuple_elements()[i],
                         may_combine_partial_sharding);
     }
     return changed;
   }
-  if (!may_combine_partial_sharding || !old.HasPartialReplication() ||
-      !to_merge->HasPartialReplication() ||
-      old.tile_assignment().num_elements() !=
-          to_merge->tile_assignment().num_elements()) {
-    return IsShardingMoreSpecific(*to_merge, old);
+  if (!may_combine_partial_sharding || !to_merge.HasPartialReplication() ||
+      !dst->HasPartialReplication() ||
+      to_merge.tile_assignment().num_elements() !=
+          dst->tile_assignment().num_elements()) {
+    return IsShardingMoreSpecific(*dst, to_merge);
   }
 
   if (MergeShardingIfCompatible(
-          old,
-          /*minimum_tiles=*/std::max(old.NumTiles(), to_merge->NumTiles()) + 1,
-          to_merge)) {
+          to_merge,
+          /*minimum_tiles=*/std::max(to_merge.NumTiles(), dst->NumTiles()) + 1,
+          dst)) {
     return true;
   }
-  return IsShardingMoreSpecific(*to_merge, old);
+  return IsShardingMoreSpecific(*dst, to_merge);
 }
 
 bool MergeShardingIfCompatible(const HloSharding& to_merge,
                                int64_t minimum_tiles, HloSharding* dst) {
+  CHECK(!to_merge.IsTuple() && !to_merge.IsManual() && !dst->IsTuple() &&
+        !dst->IsManual());
   if (to_merge.IsTileMaximal()) {
     return false;
   }
@@ -376,6 +379,26 @@ std::optional<int64_t> SelectDominantDevice(
   return count > 0 ? std::optional<int64_t>(device) : std::optional<int64_t>();
 }
 
+HloSharding FindCommonSharding(absl::Span<const HloSharding> shardings) {
+  CHECK(!shardings.empty());
+  bool all_compatible = true;
+  HloSharding common_sharding = shardings[0];
+  for (int i = 1; i != shardings.size(); ++i) {
+    if (!MergeShardingIfCompatible(shardings[i], common_sharding.NumTiles(),
+                                   &common_sharding)) {
+      all_compatible = false;
+      break;
+    }
+  }
+  if (all_compatible) {
+    return common_sharding;
+  }
+  // TODO(tongfei): instead of return the first sharding in case not all
+  // shardings are compatible, we should find a sharding that's compatible with
+  // the most number of shardings instead.
+  return shardings[0];
+}
+
 void AssignComputationDevice(HloComputation* computation, int64_t device) {
   VLOG(4) << "Assigning device " << device << " to " << computation->name()
           << " computation";
@@ -1465,7 +1488,9 @@ std::optional<HloSharding> ScatterUpdateShardingFromOutputParallelDimensions(
     auto operand_aligned_update_parallel_dims = AlignSmallContainers(
         update_parallel_dims, index_aligned_operand_parallel_dims,
         operand_parallel_dims_sorted);
-    const Shape scatter_shape = scatter.shape();
+    const Shape scatter_shape = scatter.shape().IsTuple()
+                                    ? scatter.shape().tuple_shapes()[0]
+                                    : scatter.shape();
     CHECK_EQ(update_parallel_dims.size(),
              index_aligned_operand_parallel_dims.size());
     std::vector<int64_t> update_tile_assignment(
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index 23f56e66b7a27b..95dcad8347376f 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -59,7 +59,7 @@ bool IsShardingMoreSpecific(const HloSharding& lhs, const HloSharding& rhs);
 
 // Tries to refine `to_merge` by combining with `old`. Returns if the final
 // `to_merge` is more specific than `old`.
-bool MergeSharding(const HloSharding& old, HloSharding* to_merge,
+bool MergeSharding(const HloSharding& to_merge, HloSharding* dst,
                    bool may_combine_partial_sharding);
 
 // Merges `to_merge` into `dst` only if they are compatible, and the merged
@@ -67,6 +67,11 @@ bool MergeSharding(const HloSharding& old, HloSharding* to_merge,
 bool MergeShardingIfCompatible(const HloSharding& to_merge,
                                int64_t minimum_tiles, HloSharding* dst);
 
+// Find a reasonable common sharding for a list of shardings. The reasonable
+// sharding should incur little(the least) amount of total resharding cost when
+// resharding all the shardings to this common sharding.
+HloSharding FindCommonSharding(absl::Span<const HloSharding> shardings);
+
 // Given a map<device, occurrence_count>, selects the device with higher
 // occurrence count (if any). If top_count in not nullptr, it will receive the
 // count of the dominant device returned.
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 201a95e2ce3a4b..25354ba2badc4f 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -662,6 +662,7 @@ cc_library(
         ":hlo_graph_dumper",
         ":hlo_pass",
         "//xla:protobuf_util",
+        "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:sharding_op_util",
         "//xla:status_macros",
diff --git a/third_party/xla/xla/service/sharding_propagation.cc b/third_party/xla/xla/service/sharding_propagation.cc
index 5737ed0c576979..c675d55dba4634 100644
--- a/third_party/xla/xla/service/sharding_propagation.cc
+++ b/third_party/xla/xla/service/sharding_propagation.cc
@@ -43,6 +43,8 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_sharding_util.h"
 #include "xla/protobuf_util.h"
 #include "xla/service/dot_as_convolution_util.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/sharding_op_util.h"
 #include "xla/status_macros.h"
@@ -96,75 +98,80 @@ bool IsShardingStrictlyBetter(const HloSharding& lhs, const HloSharding& rhs) {
   return false;
 }
 
-std::optional<HloSharding> ReturnImprovedSharding(
-    HloSharding sharding, HloInstruction* instruction,
-    bool may_combine_partial_sharding,
+// Implementation for returning a improved sharding from another sharding.
+std::optional<HloSharding> ReturnImprovedShardingImpl(
+    HloSharding from, std::optional<const HloSharding> to_improved,
+    const Shape& to_improved_shape, bool may_combine_partial_sharding,
     bool allow_aggressive_resharding = false) {
   // Always allow improve the sharding if it's straightly better.
-  if (instruction->has_sharding() &&
-      IsShardingStrictlyBetter(sharding, instruction->sharding())) {
-    return sharding;
-  }
-  // Allows improve from tile maximal shardings to manual shardings.
-  if (instruction->has_sharding()) {
-    bool no_worse = true;
-    bool changed = false;
-    const std::vector<HloSharding>& flattened_instruction_shardings =
-        instruction->sharding().tuple_elements();
-    const std::vector<HloSharding>& flatten_shardings =
-        sharding.tuple_elements();
-    CHECK_EQ(flattened_instruction_shardings.size(), flatten_shardings.size());
-    for (int i = 0; i != flattened_instruction_shardings.size(); ++i) {
-      if (flattened_instruction_shardings[i] != flatten_shardings[i]) {
-        changed = true;
-        if (!flattened_instruction_shardings[i].IsTileMaximal() ||
-            !flatten_shardings[i].IsManual()) {
-          no_worse = false;
-          break;
-        }
-      }
-    }
-    // Replace sharding if we are know that it strictly improves(i.e. the
-    // sharding is changed and no worse than before) from tile maximal
-    // (sub)shardings to manual shardings. Otherwise pass through.
-    if (no_worse && changed) {
-      return sharding;
-    }
+  if (to_improved.has_value() &&
+      IsShardingStrictlyBetter(from, to_improved.value())) {
+    return from;
   }
   // We don't want to propagate tile maximal shardings.
-  if (!IsSpatiallyPartitioned(sharding)) {
+  if (!IsSpatiallyPartitioned(from)) {
     return std::nullopt;
   }
   // Any sharding is better then no sharding.
-  if (!instruction->has_sharding()) {
-    return sharding;
+  if (!to_improved.has_value()) {
+    return from;
   }
   // We don't want to propagate manual shardings.
-  if (sharding.IsManual()) {
+  if (from.IsManual()) {
     return std::nullopt;
   }
-  int64_t sharding_tiles = sharding.NumTiles();
-  if (hlo_sharding_util::MergeSharding(instruction->sharding(), &sharding,
+  int64_t sharding_tiles = from.NumTiles();
+  if (hlo_sharding_util::MergeSharding(to_improved.value(), &from,
                                        may_combine_partial_sharding)) {
     // Override existing tiled sharding only when the new sharding is compatible
     // with the existing one. This avoids unexpected resharding when `sharding`
     // just has more tiles than existing sharding but they are not mergeable.
-    if (!allow_aggressive_resharding && instruction->shape().IsArray() &&
-        !instruction->sharding().IsTileMaximal() &&
-        sharding.NumTiles() == sharding_tiles) {
+    if (!allow_aggressive_resharding && to_improved_shape.IsArray() &&
+        !to_improved.value().IsTileMaximal() &&
+        from.NumTiles() == sharding_tiles) {
       if (!hlo_sharding_util::IsSubTilingOrEqualSharding(
-              instruction->shape(), sharding, instruction->sharding())) {
+              to_improved_shape, from, to_improved.value())) {
         VLOG(10) << "Not merging because of different device distribution";
-        VLOG(10) << "Instr sharding: " << instruction->sharding().ToString();
-        VLOG(10) << "New sharding " << sharding.ToString();
+        VLOG(10) << "Instr sharding: " << to_improved.value().ToString();
+        VLOG(10) << "New sharding " << from.ToString();
         return std::nullopt;
       }
     }
-    return sharding;
+    return from;
   }
   return std::nullopt;
 }
 
+// Returning the improved sharding of an instruction from some other sharding.
+std::optional<HloSharding> ReturnImprovedSharding(
+    HloSharding sharding, HloInstruction* instruction,
+    bool may_combine_partial_sharding,
+    bool allow_aggressive_resharding = false) {
+  return ReturnImprovedShardingImpl(
+      sharding,
+      instruction->has_sharding()
+          ? std::optional<HloSharding>(instruction->sharding())
+          : std::nullopt,
+      instruction->shape(), may_combine_partial_sharding,
+      allow_aggressive_resharding);
+}
+
+// Same as above, but return the improved subsharding of a tuple-shaped
+// instruction.
+std::optional<HloSharding> ReturnImprovedSubSharding(
+    HloSharding sharding, HloInstruction* instruction, const ShapeIndex& index,
+    bool may_combine_partial_sharding,
+    bool allow_aggressive_resharding = false) {
+  return ReturnImprovedShardingImpl(
+      sharding,
+      instruction->has_sharding()
+          ? std::optional<HloSharding>(instruction->sharding().GetSubSharding(
+                instruction->shape(), index))
+          : std::nullopt,
+      ShapeUtil::GetSubshape(instruction->shape(), index),
+      may_combine_partial_sharding, allow_aggressive_resharding);
+}
+
 // Updates the sharding of the specified instruction with the specified sharding
 // if it is better than the current one and returns true if a new sharding have
 // been applied. If may_combine_partial_sharding is true, this may combine the
@@ -183,6 +190,36 @@ bool MaybeImproveInstructionSharding(HloSharding sharding,
   return false;
 }
 
+// Same as above, but improve the subsharding of an maybe tuple-shaped
+// instruction.
+bool MaybeImproveInstructionSubSharding(
+    HloSharding sharding, HloInstruction* instruction, const ShapeIndex& index,
+    bool may_combine_partial_sharding,
+    bool allow_aggressive_resharding = false) {
+  if (instruction->shape().IsTuple()) {
+    if (auto new_sub_sharding = ReturnImprovedSubSharding(
+            sharding, instruction, index, may_combine_partial_sharding,
+            allow_aggressive_resharding)) {
+      HloSharding new_sharding =
+          instruction->has_sharding()
+              ? instruction->sharding()
+              : HloSharding::Single(instruction->shape(),
+                                    HloSharding::Replicate());
+      ShapeTree<HloSharding> sharding_shape_tree =
+          new_sharding.GetAsShapeTree(instruction->shape());
+      *sharding_shape_tree.mutable_element(index) = new_sub_sharding.value();
+      instruction->set_sharding(HloSharding::Tuple(sharding_shape_tree));
+      return true;
+    } else {
+      return false;
+    }
+  }
+  CHECK(index.size() == 1 && index[0] == 0);
+  return MaybeImproveInstructionSharding(sharding, instruction,
+                                         may_combine_partial_sharding,
+                                         allow_aggressive_resharding);
+}
+
 // We consider a convolution kernel to be small iff it is smaller along all
 // spatial dimensions then the output of the convolution. The rational is that
 // we can either shard the kernel or the output and we want to shard the larger
@@ -706,6 +743,10 @@ bool InferScatterParallelShardingFromOperands(
     bool may_combine_partial_sharding) {
   HloScatterInstruction* scatter = DynCast<HloScatterInstruction>(instruction);
   CHECK(scatter);
+  const int64_t operand_count = scatter->scatter_operand_count();
+  auto scatter_operands = scatter->scatter_operands();
+  auto scatter_indices = scatter->scatter_indices();
+  auto scatter_updates = scatter->scatter_updates();
   bool changed = false;
   auto aligned_operand_parallel_dims =
       hlo_sharding_util::IndexAlignedOperandParallelDims(parallel_dims);
@@ -713,31 +754,41 @@ bool InferScatterParallelShardingFromOperands(
       *instruction, parallel_dims);
   auto output_parallel_dims = aligned_operand_parallel_dims;
   // Infer output sharding from scatter operand sharding.
-  if (IsSpatiallyPartitioned(scatter->scatter_operands()[0])) {
-    changed |= MaybeImproveInstructionSharding(
-        InferParallelShardingFromOperand(
-            scatter->scatter_operands()[0], instruction->shape(),
-            absl::MakeConstSpan(aligned_operand_parallel_dims),
-            absl::MakeConstSpan(output_parallel_dims)),
-        instruction, may_combine_partial_sharding);
+  Shape shape = operand_count == 1
+                    ? instruction->shape()
+                    : ShapeUtil::GetSubshape(instruction->shape(), {0});
+  for (int64_t i = 0; i != operand_count; ++i) {
+    if (IsSpatiallyPartitioned(scatter_operands[i])) {
+      changed |= MaybeImproveInstructionSubSharding(
+          InferParallelShardingFromOperand(
+              scatter_operands[i], shape,
+              absl::MakeConstSpan(aligned_operand_parallel_dims),
+              absl::MakeConstSpan(output_parallel_dims)),
+          instruction, {i}, may_combine_partial_sharding);
+    }
   }
   // Infer output sharding from scatter indices sharding.
-  if (IsSpatiallyPartitioned(scatter->scatter_indices())) {
-    changed |= MaybeImproveInstructionSharding(
-        InferParallelShardingFromOperand(
-            scatter->scatter_indices(), instruction->shape(),
-            absl::MakeConstSpan(parallel_dims.indices_parallel_dims),
-            absl::MakeConstSpan(output_parallel_dims)),
-        instruction, may_combine_partial_sharding);
+  if (IsSpatiallyPartitioned(scatter_indices)) {
+    auto parallel_sharding_from_indices = InferParallelShardingFromOperand(
+        scatter_indices, shape,
+        absl::MakeConstSpan(parallel_dims.indices_parallel_dims),
+        absl::MakeConstSpan(output_parallel_dims));
+    for (int64_t i = 0; i != operand_count; ++i) {
+      changed |= MaybeImproveInstructionSubSharding(
+          parallel_sharding_from_indices, instruction, {i},
+          may_combine_partial_sharding);
+    }
   }
   // Infer output sharding from scatter update sharding.
-  if (IsSpatiallyPartitioned(scatter->scatter_updates()[0])) {
-    changed |= MaybeImproveInstructionSharding(
-        InferParallelShardingFromOperand(
-            scatter->scatter_updates()[0], instruction->shape(),
-            absl::MakeConstSpan(update_parallel_dims),
-            absl::MakeConstSpan(output_parallel_dims)),
-        instruction, may_combine_partial_sharding);
+  for (int64_t i = 0; i != operand_count; ++i) {
+    if (IsSpatiallyPartitioned(scatter_updates[i])) {
+      changed |= MaybeImproveInstructionSubSharding(
+          InferParallelShardingFromOperand(
+              scatter_updates[i], shape,
+              absl::MakeConstSpan(update_parallel_dims),
+              absl::MakeConstSpan(output_parallel_dims)),
+          instruction, {i}, may_combine_partial_sharding);
+    }
   }
   return changed;
 }
@@ -1922,29 +1973,56 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
     }
     case HloOpcode::kScatter: {
       auto& scatter_user = *Cast<HloScatterInstruction>(&user);
-      if (&instruction == scatter_user.operand(0)) {
-        return user.sharding();
-      }
-      if (&instruction == scatter_user.scatter_indices()) {
-        auto update = scatter_user.scatter_updates()[0];
-        if (!IsSpatiallyPartitioned(update)) {
+      const int64_t operand_count = scatter_user.scatter_operand_count();
+      auto scatter_operands = scatter_user.scatter_operands();
+      auto scatter_indices = scatter_user.scatter_indices();
+      auto scatter_updates = scatter_user.scatter_updates();
+      // Infer sharding for scatter operand.
+      const int64_t operand_index =
+          absl::c_find(scatter_operands, &instruction) -
+          scatter_operands.cbegin();
+      if (operand_index < operand_count) {
+        return user.sharding().IsTuple() ? user.sharding().GetSubSharding(
+                                               user.shape(), {operand_index})
+                                         : user.sharding();
+      }
+      // Infer sharding for scatter indices.
+      if (&instruction == scatter_indices) {
+        std::vector<const HloInstruction*> partitioned_updates;
+        for (const HloInstruction* update : scatter_updates) {
+          if (IsSpatiallyPartitioned(update)) {
+            partitioned_updates.push_back(update);
+          }
+        }
+        if (partitioned_updates.empty()) {
           return std::nullopt;
         }
-        return hlo_sharding_util::
-            ScatterIndexShardingFromUpdateIndexPassthroughDimensions(
-                update->sharding(), &scatter_user);
-      }
-      CHECK_EQ(&instruction, scatter_user.scatter_updates()[0]);
-      auto indices = scatter_user.scatter_indices();
+        std::vector<HloSharding> shardings;
+        absl::c_transform(
+            partitioned_updates, std::back_inserter(shardings),
+            [&scatter_user](const HloInstruction* update) {
+              return hlo_sharding_util::
+                  ScatterIndexShardingFromUpdateIndexPassthroughDimensions(
+                      update->sharding(), &scatter_user);
+            });
+        return hlo_sharding_util::FindCommonSharding(shardings);
+      }
+      // Infer sharding for scatter update.
+      const int64_t update_index = absl::c_find(scatter_updates, &instruction) -
+                                   scatter_updates.cbegin();
+      CHECK_LE(update_index, operand_count);
       auto from_indices =
-          IsSpatiallyPartitioned(indices)
+          IsSpatiallyPartitioned(scatter_indices)
               ? hlo_sharding_util::
                     ScatterUpdateShardingFromIndexIndexPassthroughDimensions(
-                        indices->sharding(), &scatter_user)
+                        scatter_indices->sharding(), &scatter_user)
               : HloSharding::Replicate();
       if (is_spmd) {
         auto from_output = hlo_sharding_util::ScatterUpdateShardingFromOutput(
-            user.sharding(), scatter_user, call_graph);
+            user.sharding().IsTuple()
+                ? user.sharding().GetSubSharding(user.shape(), {update_index})
+                : user.sharding(),
+            scatter_user, call_graph);
         if (from_output.has_value()) {
           // Use sharding from output as primary sharding since it prioritize
           // parallel sharding first as this is how it is in spmd_partitioner.
@@ -2524,18 +2602,26 @@ bool ShardingPropagation::InferShardingFromOperands(
       return changed;
     }
     case HloOpcode::kScatter: {
+      auto& scatter = *Cast<HloScatterInstruction>(instruction);
+      const int64_t operand_count = scatter.scatter_operand_count();
+      auto scatter_operands = scatter.scatter_operands();
+      auto scatter_indices = scatter.scatter_indices();
+      auto scatter_updates = scatter.scatter_updates();
       bool changed = false;
-      if (is_spmd_ && IsSpatiallyPartitioned(instruction->operand(0))) {
-        changed |= MaybeImproveInstructionSharding(
-            instruction->operand(0)->sharding(), instruction,
-            may_combine_partial_sharding);
-      }
-      auto* scatter = Cast<HloScatterInstruction>(instruction);
-      if (!IsSpatiallyPartitioned(scatter->scatter_indices()) &&
-          !IsSpatiallyPartitioned(scatter->scatter_updates()[0])) {
-        return changed;
-      }
       if (is_spmd_) {
+        for (int64_t i = 0; i != operand_count; ++i) {
+          if (IsSpatiallyPartitioned(scatter_operands[i])) {
+            changed |= MaybeImproveInstructionSubSharding(
+                scatter_operands[i]->sharding(), instruction, {i},
+                may_combine_partial_sharding);
+          }
+        }
+        if (!IsSpatiallyPartitioned(scatter_indices) &&
+            absl::c_none_of(scatter_updates, [](const HloInstruction* update) {
+              return IsSpatiallyPartitioned(update);
+            })) {
+          return changed;
+        }
         auto scatter_parallel_dims =
             hlo_sharding_util::GetScatterParallelBatchDims(*instruction,
                                                            call_graph);
@@ -2544,22 +2630,25 @@ bool ShardingPropagation::InferShardingFromOperands(
               instruction, *scatter_parallel_dims,
               may_combine_partial_sharding);
         }
-      }
-      if (is_spmd_ && IsSpatiallyPartitioned(scatter->scatter_updates()[0])) {
-        auto maybe_from_update =
-            hlo_sharding_util::ScatterOutputShardingFromUpdate(
-                scatter->scatter_updates()[0]->sharding(), *scatter);
-        if (maybe_from_update) {
-          changed |= MaybeImproveInstructionSharding(
-              std::move(*maybe_from_update), instruction,
+        for (int64_t i = 0; i != operand_count; ++i) {
+          if (IsSpatiallyPartitioned(scatter_updates[i])) {
+            auto maybe_from_update =
+                hlo_sharding_util::ScatterOutputShardingFromUpdate(
+                    scatter_updates[i]->sharding(), scatter);
+            if (maybe_from_update) {
+              changed |= MaybeImproveInstructionSubSharding(
+                  std::move(*maybe_from_update), instruction, {i},
+                  may_combine_partial_sharding);
+            }
+          }
+        }
+      } else {
+        for (int64_t i = 0; i != operand_count; ++i) {
+          changed |= MaybeImproveInstructionSubSharding(
+              HloSharding::Replicate(), instruction, {i},
               may_combine_partial_sharding);
         }
       }
-      if (!is_spmd_) {
-        changed |= MaybeImproveInstructionSharding(
-            HloSharding::Replicate(), instruction,
-            may_combine_partial_sharding);
-      }
       return changed;
     }
     case HloOpcode::kWhile: {
@@ -3178,15 +3267,17 @@ StatusOr<bool> ShardingPropagation::Run(
     // group are compatible with each other, then we will merge all of them to
     // get the most specific sharding. If some of them are not compatible, then
     // it will just choose the a random sharding among them(say the first one).
-    HloSharding base_sharding = (*shard_as_group.begin())->sharding();
-    for (HloInstruction* member : shard_as_group) {
-      hlo_sharding_util::MergeShardingIfCompatible(
-          member->sharding(), base_sharding.NumTiles() + 1, &base_sharding);
-    }
+    std::vector<HloSharding> shardings;
+    absl::c_transform(shard_as_group, std::back_inserter(shardings),
+                      [](const HloInstruction* instruction) {
+                        return instruction->sharding();
+                      });
+    HloSharding common_sharding =
+        hlo_sharding_util::FindCommonSharding(shardings);
     VLOG(2) << "Aligning shard group: " << shard_as_group_id
-            << " to sharding:" << base_sharding.ToString();
+            << " to sharding:" << common_sharding.ToString();
     for (HloInstruction* member : shard_as_group) {
-      member->set_sharding(base_sharding);
+      member->set_sharding(common_sharding);
     }
   }
 
diff --git a/third_party/xla/xla/service/sharding_propagation_test.cc b/third_party/xla/xla/service/sharding_propagation_test.cc
index 7bea7ac1e48441..226ef6e4152383 100644
--- a/third_party/xla/xla/service/sharding_propagation_test.cc
+++ b/third_party/xla/xla/service/sharding_propagation_test.cc
@@ -4446,6 +4446,65 @@ ENTRY entry {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, DataOperandToScatter_Variadic) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs.0: f32[], lhs.1: f32[], rhs.0: f32[], rhs.1: f32[]) -> (f32[], f32[]) {
+  lhs.0 = f32[] parameter(0)
+  lhs.1 = f32[] parameter(1)
+  rhs.0 = f32[] parameter(2)
+  rhs.1 = f32[] parameter(3)
+  sum.0 = f32[] add(lhs.0, rhs.0)
+  sum.1 = f32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY entry {
+  %input.0 = f32[2,9] parameter(0),
+    sharding={devices=[1,4]0,1,2,3 metadata={op_name="a"}}
+  %input.1 = f32[2,9] parameter(1),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate metadata={op_name="b"}}
+  %indices = s32[3] parameter(2),
+    sharding={replicated metadata={op_name="c"}}
+  %updates.0 = f32[3,9] parameter(3),
+    sharding={replicated metadata={op_name="d"}}
+  %updates.1 = f32[3,9] parameter(4),
+    sharding={replicated metadata={op_name="e"}}
+  %scatter = (f32[2,9],f32[2,9]) scatter(%input.0, %input.1, %indices, %updates.0, %updates.1),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+  ROOT %copy = (f32[2,9],f32[2,9]) copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "scatter");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction,
+              op::Sharding("{{devices=[1,4]0,1,2,3}, {devices=[1,2,2]0,1,2,3 "
+                           "last_tile_dim_replicate}}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding().tuple_elements()[0],
+                ShardingMetadata({CreateMetadata("a")}));
+    EXPECT_THAT(instruction->sharding().tuple_elements()[1],
+                ShardingMetadata({CreateMetadata("b")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
 TEST_P(ParameterizedMetadataTest, UpdateOperandToScatter) {
   const char* const hlo_string = R"(
 HloModule module
@@ -4541,6 +4600,65 @@ ENTRY entry {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, UpdateOperandToScatter_Variadic) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs.0: f32[], lhs.1: f32[], rhs.0: f32[], rhs.1: f32[]) -> (f32[], f32[]) {
+  lhs.0 = f32[] parameter(0)
+  lhs.1 = f32[] parameter(1)
+  rhs.0 = f32[] parameter(2)
+  rhs.1 = f32[] parameter(3)
+  sum.0 = f32[] add(lhs.0, rhs.0)
+  sum.1 = f32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY entry {
+  %input.0 = f32[2,9] parameter(0),
+    sharding={replicated metadata={op_name="a"}}
+  %input.1 = f32[2,9] parameter(1),
+    sharding={replicated metadata={op_name="b"}}
+  %indices = s32[3] parameter(2),
+    sharding={replicated metadata={op_name="c"}}
+  %updates.0 = f32[3,9] parameter(3),
+    sharding={devices=[1,4]0,1,2,3 metadata={op_name="d"}}
+  %updates.1 = f32[3,9] parameter(4),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate metadata={op_name="e"}}
+  %scatter = (f32[2,9],f32[2,9]) scatter(%input.0, %input.1, %indices, %updates.0, %updates.1),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+  ROOT %copy = (f32[2,9],f32[2,9]) copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "scatter");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction,
+              op::Sharding("{{devices=[1,4] 0,1,2,3}, {devices=[1,2,2]0,1,2,3 "
+                           "last_tile_dim_replicate}}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding().tuple_elements()[0],
+                ShardingMetadata({CreateMetadata("d")}));
+    EXPECT_THAT(instruction->sharding().tuple_elements()[1],
+                ShardingMetadata({CreateMetadata("e")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
 TEST_P(ParameterizedMetadataTest, ScatterToDataOperand_PartialReplicate) {
   const char* const hlo_string = R"(
 HloModule module
@@ -4636,6 +4754,72 @@ ENTRY entry {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ScatterToDataOperand_Variadic) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs.0: f32[], lhs.1: f32[], rhs.0: f32[], rhs.1: f32[]) -> (f32[], f32[]) {
+  lhs.0 = f32[] parameter(0)
+  lhs.1 = f32[] parameter(1)
+  rhs.0 = f32[] parameter(2)
+  rhs.1 = f32[] parameter(3)
+  sum.0 = f32[] add(lhs.0, rhs.0)
+  sum.1 = f32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY entry {
+  %p0 = f32[2,9] parameter(0)
+  %input.0 = f32[2,9] copy(%p0)
+  %p1 = f32[2,9] parameter(1)
+  %input.1 = f32[2,9] copy(%p1)
+  %indices = s32[3] parameter(2),
+    sharding={replicated metadata={op_name="a"}}
+  %updates.0 = f32[3,9] parameter(3),
+    sharding={replicated metadata={op_name="b"}}
+  %updates.1 = f32[3,9] parameter(4),
+    sharding={replicated metadata={op_name="c"}}
+  ROOT %scatter = (f32[2,9],f32[2,9]) scatter(%input.0, %input.1, %indices, %updates.0, %updates.1),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      sharding={{devices=[1,4]0,1,2,3 metadata={op_name="d"}}, {devices=[1,2,2]0,1,2,3 last_tile_dim_replicate metadata={op_name="e"}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "input.0");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction, op::Sharding("{devices=[1,4]0,1,2,3}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding(),
+                ShardingMetadata({CreateMetadata("d")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+
+  instruction = FindInstruction(module.get(), "input.1");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction,
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding(),
+                ShardingMetadata({CreateMetadata("e")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
 TEST_P(ParameterizedMetadataTest, ScatterToUpdateOperand_PartialReplicate) {
   const char* const hlo_string = R"(
 HloModule module
@@ -4729,6 +4913,70 @@ ENTRY entry {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ScatterToUpdateOperand_Variadic) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs.0: f32[], lhs.1: f32[], rhs.0: f32[], rhs.1: f32[]) -> (f32[], f32[]) {
+  lhs.0 = f32[] parameter(0)
+  lhs.1 = f32[] parameter(1)
+  rhs.0 = f32[] parameter(2)
+  rhs.1 = f32[] parameter(3)
+  sum.0 = f32[] add(lhs.0, rhs.0)
+  sum.1 = f32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY entry {
+  %input.0 = f32[2,9] parameter(0)
+  %input.1 = f32[2,9] parameter(1)
+  %indices = s32[3] parameter(2),
+    sharding={replicated metadata={op_name="a"}}
+  %p3 = f32[3,9] parameter(3)
+  %updates.0 = f32[3,9] copy(%p3)
+  %p4 = f32[3,9] parameter(4)
+  %updates.1 = f32[3,9] copy(%p4)
+  ROOT %scatter = (f32[2,9],f32[2,9]) scatter(%input.0, %input.1, %indices, %updates.0, %updates.1),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      sharding={{devices=[1,4]0,1,2,3 metadata={op_name="b"}}, {devices=[1,2,2]0,1,2,3 last_tile_dim_replicate metadata={op_name="c"}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "updates.0");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction, op::Sharding("{devices=[1,4]0,1,2,3}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding(),
+                ShardingMetadata({CreateMetadata("b")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+
+  instruction = FindInstruction(module.get(), "updates.1");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction,
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding(),
+                ShardingMetadata({CreateMetadata("c")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
 TEST_P(ParameterizedMetadataTest, ScatterUpdateToIndex) {
   const char* const hlo_string = R"(
 HloModule module
@@ -4924,6 +5172,62 @@ ENTRY entry {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ScatterUpdateToIndex_Variadic) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs.0: f32[], lhs.1: f32[], rhs.0: f32[], rhs.1: f32[]) -> (f32[], f32[]) {
+  lhs.0 = f32[] parameter(0)
+  lhs.1 = f32[] parameter(1)
+  rhs.0 = f32[] parameter(2)
+  rhs.1 = f32[] parameter(3)
+  sum.0 = f32[] add(lhs.0, rhs.0)
+  sum.1 = f32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY entry {
+  %input.0 = f32[2,9] parameter(0),
+    sharding={replicated metadata={op_name="a"}}
+  %input.1 = f32[2,9] parameter(1),
+    sharding={replicated metadata={op_name="b"}}
+  %p2 = s32[3,3] parameter(2),
+    sharding={replicated metadata={op_name="c"}}
+  %indices = s32[3,3] copy(%p2)
+  %updates.0 = f32[3,3,9] parameter(3),
+    sharding={devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate metadata={op_name="d"}}
+  %updates.1 = f32[3,3,9] parameter(4),
+    sharding={devices=[1,2,1,2]0,2,1,3 last_tile_dim_replicate metadata={op_name="e"}}
+  ROOT %scatter = (f32[2,9],f32[2,9]) scatter(%input.0, %input.1, %indices, %updates.0, %updates.1),
+      to_apply=add,
+      update_window_dims={2},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=2,
+      sharding={{replicated metadata={op_name="d"}}, {replicated metadata={op_name="e"}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "indices");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction, op::Sharding("{devices=[2,2]0,1,2,3}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding(),
+                ShardingMetadata({CreateMetadata("d"), CreateMetadata("e")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
 TEST_P(ParameterizedMetadataTest, ScatterIndexToUpdate) {
   const char* const hlo_string = R"(
 HloModule module
@@ -5072,6 +5376,73 @@ ENTRY entry {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ScatterIndexToUpdate_Variadic) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs.0: f32[], lhs.1: f32[], rhs.0: f32[], rhs.1: f32[]) -> (f32[], f32[]) {
+  lhs.0 = f32[] parameter(0)
+  lhs.1 = f32[] parameter(1)
+  rhs.0 = f32[] parameter(2)
+  rhs.1 = f32[] parameter(3)
+  sum.0 = f32[] add(lhs.0, rhs.0)
+  sum.1 = f32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY entry {
+  %input.0 = f32[2,9] parameter(0),
+    sharding={replicated metadata={op_name="a"}}
+  %input.1 = f32[2,9] parameter(1),
+    sharding={replicated metadata={op_name="b"}}
+  %indices = s32[3,3] parameter(2),
+    sharding={devices=[2,2]0,1,2,3 metadata={op_name="c"}}
+  %p3 = f32[3,3,9] parameter(3),
+    sharding={replicated metadata={op_name="d"}}
+  %updates.0 = f32[3,3,9] copy(%p3)
+  %p4 = f32[3,3,9] parameter(4),
+    sharding={replicated metadata={op_name="e"}}
+  %updates.1 = f32[3,3,9] copy(%p4)
+  ROOT %scatter = (f32[2,9],f32[2,9])scatter(%input.0, %input.1, %indices, %updates.0, %updates.1),
+      to_apply=add,
+      update_window_dims={2},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=2,
+      sharding={replicated metadata={op_name="d"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "updates.0");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction, op::Sharding("{devices=[2,2,1]0,1,2,3}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding(),
+                ShardingMetadata({CreateMetadata("c")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+
+  instruction = FindInstruction(module.get(), "updates.1");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction, op::Sharding("{devices=[2,2,1]0,1,2,3}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding(),
+                ShardingMetadata({CreateMetadata("c")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
 TEST_P(ParameterizedMetadataTest, PartialShardingOnElementwise) {
   const char* const hlo_string = R"(
 HloModule module
@@ -6178,6 +6549,383 @@ ENTRY %module {
   }
 }
 
+TEST_P(ParameterizedMetadataTest,
+       ParallelScatterFromOperandForwardPass_Variadic) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs.0: s32[], lhs.1: s32[], rhs.0: s32[], rhs.1: s32[]) -> (s32[], s32[]) {
+  lhs.0 = s32[] parameter(0)
+  lhs.1 = s32[] parameter(1)
+  rhs.0 = s32[] parameter(2)
+  rhs.1 = s32[] parameter(3)
+  sum.0 = s32[] add(lhs.0, rhs.0)
+  sum.1 = s32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0),
+    sharding={devices=[8,1,1,1]0,1,4,5,2,3,6,7 metadata={op_name="a"}}
+  %parameter.1 = s32[8,4,2,2]{3,2,1,0} parameter(1),
+    sharding={devices=[4,1,1,1,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate metadata={op_name="b"}}
+  %iota = s32[1,8,4]{2,1,0} iota(), iota_dimension=1
+  %iota2 = s32[1,8,4]{2,1,0} iota(), iota_dimension=2
+  %concatenate = s32[2,8,4]{2,1,0} concatenate(s32[1,8,4]{2,1,0} %iota,
+    s32[1,8,4]{2,1,0} %iota2), dimensions={0}
+  %parameter.2 = s32[8,4,2,2]{3,2,1,0} parameter(2)
+  %parameter.3 = s32[8,4,2,2]{3,2,1,0} parameter(3)
+  %scatter = (s32[8,4,2,2]{3,2,1,0},s32[8,4,2,2]{3,2,1,0}) scatter(
+    s32[8,4,2,2]{3,2,1,0} %parameter.0,
+    s32[8,4,2,2]{3,2,1,0} %parameter.1,
+    s32[2,8,4]{2,1,0} %concatenate,
+    s32[8,4,2,2]{3,2,1,0} %parameter.2,
+    s32[8,4,2,2]{3,2,1,0} %parameter.3),
+    to_apply=add,
+    update_window_dims={2,3},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=0
+  ROOT %copy = (s32[8,4,2,2]{3,2,1,0},s32[8,4,2,2]{3,2,1,0}) copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "scatter");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction,
+              op::Sharding("{{devices=[8,1,1,1]0,1,4,5,2,3,6,7},{devices=[4,1,"
+                           "1,1,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding().tuple_elements()[0],
+                ShardingMetadata({CreateMetadata("a")}));
+    EXPECT_THAT(instruction->sharding().tuple_elements()[1],
+                ShardingMetadata({CreateMetadata("b")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
+TEST_P(ParameterizedMetadataTest,
+       ParallelScatterFromIndexForwardPass_Variadic) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs.0: s32[], lhs.1: s32[], rhs.0: s32[], rhs.1: s32[]) -> (s32[], s32[]) {
+  lhs.0 = s32[] parameter(0)
+  lhs.1 = s32[] parameter(1)
+  rhs.0 = s32[] parameter(2)
+  rhs.1 = s32[] parameter(3)
+  sum.0 = s32[] add(lhs.0, rhs.0)
+  sum.1 = s32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0)
+  %parameter.1 = s32[8,4,2,2]{3,2,1,0} parameter(1)
+  %iota = s32[1,8,4]{2,1,0} iota(), iota_dimension=1,
+    sharding={devices=[1,4,1,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate metadata={op_name="a"}}
+  %iota2 = s32[1,8,4]{2,1,0} iota(), iota_dimension=2
+  %concatenate = s32[2,8,4]{2,1,0} concatenate(s32[1,8,4]{2,1,0} %iota,
+    s32[1,8,4]{2,1,0} %iota2), dimensions={0}
+  %parameter.2 = s32[8,4,2,2]{3,2,1,0} parameter(2)
+  %parameter.3 = s32[8,4,2,2]{3,2,1,0} parameter(3)
+  %scatter = (s32[8,4,2,2]{3,2,1,0},s32[8,4,2,2]{3,2,1,0}) scatter(
+    s32[8,4,2,2]{3,2,1,0} %parameter.0,
+    s32[8,4,2,2]{3,2,1,0} %parameter.1,
+    s32[2,8,4]{2,1,0} %concatenate,
+    s32[8,4,2,2]{3,2,1,0} %parameter.2,
+    s32[8,4,2,2]{3,2,1,0} %parameter.3),
+    to_apply=add,
+    update_window_dims={2,3},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=0
+  ROOT %copy = (s32[8,4,2,2]{3,2,1,0},s32[8,4,2,2]{3,2,1,0}) copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "scatter");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction,
+              op::Sharding("{{devices=[4,1,1,1,2]0,1,4,5,2,3,6,7 "
+                           "last_tile_dim_replicate},{devices=[4,1,1,1,2]0,1,4,"
+                           "5,2,3,6,7 last_tile_dim_replicate}}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding().tuple_elements()[0],
+                ShardingMetadata({CreateMetadata("a")}));
+    EXPECT_THAT(instruction->sharding().tuple_elements()[1],
+                ShardingMetadata({CreateMetadata("a")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
+TEST_P(ParameterizedMetadataTest,
+       ParallelScatterFromUpdateForwardPass_Variadic) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs.0: s32[], lhs.1: s32[], rhs.0: s32[], rhs.1: s32[]) -> (s32[], s32[]) {
+  lhs.0 = s32[] parameter(0)
+  lhs.1 = s32[] parameter(1)
+  rhs.0 = s32[] parameter(2)
+  rhs.1 = s32[] parameter(3)
+  sum.0 = s32[] add(lhs.0, rhs.0)
+  sum.1 = s32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0)
+  %parameter.1 = s32[8,4,2,2]{3,2,1,0} parameter(1)
+  %iota = s32[1,8,4]{2,1,0} iota(), iota_dimension=1
+  %iota2 = s32[1,8,4]{2,1,0} iota(), iota_dimension=2
+  %concatenate = s32[2,8,4]{2,1,0} concatenate(s32[1,8,4]{2,1,0} %iota,
+    s32[1,8,4]{2,1,0} %iota2), dimensions={0}
+  %parameter.2 = s32[8,4,2,2]{3,2,1,0} parameter(2),
+    sharding={devices=[1,8,1,1]0,1,4,5,2,3,6,7 metadata={op_name="a"}}
+  %parameter.3 = s32[8,4,2,2]{3,2,1,0} parameter(3),
+    sharding={devices=[4,1,1,1,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate metadata={op_name="b"}}
+  %scatter = (s32[8,4,2,2]{3,2,1,0},s32[8,4,2,2]{3,2,1,0}) scatter(
+    s32[8,4,2,2]{3,2,1,0} %parameter.0,
+    s32[8,4,2,2]{3,2,1,0} %parameter.1,
+    s32[2,8,4]{2,1,0} %concatenate,
+    s32[8,4,2,2]{3,2,1,0} %parameter.2,
+    s32[8,4,2,2]{3,2,1,0} %parameter.3),
+    to_apply=add,
+    update_window_dims={2,3},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=0
+  ROOT %copy = (s32[8,4,2,2]{3,2,1,0},s32[8,4,2,2]{3,2,1,0}) copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "scatter");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction,
+              op::Sharding("{{devices=[1,8,1,1]0,1,4,5,2,3,6,7},{devices=[4,1,"
+                           "1,1,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding().tuple_elements()[0],
+                ShardingMetadata({CreateMetadata("a")}));
+    EXPECT_THAT(instruction->sharding().tuple_elements()[1],
+                ShardingMetadata({CreateMetadata("b")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
+TEST_P(ParameterizedMetadataTest, ParallelScatterBackwardPass_Variadic) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs.0: s32[], lhs.1: s32[], rhs.0: s32[], rhs.1: s32[]) -> (s32[], s32[]) {
+  lhs.0 = s32[] parameter(0)
+  lhs.1 = s32[] parameter(1)
+  rhs.0 = s32[] parameter(2)
+  rhs.1 = s32[] parameter(3)
+  sum.0 = s32[] add(lhs.0, rhs.0)
+  sum.1 = s32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0)
+  %copy.p0 = s32[8,4,2,2]{3,2,1,0} copy(%parameter.0)
+  %parameter.1 = s32[8,4,2,2]{3,2,1,0} parameter(1)
+  %copy.p1 = s32[8,4,2,2]{3,2,1,0} copy(%parameter.1)
+  %iota = s32[1,8,4]{2,1,0} iota(), iota_dimension=1
+  %iota2 = s32[1,8,4]{2,1,0} iota(), iota_dimension=2
+  %concatenate = s32[2,8,4]{2,1,0} concatenate(s32[1,8,4]{2,1,0} %iota,
+    s32[1,8,4]{2,1,0} %iota2), dimensions={0}
+  %parameter.2 = s32[8,4,2,2]{3,2,1,0} parameter(2)
+  %copy.p2 = s32[8,4,2,2]{3,2,1,0} copy(%parameter.2)
+  %parameter.3 = s32[8,4,2,2]{3,2,1,0} parameter(3)
+  %copy.p3 = s32[8,4,2,2]{3,2,1,0} copy(%parameter.3)
+  %scatter = (s32[8,4,2,2]{3,2,1,0},s32[8,4,2,2]{3,2,1,0}) scatter(
+    s32[8,4,2,2]{3,2,1,0} %copy.p0,
+    s32[8,4,2,2]{3,2,1,0} %copy.p1,
+    s32[2,8,4]{2,1,0} %concatenate,
+    s32[8,4,2,2]{3,2,1,0} %copy.p2,
+    s32[8,4,2,2]{3,2,1,0} %copy.p3),
+    to_apply=add,
+    update_window_dims={2,3},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=0,
+    sharding={{devices=[8,1,1,1]0,1,4,5,2,3,6,7 metadata={op_name="a"}},
+              {devices=[4,1,1,1,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate metadata={op_name="b"}}}
+  ROOT %copy = (s32[8,4,2,2]{3,2,1,0},s32[8,4,2,2]{3,2,1,0}) copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* concatenate = FindInstruction(module.get(), "concatenate");
+  ASSERT_NE(concatenate, nullptr);
+  EXPECT_THAT(concatenate, op::Sharding("{devices=[1,8,1]0,1,4,5,2,3,6,7}"));
+  auto* copy_p0 = FindInstruction(module.get(), "copy.p0");
+  ASSERT_NE(copy_p0, nullptr);
+  EXPECT_THAT(copy_p0, op::Sharding("{devices=[8,1,1,1]0,1,4,5,2,3,6,7}"));
+  auto* copy_p1 = FindInstruction(module.get(), "copy.p1");
+  ASSERT_NE(copy_p1, nullptr);
+  EXPECT_THAT(
+      copy_p1,
+      op::Sharding(
+          "{devices=[4,1,1,1,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+  auto* copy_p2 = FindInstruction(module.get(), "copy.p2");
+  ASSERT_NE(copy_p2, nullptr);
+  EXPECT_THAT(copy_p2, op::Sharding("{devices=[8,1,1,1]0,1,4,5,2,3,6,7}"));
+  auto* copy_p3 = FindInstruction(module.get(), "copy.p3");
+  ASSERT_NE(copy_p3, nullptr);
+  EXPECT_THAT(
+      copy_p3,
+      op::Sharding(
+          "{devices=[4,1,1,1,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+  for (HloInstruction* instruction : {concatenate, copy_p0, copy_p2}) {
+    if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+      EXPECT_THAT(instruction->sharding(),
+                  ShardingMetadata({CreateMetadata("a")}));
+    } else {
+      EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+    }
+  }
+  for (HloInstruction* instruction : {copy_p1, copy_p3}) {
+    if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+      EXPECT_THAT(instruction->sharding(),
+                  ShardingMetadata({CreateMetadata("b")}));
+    } else {
+      EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+    }
+  }
+}
+
+TEST_P(ParameterizedMetadataTest, ParallelScatterBackwardPass2_Variadic) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs.0: s32[], lhs.1: s32[], rhs.0: s32[], rhs.1: s32[]) -> (s32[], s32[]) {
+  lhs.0 = s32[] parameter(0)
+  lhs.1 = s32[] parameter(1)
+  rhs.0 = s32[] parameter(2)
+  rhs.1 = s32[] parameter(3)
+  sum.0 = s32[] add(lhs.0, rhs.0)
+  sum.1 = s32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY %module {
+  %parameter.0 = s32[4,8,2,2]{3,2,1,0} parameter(0)
+  %copy.p0 = s32[4,8,2,2]{3,2,1,0} copy(%parameter.0)
+  %parameter.1 = s32[4,8,2,2]{3,2,1,0} parameter(1)
+  %copy.p1 = s32[4,8,2,2]{3,2,1,0} copy(%parameter.1)
+  %iota = s32[1,8,4]{2,1,0} iota(), iota_dimension=1
+  %iota2 = s32[1,8,4]{2,1,0} iota(), iota_dimension=2
+  %concatenate = s32[2,8,4]{2,1,0} concatenate(s32[1,8,4]{2,1,0} %iota,
+    s32[1,8,4]{2,1,0} %iota2), dimensions={0}
+  %parameter.2 = s32[8,4,2,2]{3,2,1,0} parameter(2)
+  %copy.p2 = s32[8,4,2,2]{3,2,1,0} copy(%parameter.2)
+  %parameter.3 = s32[8,4,2,2]{3,2,1,0} parameter(3)
+  %copy.p3 = s32[8,4,2,2]{3,2,1,0} copy(%parameter.3)
+  %scatter = (s32[4,8,2,2]{3,2,1,0},s32[4,8,2,2]{3,2,1,0}) scatter(
+    s32[4,8,2,2]{3,2,1,0} %copy.p0,
+    s32[4,8,2,2]{3,2,1,0} %copy.p1,
+    s32[2,8,4]{2,1,0} %concatenate,
+    s32[8,4,2,2]{3,2,1,0} %copy.p2,
+    s32[8,4,2,2]{3,2,1,0} %copy.p3),
+    to_apply=add,
+    update_window_dims={2,3},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={1,0},
+    index_vector_dim=0,
+    sharding={{devices=[4,1,1,1]0,1,4,5 metadata={op_name="a"}},
+              {devices=[2,1,1,1,2]0,1,4,5 last_tile_dim_replicate metadata={op_name="b"}}}
+  ROOT %copy = (s32[4,8,2,2]{3,2,1,0},s32[4,8,2,2]{3,2,1,0}) copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* concatenate = FindInstruction(module.get(), "concatenate");
+  ASSERT_NE(concatenate, nullptr);
+  EXPECT_THAT(concatenate, op::Sharding("{devices=[1,1,4]0,1,4,5}"));
+  auto* copy_p0 = FindInstruction(module.get(), "copy.p0");
+  ASSERT_NE(copy_p0, nullptr);
+  EXPECT_THAT(copy_p0, op::Sharding("{devices=[4,1,1,1]0,1,4,5}"));
+  auto* copy_p1 = FindInstruction(module.get(), "copy.p1");
+  ASSERT_NE(copy_p1, nullptr);
+  EXPECT_THAT(
+      copy_p1,
+      op::Sharding("{devices=[2,1,1,1,2]0,1,4,5 last_tile_dim_replicate}"));
+  auto* copy_p2 = FindInstruction(module.get(), "copy.p2");
+  ASSERT_NE(copy_p2, nullptr);
+  EXPECT_THAT(copy_p2, op::Sharding("{devices=[1,4,1,1]0,1,4,5}"));
+  auto* copy_p3 = FindInstruction(module.get(), "copy.p3");
+  ASSERT_NE(copy_p3, nullptr);
+  EXPECT_THAT(
+      copy_p3,
+      op::Sharding("{devices=[1,2,1,1,2]0,1,4,5 last_tile_dim_replicate}"));
+  for (HloInstruction* instruction : {concatenate, copy_p0, copy_p2}) {
+    if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+      EXPECT_THAT(instruction->sharding(),
+                  ShardingMetadata({CreateMetadata("a")}));
+    } else {
+      EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+    }
+  }
+  for (HloInstruction* instruction : {copy_p1, copy_p3}) {
+    if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+      EXPECT_THAT(instruction->sharding(),
+                  ShardingMetadata({CreateMetadata("b")}));
+    } else {
+      EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+    }
+  }
+}
+
 TEST_P(ParameterizedMetadataTest,
        GatherMergedIndexParallelAndOperandPassthroughFromOperandForwardPass) {
   absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
index 82ef9f2bca02a3..41386a6d822d87 100644
--- a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
+++ b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
@@ -1498,8 +1498,16 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
         return operand.sharding() == operands[0].sharding() &&
                operand.base_shape() == operands[0].base_shape();
       })) {
-    return FailedPrecondition(
-        "All scatter operands must have the same sharding.");
+    std::vector<HloSharding> shardings;
+    absl::c_transform(operands, std::back_inserter(shardings),
+                      [](const PartitionedHlo& instruction) {
+                        return instruction.sharding();
+                      });
+    HloSharding common_sharding =
+        hlo_sharding_util::FindCommonSharding(shardings);
+    absl::c_for_each(operands, [&](PartitionedHlo& operand) {
+      operand = operand.Reshard(common_sharding);
+    });
   }
   absl::c_transform(
       scatter->scatter_updates(), std::back_inserter(updates),
@@ -1508,8 +1516,16 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
         return update.sharding() == updates[0].sharding() &&
                update.base_shape() == updates[0].base_shape();
       })) {
-    return FailedPrecondition(
-        "All scatter outputs must have the same sharding.");
+    std::vector<HloSharding> shardings;
+    absl::c_transform(updates, std::back_inserter(shardings),
+                      [](const PartitionedHlo& instruction) {
+                        return instruction.sharding();
+                      });
+    HloSharding common_sharding =
+        hlo_sharding_util::FindCommonSharding(shardings);
+    absl::c_for_each(operands, [&](PartitionedHlo& operand) {
+      operand = operand.Reshard(common_sharding);
+    });
   }
   CHECK_EQ(operands.size(), updates.size());
   CHECK_EQ(operands.size() * 2,
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index 14cc49434a8785..67330f976bcf87 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -7461,6 +7461,55 @@ ENTRY entry {
                     op::Shape("f32[2,5]")));
 }
 
+TEST_P(SpmdPartitioningTest, VariadicScatter) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+add (lhs.0: f32[], lhs.1: f32[], rhs.0: f32[], rhs.1: f32[]) -> (f32[], f32[]) {
+  lhs.0 = f32[] parameter(0)
+  lhs.1 = f32[] parameter(1)
+  rhs.0 = f32[] parameter(2)
+  rhs.1 = f32[] parameter(3)
+  sum.0 = f32[] add(lhs.0, rhs.0)
+  sum.1 = f32[] add(lhs.1, rhs.1)
+  ROOT tuple = tuple(sum.0, sum.1)
+}
+
+ENTRY entry {
+  %input.0 = f32[2,9] parameter(0), sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  %input.1 = f32[2,9] parameter(1), sharding={devices=[1,2,2]0,2,1,3 last_tile_dim_replicate}
+  %indices = s32[3] parameter(2), sharding={replicated}
+  %updates.0 = f32[3,9] parameter(3), sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  %updates.1 = f32[3,9] parameter(4), sharding={devices=[1,4]0,1,2,3}
+  ROOT %scatter = (f32[2,9],f32[2,9]) scatter(%input.0, %input.1, %indices, %updates.0, %updates.1),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={devices=[1,4]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  auto scatter = op::Scatter(op::Shape("f32[1,9]"), op::Shape("f32[1,9]"),
+                             op::Shape("s32[3]"), op::Shape("f32[3,9]"),
+                             op::Shape("f32[3,9]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Tuple(op::DynamicSlice(
+                          op::Pad(op::AllReduce(op::DynamicUpdateSlice(
+                                      _, op::GetTupleElement(scatter), _, _)),
+                                  _),
+                          _, _),
+                      op::DynamicSlice(
+                          op::Pad(op::AllReduce(op::DynamicUpdateSlice(
+                                      _, op::GetTupleElement(scatter), _, _)),
+                                  _),
+                          _, _)),
+            op::Shape("(f32[2,3],f32[2,3])")));
+}
+
 TEST_P(SpmdPartitioningTest, PassthroughScatter) {
   absl::string_view hlo_string = R"(
 HloModule module

From 1e2ba432f94c54ffeaea535d91ff11a1f76b4eae Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Mon, 2 Oct 2023 17:28:40 -0700
Subject: [PATCH 503/567] Remove rc3 from python 3.12 install now that stable
 release is out

PiperOrigin-RevId: 570226890
---
 ....rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython | 2 +-
 ....rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
index 8fcf2211bcd848..fa4748d996ee6b 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
@@ -35,7 +35,7 @@ COPY install/build_and_install_python.sh /install/
 RUN /install/build_and_install_python.sh "3.9.4"
 RUN /install/build_and_install_python.sh "3.10.0"
 RUN /install/build_and_install_python.sh "3.11.0"
-RUN /install/build_and_install_python.sh "3.12.0rc3"
+RUN /install/build_and_install_python.sh "3.12.0"
 
 COPY install/install_pip_packages_by_version.sh /install/
 # https://github.com/numpy/numpy/issues/22623 for `SETUPTOOLS_USE_DISTUTILS`.
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython
index 308b709bf51471..008568578c5460 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython
@@ -34,7 +34,7 @@ COPY install/build_and_install_python.sh /install/
 RUN /install/build_and_install_python.sh "3.9.4"
 RUN /install/build_and_install_python.sh "3.10.0"
 RUN /install/build_and_install_python.sh "3.11.0"
-RUN /install/build_and_install_python.sh "3.12.0rc3"
+RUN /install/build_and_install_python.sh "3.12.0"
 
 COPY install/install_pip_packages_by_version.sh /install/
 # https://github.com/numpy/numpy/issues/22623 for `SETUPTOOLS_USE_DISTUTILS`.

From d1ba59bcc03112ba5351b8f50db206b40d63d643 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Mon, 2 Oct 2023 18:02:14 -0700
Subject: [PATCH 504/567] [xla:gpu] Deallocate all graphs with same device
 ordinal if a graph fallbacks

PiperOrigin-RevId: 570233968
---
 third_party/xla/xla/runtime/state.h           |  7 +--
 third_party/xla/xla/service/gpu/runtime/BUILD |  1 +
 .../xla/xla/service/gpu/runtime/executable.cc |  5 ++-
 .../xla/service/gpu/runtime/graph_launch.cc   | 44 +++++++++++++------
 .../xla/service/gpu/runtime/graph_launch.h    |  3 ++
 5 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/third_party/xla/xla/runtime/state.h b/third_party/xla/xla/runtime/state.h
index c505dd96ffea72..bbfd6e9a8cb8f0 100644
--- a/third_party/xla/xla/runtime/state.h
+++ b/third_party/xla/xla/runtime/state.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <memory>
+#include <optional>
 #include <type_traits>
 #include <vector>
 
@@ -72,7 +73,7 @@ class StateVector {
     template <typename F>
     absl::StatusOr<T*> GetOrCreate(size_t id, F&& create);
 
-    absl::StatusOr<T*> Get(size_t id);
+    std::optional<T*> Get(size_t id);
 
     absl::Status Erase(size_t id);
 
@@ -183,7 +184,7 @@ absl::StatusOr<T*> StateVector<T>::Snapshot::GetOrCreate(size_t id,
 }
 
 template <typename T>
-absl::StatusOr<T*> StateVector<T>::Snapshot::Get(size_t id) {
+std::optional<T*> StateVector<T>::Snapshot::Get(size_t id) {
   // If snapshot already contains the entry, just return it.
   std::vector<T*>& snapshot = *maybe_obsolete_snapshot_;
   if (id < snapshot.size() && snapshot[id]) return snapshot[id];
@@ -196,7 +197,7 @@ absl::StatusOr<T*> StateVector<T>::Snapshot::Get(size_t id) {
   std::vector<std::unique_ptr<T>>& state = owning_state_.vector_;
   if (id < state.size() && state[id].get()) return state[id].get();
 
-  return absl::InternalError("Value not found in state vector");
+  return std::nullopt;
 }
 
 template <typename T>
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 4cdd2dcbbd8464..c995fb2f175cba 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -431,6 +431,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/profiler/lib:profiler_lock",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
diff --git a/third_party/xla/xla/service/gpu/runtime/executable.cc b/third_party/xla/xla/service/gpu/runtime/executable.cc
index 8fe28a284f275f..13641b8fa1b510 100644
--- a/third_party/xla/xla/service/gpu/runtime/executable.cc
+++ b/third_party/xla/xla/service/gpu/runtime/executable.cc
@@ -416,7 +416,7 @@ Status GpuRuntimeExecutable::Execute(
   std::shared_ptr<StreamExecutorGraphInstances> executor_graphs =
       graph_instances_(executor);
 
-  StreamExecutorGraphInstances::Snapshot graph_instances =
+  StreamExecutorGraphInstances::Snapshot se_graph_instances =
       executor_graphs->snapshot();
   CapturedFunctionExecutionCount::Snapshot execution_count =
       captured_function_counts_(executor)->snapshot();
@@ -461,7 +461,8 @@ Status GpuRuntimeExecutable::Execute(
       // only.
       &fused_attention_runners, &fused_attention_backward_runners,
 #endif  // GOOGLE_CUDA
-      &graph_instances, &execution_count, &ordinal_to_fallback,
+      &se_graph_instances, &execution_count, &ordinal_to_fallback,
+      &graph_instances_,
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       &concurrent_region_status,
       // Null pointer will be interpreted as an absence of async collectives
diff --git a/third_party/xla/xla/service/gpu/runtime/graph_launch.cc b/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
index fa42e1330050c4..b40e5ff7954d4f 100644
--- a/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
+++ b/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/support.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/statusor.h"
+#include "tsl/platform/errors.h"
 #include "tsl/profiler/lib/profiler_lock.h"
 #include "tsl/profiler/lib/traceme.h"
 #include "tsl/profiler/lib/traceme_encode.h"
@@ -303,8 +304,8 @@ Status GraphInstances::InstantiateAllGraphs(
                           "xla.gpu.graph.capture"))
       continue;
 
-    StatusOr<std::monostate*> fallback = ordinal_to_fallback->Get(ordinal);
-    if (fallback.ok()) continue;
+    std::optional<std::monostate*> fallback = ordinal_to_fallback->Get(ordinal);
+    if (fallback.has_value()) continue;
 
     VLOG(3) << "Instantiate Gpu graph defined by capture function @"
             << executable.function_name(ordinal) << " (ordinal = " << ordinal
@@ -375,6 +376,17 @@ Status GraphInstances::InstantiateAllGraphs(
   return OkStatus();
 }
 
+Status GraphInstances::RemoveGraphsByOrdinal(unsigned ordinal) {
+  absl::MutexLock lock(&impl_->mu);
+  for (auto& [stream_executor, state] : impl_->graphs) {
+    Status erased = state.instances->snapshot().Erase(ordinal);
+    if (erased.ok()) {
+      NotifyGraphInstancesDestroyed(stream_executor, 1);
+    }
+  }
+  return OkStatus();
+}
+
 CapturedFunctionExecutionCount* CapturedFunctionExecutionCounts::operator()(
     se::StreamExecutor* executor) {
   absl::MutexLock lock(&mutex_);
@@ -555,11 +567,11 @@ static absl::Status LaunchGraph(
     const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
     StreamExecutorKernels::Snapshot* kernels,
     StreamExecutorConvRunners::Snapshot* convs,
-    StreamExecutorGraphInstances::Snapshot* instances,
+    StreamExecutorGraphInstances::Snapshot* se_graph_instances,
     CapturedFunctionExecutionCount::Snapshot* counts,
     OrdinalToFallback::Snapshot* ordinal_to_fallback,
-    GemmConfigs::Snapshot* gemm_config, runtime::Executable* executable,
-    NonAtomicallyUpgradeableRWLock* gpu_lock,
+    GraphInstances* graph_instances, GemmConfigs::Snapshot* gemm_config,
+    runtime::Executable* executable, NonAtomicallyUpgradeableRWLock* gpu_lock,
     ConcurrentRegionStatus* region_status, CustomCall::RemainingArgs fwd_args,
     CustomCall::FunctionOrdinal capture) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -591,10 +603,10 @@ static absl::Status LaunchGraph(
   // work around disable graph execution and run everything in op-by-op mode.
   bool is_profiling = tsl::profiler::ProfilerLock::HasActiveSession();
 
-  StatusOr<std::monostate*> fallback =
+  std::optional<std::monostate*> fallback =
       ordinal_to_fallback->Get(capture.ordinal);
 
-  if (count < num_runs_to_instantiate || is_profiling || fallback.ok()) {
+  if (count < num_runs_to_instantiate || is_profiling || fallback.has_value()) {
     VLOG(3) << "Run gpu graph in op-by-op mode: ordinal = " << capture.ordinal;
     return RunGraphOpByOp(run_options, function_ref, fwd_args, user_data());
   }
@@ -617,16 +629,16 @@ static absl::Status LaunchGraph(
     // If num_runs_to_instantiate is less than 0, all graphs should be
     // instantiated ahead-of-time. If we fail to get the graph instance, then
     // graph instantiation failed due to OOM. So we run the graph op-by-op.
-    absl::StatusOr<GraphInstance*> try_get_instance =
-        instances->Get(capture.ordinal);
-    if (try_get_instance.ok()) {
+    std::optional<GraphInstance*> try_get_instance =
+        se_graph_instances->Get(capture.ordinal);
+    if (try_get_instance.has_value()) {
       instance = try_get_instance.value();
     } else {
       return RunGraphOpByOp(run_options, function_ref, fwd_args, user_data());
     }
   } else {
-    TF_ASSIGN_OR_RETURN(instance,
-                        instances->GetOrCreate(capture.ordinal, instantiate));
+    TF_ASSIGN_OR_RETURN(instance, se_graph_instances->GetOrCreate(
+                                      capture.ordinal, instantiate));
   }
 
   {
@@ -670,8 +682,6 @@ static absl::Status LaunchGraph(
     case se::gpu::OwnedGpuGraphExec::UpdateResult::kFallback: {
       LOG(WARNING) << "Fallback to op-by-op mode because memset node breaks "
                       "graph update";
-      // Deallocate instance.
-      TF_RETURN_IF_ERROR(instances->Erase(capture.ordinal));
       // Set ordinal_to_fallback to prevent future instantiation of this graph.
       TF_ASSIGN_OR_RETURN(
           std::monostate * fallback,
@@ -679,6 +689,11 @@ static absl::Status LaunchGraph(
               capture.ordinal,
               []() -> StatusOr<std::monostate> { return std::monostate{}; }));
       DCHECK(fallback);
+
+      // Deallocate graph instances with the ordinal.
+      TF_RETURN_IF_ERROR(
+          graph_instances->RemoveGraphsByOrdinal(capture.ordinal));
+
       return RunGraphOpByOp(run_options, function_ref, fwd_args, user_data());
     }
     case se::gpu::OwnedGpuGraphExec::UpdateResult::kSuccess:
@@ -716,6 +731,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .UserData<StreamExecutorGraphInstances::Snapshot*>()
         .UserData<CapturedFunctionExecutionCount::Snapshot*>()
         .UserData<OrdinalToFallback::Snapshot*>()
+        .UserData<GraphInstances*>()
         .UserData<GemmConfigs::Snapshot*>()
         .UserData<Executable*>()
         .UserData<NonAtomicallyUpgradeableRWLock*>()
diff --git a/third_party/xla/xla/service/gpu/runtime/graph_launch.h b/third_party/xla/xla/service/gpu/runtime/graph_launch.h
index e4841541bc781e..1553815acf8717 100644
--- a/third_party/xla/xla/service/gpu/runtime/graph_launch.h
+++ b/third_party/xla/xla/service/gpu/runtime/graph_launch.h
@@ -118,6 +118,9 @@ class GraphInstances {
   bool InstantiatedAllGraphs(const ServiceExecutableRunOptions* run_options,
                              const runtime::Executable& executable);
 
+  // Remove all instantiated graph instances that has a certain ordinal.
+  Status RemoveGraphsByOrdinal(unsigned ordinal);
+
  private:
   std::shared_ptr<Impl> impl_;
 };

From 80bfa43d06907ac9ce1df928eb74983f1ad10bac Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Mon, 2 Oct 2023 18:31:36 -0700
Subject: [PATCH 505/567] Support int4 in hlo_utils.cc functions.

In particular, support int4 in CreateDenseAttrFromLiteral and CopyDenseElementsDataToXlaFormat.

PiperOrigin-RevId: 570238807
---
 .../xla/xla/translate/hlo_to_mhlo/BUILD       |  5 +++
 .../xla/translate/hlo_to_mhlo/hlo_utils.cc    | 39 ++++++++++++++++--
 .../translate/hlo_to_mhlo/hlo_utils_test.cc   | 41 +++++++++++++++++++
 third_party/xla/xla/util.cc                   | 13 ++++++
 third_party/xla/xla/util.h                    |  6 +++
 5 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
index 53836e78cf4787..8645c6f381a423 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
@@ -97,6 +97,7 @@ cc_library(
     deps = [
         "//xla:literal",
         "//xla:shape_util",
+        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -116,10 +117,14 @@ xla_cc_test(
     srcs = ["hlo_utils_test.cc"],
     deps = [
         ":hlo_utils",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:test",
+        "//xla:types",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:test_main",
     ],
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
index 3c1decc42f3192..1574cca50ee837 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
@@ -17,6 +17,11 @@ limitations under the License.
 
 #include "xla/translate/hlo_to_mhlo/hlo_utils.h"
 
+#include <cstddef>
+#include <type_traits>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -25,6 +30,8 @@ limitations under the License.
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/primitive_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/types.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -40,9 +47,22 @@ using xla::StatusOr;
 template <typename CppType>
 ::mlir::DenseElementsAttr CreateDenseAttrFromLiteral(
     const ShapedType& type, const LiteralBase& literal) {
-  auto data_span = literal.data<CppType>();
-  return ::mlir::DenseElementsAttr::get(
-      type, llvm::ArrayRef(data_span.data(), data_span.size()));
+  if constexpr (std::is_same_v<CppType, u4> || std::is_same_v<CppType, s4>) {
+    // DenseElementsAttr::get() does not support being passed an i4 array.
+    // Instead, create buffer of padded i4 values and call
+    // DenseElementsAttr::getFromRawBuffer()
+    auto data_span = literal.data<CppType>();
+    std::vector<char> int4_padded_data;
+    int4_padded_data.reserve(literal.element_count());
+    for (size_t i = 0; i < literal.element_count(); i++) {
+      int4_padded_data.push_back(static_cast<char>(data_span[i]));
+    }
+    return ::mlir::DenseElementsAttr::getFromRawBuffer(type, int4_padded_data);
+  } else {
+    auto data_span = literal.data<CppType>();
+    return ::mlir::DenseElementsAttr::get(
+        type, llvm::ArrayRef(data_span.data(), data_span.size()));
+  }
 }
 
 StatusOr<AffineMap> GetPermutationIfAvailable(const Shape& shape,
@@ -85,6 +105,15 @@ void CopyDenseElementsBy(mlir::DenseElementsAttr data,
   }
 }
 
+template <>
+void CopyDenseElementsBy<u4>(mlir::DenseElementsAttr data,
+                             std::vector<uint8_t>* output) {
+  output->resize(CeilOfRatio(data.getNumElements(), int64_t{2}));
+  absl::Span<char> output_span =
+      absl::MakeSpan(reinterpret_cast<char*>(output->data()), output->size());
+  PackInt4(data.getRawData(), output_span);
+}
+
 }  // namespace
 
 StatusOr<mlir::MemRefType> ConvertTensorShapeToMemRefType(
@@ -132,6 +161,10 @@ Status CopyDenseElementsDataToXlaFormat(mlir::DenseElementsAttr data,
     CopyDenseElementsBy<bool>(data, output);
     return OkStatus();
   }
+  if (element_type.isInteger(4)) {
+    CopyDenseElementsBy<u4>(data, output);
+    return OkStatus();
+  }
   if (element_type.isInteger(8)) {
     CopyDenseElementsBy<uint8_t>(data, output);
     return OkStatus();
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils_test.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils_test.cc
index 08499aca2613ba..ef6ae79b2b5da3 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils_test.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils_test.cc
@@ -15,12 +15,20 @@ limitations under the License.
 
 #include "xla/translate/hlo_to_mhlo/hlo_utils.h"
 
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/shape_util.h"
 #include "xla/test.h"
+#include "xla/types.h"
+#include "tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -61,5 +69,38 @@ TEST(ConvertTensorShapeToType, Simple) {
   }
 }
 
+TEST(LiteralToAttrToXlaFormat, Simple) {
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::mhlo::MhloDialect>();
+  mlir::Builder builder(&context);
+
+  // int16
+  {
+    Literal x = LiteralUtil::CreateR2<int16_t>({{0, 1, 2}, {3, 4, 5}});
+    TF_ASSERT_OK_AND_ASSIGN(mlir::DenseElementsAttr attr,
+                            CreateDenseElementsAttrFromLiteral(x, builder));
+
+    std::vector<uint8_t> data;
+    TF_ASSERT_OK(CopyDenseElementsDataToXlaFormat(attr, &data));
+    for (int i = 0; i < 6; i++) {
+      int16_t x;
+      memcpy(&x, &data[i * 2], 2);
+      EXPECT_EQ(x, i);
+    }
+  }
+
+  // int4
+  {
+    Literal x = LiteralUtil::CreateR2<s4>(
+        {{s4(0), s4(1), s4(2)}, {s4(3), s4(4), s4(5)}});
+    TF_ASSERT_OK_AND_ASSIGN(mlir::DenseElementsAttr attr,
+                            CreateDenseElementsAttrFromLiteral(x, builder));
+
+    std::vector<uint8_t> data;
+    TF_ASSERT_OK(CopyDenseElementsDataToXlaFormat(attr, &data));
+    EXPECT_EQ(data, std::vector<uint8_t>({0x01, 0x23, 0x45}));
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/util.cc b/third_party/xla/xla/util.cc
index 5d1eae734c213c..739afaf131f6ed 100644
--- a/third_party/xla/xla/util.cc
+++ b/third_party/xla/xla/util.cc
@@ -498,4 +498,17 @@ std::pair<float, float> SplitF64ToF32(double x) {
   return std::make_pair(hi, lo);
 }
 
+void PackInt4(absl::Span<const char> input, absl::Span<char> output) {
+  CHECK_EQ(output.size(), CeilOfRatio(input.size(), size_t{2}));
+  for (size_t i = 0; i < input.size(); ++i) {
+    // Mask out the high-order 4 bits in case they have extraneous data.
+    char val = input[i] & 0xf;
+    if (i % 2 == 0) {
+      output[i / 2] = val << 4;
+    } else {
+      output[i / 2] |= val;
+    }
+  }
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/util.h b/third_party/xla/xla/util.h
index 731db9967ef7d3..f54e57f0e37616 100644
--- a/third_party/xla/xla/util.h
+++ b/third_party/xla/xla/util.h
@@ -716,6 +716,12 @@ Status EraseElementFromVector(std::vector<T>* container, const T& value) {
 // range that is available in F32s (out of a total of 11 exponent bits in F64s).
 std::pair<float, float> SplitF64ToF32(double x);
 
+// Takes a sequence of unpacked int4 values, such that every byte stores one
+// int4 value, and packs them so every byte stores two int4 values. 'input'
+// should have num_elements bytes; 'output' should have (num_elements+1)/2
+// bytes.
+void PackInt4(absl::Span<const char> input, absl::Span<char> output);
+
 class HloInstruction;
 class HloModule;
 

From 832ec6f6edf3814382e6d6433ef3a3b02c12e41d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Oct 2023 18:48:45 -0700
Subject: [PATCH 506/567] Fix to restore from SC checkpoints

PiperOrigin-RevId: 570241756
---
 tensorflow/python/tpu/BUILD                   |   1 +
 .../python/tpu/tpu_embedding_for_serving.py   | 134 +++++++++++++++++-
 2 files changed, 129 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 74fac03c20325e..3a40ff4e1b1896 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -832,6 +832,7 @@ pytype_strict_library(
         "//tensorflow/python/ops:sparse_ops",
         "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/trackable:base",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
diff --git a/tensorflow/python/tpu/tpu_embedding_for_serving.py b/tensorflow/python/tpu/tpu_embedding_for_serving.py
index 2ce0afd231690d..c35ac0958930ad 100644
--- a/tensorflow/python/tpu/tpu_embedding_for_serving.py
+++ b/tensorflow/python/tpu/tpu_embedding_for_serving.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Mid level API for Serving TPU Embeddings."""
+import functools
+from typing import Any, Dict, Iterable, Optional, Union
 
-from typing import Any, Iterable, Optional, Text, Union, Dict
 from absl import logging
 
 from tensorflow.python.distribute import distribute_lib
@@ -32,6 +33,7 @@
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.tpu import tpu_embedding_base
 from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.trackable import base as trackable_base
 from tensorflow.python.types import core
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -118,6 +120,9 @@ def __init__(
     """
     super(TPUEmbeddingForServing, self).__init__(feature_config, optimizer)
     self._strategy = distribute_lib.get_strategy()
+    self._experimental_sparsecore_restore_info = (
+        experimental_sparsecore_restore_info
+    )
     if isinstance(self._strategy,
                   (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV2)):
       raise RuntimeError("Serving on TPU is not yet supported.")
@@ -143,8 +148,123 @@ def _maybe_build(self):
       with ops.init_scope():
         self.build()
 
+  def _unshuffle_from_sc_to_cpu(self, t: tensor.Tensor) -> tensor.Tensor:
+    if self._experimental_sparsecore_restore_info is None:
+      return t
+    if "num_tpu_devices" not in self._experimental_sparsecore_restore_info:
+      raise ValueError(
+          "Missing `num_tpu_devices` in `experimental_sparsecore_restore_info`"
+      )
+    if "num_sc_chips_per_tpu" not in self._experimental_sparsecore_restore_info:
+      raise ValueError(
+          "Missing `num_sc_chips_per_tpu` in"
+          " `experimental_sparsecore_restore_info`"
+      )
+    num_sc_devices = (
+        self._experimental_sparsecore_restore_info["num_tpu_devices"]
+        * self._experimental_sparsecore_restore_info["num_sc_chips_per_tpu"]
+    )
+    old_shape = t.shape
+    # The width of the table must be a multiple of number of SC devices. The
+    # tpu strategy does this round off at training time so we expect the
+    # checkpoints value to meet this requirement.
+    assert t.shape[0] % num_sc_devices == 0
+    intermediate_tensor = array_ops.reshape(
+        t, (num_sc_devices, t.shape[0] // num_sc_devices, t.shape[1])
+    )
+    intermediate_tensor = array_ops.transpose(intermediate_tensor, (1, 0, 2))
+    return array_ops.reshape(intermediate_tensor, old_shape)
+
+  def _remove_padding_from_sc(
+      self, value_in_checkpoint: tensor.Tensor, variable_shape: tuple[int, int]
+  ) -> tensor.Tensor:
+    checkpoint_value_shape = value_in_checkpoint.shape.as_list()
+    # If the checkpoint shape is at least the size of the variable, we conclude
+    # that the extra rows and cols must be padding.
+    is_init_value_padded = all(
+        [i >= j for i, j in zip(checkpoint_value_shape, variable_shape)]
+    )
+    if not is_init_value_padded:
+      return value_in_checkpoint
+    # checkpoint has padding so we can remove it.
+    begin = [0] * len(checkpoint_value_shape)
+    return array_ops.slice(
+        value_in_checkpoint, begin=begin, size=variable_shape
+    )
+
+  def _create_variables(
+      self, table: tpu_embedding_v2_utils.TableConfig, trainable: bool
+  ) -> Dict[str, tf_variables.Variable]:
+    """Create all variables including table variables and slot variables."""
+    variable_shape = (table.vocabulary_size, table.dim)
+
+    def getter(name, shape, dtype, initializer, trainable):
+      # _add_variable_with_custom_getter clears the shape sometimes, so we
+      # take the global shape from outside the getter.
+      del shape
+      if isinstance(initializer, trackable_base.CheckpointInitialValueCallable):
+        checkpoint_init_value = initializer(variable_shape).wrapped_value
+        restore_uid = initializer.restore_uid
+        unshuffled = self._unshuffle_from_sc_to_cpu(checkpoint_init_value)
+        truncated = self._remove_padding_from_sc(unshuffled, variable_shape)
+        var = tf_variables.Variable(
+            name=name,
+            initial_value=truncated,
+            shape=variable_shape,
+            dtype=dtype,
+            trainable=trainable,
+        )
+        # Maybe initialize the variable
+        var._maybe_initialize_trackable()  # pylint:disable=protected-access
+        # Update the uid for this variable from the checkpoint init value.
+        # This lets the checkpoint deferred restoration code know that this
+        # variable was restored while creation, so no need to restore it from
+        # the checkpoint later.
+        if restore_uid is not None:
+          var._update_uid = initializer.restore_uid  # pylint:disable=protected-access
+        return var
+
+      initial_value = functools.partial(
+          initializer, variable_shape, dtype=dtype
+      )
+      return tf_variables.Variable(
+          name=name,
+          initial_value=initial_value,
+          shape=variable_shape,
+          dtype=dtype,
+          trainable=trainable,
+      )
+
+    def variable_creator(name, initializer, shape, trainable=True):
+      # Use add_variable_with_custom_getter here so that we take advantage of
+      # the checkpoint loading to allow restore before the variables get
+      # created which avoids double initialization.
+      return self._add_variable_with_custom_getter(
+          name=name,
+          initializer=initializer,
+          shape=shape,
+          dtype=dtypes.float32,
+          getter=getter,
+          trainable=trainable,
+      )
+
+    parameters = variable_creator(
+        table.name, table.initializer, variable_shape, trainable=trainable
+    )
+
+    def slot_creator(name, initializer):
+      return variable_creator(table.name + "/" + name, initializer, False)
+
+    if table.optimizer is not None:
+      slot_vars = table.optimizer._create_slots(parameters, slot_creator)  # pylint: disable=protected-access
+    else:
+      slot_vars = {}
+    slot_vars["parameters"] = parameters
+    return slot_vars
+
   def _create_variables_and_slots(
-      self) -> Dict[Text, Dict[Text, tf_variables.Variable]]:
+      self,
+  ) -> Dict[str, Dict[str, tf_variables.Variable]]:
     """Create variables for TPU embeddings.
 
     Returns:
@@ -180,10 +300,12 @@ def embedding_lookup(self,
                                 self._feature_config)
 
 
-def _ragged_embedding_lookup_with_reduce(table: tf_variables.Variable,
-                                         ragged: ragged_tensor.RaggedTensor,
-                                         weights: ragged_tensor.RaggedTensor,
-                                         combiner: Text) -> core.Tensor:
+def _ragged_embedding_lookup_with_reduce(
+    table: tf_variables.Variable,
+    ragged: ragged_tensor.RaggedTensor,
+    weights: ragged_tensor.RaggedTensor,
+    combiner: str,
+) -> core.Tensor:
   """Compute a ragged lookup followed by a reduce on axis 1.
 
   Args:

From 929ed5d15494349577ed70b65f289272f53e4673 Mon Sep 17 00:00:00 2001
From: Sandeep Dasgupta <sdasgup@google.com>
Date: Mon, 2 Oct 2023 18:54:21 -0700
Subject: [PATCH 507/567] [StableHLO to MHLO] Update element-type check for
 quantized types for BroadcastInDimOp.

Also, includes adding tests for
BroadcastInDimOp/RehsapeOP/TransposeOp. The element type checks for the last
two ops are updated as part of cl/568657411 and cl/570133154.

PiperOrigin-RevId: 570242706
---
 third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td          | 2 +-
 third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
index 94d72f5b4c7121..d480e6794e4bbb 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
@@ -2085,7 +2085,7 @@ def MHLO_BroadcastOp : MHLO_ShapedInterfaceOp<"broadcast",
 }
 
 def MHLO_BroadcastInDimOp : MHLO_Op<"broadcast_in_dim",
-      [Pure, SameOperandsAndResultElementType]> {
+      [Pure, HLO_CompatibleOperandsAndResultElementType]> {
   let summary = "BroadcastInDim operation";
   let description = [{
     Expands the dimensions and/or rank of an input tensor by duplicating the
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index 4ba4e4085afae6..983148bff77802 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -6096,6 +6096,14 @@ func.func @quantization_supported_ops(%arg0: tensor<1x2x2x!quant.uniform<i8:f32,
   func.return
 }
 
+func.func @per_axis_quantized_ops(%arg0: tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>, %arg1: tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30}>>) {
+  %0 = "stablehlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0,1,3]> : tensor<3xi64>} : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<1x2x3x2x!quant.uniform<i8<-128:127>:f32:3, {0.1:-30, 0.5:-20}>>
+  %1 = "stablehlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0,1,2]> : tensor<3xi64>} : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30}>>) -> tensor<2x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30, 0.1:-30}>>
+  %2 = stablehlo.reshape %arg0 : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<2x2x!quant.uniform<i8<-128:127>:f32:1, {0.1:-30, 0.5:-20}>>
+  %3 = "stablehlo.transpose"(%arg0) {permutation = dense<[0,2,1]> : tensor<3xi64>}: (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:1, {0.1:-30, 0.5:-20}>>
+  func.return
+}
+
 // -----
 
 // CHECK-LABEL: scatter_update_scalar

From a965566d7b34a6ba41627a64ca1d1e659c038098 Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Mon, 2 Oct 2023 19:32:30 -0700
Subject: [PATCH 508/567] Add QuantizationUnitLoc and enforce selective
 quantization during lifting

The QuantizationUnitLoc is added right after importing to MLIR, meaning it is the first pass to run.

PiperOrigin-RevId: 570248862
---
 .../mlir/quantization/tensorflow/BUILD        |   4 +
 .../mlir/quantization/tensorflow/cc/BUILD     |  14 ++
 .../tensorflow/cc/quantization_unit_loc.cc    | 104 +++++++++
 .../tensorflow/cc/quantization_unit_loc.h     |  54 +++++
 .../passes/add_quantization_unit_loc.cc       | 202 ++++++++++++++++++
 .../lift_quantizable_spots_as_functions.cc    | 199 ++++++++++++++---
 .../quantization/tensorflow/passes/passes.h   |   7 +-
 .../tensorflow/quantize_passes.cc             |   6 +-
 .../tensorflow/quantize_preprocess.cc         |   5 +
 .../tests/add_quantization_unit_loc.mlir      |  50 +++++
 ..._functions_xla_selective_quantization.mlir | 112 ++++++++++
 .../mlir/quantization/tensorflow/utils/BUILD  |   1 +
 .../utils/lift_as_function_call_utils.cc      |  14 +-
 13 files changed, 732 insertions(+), 40 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc
 create mode 100644 tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h
 create mode 100644 tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc
 create mode 100644 tensorflow/compiler/mlir/quantization/tensorflow/tests/add_quantization_unit_loc.mlir
 create mode 100644 tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla_selective_quantization.mlir

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index 256b106980ef6f..88d17db612cedc 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -351,6 +351,7 @@ cc_library(
     name = "passes",
     srcs = [
         "passes/add_dump_tensor_op.cc",
+        "passes/add_quantization_unit_loc.cc",
         "passes/cast_bf16_ops_to_f32.cc",
         "passes/cast_bf16_ops_to_f32.inc",
         "passes/convert_custom_aggregation_op_to_quant_stats.cc",
@@ -426,6 +427,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:const_op_size",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:constant_fold",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:quantization_unit_loc",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow/ops:tf_op_quant_spec",
         "//tensorflow/compiler/mlir/quantization/tensorflow/ops:tf_quantize_op",
@@ -462,6 +464,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_googlesource_code_re2//:re2",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
@@ -513,6 +516,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo:all_passes",
         "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
index a77b77527c8545..635f71ec59fb6e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
@@ -199,3 +199,17 @@ tf_cc_test(
         "@llvm-project//mlir:Transforms",
     ],
 )
+
+cc_library(
+    name = "quantization_unit_loc",
+    srcs = ["quantization_unit_loc.cc"],
+    hdrs = ["quantization_unit_loc.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc
new file mode 100644
index 00000000000000..7be369e7947ced
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc
@@ -0,0 +1,104 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
+
+#include <cstddef>
+#include <optional>
+#include <string>
+#include <string_view>
+
+#include "absl/strings/str_cat.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace quant {
+namespace {
+
+// Prefix and suffix to the QuantizationUnit string representation.
+constexpr std::string_view kQuantizationUnitPrefix = "QuantizationUnit(";
+constexpr std::string_view kQuantizationUnitSuffix = ")";
+
+// Concatenates node name and func name with a "@" separator.
+std::string ConcatNodeAndFuncName(std::string_view node_name,
+                                  std::string_view func_name) {
+  return absl::StrCat(node_name, "@", func_name);
+}
+
+// Generate a string to represent the QuantizationUnit.
+std::string GenerateQuantizationUnitString(
+    const QuantizationUnitLoc::QuantizationUnit& unit) {
+  return absl::StrCat(kQuantizationUnitPrefix, unit.SerializeAsString(),
+                      kQuantizationUnitSuffix);
+}
+
+}  // namespace
+
+QuantizationUnitLoc::QuantizationUnitLoc(MLIRContext* context,
+                                         const QuantizationUnit& unit)
+    : CallSiteLoc(CallSiteLoc::get(
+          /*callee=*/NameLoc::get(
+              StringAttr::get(context, ConcatNodeAndFuncName(unit.node_name(),
+                                                             unit.func_name())),
+              /*childLoc=*/NameLoc::get(
+                  StringAttr::get(context, unit.op_type()))),
+          /*caller=*/NameLoc::get(StringAttr::get(
+              context, GenerateQuantizationUnitString(unit))))) {}
+
+bool QuantizationUnitLoc::classof(Attribute attr) {
+  if (!llvm::isa<CallSiteLoc>(attr)) return false;
+  auto callsite_loc = llvm::dyn_cast<CallSiteLoc>(attr);
+
+  if (!callsite_loc.getCaller().isa<NameLoc>()) return false;
+  StringRef caller_name =
+      callsite_loc.getCaller().cast<NameLoc>().getName().strref();
+  return caller_name.starts_with(kQuantizationUnitPrefix) &&
+         caller_name.ends_with(kQuantizationUnitSuffix);
+}
+
+std::optional<QuantizationUnitLoc::QuantizationUnit>
+FindQuantizationUnitFromLoc(Location loc) {
+  if (isa<QuantizationUnitLoc>(loc)) {
+    Location caller = loc.cast<CallSiteLoc>().getCaller();
+    StringRef caller_name = caller.cast<NameLoc>().getName().strref();
+    const size_t start_index = kQuantizationUnitPrefix.size();
+    const size_t end_index = caller_name.rfind(kQuantizationUnitSuffix);
+    std::string serialized_proto =
+        caller_name.substr(start_index, end_index - start_index).str();
+    QuantizationUnitLoc::QuantizationUnit quant_unit;
+    if (quant_unit.ParseFromString(serialized_proto)) {
+      return quant_unit;
+    }
+  } else if (isa<FusedLoc>(loc)) {
+    // If the op is rewritten, FusedLoc can be created.
+    for (Location child_loc : loc.cast<FusedLoc>().getLocations()) {
+      std::optional<QuantizationUnitLoc::QuantizationUnit> found_unit =
+          FindQuantizationUnitFromLoc(child_loc);
+      if (found_unit.has_value()) return found_unit;
+    }
+  } else if (isa<CallSiteLoc>(loc)) {
+    // If the graph is inlined, CallSiteLoc can be created.
+    return FindQuantizationUnitFromLoc(loc.cast<CallSiteLoc>().getCallee());
+  }
+
+  return std::nullopt;
+}
+
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h b/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h
new file mode 100644
index 00000000000000..32fb6f892c0504
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h
@@ -0,0 +1,54 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_QUANTIZATION_UNIT_LOC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_QUANTIZATION_UNIT_LOC_H_
+
+#include <optional>
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir {
+namespace quant {
+
+// QuantizationUnitLoc uses CallSiteLoc as the base class so it can be printed
+// with AsmPrinter and used to set the node name in MLIR to GraphDef exporter.
+// The callee is named as `node_name@func_name` with child loc named as
+// `op_type` while the caller is the quantization unit.
+class QuantizationUnitLoc : public CallSiteLoc {
+ public:
+  using QuantizationUnit =
+      tensorflow::quantization::UnitWiseQuantizationSpec::QuantizationUnit;
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizationUnitLoc)
+
+  QuantizationUnitLoc(MLIRContext* context, const QuantizationUnit& unit);
+
+  // Checks if the given location is QuantizationUnitLoc. Users could call
+  // `isa<QuantizationUnitLoc>(loc)` to check if the type matches.
+  static bool classof(Attribute attr);
+};
+
+// Finds the QuantizationUnit from location info.
+std::optional<QuantizationUnitLoc::QuantizationUnit>
+FindQuantizationUnitFromLoc(Location loc);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_QUANTIZATION_UNIT_LOC_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc
new file mode 100644
index 00000000000000..d390ac6d548e78
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc
@@ -0,0 +1,202 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/match.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir {
+namespace quant {
+namespace {
+
+using QuantizationUnit =
+    tensorflow::quantization::UnitWiseQuantizationSpec::QuantizationUnit;
+
+// Adds QuantizationUnitLoc to quantizable layers.
+class AddQuantizationUnitLocPass
+    : public PassWrapper<AddQuantizationUnitLocPass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AddQuantizationUnitLocPass)
+  explicit AddQuantizationUnitLocPass() = default;
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "quant-add-quantization-unit-loc";
+  }
+  StringRef getDescription() const final {
+    return "Add QuantizationUnitLoc to quantizable layers.";
+  }
+
+ private:
+  void runOnOperation() override;
+};
+
+// TF graph nodes are imported with one of following location patterns:
+//   FusedLoc[NameLoc(op_type:), ..., NameLoc(node_name@func_name)] or
+//   FusedLoc[NameLoc(op_type:), ..., CallSiteLoc(node_name@func_name)]. See
+// tensorflow/compiler/mlir/tensorflow/translate/import_model.cc for more
+// details.
+bool IsImportLocPattern(FusedLoc loc) {
+  ArrayRef<Location> locations = loc.cast<FusedLoc>().getLocations();
+  if (locations.size() < 2 || !isa<NameLoc>(locations.front())) return false;
+
+  StringRef op_type_with_suffix =
+      locations.front().cast<NameLoc>().getName().strref();
+  if (!op_type_with_suffix.ends_with(":")) return false;
+
+  return absl::c_all_of(locations, [](Location loc) {
+    return isa<NameLoc>(loc) ||
+           (isa<CallSiteLoc>(loc) &&
+            isa<NameLoc>(loc.cast<CallSiteLoc>().getCallee()));
+  });
+}
+
+// Finds the pattern of the location created by `ImporterBase::GetLocation`
+// in `tensorflow/compiler/mlir/tensorflow/translate/import_model.cc`.
+void FindQuantizationUnitsRecursively(Location loc,
+                                      SmallVector<QuantizationUnit>& units) {
+  if (!isa<FusedLoc>(loc)) return;
+
+  auto set_node_and_func_name = [](QuantizationUnit& new_unit,
+                                   StringRef name_loc_id) {
+    if (name_loc_id.contains("@")) {
+      new_unit.set_node_name(name_loc_id.split('@').first.str());
+      new_unit.set_func_name(name_loc_id.split('@').second.str());
+    } else {
+      new_unit.set_node_name(name_loc_id.str());
+    }
+  };
+
+  ArrayRef<Location> locations = loc.cast<FusedLoc>().getLocations();
+  if (IsImportLocPattern(loc.cast<FusedLoc>())) {
+    QuantizationUnit new_unit;
+    // Op type is a NameLoc with the ":" suffix.
+    StringRef op_type_with_suffix =
+        locations.front().cast<NameLoc>().getName().strref();
+    StringRef op_type =
+        op_type_with_suffix.substr(0, op_type_with_suffix.size() - 1);
+    new_unit.set_op_type(op_type.str());
+
+    if (isa<NameLoc>(locations.back())) {
+      StringRef name_loc_id =
+          locations.back().cast<NameLoc>().getName().strref();
+      set_node_and_func_name(new_unit, name_loc_id);
+    } else {
+      Location callee = locations.back().cast<CallSiteLoc>().getCallee();
+      StringRef name_loc_id = callee.cast<NameLoc>().getName().strref();
+      set_node_and_func_name(new_unit, name_loc_id);
+    }
+    units.push_back(new_unit);
+  } else {
+    for (Location child_loc : locations) {
+      FindQuantizationUnitsRecursively(child_loc, units);
+    }
+  }
+}
+
+// Finds the QuantizationUnit from location.
+std::optional<QuantizationUnit> FindQuantizationUnit(Operation* op) {
+  SmallVector<QuantizationUnit> quant_units;
+  FindQuantizationUnitsRecursively(op->getLoc(), quant_units);
+
+  if (quant_units.size() == 1) {
+    return *quant_units.begin();
+  }
+  // Among units, return the one with the same type as given op.
+  StringRef given_op_type = op->getName().getStringRef();
+  for (const QuantizationUnit& quant_unit : quant_units) {
+    if (absl::StrContains(given_op_type.lower(),
+                          StringRef(quant_unit.op_type()).lower())) {
+      return quant_unit;
+    }
+  }
+
+  return std::nullopt;
+}
+
+class AddQuantizationUnitLoc : public RewritePattern {
+ public:
+  explicit AddQuantizationUnitLoc(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+
+ private:
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsOpWithQuantizableTrait(op) ||
+        FindQuantizationUnitFromLoc(op->getLoc()).has_value()) {
+      return failure();
+    }
+
+    std::optional<QuantizationUnit> quantization_unit =
+        FindQuantizationUnit(op);
+    if (!quantization_unit.has_value()) return failure();
+
+    if (quantization_unit->func_name().empty()) {
+      std::string func_name =
+          op->getParentOfType<func::FuncOp>().getSymNameAttr().str();
+      quantization_unit->set_func_name(func_name);
+    }
+    QuantizationUnitLoc unit_loc(getContext(), quantization_unit.value());
+    op->setLoc(unit_loc);
+
+    return success();
+  }
+};
+
+void AddQuantizationUnitLocPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  func::FuncOp func = getOperation();
+
+  patterns.add<AddQuantizationUnitLoc>(ctx);
+  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+    func.emitError() << "quant-add-quantization-unit-loc pattern "
+                        "conversion did not converge.";
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+// Creates an instance of `AddQuantizationUnitLocPass`.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateAddQuantizationUnitLocPass() {
+  return std::make_unique<AddQuantizationUnitLocPass>();
+}
+
+static PassRegistration<AddQuantizationUnitLocPass> pass;
+
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
index 147fbf0bce5ed5..38eabad77a9052 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
@@ -12,36 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <algorithm>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 
 #include "absl/status/status.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "re2/re2.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
 namespace quant {
 namespace {
 
+using QuantizationOptions = tensorflow::quantization::QuantizationOptions;
+using QuantizationMethod = tensorflow::quantization::QuantizationMethod;
+using QuantizationComponentSpec =
+    tensorflow::quantization::QuantizationComponentSpec;
+using UnitWiseQuantizationSpec =
+    tensorflow::quantization::UnitWiseQuantizationSpec;
+using QuantizationUnit =
+    tensorflow::quantization::UnitWiseQuantizationSpec::QuantizationUnit;
+
 class LiftQuantizableSpotsAsFunctionsPass
     : public PassWrapper<LiftQuantizableSpotsAsFunctionsPass,
                          OperationPass<ModuleOp>> {
@@ -49,18 +67,20 @@ class LiftQuantizableSpotsAsFunctionsPass
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
       LiftQuantizableSpotsAsFunctionsPass)
 
-  LiftQuantizableSpotsAsFunctionsPass() = default;
-
-  explicit LiftQuantizableSpotsAsFunctionsPass(OpSet op_set,
-                                               bool enable_two_input_tensors) {
-    op_set_ = op_set;
-    enable_two_input_tensors_ = enable_two_input_tensors;
+  LiftQuantizableSpotsAsFunctionsPass() : test_mode_(true) {
+    initializeForTest();
   }
 
+  explicit LiftQuantizableSpotsAsFunctionsPass(
+      const QuantizationOptions& quant_options)
+      : quant_options_(quant_options), test_mode_(false) {}
+
   LiftQuantizableSpotsAsFunctionsPass(
       const LiftQuantizableSpotsAsFunctionsPass& other) {
+    quant_options_ = other.quant_options_;
+    test_mode_ = other.test_mode_;
     op_set_ = other.op_set_;
-    enable_two_input_tensors_ = other.enable_two_input_tensors_;
+    initializeForTest();
   }
 
   StringRef getArgument() const final {
@@ -82,6 +102,8 @@ class LiftQuantizableSpotsAsFunctionsPass
   void runOnOperation() override;
 
  private:
+  QuantizationOptions quant_options_;
+  bool test_mode_;
   Option<OpSet> op_set_{
       *this, "target-opset", llvm::cl::init(OpSet::TF),
       llvm::cl::desc("Choose target opset."),
@@ -92,17 +114,61 @@ class LiftQuantizableSpotsAsFunctionsPass
           clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
                      "Uses TF Uniform Quantized ops"))};
 
-  bool enable_two_input_tensors_{false};
+  // Initialize for tests.
+  void initializeForTest() {
+    if (!test_mode_) return;
+
+    op_set_.setCallback([this](const OpSet& new_op_set) {
+      quant_options_.set_op_set(new_op_set);
+    });
+
+    // Set the test quantization method to static-range.
+    if (quant_options_.quantization_method().preset_method() ==
+        QuantizationMethod::METHOD_UNSPECIFIED) {
+      quant_options_.mutable_quantization_method()->set_preset_method(
+          QuantizationMethod::METHOD_STATIC_RANGE_INT8);
+    }
+
+    if (quant_options_.quantization_method()
+            .quantization_component_specs()
+            .empty()) {
+      auto add_new_spec =
+          [this](QuantizationComponentSpec::QuantizationComponent component,
+                 QuantizationComponentSpec::TensorType type) {
+            QuantizationComponentSpec* new_spec =
+                quant_options_.mutable_quantization_method()
+                    ->add_quantization_component_specs();
+            new_spec->set_quantization_component(component);
+            new_spec->set_tensor_type(type);
+          };
+
+      add_new_spec(QuantizationComponentSpec::COMPONENT_ACTIVATION,
+                   QuantizationComponentSpec::TENSORTYPE_INT_8);
+      add_new_spec(QuantizationComponentSpec::COMPONENT_WEIGHT,
+                   QuantizationComponentSpec::TENSORTYPE_INT_8);
+      add_new_spec(QuantizationComponentSpec::COMPONENT_BIAS,
+                   QuantizationComponentSpec::TENSORTYPE_INT_32);
+    }
+
+    if (quant_options_.unit_wise_quantization_specs().empty()) {
+      // Opt-out a node named `test_opt_out`.
+      UnitWiseQuantizationSpec* new_spec =
+          quant_options_.add_unit_wise_quantization_specs();
+      QuantizationUnit* new_unit = new_spec->add_unit();
+      new_unit->set_node_name("test_opt_out");
+      new_spec->mutable_quantization_method()->set_preset_method(
+          QuantizationMethod::METHOD_NO_QUANTIZE);
+    }
+  }
 };
 
 class CheckQuantizableOps
     : public mlir::OpRewritePattern<TF::PartitionedCallOp> {
  public:
-  explicit CheckQuantizableOps(MLIRContext* context, OpSet op_set,
-                               bool enable_two_input_tensors)
+  explicit CheckQuantizableOps(MLIRContext* context,
+                               const QuantizationOptions& quant_options)
       : OpRewritePattern<TF::PartitionedCallOp>(context),
-        op_set_(op_set),
-        enable_two_input_tensors_(enable_two_input_tensors) {}
+        quant_options_(quant_options) {}
 
  private:
   LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
@@ -117,14 +183,13 @@ class CheckQuantizableOps
     absl::Status check_status;
     // TODO(b/270906404): Support weight-only gather for uniform quantized opset
     // in PTQ mode
-    if (op_set_ == OpSet::UNIFORM_QUANTIZED &&
+    if (quant_options_.op_set() == OpSet::UNIFORM_QUANTIZED &&
         function_name.contains("gather")) {
       check_status.Update(absl::InternalError("Weight-only op is skipped."));
     }
 
-    if (op_set_ == OpSet::XLA) {
-      check_status.Update(checkQuantizableOpsForXla(call_op, function_name,
-                                                    enable_two_input_tensors_));
+    if (quant_options_.op_set() == OpSet::XLA) {
+      check_status.Update(checkQuantizableOpsForXla(call_op, function_name));
     }
 
     // Only the composite functions with f32 inputs are quantizable.
@@ -145,9 +210,47 @@ class CheckQuantizableOps
     return success();
   }
 
+  // Get the quantization method to apply to this composite function. If set,
+  // the unit-wise quantization method overrides the default one.
+  std::optional<QuantizationMethod> getUnitWiseQuantizationMethod(
+      TF::PartitionedCallOp call_op) const {
+    // If unit-wise quantization config is found, overwrite the default config.
+    auto quantization_unit = FindQuantizationUnitFromLoc(call_op.getLoc());
+    if (!quantization_unit.has_value()) return std::nullopt;
+
+    for (const auto& unit_config :
+         quant_options_.unit_wise_quantization_specs()) {
+      for (const auto& unit : unit_config.unit()) {
+        if (!unit.op_type().empty() &&
+            quantization_unit.value().op_type() != unit.op_type()) {
+          continue;
+        }
+
+        if (!unit.node_name().empty()) {
+          const RE2 node_name_regex(unit.node_name());
+          if (!RE2::FullMatch(quantization_unit.value().node_name(),
+                              node_name_regex)) {
+            continue;
+          }
+        }
+
+        if (!unit.func_name().empty()) {
+          const RE2 func_name_regex(unit.func_name());
+          if (!RE2::FullMatch(quantization_unit.value().func_name(),
+                              func_name_regex)) {
+            continue;
+          }
+        }
+
+        // Overrides the default quantization method.
+        return unit_config.quantization_method();
+      }
+    }
+    return std::nullopt;
+  }
+
   absl::Status checkQuantizableOpsForXla(TF::PartitionedCallOp call_op,
-                                         StringRef function_name,
-                                         bool enable_two_input_tensors) const {
+                                         StringRef function_name) const {
     // Disable quantization for the DepthwiseConv since it has no benefits in
     // the XLA opset.
     if (function_name.contains("depthwise_conv2d")) {
@@ -168,14 +271,39 @@ class CheckQuantizableOps
             "The channel dimension of Conv3D is required to be static.");
       }
     } else if (function_name.contains("batch_matmul")) {
-      // For BatchMatMul, the input must be ranked.
-      auto shaped_type =
+      // For BatchMatMul, the input must be ranked to determine the batch
+      // dimensions.
+      ShapedType shaped_type =
           call_op->getOperand(0).getType().dyn_cast<ShapedType>();
       if (!shaped_type || !shaped_type.hasRank()) {
         return absl::InternalError("The input of BatchMatMul must have rank.");
       }
     }
 
+    // Disable quantization if the quantization method is NO_QUANTIZE.
+    QuantizationMethod quantization_method =
+        quant_options_.quantization_method();
+    if (quantization_method.quantization_component_specs().empty()) {
+      return absl::InternalError(
+          "The quantization method has been set to METHOD_NO_QUANTIZE.");
+    }
+
+    // The unit-wise quantization config should override the loser-grained
+    // quantization config, such as `enable_two_input_tensors`.
+    bool is_unitwise_quantization_enabled = false;
+    std::optional<QuantizationMethod> unit_wise_quantization_method =
+        getUnitWiseQuantizationMethod(call_op);
+    if (unit_wise_quantization_method.has_value()) {
+      if (unit_wise_quantization_method.value()
+              .quantization_component_specs()
+              .empty()) {
+        return absl::InternalError(
+            "The unit-wise quantization method has been set to "
+            "METHOD_NO_QUANTIZE.");
+      }
+      is_unitwise_quantization_enabled = true;
+    }
+
     std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(call_op);
     for (auto iter : spec->coeff_op_quant_dim) {
       Operation* preceding_op = call_op.getOperand(iter.first).getDefiningOp();
@@ -195,21 +323,28 @@ class CheckQuantizableOps
       }
 
       if (!is_weight_constant) {
-        if (!enable_two_input_tensors || (!function_name.contains("matmul") &&
-                                          !function_name.contains("einsum"))) {
+        if (!function_name.contains("matmul") &&
+            !function_name.contains("einsum")) {
           return absl::InternalError(
               "Non-constant weights are not supported at the moment,"
               " except matmul and einsum.");
+        } else if (!quant_options_.enable_two_input_tensors() &&
+                   !is_unitwise_quantization_enabled) {
+          return absl::InternalError(
+              "Quantization is disabled for this op due to the non-constant "
+              "weight. You can enable it by setting `enable_two_input_tensors` "
+              "to true or using unit-wise quantization config.");
         }
       }
     }
+
     return absl::OkStatus();
   }
 
   void removeAttrMapAttribute(TF::PartitionedCallOp call_op,
                               StringRef function_name,
                               StringRef error_message) const {
-    auto module = call_op->getParentOfType<ModuleOp>();
+    ModuleOp module = call_op->getParentOfType<ModuleOp>();
     SymbolTable symbol_table(module);
     mlir::func::FuncOp composite_func =
         dyn_cast<func::FuncOp>(symbol_table.lookup(function_name));
@@ -231,8 +366,7 @@ class CheckQuantizableOps
     });
   }
 
-  OpSet op_set_;
-  bool enable_two_input_tensors_;
+  const QuantizationOptions& quant_options_;
 };
 
 static PassRegistration<LiftQuantizableSpotsAsFunctionsPass> pass;
@@ -245,7 +379,7 @@ void LiftQuantizableSpotsAsFunctionsPass::runOnOperation() {
   ModuleOp module = getOperation();
 
   populateWithGenerated(patterns);
-  patterns.add<CheckQuantizableOps>(ctx, op_set_, enable_two_input_tensors_);
+  patterns.add<CheckQuantizableOps>(ctx, quant_options_);
   FrozenRewritePatternSet frozen_patterns(std::move(patterns));
   for (auto func : module.getOps<func::FuncOp>()) {
     if (failed(applyPatternsAndFoldGreedily(func, frozen_patterns))) {
@@ -258,10 +392,9 @@ void LiftQuantizableSpotsAsFunctionsPass::runOnOperation() {
 }  // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>>
-CreateLiftQuantizableSpotsAsFunctionsPass(OpSet target_opset,
-                                          bool enable_two_input_tensors) {
-  return std::make_unique<LiftQuantizableSpotsAsFunctionsPass>(
-      target_opset, enable_two_input_tensors);
+CreateLiftQuantizableSpotsAsFunctionsPass(
+    const QuantizationOptions& quant_options) {
+  return std::make_unique<LiftQuantizableSpotsAsFunctionsPass>(quant_options);
 }
 
 }  // namespace quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
index 28fb1a0634d016..7deed9306fcf88 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
@@ -42,8 +42,8 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateConvertFakeQuantToQdqPass();
 
 // Lifts the quantizable spots as composite functions.
 std::unique_ptr<OperationPass<ModuleOp>>
-CreateLiftQuantizableSpotsAsFunctionsPass(OpSet target_opset,
-                                          bool enable_two_input_tensors);
+CreateLiftQuantizableSpotsAsFunctionsPass(
+    const tensorflow::quantization::QuantizationOptions& quant_options);
 
 // Apply graph optimizations such as fusing and constant folding to prepare
 // lifting.
@@ -241,6 +241,9 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateAddDumpTensorOpPass(
     tensorflow::quantization::DebuggerOptions::DebuggerType debugger_type,
     std::string log_dir_path);
 
+// Creates a pass that add QuantizationUnitLoc to quantizable layers.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateAddQuantizationUnitLocPass();
+
 }  // namespace quant
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
index 7d988f497474b3..8c1768e7de63a2 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
@@ -88,8 +88,7 @@ void AddQuantizeQatPasses(
       mlir::quant::CreatePrepareLiftingPass(quantization_options.op_set()));
 
   pm.addPass(mlir::quant::CreateLiftQuantizableSpotsAsFunctionsPass(
-      quantization_options.op_set(),
-      quantization_options.enable_two_input_tensors()));
+      quantization_options));
   pm.addPass(mlir::quant::CreateInsertQuantizedFunctionsPass(
       quantization_options.quantization_method().preset_method(),
       quantization_options.op_set()));
@@ -177,8 +176,7 @@ void AddQuantizePtqPreCalibrationPasses(
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::quant::CreatePrepareLiftingPass(quantization_options.op_set()));
   pm.addPass(mlir::quant::CreateLiftQuantizableSpotsAsFunctionsPass(
-      quantization_options.op_set(),
-      quantization_options.enable_two_input_tensors()));
+      quantization_options));
   // TODO: b/295140328 - Add debugger support for weight only
   if (quantization_options.has_debugger_options()) {
     pm.addPass(mlir::quant::CreateAddDumpTensorOpPass(
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
index c766a3c87ca75e..bc96e59407615e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
@@ -20,11 +20,13 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h"
@@ -122,6 +124,9 @@ absl::Status PreprocessAndFreezeGraph(
   mlir::TF::CreateTFStandardPipeline(pm_before_freezing_variables,
                                      standard_pipeline_options);
 
+  // The AddQuantizationUnitLocPass should be added before any other passes.
+  pm_before_freezing_variables.addNestedPass<mlir::func::FuncOp>(
+      mlir::quant::CreateAddQuantizationUnitLocPass());
   pm_before_freezing_variables.addNestedPass<mlir::func::FuncOp>(
       mlir::TFDevice::CreateDecomposeResourceOpsPass());
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_quantization_unit_loc.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_quantization_unit_loc.mlir
new file mode 100644
index 00000000000000..b5088768bc39b1
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_quantization_unit_loc.mlir
@@ -0,0 +1,50 @@
+// RUN: tf-quant-opt %s -mlir-print-debuginfo -mlir-print-local-scope -quant-add-quantization-unit-loc | FileCheck %s
+
+func.func @conv2d_unmatching_loc_pattern(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1]}
+       : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16> loc("Model/conv2d")
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %3 : tensor<1x3x2x2xf32>
+// CHECK: tf.Conv2D
+// CHECK-SAME: loc("Model/conv2d")
+}
+
+func.func @conv2d_with_valid_loc(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1]}
+       : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16> loc(fused["Conv2D:", "Model/conv2d"])
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %3 : tensor<1x3x2x2xf32>
+// CHECK: tf.Conv2D
+// CHECK-SAME: loc(callsite("Model/conv2d@conv2d_with_valid_loc"("Conv2D") at "QuantizationUnit({{.*}})"))
+}
+
+func.func @conv2d_with_callsite_loc(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1]}
+       : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16> loc(fused["Conv2D:", callsite("Model/conv2d" at "model.py":10:8)])
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %3 : tensor<1x3x2x2xf32>
+// CHECK: tf.Conv2D
+// CHECK-SAME: loc(callsite("Model/conv2d@conv2d_with_callsite_loc"("Conv2D") at "QuantizationUnit({{.*}})"))
+}
+
+func.func @conv2d_with_func_name(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1]}
+       : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16> loc(fused["Conv2D:", "Model/conv2d@original_func"])
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %3 : tensor<1x3x2x2xf32>
+// CHECK: tf.Conv2D
+// CHECK-SAME: loc(callsite("Model/conv2d@original_func"("Conv2D") at "QuantizationUnit({{.*}})"))
+}
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla_selective_quantization.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla_selective_quantization.mlir
new file mode 100644
index 00000000000000..8dfd2815692fc6
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla_selective_quantization.mlir
@@ -0,0 +1,112 @@
+// RUN: tf-quant-opt %s -split-input-file -mlir-print-debuginfo -mlir-print-local-scope -quant-add-quantization-unit-loc -inline -quant-prepare-lifting -quant-lift-quantizable-spots-as-functions='target-opset=XLA' | FileCheck %s
+
+// This file test the selective quantiation feature in TF Quantizer. In the test
+// config, the op named "test_opt_out" will not be quantized.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1269 : i32}} {
+  func.func @conv2d_unmatching_unit(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+    %cst = "tf.Const"() {device = "", value = dense<1.0> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}
+        : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32> loc(fused["Conv2D:", "Model/conv2d"])
+    %2 = "tf.IdentityN"(%1) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+    return %2 : tensor<1x3x2x2xf32>
+  }
+}
+
+// CHECK-LABEL: func @conv2d_unmatching_unit
+// CHECK: "tf.PartitionedCall"
+// Check that the `_tfl_quant_trait` attribute exists since the unit is not in `unit_wise_quantization_specs`.
+// CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+// CHECK-SAME: f = @composite_conv2d_fn_1
+// CHECK-SAME: loc(callsite("Model/conv2d@conv2d_unmatching_unit"("Conv2D") at "QuantizationUnit({{.*}})"))
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1269 : i32}} {
+  func.func @conv2d_disable_quantization(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+    %cst = "tf.Const"() {device = "", value = dense<1.0> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}
+        : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32> loc(fused["Conv2D:", "test_opt_out"])
+    %2 = "tf.IdentityN"(%1) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+    return %2 : tensor<1x3x2x2xf32>
+  }
+}
+
+// CHECK-LABEL: func @conv2d_disable_quantization
+// CHECK: "tf.PartitionedCall"
+// Check that quantization is disabled for this unit.
+// CHECK-NOT: _tfl_quant_trait = "fully_quantizable"
+// CHECK-SAME: f = @composite_conv2d_fn_1
+// CHECK-SAME: loc(callsite("test_opt_out@conv2d_disable_quantization"("Conv2D") at "QuantizationUnit({{.*}})"))
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1269 : i32}} {
+  func.func @conv2d_with_bias_disable_quantization(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+    %cst = "tf.Const"() {device = "", value = dense<1.0> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}
+        : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32> loc(fused["Conv2D:", "test_opt_out"])
+    %2 = "tf.BiasAdd"(%1, %cst_0) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32> loc(fused["BiasAdd:", "model/bias_add"])
+    %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+    return %3 : tensor<1x3x2x2xf32>
+  }
+}
+
+// CHECK-LABEL: func @conv2d_with_bias_disable_quantization
+// CHECK: "tf.PartitionedCall"
+// Check that quantization is disabled for this unit.
+// CHECK-NOT: _tfl_quant_trait = "fully_quantizable"
+// CHECK-SAME: f = @composite_conv2d_with_bias_fn_1
+// CHECK-SAME: loc(callsite("test_opt_out@conv2d_with_bias_disable_quantization"("Conv2D") at "QuantizationUnit({{.*}})"))
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1269 : i32}} {
+  func.func @matmul_with_reshape_disable_quantization(%arg0: tensor<1x10xf32>, %arg1: tensor<10x10xf32>) -> (tensor<?x10xf32>) {
+    %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<10xf32>} : () -> tensor<10xf32>
+    %cst_0 = "tf.Const"() {value = dense<[-1, 10]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %1 = "tf.MatMul"(%arg0, %arg1) {
+      transpose_a = false, transpose_b = false
+    } : (tensor<1x10xf32>, tensor<10x10xf32>) -> tensor<1x10xf32>  loc(fused["MatMul:", "test_opt_out"])
+    %2 = "tf.Reshape"(%1, %cst_0) : (tensor<1x10xf32>, tensor<2xi32>) -> tensor<?x10xf32> loc(fused["Reshape:", "model/reshape"])
+    %3 = "tf.AddV2"(%2, %cst) : (tensor<?x10xf32>, tensor<10xf32>) -> tensor<?x10xf32> loc(fused["AddV2:", "model/add"])
+    func.return %3 : tensor<?x10xf32>
+  }
+}
+
+// CHECK-LABEL: func @matmul_with_reshape_disable_quantization
+// CHECK: "tf.PartitionedCall"
+// Check that quantization is disabled for this unit.
+// CHECK-NOT: _tfl_quant_trait = "fully_quantizable"
+// CHECK-SAME: f = @composite_matmul_with_reshape_and_bias_fn_1
+// CHECK-SAME: loc(callsite("test_opt_out@matmul_with_reshape_disable_quantization"("MatMul") at "QuantizationUnit({{.*}})"))
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1269 : i32}} {
+  func.func private @conv2d_with_inliner(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+    %cst = "tf.Const"() {device = "", value = dense<1.0> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}
+        : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32> loc(fused["Conv2D:", "test_opt_out"])
+    %2 = "tf.IdentityN"(%1) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+    return %2 : tensor<1x3x2x2xf32>
+  }
+
+  func.func @serving_default(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+    %0 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @conv2d_with_inliner}
+        : (tensor<1x3x4x3xf32>) -> tensor<1x3x2x2xf32>
+    return %0 : tensor<1x3x2x2xf32>
+  }
+
+// CHECK-LABEL: func @serving_default
+// CHECK: "tf.PartitionedCall"
+// Check that quantization is disabled for this unit.
+// CHECK-NOT: _tfl_quant_trait = "fully_quantizable"
+// CHECK-SAME: f = @composite_conv2d_fn_1
+// CHECK-SAME: loc(callsite("test_opt_out@conv2d_with_inliner"("Conv2D") at "QuantizationUnit({{.*}})"))
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
index 05bb19ffdf252a..e3b67f59a5e829 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
@@ -53,6 +53,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/quantization/tensorflow:pass_utils",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:quantization_unit_loc",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.cc
index 7be0a1539c6966..37d9b56ac1a7de 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <optional>
 #include <queue>
 #include <stack>
 #include <string>
@@ -35,6 +36,7 @@ limitations under the License.
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
@@ -294,6 +296,16 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
   }
 
   auto cloning_ops = FindOpsFromArgumentsToResults(arguments, results);
+  // Set the location of call op to QuantizationUnitLoc if found.
+  Location call_op_loc = location;
+  for (Operation *op : cloning_ops) {
+    std::optional<QuantizationUnitLoc::QuantizationUnit> unit =
+        FindQuantizationUnitFromLoc(op->getLoc());
+    if (unit.has_value()) {
+      call_op_loc = QuantizationUnitLoc(builder.getContext(), unit.value());
+    }
+  }
+
   if (failed(SetAttributeMap(context, attributes, cloning_ops))) {
     current_func.emitError() << "Some attributes couldn't be found.";
   }
@@ -312,7 +324,7 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
       InsertToSymbolTable(module, wrap_func, func_name.str());
   builder.setInsertionPointAfter(result_op);
   ValueRange new_results =
-      createFunctionCallOp(builder, location, call_op_type,
+      createFunctionCallOp(builder, call_op_loc, call_op_type,
                            new_func_name.getValue(), result_types, arguments);
   return llvm::SmallVector<Value, 4>(new_results.begin(), new_results.end());
 }

From fa46c8f848649e769c307d0cd4d68f397800e398 Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Mon, 2 Oct 2023 20:24:06 -0700
Subject: [PATCH 509/567] Enable NO_QUANTIZE and selective quantization in TF
 Quantizer

PiperOrigin-RevId: 570257817
---
 .../integration_test/quantize_model_test.py   | 322 +++++++++++++++++-
 .../tensorflow/python/quantize_model.py       |  92 +++--
 2 files changed, 388 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index 0b8c58690f912b..f69bb978ad1167 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -286,7 +286,9 @@ def test_predefined_method_component_spec(self):
             preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
         )
     )
-    quantize_model._populate_quantization_component_spec(options)
+    quantize_model._populate_quantization_component_spec(
+        options.quantization_method
+    )
 
     # Quantize activation, weight and bias for static range quantization.
     self.assertLen(options.quantization_method.quantization_component_specs, 3)
@@ -307,7 +309,9 @@ def test_invalid_spec_raise_value_error(self):
 
     with self.assertRaises(ValueError):
       # Activation 4bit is not a valid configuration.
-      quantize_model._populate_quantization_component_spec(options)
+      quantize_model._populate_quantization_component_spec(
+          options.quantization_method
+      )
 
   def test_invalid_method_raises_value_error(self):
     model = self.SimpleModel()
@@ -6378,7 +6382,9 @@ def test_default_calibration_options(
         ),
         calibration_options=calibration_options,
     )
-    quantize_model._populate_quantization_component_spec(quant_opts)
+    quantize_model._populate_quantization_component_spec(
+        quant_opts.quantization_method
+    )
     quantize_model._populate_calibration_options(quant_opts)
 
     self.assertEqual(
@@ -6571,5 +6577,315 @@ def get_mean_square_error(x, y):
     self.assertLess(average_min_max_mse, min_max_mse)
 
 
+class SelectiveQuantizationTest(quantize_model_test_base.QuantizedModelTest):
+  """Test cases regarding selective quantization."""
+
+  def test_unitwise_spec_with_no_units(self):
+    _, y_shape, bias_shape, x_signature, y_signature = (
+        self._prepare_sample_einsum_datashapes('abc,acd->abd')
+    )
+    model = self._create_einsum_model(
+        'abc,acd->abd',
+        y_shape,
+        x_signature,
+        y_signature,
+        bias_shape,
+        is_qat_model=True,
+    )
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures=model.einsum_with_kernel
+    )
+
+    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    tags = {tag_constants.SERVING}
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
+        ),
+        tags=tags,
+        signature_keys=[signature_key],
+        op_set=quant_opts_pb2.XLA,
+        unit_wise_quantization_specs=[
+            quant_opts_pb2.UnitWiseQuantizationSpec(
+                unit=[],
+                quantization_method=quant_opts_pb2.QuantizationMethod(
+                    preset_method=quant_opts_pb2.QuantizationMethod.METHOD_NO_QUANTIZE
+                ),
+            )
+        ],
+    )
+
+    with self.assertRaisesRegex(
+        ValueError, 'UnitWiseQuantizationSpec must contain at least one unit.'
+    ):
+      quantize_model.quantize(
+          self._input_saved_model_path,
+          self._output_saved_model_path,
+          quantization_options,
+      )
+
+  def test_unitwise_spec_missing_unit_info(self):
+    _, y_shape, bias_shape, x_signature, y_signature = (
+        self._prepare_sample_einsum_datashapes('abc,acd->abd')
+    )
+    model = self._create_einsum_model(
+        'abc,acd->abd',
+        y_shape,
+        x_signature,
+        y_signature,
+        bias_shape,
+        is_qat_model=True,
+    )
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures=model.einsum_with_kernel
+    )
+
+    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    tags = {tag_constants.SERVING}
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
+        ),
+        tags=tags,
+        signature_keys=[signature_key],
+        op_set=quant_opts_pb2.XLA,
+        unit_wise_quantization_specs=[
+            quant_opts_pb2.UnitWiseQuantizationSpec(
+                unit=[
+                    quant_opts_pb2.UnitWiseQuantizationSpec.QuantizationUnit(),
+                ],
+                quantization_method=quant_opts_pb2.QuantizationMethod(
+                    preset_method=quant_opts_pb2.QuantizationMethod.METHOD_NO_QUANTIZE
+                ),
+            )
+        ],
+    )
+
+    with self.assertRaisesRegex(
+        ValueError, 'Either `op_type` or `node_name` must be specified.'
+    ):
+      quantize_model.quantize(
+          self._input_saved_model_path,
+          self._output_saved_model_path,
+          quantization_options,
+      )
+
+  def test_unitwise_spec_unsupported_method(self):
+    _, y_shape, bias_shape, x_signature, y_signature = (
+        self._prepare_sample_einsum_datashapes('abc,acd->abd')
+    )
+    model = self._create_einsum_model(
+        'abc,acd->abd',
+        y_shape,
+        x_signature,
+        y_signature,
+        bias_shape,
+        is_qat_model=True,
+    )
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures=model.einsum_with_kernel
+    )
+
+    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    tags = {tag_constants.SERVING}
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
+        ),
+        tags=tags,
+        signature_keys=[signature_key],
+        op_set=quant_opts_pb2.XLA,
+        unit_wise_quantization_specs=[
+            quant_opts_pb2.UnitWiseQuantizationSpec(
+                unit=[
+                    quant_opts_pb2.UnitWiseQuantizationSpec.QuantizationUnit(
+                        op_type='Conv2D',
+                    ),
+                ],
+                quantization_method=quant_opts_pb2.QuantizationMethod(
+                    preset_method=quant_opts_pb2.QuantizationMethod.METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8
+                ),
+            )
+        ],
+    )
+
+    with self.assertRaisesRegex(
+        ValueError,
+        'Currently unit-wise quantization spec only supports NO_QUANTIZE and'
+        ' same quantization method as the top-level',
+    ):
+      quantize_model.quantize(
+          self._input_saved_model_path,
+          self._output_saved_model_path,
+          quantization_options,
+      )
+
+  def test_selective_quantization_qat(self):
+    _, y_shape, bias_shape, x_signature, y_signature = (
+        self._prepare_sample_einsum_datashapes('abc,acd->abd')
+    )
+    model = self._create_einsum_model(
+        'abc,acd->abd',
+        y_shape,
+        x_signature,
+        y_signature,
+        bias_shape,
+        is_qat_model=True,
+    )
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures=model.einsum_with_kernel
+    )
+
+    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    tags = {tag_constants.SERVING}
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
+        ),
+        tags=tags,
+        signature_keys=[signature_key],
+        op_set=quant_opts_pb2.XLA,
+        unit_wise_quantization_specs=[
+            quant_opts_pb2.UnitWiseQuantizationSpec(
+                unit=[
+                    quant_opts_pb2.UnitWiseQuantizationSpec.QuantizationUnit(
+                        op_type='Einsum',
+                    ),
+                ],
+                quantization_method=quant_opts_pb2.QuantizationMethod(
+                    preset_method=quant_opts_pb2.QuantizationMethod.METHOD_NO_QUANTIZE
+                ),
+            )
+        ],
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {signature_key}
+    )
+    loader = saved_model_loader.SavedModelLoader(self._output_saved_model_path)
+    graphdef = loader.get_meta_graph_def_from_tags(tags).graph_def
+    # The Einsum ops shouldn't be quantized.
+    self.assertTrue(self._contains_op(graphdef, 'Einsum'))
+    self.assertFalse(self._contains_op(graphdef, 'XlaDotV2'))
+
+  def test_selective_quantization_ptq(self):
+    x_shape, y_shape, bias_shape, x_signature, y_signature = (
+        self._prepare_sample_einsum_datashapes('abc,acd->abd')
+    )
+    model = self._create_einsum_model(
+        'abc,acd->abd',
+        y_shape,
+        x_signature,
+        y_signature,
+        bias_shape,
+        is_qat_model=False,
+    )
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures=model.einsum_with_kernel
+    )
+
+    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    tags = {tag_constants.SERVING}
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
+        ),
+        tags=tags,
+        signature_keys=[signature_key],
+        op_set=quant_opts_pb2.XLA,
+        unit_wise_quantization_specs=[
+            quant_opts_pb2.UnitWiseQuantizationSpec(
+                unit=[
+                    quant_opts_pb2.UnitWiseQuantizationSpec.QuantizationUnit(
+                        op_type='Einsum',
+                    ),
+                ],
+                quantization_method=quant_opts_pb2.QuantizationMethod(
+                    preset_method=quant_opts_pb2.QuantizationMethod.METHOD_NO_QUANTIZE
+                ),
+            )
+        ],
+    )
+
+    rng = np.random.default_rng(seed=1234)
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(10):
+        yield {
+            'x': rng.uniform(low=0.0, high=1.0, size=x_shape).astype(np.float32)
+        }
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=data_gen(),
+    )
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {signature_key}
+    )
+    loader = saved_model_loader.SavedModelLoader(self._output_saved_model_path)
+    graphdef = loader.get_meta_graph_def_from_tags(tags).graph_def
+    # The Einsum ops shouldn't be quantized.
+    self.assertTrue(self._contains_op(graphdef, 'Einsum'))
+    self.assertFalse(self._contains_op(graphdef, 'XlaDotV2'))
+
+  def test_selective_quantization_on_gather(
+      self,
+  ):
+    input_type = dtypes.int32
+    model = self._create_simple_gather_and_conv_model(
+        input_type,
+        filter_shape=(2, 3, 3, 1024),
+        is_qat_model=True,
+    )
+
+    saved_model_save.save(model, self._input_saved_model_path)
+
+    tags = {tag_constants.SERVING}
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
+        ),
+        tags=tags,
+        signature_keys=['serving_default'],
+        op_set=quant_opts_pb2.XLA,
+        unit_wise_quantization_specs=[
+            quant_opts_pb2.UnitWiseQuantizationSpec(
+                unit=[
+                    quant_opts_pb2.UnitWiseQuantizationSpec.QuantizationUnit(
+                        op_type='GatherV2',
+                    ),
+                ],
+                quantization_method=quant_opts_pb2.QuantizationMethod(
+                    preset_method=quant_opts_pb2.QuantizationMethod.METHOD_NO_QUANTIZE
+                ),
+            )
+        ],
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+    self.assertIsNotNone(converted_model)
+    loader = saved_model_loader.SavedModelLoader(self._output_saved_model_path)
+    graphdef = loader.get_meta_graph_def_from_tags(tags).graph_def
+    # The Conv2D op shouldn't be quantized as it has no FakeQuant on input.
+    self.assertTrue(self._contains_op(graphdef, 'Conv2D'))
+    # If the Gather op is quantized, input_model_size / output_model_size > 2.
+    self.assertSizeRatioLessThan(
+        self._input_saved_model_path, self._output_saved_model_path, 1.15
+    )
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 1534764b0141ee..911c35987f6333 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -936,15 +936,23 @@ def _static_range_quantize(
   )
   logging.info('QuantizationOptions: \n%s', quantization_options)
 
-  is_qat_saved_model = _is_qat_saved_model(saved_model_path)
+  is_qat_saved_model_or_method_no_quantize = _is_qat_saved_model(
+      saved_model_path
+  ) or (
+      quantization_options.quantization_method.preset_method
+      == _QuantizationMethod.METHOD_NO_QUANTIZE
+  )
   signature_def_map = save_model.get_signatures_from_saved_model(
       saved_model_path,
       quantization_options.signature_keys,
       set(quantization_options.tags),
   )
 
-  # Checks if the model is from QAT
-  if representative_dataset is None and not is_qat_saved_model:
+  # Checks if the model is from QAT or method is METHOD_NO_QUANTIZE.
+  if (
+      representative_dataset is None
+      and not is_qat_saved_model_or_method_no_quantize
+  ):
     raise ValueError(
         'When `representative_dataset` is not provided, the model should be '
         'trained with quantization-aware training (QAT).'
@@ -956,7 +964,7 @@ def _static_range_quantize(
         'The flag is ignored.'
     )
 
-  if is_qat_saved_model:
+  if is_qat_saved_model_or_method_no_quantize:
     _run_static_range_qat(
         saved_model_path,
         output_directory,
@@ -1148,19 +1156,13 @@ def _verify_output_dir(output_dir: Optional[str], overwrite: bool) -> None:
 
 
 def _populate_quantization_component_spec(
-    quantization_options: _QuantizationOptions,
+    quant_method: _QuantizationMethod,
 ) -> None:
   """Populates default values for QuantizationComponentSpec.
 
   Args:
-    quantization_options: An instance of QuantizationOptions with a field
-      specifying QuantizationComponentSpec.
+    quant_method: The quantization method to be updated.
   """
-  quant_method: _QuantizationMethod = quantization_options.quantization_method
-
-  if quantization_options.unit_wise_quantization_specs:
-    raise ValueError('Selective quantization is not supported yet.')
-
   # Make sure creating one spec per component.
   updated_component_spec = dict()
 
@@ -1187,7 +1189,10 @@ def _populate_quantization_component_spec(
             tensor_type=_TensorType.TENSORTYPE_INT_32,
         )
     )
-  else:
+  elif (
+      quant_method.preset_method
+      == _PresetMethod.METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8
+  ):
     updated_component_spec[_QuantizationComponent.COMPONENT_WEIGHT] = (
         _QuantizationComponentSpec(
             quantization_component=_QuantizationComponent.COMPONENT_WEIGHT,
@@ -1199,13 +1204,10 @@ def _populate_quantization_component_spec(
   if quant_method.quantization_component_specs:
     # Check if the component spec is supported configuration in TF-Quant.
     for component_spec in quant_method.quantization_component_specs:
-      if (
-          component_spec.quantization_component
-          == _QuantizationComponent.COMPONENT_WEIGHT
-      ) or (
-          component_spec.quantization_component
-          == _QuantizationComponent.COMPONENT_ACTIVATION
-      ):
+      if component_spec.quantization_component in [
+          _QuantizationComponent.COMPONENT_WEIGHT,
+          _QuantizationComponent.COMPONENT_ACTIVATION,
+      ]:
         if component_spec.tensor_type != _TensorType.TENSORTYPE_INT_8:
           raise ValueError(
               'Only int8 precision is supported for input operands.'
@@ -1236,6 +1238,43 @@ def _populate_quantization_component_spec(
     raise ValueError('At least one component spec needs to be specified.')
 
 
+def _populate_unitwise_quantization_specs(
+    quantization_options: _QuantizationOptions,
+) -> None:
+  """Verifies and pupulates unitwise quantization specs."""
+  if not quantization_options.unit_wise_quantization_specs:
+    return
+
+  sorted_top_level_component_specs = sorted(
+      quantization_options.quantization_method.quantization_component_specs,
+      key=lambda x: x.quantization_component,
+  )
+
+  for unitwise_spec in quantization_options.unit_wise_quantization_specs:
+    if not unitwise_spec.unit:
+      raise ValueError(
+          'UnitWiseQuantizationSpec must contain at least one unit.'
+      )
+
+    for unit in unitwise_spec.unit:
+      if not unit.op_type and not unit.node_name:
+        raise ValueError('Either `op_type` or `node_name` must be specified.')
+
+    _populate_quantization_component_spec(unitwise_spec.quantization_method)
+
+    component_specs = (
+        unitwise_spec.quantization_method.quantization_component_specs
+    )
+    if component_specs and (
+        sorted_top_level_component_specs
+        != sorted(component_specs, key=lambda x: x.quantization_component)
+    ):
+      raise ValueError(
+          'Currently unit-wise quantization spec only supports NO_QUANTIZE and'
+          ' same quantization method as the top-level `quantization_method`'
+      )
+
+
 def _populate_calibration_options(
     quantization_options: quant_opts_pb2.QuantizationOptions,
 ):
@@ -1405,8 +1444,12 @@ def _populate_quantization_options_default_values(
           ' unquantized_dump_model_path was not specified.'
       )
 
-  # Check and populate quantization component spec
-  _populate_quantization_component_spec(quantization_options)
+  # Check and populate quantization component spec.
+  _populate_quantization_component_spec(
+      quantization_options.quantization_method
+  )
+  # Verify and populate unit-wise quantization specs.
+  _populate_unitwise_quantization_specs(quantization_options)
 
   if (
       quantization_options.quantization_method.preset_method
@@ -1531,7 +1574,10 @@ def quantize(
     ).load()
 
   method: _QuantizationMethod = quantization_options.quantization_method
-  if method.preset_method == _PresetMethod.METHOD_STATIC_RANGE_INT8:
+  if (
+      method.preset_method == _PresetMethod.METHOD_STATIC_RANGE_INT8
+      or method.preset_method == _PresetMethod.METHOD_NO_QUANTIZE
+  ):
     return _static_range_quantize(
         saved_model_path,
         output_directory,

From 970e3d0f79af7be6c5afda0a3d6e96b21393ee40 Mon Sep 17 00:00:00 2001
From: Kanvi Khanna <kanvi.khanna@intel.com>
Date: Mon, 2 Oct 2023 20:35:25 -0700
Subject: [PATCH 510/567] fix build

---
 .../core/common_runtime/mkl_layout_pass.cc    |  5 +-
 tensorflow/core/graph/mkl_graph_util.h        |  5 +-
 .../grappler/optimizers/mkl_remapper_test.cc  | 24 ++++-----
 .../core/grappler/optimizers/remapper.cc      |  8 ++-
 .../core/grappler/optimizers/remapper_test.cc | 54 +++++++++----------
 tensorflow/core/util/util.cc                  | 11 ++++
 tensorflow/core/util/util.h                   |  4 ++
 7 files changed, 60 insertions(+), 51 deletions(-)

diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index ca7ca96aa1164b..93371be2fc12ab 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -1074,11 +1074,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     if (HasNodeAttr(n->def(), type_attr)) {
       const auto& attr = n->def().attr().at(type_attr);
       DataType dtype = attr.type();
-      if (dtype == DT_BFLOAT16 &&
-          !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU()) {
+      if (dtype == DT_BFLOAT16 && !IsBF16SupportedByOneDNNOnThisCPU()) {
         mkl_op_registry::BF16UnsupportedWarning();
         result = false;
-        reason = "Intel oneDNN with bfloat16 support is not enabled.";
+        reason = "Intel oneDNN with bfloat16 is not supported.";
       }
     }
 
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 192a500cb91248..993a66feca7a54 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
 // Since our ops are going to produce and also consume N addition tensors
@@ -172,10 +173,6 @@ inline string GetMklEagerOpName(const string& name) {
   return string(kMklEagerOpPrefix) + name;
 }
 
-static inline bool IsBF16SupportedByOneDNNOnThisCPU() {
-  return port::TestCPUFeature(port::CPUFeature::AVX512F);
-}
-
 static inline void BF16UnsupportedWarning() {
   static absl::once_flag cpu_bfloat16_warn_once_flag;
   absl::call_once(cpu_bfloat16_warn_once_flag, [] {
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index ed60e93bfd6a16..340fa8b460efbf 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -759,9 +759,9 @@ TEST_F(FusedMatMulBiasAddAndGeluTest, Float32GeluExact) {
   RunTest<DT_FLOAT, false>();
 }
 TEST_F(FusedMatMulBiasAddAndGeluTest, BFloat16GeluExact) {
-  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "FusedMatMulBiasAddAndGelu with bfloat16.";
   RunTest<DT_BFLOAT16, false>();
 }
@@ -769,9 +769,9 @@ TEST_F(FusedMatMulBiasAddAndGeluTest, Float32GeluExact2) {
   RunTest<DT_FLOAT, true>();
 }
 TEST_F(FusedMatMulBiasAddAndGeluTest, BFloat16GeluExact2) {
-  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "FusedMatMulBiasAddAndGelu with bfloat16.";
   RunTest<DT_BFLOAT16, true>();
 }
@@ -781,9 +781,9 @@ class MklFusedBatchMatMul : public MklRemapperTest {
   template <typename T>
   void VerifyFused(bool adjx, bool adjy) {
     if (DataTypeToEnum<T>::v() == DT_BFLOAT16 &&
-        !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+        !IsBF16SupportedByOneDNNOnThisCPU())
       GTEST_SKIP()
-          << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+          << "Intel oneDNN with bfloat16 is not supported, skipping "
              "MklFusedBatchMatMul with bfloat16.";
     using ::tensorflow::ops::Placeholder;
     using normal_generator = Eigen::internal::NormalRandomGenerator<T>;
@@ -877,9 +877,9 @@ class MklFusedBatchMatMul : public MklRemapperTest {
   template <typename T>
   void VerifyPreceedingScalarMul(bool adjx, bool adjy) {
     if (DataTypeToEnum<T>::v() == DT_BFLOAT16 &&
-        !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+        !IsBF16SupportedByOneDNNOnThisCPU())
       GTEST_SKIP()
-          << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+          << "Intel oneDNN with bfloat16 is not supported, skipping "
              "MklFusedBatchMatMul with bfloat16.";
     using ::tensorflow::ops::Placeholder;
     using normal_generator = Eigen::internal::NormalRandomGenerator<T>;
@@ -1075,9 +1075,9 @@ class MklRemapperSwishTest : public GrapplerTest {
 
 TEST_F(MklRemapperSwishTest, F32) { RunTest<DT_FLOAT>(); }
 TEST_F(MklRemapperSwishTest, BF16) {
-  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "MklRemapperSwish with bfloat16.";
   RunTest<DT_BFLOAT16>();
 }
@@ -1161,9 +1161,9 @@ class MklRemapperConv2dBiasAddSwishTest : public GrapplerTest {
 
 TEST_F(MklRemapperConv2dBiasAddSwishTest, F32) { RunTest<DT_FLOAT>(); }
 TEST_F(MklRemapperConv2dBiasAddSwishTest, BF16) {
-  if (!mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "MklRemapperConv2dBiasAddSwish with bfloat16.";
   RunTest<DT_BFLOAT16>();
 }
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 128e86d423f2cd..d5238316d73c70 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -342,8 +342,7 @@ bool IsCpuCompatibleDataType(const NodeDef* contraction,
             IsConv3D(*contraction) || IsAnyBatchMatMul(*contraction) ||
             is_supported_matmul) &&
            (dtype == DT_FLOAT ||
-            (dtype == DT_BFLOAT16 &&
-             mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU()));
+            (dtype == DT_BFLOAT16 && IsBF16SupportedByOneDNNOnThisCPU()));
   }
   if (IsConv2D(*contraction)) {
     return dtype == DT_FLOAT || dtype == DT_DOUBLE;
@@ -4653,14 +4652,13 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 
     if (IsMKLEnabled()) {
-      // Check if BF16 datatype is supported by oneDNN on this CPU,
+      // Check if BF16 data type is supported by oneDNN,
       // skipping fusions if it is not supported.
       const auto* node_view = ctx.graph_view.GetNode(i);
       const auto* node_def = node_view->node();
       const string& type_attr = "T";
       DataType dtype = GetDataTypeFromAttr(*node_def, type_attr);
-      if (dtype == DT_BFLOAT16 &&
-          !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU()) {
+      if (dtype == DT_BFLOAT16 && !IsBF16SupportedByOneDNNOnThisCPU()) {
         continue;
       }
 
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 9a01d9825cc6f4..2f654dc25e85d3 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -722,16 +722,16 @@ class RemapperFuseConvWithBias : public RemapperTest {
 TEST_F(RemapperFuseConvWithBias, Conv2D_F32) { RunTest<2, DT_FLOAT>(); }
 TEST_F(RemapperFuseConvWithBias, Conv3D_F32) { RunTest<3, DT_FLOAT>(); }
 TEST_F(RemapperFuseConvWithBias, Conv2D_BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "FuseConv2DWithBias with bfloat16.";
   RunTest<2, DT_BFLOAT16>();
 }
 TEST_F(RemapperFuseConvWithBias, Conv3D_BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "FuseConv3DWithBias with bfloat16.";
   RunTest<3, DT_BFLOAT16>();
 }
@@ -879,16 +879,16 @@ TEST_F(RemapperFuseConvWithBiasAndActivation, Conv3D_F32) {
   RunTest<3, DT_FLOAT>();
 }
 TEST_F(RemapperFuseConvWithBiasAndActivation, Conv2D_BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "FuseConv2DWithBiasAndActivation with bfloat16.";
   RunTest<2, DT_BFLOAT16>();
 }
 TEST_F(RemapperFuseConvWithBiasAndActivation, Conv3D_BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "FuseConv3DWithBiasAndActivation with bfloat16.";
   RunTest<3, DT_BFLOAT16>();
 }
@@ -1005,16 +1005,16 @@ TEST_F(RemapperFuseConvWithSqueezeAndBias, Conv3D_FP32) {
   RunTest<3, DT_FLOAT>();
 }
 TEST_F(RemapperFuseConvWithSqueezeAndBias, Conv2D_BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "FuseConvWithSqueezeAndBias with bfloat16.";
   RunTest<2, DT_BFLOAT16>();
 }
 TEST_F(RemapperFuseConvWithSqueezeAndBias, Conv3D_BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "FuseConvWithSqueezeAndBias with bfloat16.";
   RunTest<3, DT_BFLOAT16>();
 }
@@ -1407,7 +1407,7 @@ TEST_F(RemapperFuseSoftplusTanhMul, FP32) {
   RunTest<DT_FLOAT>();
 }
 TEST_F(RemapperFuseSoftplusTanhMul, BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP() << "Test only applicable to oneDNN.";
   RunTest<DT_BFLOAT16>();
 }
@@ -1702,9 +1702,9 @@ TEST_F(RemapperFuseMatMulWithBiasTest, F16) {
 TEST_F(RemapperFuseMatMulWithBiasTest, F32) { RunTest<DT_FLOAT>(); }
 
 TEST_F(RemapperFuseMatMulWithBiasTest, Bf16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "FuseMatMulWithBias with bfloat16.";
   RunTest<DT_BFLOAT16>();  // NOLINT
 }
@@ -1912,9 +1912,9 @@ TEST_F(RemapperFuseMatMulWithBiasAndActivationTest, F32) {
 }
 
 TEST_F(RemapperFuseMatMulWithBiasAndActivationTest, Bf16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "FuseMatMulWithBiasAndActivation with bfloat16.";
   RunTest<DT_BFLOAT16>();  // NOLINT
 }
@@ -2511,9 +2511,9 @@ TEST_F(RemapperFusePadConv3D, Conv3D_FP32) {
   RunTest<DT_FLOAT>();
 }
 TEST_F(RemapperFusePadConv3D, Conv3D_BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "RemapperFusePadConv3D with bfloat16.";
   RunTest<DT_BFLOAT16>();
 }
@@ -2636,9 +2636,9 @@ TEST_F(RemapperFusePadWithFusedConv3D, FusedConv3D_FP32) {
   RunTest<DT_FLOAT>();
 }
 TEST_F(RemapperFusePadWithFusedConv3D, FusedConv3D_BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "RemapperFusePadWithFusedConv3D with bfloat16.";
   RunTest<DT_BFLOAT16>();
 }
@@ -2707,9 +2707,9 @@ class RemapperLeakyReluTest : public GrapplerTest {
 
 TEST_F(RemapperLeakyReluTest, F32) { RunTest<DT_FLOAT>(); }
 TEST_F(RemapperLeakyReluTest, BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "RemapperLeakyRelu with bfloat16.";
   RunTest<DT_BFLOAT16>();
 }
@@ -2857,9 +2857,9 @@ TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv2D_F32) {
   RunTest<2, DT_FLOAT>();
 }
 TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv2D_BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "RemapperFuseFusedConvWithFusedActivation with bfloat16.";
   RunTest<2, DT_BFLOAT16>();
 }
@@ -2867,9 +2867,9 @@ TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv3D_F32) {
   RunTest<3, DT_FLOAT>();
 }
 TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv3D_BF16) {
-  if (!IsMKLEnabled() || !mkl_op_registry::IsBF16SupportedByOneDNNOnThisCPU())
+  if (!IsMKLEnabled() || !IsBF16SupportedByOneDNNOnThisCPU())
     GTEST_SKIP()
-        << "Intel oneDNN with bfloat16 support is not enabled, skipping "
+        << "Intel oneDNN with bfloat16 is not supported, skipping "
            "RemapperFuseFusedConvWithFusedActivation with bfloat16.";
   RunTest<3, DT_BFLOAT16>();
 }
diff --git a/tensorflow/core/util/util.cc b/tensorflow/core/util/util.cc
index 1c12f552d7de7b..32fc0a3ed64353 100644
--- a/tensorflow/core/util/util.cc
+++ b/tensorflow/core/util/util.cc
@@ -124,4 +124,15 @@ string SliceDebugString(const TensorShape& shape, const int64_t flat) {
 // TODO(penporn): Remove this function from util.cc
 bool IsMKLEnabled() { return IsMklEnabled(); }
 
+bool IsBF16SupportedByOneDNNOnThisCPU() {
+#ifdef INTEL_MKL
+  if (port::TestCPUFeature(port::CPUFeature::AVX512F)) {
+    return true;
+  } else {
+    return false;
+  }
+#endif  // INTEL_MKL
+  return false;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/util.h b/tensorflow/core/util/util.h
index 04a68cef18222c..8c388aea3edb66 100644
--- a/tensorflow/core/util/util.h
+++ b/tensorflow/core/util/util.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/cpu_info.h"
 
 namespace tensorflow {
 
@@ -61,6 +62,9 @@ std::string SliceDebugString(const TensorShape& shape, int64_t flat);
 // Check if MKL is enabled in runtime
 bool IsMKLEnabled();
 
+// Check if BF16 is supported
+bool IsBF16SupportedByOneDNNOnThisCPU();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_UTIL_UTIL_H_

From 4696642aac7916fde443f1a7e8499bf81f0736b3 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Tue, 3 Oct 2023 00:01:23 -0700
Subject: [PATCH 511/567] Refactor stream implementation so that it does not
 create a thread for each streaming request.

The callback are immediately invoked at the producer side. And the producer can customize the threading using ThreadPoolInterface.

PiperOrigin-RevId: 570293325
---
 tensorflow/core/tfrt/runtime/BUILD           |  25 +---
 tensorflow/core/tfrt/runtime/channel.h       |  79 -----------
 tensorflow/core/tfrt/runtime/channel_test.cc | 138 -------------------
 tensorflow/core/tfrt/runtime/stream.cc       |  96 ++++++++-----
 tensorflow/core/tfrt/runtime/stream.h        |  51 ++++++-
 tensorflow/core/tfrt/runtime/stream_test.cc  |  12 +-
 6 files changed, 115 insertions(+), 286 deletions(-)
 delete mode 100644 tensorflow/core/tfrt/runtime/channel.h
 delete mode 100644 tensorflow/core/tfrt/runtime/channel_test.cc

diff --git a/tensorflow/core/tfrt/runtime/BUILD b/tensorflow/core/tfrt/runtime/BUILD
index d37862f5edf999..41a204f801c5ac 100644
--- a/tensorflow/core/tfrt/runtime/BUILD
+++ b/tensorflow/core/tfrt/runtime/BUILD
@@ -110,7 +110,6 @@ cc_library(
     srcs = ["stream.cc"],
     hdrs = ["stream.h"],
     deps = [
-        ":channel",
         ":step_id",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core/framework:tensor",
@@ -129,20 +128,11 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:random",
+        "@local_tsl//tsl/platform:threadpool_interface",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
-cc_library(
-    name = "channel",
-    hdrs = ["channel.h"],
-    deps = [
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
 cc_library(
     name = "step_id",
     srcs = ["step_id.cc"],
@@ -162,6 +152,7 @@ tf_cc_shared_test(
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
+        "//tensorflow/core/tfrt/utils:thread_pool",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -192,15 +183,3 @@ tf_cc_test(
         "@tf_runtime//:support",
     ],
 )
-
-tf_cc_test(
-    name = "channel_test",
-    srcs = ["channel_test.cc"],
-    deps = [
-        ":channel",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:status_matchers",
-    ],
-)
diff --git a/tensorflow/core/tfrt/runtime/channel.h b/tensorflow/core/tfrt/runtime/channel.h
deleted file mode 100644
index 5a01e78677064f..00000000000000
--- a/tensorflow/core/tfrt/runtime/channel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TFRT_RUNTIME_CHANNEL_H_
-#define TENSORFLOW_CORE_TFRT_RUNTIME_CHANNEL_H_
-
-#include <queue>
-
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
-
-namespace tensorflow {
-namespace tfrt_stub {
-
-// An unbounded queue for communicating between threads. This class is
-// thread-safe.
-template <typename T>
-class UnboundedChannel {
- public:
-  absl::Status Write(T value) {
-    absl::MutexLock lock(&mu_);
-
-    if (closed_) {
-      return absl::InternalError(
-          "Failed to write to the UnboundedChannel that is closed.");
-    }
-
-    channel_.push(std::move(value));
-
-    return absl::OkStatus();
-  }
-
-  bool Read(T& value) {
-    absl::MutexLock lock(&mu_);
-
-    mu_.Await(absl::Condition(
-        +[](UnboundedChannel* channel) ABSL_SHARED_LOCKS_REQUIRED(mu_) {
-          return !channel->channel_.empty() || channel->closed_;
-        },
-        this));
-
-    if (!channel_.empty()) {
-      value = std::move(channel_.front());
-      channel_.pop();
-      return true;
-    }
-
-    // If channel_ is empty, then it must be closed at this point.
-    DCHECK(closed_);
-    return false;
-  }
-
-  void Close() {
-    absl::MutexLock lock(&mu_);
-    closed_ = true;
-  }
-
- private:
-  absl::Mutex mu_;
-  std::queue<T> channel_ ABSL_GUARDED_BY(mu_);
-  bool closed_ ABSL_GUARDED_BY(mu_) = false;
-};
-
-}  // namespace tfrt_stub
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_TFRT_RUNTIME_CHANNEL_H_
diff --git a/tensorflow/core/tfrt/runtime/channel_test.cc b/tensorflow/core/tfrt/runtime/channel_test.cc
deleted file mode 100644
index e802defed5177b..00000000000000
--- a/tensorflow/core/tfrt/runtime/channel_test.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/tfrt/runtime/channel.h"
-
-#include <numeric>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/synchronization/blocking_counter.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/status_matchers.h"
-
-namespace tensorflow {
-namespace tfrt_stub {
-namespace {
-
-using ::testing::ElementsAreArray;
-using ::testing::UnorderedElementsAreArray;
-using ::tsl::testing::StatusIs;
-
-TEST(ChannelTest, Basic) {
-  UnboundedChannel<int> channel;
-
-  std::vector<int> expected(100);
-  std::iota(expected.begin(), expected.end(), 0);
-
-  tsl::Env::Default()->SchedClosure([&]() {
-    for (int v : expected) {
-      CHECK_OK(channel.Write(v));
-    }
-    channel.Close();
-  });
-
-  std::vector<int> outputs;
-  int v = -1;
-  while (channel.Read(v)) {
-    outputs.push_back(v);
-  }
-
-  EXPECT_THAT(outputs, ElementsAreArray(expected));
-
-  EXPECT_THAT(channel.Write(100), StatusIs(absl::StatusCode::kInternal));
-}
-
-TEST(ChannelTest, MultipleWriters) {
-  UnboundedChannel<int> channel;
-
-  std::vector<int> expected(100);
-  std::iota(expected.begin(), expected.end(), 0);
-
-  tsl::Env::Default()->SchedClosure([&]() {
-    absl::BlockingCounter bcount(expected.size());
-    for (int v : expected) {
-      tsl::Env::Default()->SchedClosure([&, v]() {
-        CHECK_OK(channel.Write(v));
-        bcount.DecrementCount();
-      });
-    }
-    bcount.Wait();
-    channel.Close();
-  });
-
-  std::vector<int> outputs;
-  int v = 0;
-  while (channel.Read(v)) {
-    outputs.push_back(v);
-  }
-
-  EXPECT_THAT(outputs, UnorderedElementsAreArray(expected));
-}
-
-TEST(ChannelTest, MultipleReaders) {
-  UnboundedChannel<int> channel;
-
-  std::vector<int> expected(100);
-  std::iota(expected.begin(), expected.end(), 0);
-
-  absl::Mutex mu;
-  std::vector<int> outputs;
-
-  int num_readers = 200;
-  absl::BlockingCounter bcount(num_readers);
-  for (int i = 0; i < num_readers; ++i) {
-    tsl::Env::Default()->SchedClosure([&]() {
-      int v = 0;
-      while (channel.Read(v)) {
-        absl::MutexLock lock(&mu);
-        outputs.push_back(v);
-      }
-      bcount.DecrementCount();
-    });
-  }
-
-  for (int v : expected) {
-    CHECK_OK(channel.Write(v));
-  }
-  channel.Close();
-
-  bcount.Wait();
-  EXPECT_THAT(outputs, UnorderedElementsAreArray(expected));
-}
-
-TEST(ChannelTest, FullyBuffered) {
-  UnboundedChannel<int> channel;
-
-  std::vector<int> expected(100);
-  std::iota(expected.begin(), expected.end(), 0);
-
-  for (int v : expected) {
-    CHECK_OK(channel.Write(v));
-  }
-  channel.Close();
-
-  std::vector<int> outputs;
-  int v = -1;
-  while (channel.Read(v)) {
-    outputs.push_back(v);
-  }
-
-  EXPECT_THAT(outputs, ElementsAreArray(expected));
-}
-
-}  // namespace
-}  // namespace tfrt_stub
-}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/runtime/stream.cc b/tensorflow/core/tfrt/runtime/stream.cc
index 80f961b5de776d..c833c267991d97 100644
--- a/tensorflow/core/tfrt/runtime/stream.cc
+++ b/tensorflow/core/tfrt/runtime/stream.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tsl/platform/random.h"
+#include "tsl/platform/threadpool_interface.h"
 #include "tsl/profiler/lib/traceme.h"
 
 namespace tensorflow {
@@ -77,6 +79,55 @@ absl::StatusOr<std::optional<StreamCallbackId>> CreateStreamCallbackId(
   return callback_id;
 }
 
+absl::Status StreamCallbackRegistry::CallbackState::Invoke(
+    tsl::thread::ThreadPoolInterface* thread_pool, StreamedResult result) {
+  {
+    absl::MutexLock lock(&mu_);
+    if (closed_) {
+      return absl::InternalError(
+          "Failed to invole the callback that is closed.");
+    }
+    ++num_outstanding_;
+  }
+  thread_pool->Schedule([this, result = std::move(result)]() mutable {
+    InvokeCallback(std::move(result));
+
+    absl::MutexLock lock(&mu_);
+    --num_outstanding_;
+  });
+  return absl::OkStatus();
+}
+
+void StreamCallbackRegistry::CallbackState::Close() {
+  {
+    absl::MutexLock lock(&mu_);
+    closed_ = true;
+
+    auto not_running = [this]() ABSL_SHARED_LOCKS_REQUIRED(mu_) {
+      return num_outstanding_ == 0;
+    };
+    mu_.Await(absl::Condition(&not_running));
+  }
+}
+
+void StreamCallbackRegistry::CallbackState::InvokeCallback(
+    StreamedResult result) {
+  absl::Duration dequeue_latency = absl::Now() - result.enqueued_time;
+  interface().RecordDequeueLatency(model_name_, dequeue_latency);
+
+  tsl::profiler::TraceMe trace_me("StreamCallbackInvocation");
+  trace_me.AppendMetadata([&]() {
+    return tsl::profiler::TraceMeEncode({
+        {"callback_id", callback_id_.id},
+        {"step_id", step_id_.id},
+    });
+  });
+
+  absl::Time start_time = absl::Now();
+  callback_(std::move(result.tensors));
+  interface().RecordCallbackLatency(model_name_, absl::Now() - start_time);
+}
+
 absl::StatusOr<ScopedStreamCallback> StreamCallbackRegistry::Register(
     absl::string_view model_name, StreamCallbackId callback_id, StepId step_id,
     absl::AnyInvocable<
@@ -91,39 +142,14 @@ absl::StatusOr<ScopedStreamCallback> StreamCallbackRegistry::Register(
         "Stream callback ", callback_id, " @ ", step_id, " already exists"));
   }
 
-  it->second = std::make_unique<CallbackState>();
-  it->second->thread = absl::WrapUnique(tsl::Env::Default()->StartThread(
-      tensorflow::ThreadOptions(),
-      /*name=*/absl::StrCat("stream_handler_", callback_id, "_", step_id),
-      [model_name = std::string(model_name), callback_id, step_id,
-       callback = std::move(callback), state = it->second.get(),
-       this]() mutable {
-        StreamedResult result;
-        while (state->channel.Read(result)) {
-          absl::Duration dequeue_latency = absl::Now() - result.enqueued_time;
-          interface_->RecordDequeueLatency(model_name, dequeue_latency);
-
-          tsl::profiler::TraceMe trace_me("StreamCallbackInvocation");
-          trace_me.AppendMetadata([&]() {
-            return tsl::profiler::TraceMeEncode({
-                {"callback_id", callback_id.id},
-                {"step_id", step_id.id},
-            });
-          });
-
-          absl::Time start_time = absl::Now();
-          callback(std::move(result.tensors));
-          interface_->RecordCallbackLatency(model_name,
-                                            absl::Now() - start_time);
-        }
-      }));
-
+  it->second = std::make_unique<CallbackState>(this, model_name, callback_id,
+                                               step_id, std::move(callback));
   return ScopedStreamCallback(this, callback_id, step_id);
 }
 
-absl::Status StreamCallbackRegistry::Write(StreamCallbackId callback_id,
-                                           StepId step_id,
-                                           StreamedResult result) {
+absl::Status StreamCallbackRegistry::Invoke(
+    tsl::thread::ThreadPoolInterface* thread_pool, StreamCallbackId callback_id,
+    StepId step_id, StreamedResult result) {
   absl::MutexLock lock(&mu_);
   auto iter = stream_callbacks_.find({callback_id, step_id});
   if (iter == stream_callbacks_.end()) {
@@ -135,7 +161,7 @@ absl::Status StreamCallbackRegistry::Write(StreamCallbackId callback_id,
 
   auto* state = iter->second.get();
   DCHECK(state);
-  return state->channel.Write(std::move(result));
+  return state->Invoke(thread_pool, std::move(result));
 }
 
 std::unique_ptr<StreamCallbackRegistry::CallbackState>
@@ -189,11 +215,9 @@ void ScopedStreamCallback::Unregister() {
   auto state = registry_->Unregister(*callback_id_, step_id_);
   DCHECK(state);
 
-  // At this point, it is safe to close the channel.
-  state->channel.Close();
-
-  // Wait until the stream handler finishes.
-  state->thread.reset();
+  // At this point, it is safe to close the channel. This also wait for the
+  // outstanding stream handlers to finish.
+  state->Close();
 
   callback_id_.reset();
 }
diff --git a/tensorflow/core/tfrt/runtime/stream.h b/tensorflow/core/tfrt/runtime/stream.h
index 8f6b6af5ab09bd..03b0784b4c8505 100644
--- a/tensorflow/core/tfrt/runtime/stream.h
+++ b/tensorflow/core/tfrt/runtime/stream.h
@@ -36,9 +36,9 @@ under the License.
 #include "absl/time/time.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/tfrt/runtime/channel.h"
 #include "tensorflow/core/tfrt/runtime/step_id.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/threadpool_interface.h"
 
 namespace tensorflow {
 namespace tfrt_stub {
@@ -179,17 +179,56 @@ class StreamCallbackRegistry {
           void(absl::flat_hash_map<std::string, tensorflow::Tensor>)>
           callback);
 
-  absl::Status Write(StreamCallbackId callback_id, StepId step_id,
-                     StreamedResult result);
+  absl::Status Invoke(tsl::thread::ThreadPoolInterface* thread_pool,
+                      StreamCallbackId callback_id, StepId step_id,
+                      StreamedResult result);
 
   StreamControllerInterface& stream_interface() const { return *interface_; }
 
  private:
   friend class ScopedStreamCallback;
 
-  struct CallbackState {
-    std::unique_ptr<tsl::Thread> thread;
-    UnboundedChannel<StreamedResult> channel;
+  class CallbackState {
+   public:
+    CallbackState(StreamCallbackRegistry* registry,
+                  absl::string_view model_name, StreamCallbackId callback_id,
+                  StepId step_id,
+                  absl::AnyInvocable<void(
+                      absl::flat_hash_map<std::string, tensorflow::Tensor>)>
+                      callback)
+        : registry_(registry),
+          model_name_(model_name),
+          callback_id_(callback_id),
+          step_id_(step_id),
+          callback_(std::move(callback)) {
+      DCHECK(registry_);
+    }
+
+    // Invokes the callback in `thread_pool` with `result`.
+    absl::Status Invoke(tsl::thread::ThreadPoolInterface* thread_pool,
+                        StreamedResult result);
+
+    // Closes the callback so that it can no longer be invoked. This method also
+    // waits for outstanding results to finish.
+    void Close();
+
+   private:
+    StreamControllerInterface& interface() {
+      return registry_->stream_interface();
+    }
+    void InvokeCallback(StreamedResult result);
+
+    StreamCallbackRegistry* registry_ = nullptr;
+    std::string model_name_;
+    StreamCallbackId callback_id_;
+    StepId step_id_;
+    absl::AnyInvocable<void(
+        absl::flat_hash_map<std::string, tensorflow::Tensor>)>
+        callback_;
+
+    absl::Mutex mu_;
+    bool closed_ ABSL_GUARDED_BY(mu_) = false;
+    int num_outstanding_ ABSL_GUARDED_BY(mu_) = 0;
   };
 
   std::unique_ptr<CallbackState> Unregister(StreamCallbackId callback_id,
diff --git a/tensorflow/core/tfrt/runtime/stream_test.cc b/tensorflow/core/tfrt/runtime/stream_test.cc
index 7699353c1639f9..cac9113053bfab 100644
--- a/tensorflow/core/tfrt/runtime/stream_test.cc
+++ b/tensorflow/core/tfrt/runtime/stream_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
+#include "tensorflow/core/tfrt/utils/thread_pool.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
 
@@ -76,8 +77,9 @@ TEST(StreamTest, Simple) {
     auto thread = absl::WrapUnique(tsl::Env::Default()->StartThread(
         tsl::ThreadOptions(), "fake_stream_client", [&]() {
           for (const auto& map : expected) {
-            CHECK_OK(GetGlobalStreamCallbackRegistry().Write(
-                callback_id, step_id, {map, absl::Now()}));
+            TfThreadPool thread_pool(/*name=*/"test", /*num_threads=*/4);
+            CHECK_OK(GetGlobalStreamCallbackRegistry().Invoke(
+                &thread_pool, callback_id, step_id, {map, absl::Now()}));
           }
         }));
   }
@@ -98,6 +100,7 @@ TEST(StreamTest, MultipleWriters) {
   std::vector<absl::flat_hash_map<std::string, std::vector<int32_t>>> outputs;
 
   {
+    TfThreadPool thread_pool(/*name=*/"test", /*num_threads=*/4);
     TF_ASSERT_OK_AND_ASSIGN(
         auto scoped_stream_callback,
         GetGlobalStreamCallbackRegistry().Register(
@@ -115,11 +118,12 @@ TEST(StreamTest, MultipleWriters) {
          {{"c", AsTensor<int32_t>({300})}}};
 
     for (const auto& p : expected) {
-      tsl::Env::Default()->SchedClosure([callback_id, step_id, p]() {
+      tsl::Env::Default()->SchedClosure([&, callback_id, step_id, p]() {
+        TfThreadPool thread_pool(/*name=*/"test", /*num_threads=*/4);
         // The stream callback may be dropped early, and in that case we ignore
         // the error.
         GetGlobalStreamCallbackRegistry()
-            .Write(callback_id, step_id, {p, absl::Now()})
+            .Invoke(&thread_pool, callback_id, step_id, {p, absl::Now()})
             .IgnoreError();
       });
     }

From 6a1f455f0097971e8799c62418c23ab2bf5c2f64 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 00:13:19 -0700
Subject: [PATCH 512/567] Internal Code Change

PiperOrigin-RevId: 570296006
---
 third_party/xla/xla/service/gpu/nccl_utils.cc          | 2 +-
 third_party/xla/xla/service/gpu/precompiled_kernels.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/nccl_utils.cc b/third_party/xla/xla/service/gpu/nccl_utils.cc
index 57cec8d49b23c7..82b160d849558b 100644
--- a/third_party/xla/xla/service/gpu/nccl_utils.cc
+++ b/third_party/xla/xla/service/gpu/nccl_utils.cc
@@ -216,7 +216,7 @@ void CheckNcclAsyncError(NcclComm& lockable_comm) {
     return XLA_CUDA_STATUS(async_err);
   }();
 
-  if (!status.ok()) LOG(ERROR) << status.ToString();
+  if (!status.ok()) LOG(ERROR) << status;
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/precompiled_kernels.cc b/third_party/xla/xla/service/gpu/precompiled_kernels.cc
index 1233aab49c15bf..d380a496cf4aaf 100644
--- a/third_party/xla/xla/service/gpu/precompiled_kernels.cc
+++ b/third_party/xla/xla/service/gpu/precompiled_kernels.cc
@@ -119,7 +119,7 @@ class LazyKernel {
         static absl::once_flag logged_once;
         absl::call_once(logged_once, [&]() {
           LOG(WARNING)
-              << compiled_ptx_or.status().ToString()
+              << compiled_ptx_or.status()
               << "\nRelying on driver to perform ptx compilation. "
               << "\nSetting XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda "
               << " or modifying $PATH can be used to set the location of ptxas."

From 1d6d6ee9cb43656f7111648044daaaac4558418c Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 3 Oct 2023 00:17:20 -0700
Subject: [PATCH 513/567] NFC: replace uses of
 TF_DISALLOW_COPY_AND_ASSIGN/SE_DISALLOW_COPY_AND_ASSIGN macros with
 implementation.

PiperOrigin-RevId: 570296712
---
 .../tsl/tsl/distributed_runtime/call_options.h |  3 ++-
 .../coordination/coordination_service.cc       |  4 +++-
 .../coordination/coordination_service_agent.cc |  3 ++-
 .../grpc_coordination_service_impl.h           |  3 ++-
 .../distributed_runtime/rpc/grpc_channel.cc    |  3 ++-
 .../rpc/grpc_client_cq_tag.h                   |  3 ++-
 .../third_party/tsl/tsl/framework/allocator.h  |  3 ++-
 .../tsl/tsl/framework/allocator_registry.h     |  3 ++-
 .../tsl/tsl/framework/bfc_allocator.h          |  6 ++++--
 .../tsl/tsl/framework/cpu_allocator_impl.cc    |  3 ++-
 .../tsl/tsl/framework/device_id_manager.cc     |  3 ++-
 .../tsl/tsl/lib/histogram/histogram.h          |  3 ++-
 .../third_party/tsl/tsl/lib/io/buffered_file.h |  3 ++-
 .../tsl/tsl/lib/io/buffered_inputstream.h      |  3 ++-
 .../third_party/tsl/tsl/lib/io/inputbuffer.h   |  3 ++-
 .../third_party/tsl/tsl/lib/io/record_reader.h |  3 ++-
 .../third_party/tsl/tsl/lib/io/record_writer.h |  3 ++-
 .../tsl/tsl/lib/io/snappy/snappy_inputbuffer.h |  3 ++-
 .../tsl/tsl/lib/io/snappy/snappy_inputstream.h |  3 ++-
 .../tsl/lib/io/snappy/snappy_outputbuffer.h    |  3 ++-
 .../tsl/tsl/lib/io/zlib_inputstream.h          |  3 ++-
 .../tsl/tsl/lib/io/zlib_outputbuffer.h         |  3 ++-
 .../tsl/lib/monitoring/collection_registry.h   |  9 ++++++---
 .../tsl/tsl/lib/monitoring/counter.h           | 12 ++++++++----
 .../third_party/tsl/tsl/lib/monitoring/gauge.h | 18 ++++++++++++------
 .../tsl/lib/monitoring/percentile_sampler.h    |  9 ++++++---
 .../tsl/tsl/lib/monitoring/sampler.cc          |  6 ++++--
 .../tsl/tsl/lib/monitoring/sampler.h           | 15 ++++++++++-----
 .../tsl/tsl/lib/random/distribution_sampler.h  |  3 ++-
 .../tsl/tsl/lib/random/weighted_picker.h       |  3 ++-
 .../cloud/compute_engine_metadata_client.h     |  3 ++-
 .../cloud/compute_engine_zone_provider.h       |  3 ++-
 .../tsl/tsl/platform/cloud/curl_http_request.h |  3 ++-
 .../tsl/tsl/platform/cloud/gcs_file_system.h   |  3 ++-
 .../tsl/platform/cloud/google_auth_provider.h  |  3 ++-
 .../tsl/tsl/platform/cloud/http_request.h      |  3 ++-
 .../tsl/tsl/platform/cloud/oauth_client.h      |  3 ++-
 .../tsl/tsl/platform/default/subprocess.h      |  3 ++-
 .../third_party/tsl/tsl/platform/denormal.h    | 10 +++++++---
 .../xla/third_party/tsl/tsl/platform/env.h     |  6 ++++--
 .../third_party/tsl/tsl/platform/file_system.h |  6 ++++--
 .../xla/third_party/tsl/tsl/platform/macros.h  |  1 +
 .../android_armv7a_cpu_utils_helper.h          |  3 ++-
 .../profile_utils/clock_cycle_profiler.h       |  3 ++-
 .../tsl/tsl/platform/profile_utils/cpu_utils.h |  6 ++++--
 .../profile_utils/i_cpu_utils_helper.h         |  3 ++-
 .../tsl/tsl/platform/ram_file_system.h         |  3 ++-
 .../tsl/tsl/platform/retrying_file_system.h    |  3 ++-
 .../xla/third_party/tsl/tsl/platform/scanner.h |  3 ++-
 .../third_party/tsl/tsl/platform/setround.h    |  3 ++-
 .../tsl/tsl/platform/static_threadlocal.h      |  3 ++-
 .../tsl/tsl/platform/statusor_test.cc          |  6 ++++--
 .../xla/third_party/tsl/tsl/platform/strcat.h  |  3 ++-
 .../third_party/tsl/tsl/platform/threadpool.h  |  3 ++-
 .../xla/third_party/tsl/tsl/platform/tracing.h |  3 ++-
 .../tsl/tsl/platform/windows/subprocess.h      |  3 ++-
 .../profiler/backends/cpu/annotation_stack.h   |  3 ++-
 .../profiler/backends/cpu/traceme_recorder.h   |  3 ++-
 .../tsl/tsl/profiler/lib/scoped_annotation.h   |  3 ++-
 .../third_party/tsl/tsl/profiler/lib/traceme.h |  3 ++-
 .../tsl/tsl/profiler/lib/traceme_encode.h      |  3 ++-
 .../xla/third_party/tsl/tsl/util/reporter.h    |  6 ++++--
 62 files changed, 175 insertions(+), 86 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/call_options.h b/third_party/xla/third_party/tsl/tsl/distributed_runtime/call_options.h
index 560c5611a694db..6021ba8f1d24eb 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/call_options.h
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/call_options.h
@@ -73,7 +73,8 @@ class CallOptions {
   // RPC operation timeout in milliseconds.
   int64_t timeout_in_ms_ TF_GUARDED_BY(mu_) = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CallOptions);
+  CallOptions(const CallOptions&) = delete;
+  void operator=(const CallOptions&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
index ca3273b9a3b4cc..0b916e65aaa208 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -269,7 +269,9 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
 
   absl::flat_hash_set<std::string> recoverable_jobs_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CoordinationServiceStandaloneImpl);
+  CoordinationServiceStandaloneImpl(const CoordinationServiceStandaloneImpl&) =
+      delete;
+  void operator=(const CoordinationServiceStandaloneImpl&) = delete;
 };
 
 void CoordinationServiceStandaloneImpl::TaskState::SetConnected(
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index 9c551b74325ed8..a45213d1817624 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -163,7 +163,8 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
   CancellationManager cancellation_manager_;
   std::unique_ptr<CoordinationClient> leader_client_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CoordinationServiceAgentImpl);
+  CoordinationServiceAgentImpl(const CoordinationServiceAgentImpl&) = delete;
+  void operator=(const CoordinationServiceAgentImpl&) = delete;
 };
 
 Status CoordinationServiceAgentImpl::Initialize(
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
index a4cc3bbaa4b8fd..daa7e65225f660 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
@@ -108,7 +108,8 @@ class GrpcCoordinationServiceImpl : public AsyncServiceInterface {
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   tensorflow::grpc::CoordinationService::AsyncService service_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcCoordinationServiceImpl);
+  GrpcCoordinationServiceImpl(const GrpcCoordinationServiceImpl&) = delete;
+  void operator=(const GrpcCoordinationServiceImpl&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc
index 51f42fa01b81ce..ba886c1a7bf1d3 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc
@@ -343,7 +343,8 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   const string job_id_;
   const std::map<int, string> host_ports_;
   const ChannelCreationFunction channel_func_;
-  TF_DISALLOW_COPY_AND_ASSIGN(SparseGrpcChannelCache);
+  SparseGrpcChannelCache(const SparseGrpcChannelCache&) = delete;
+  void operator=(const SparseGrpcChannelCache&) = delete;
 };
 
 }  // namespace
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
index a3a402603bace8..183ca52e8d2d0a 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
@@ -32,7 +32,8 @@ class GrpcClientCQTag {
   virtual void OnCompleted(bool ok) = 0;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcClientCQTag);
+  GrpcClientCQTag(const GrpcClientCQTag&) = delete;
+  void operator=(const GrpcClientCQTag&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/framework/allocator.h b/third_party/xla/third_party/tsl/tsl/framework/allocator.h
index 941df143b27fec..84aef0b1f99c1f 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/allocator.h
+++ b/third_party/xla/third_party/tsl/tsl/framework/allocator.h
@@ -62,7 +62,8 @@ struct AllocationAttributes {
   // returned.
   std::function<uint64()>* freed_by_func = nullptr;  // Not owned.
 
-  TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes);
+  AllocationAttributes(const AllocationAttributes&) = delete;
+  void operator=(const AllocationAttributes&) = delete;
 };
 
 // Runtime statistics collected by an allocator. Exactly the same as
diff --git a/third_party/xla/third_party/tsl/tsl/framework/allocator_registry.h b/third_party/xla/third_party/tsl/tsl/framework/allocator_registry.h
index a9099d6e67875c..b827ce42dc7ab4 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/allocator_registry.h
+++ b/third_party/xla/third_party/tsl/tsl/framework/allocator_registry.h
@@ -124,7 +124,8 @@ class AllocatorFactoryRegistry {
   const FactoryEntry* FindEntry(const string& name, int priority) const
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(AllocatorFactoryRegistry);
+  AllocatorFactoryRegistry(const AllocatorFactoryRegistry&) = delete;
+  void operator=(const AllocatorFactoryRegistry&) = delete;
 };
 
 class AllocatorFactoryRegistration {
diff --git a/third_party/xla/third_party/tsl/tsl/framework/bfc_allocator.h b/third_party/xla/third_party/tsl/tsl/framework/bfc_allocator.h
index 3d3f4094169fb9..47619856abe8dd 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/bfc_allocator.h
+++ b/third_party/xla/third_party/tsl/tsl/framework/bfc_allocator.h
@@ -352,7 +352,8 @@ class BFCAllocator : public Allocator {
     // for the memory allocation represented by "p"
     std::vector<ChunkHandle> handles_;
 
-    TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
+    AllocationRegion(const AllocationRegion&) = delete;
+    void operator=(const AllocationRegion&) = delete;
   };
 
   // RegionManager aggregates one or more "AllocationRegions" and provides
@@ -619,7 +620,8 @@ class BFCAllocator : public Allocator {
 
   friend class GPUBFCAllocatorPrivateMethodsTest;
   friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific;
-  TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator);
+  BFCAllocator(const BFCAllocator&) = delete;
+  void operator=(const BFCAllocator&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/framework/cpu_allocator_impl.cc b/third_party/xla/third_party/tsl/tsl/framework/cpu_allocator_impl.cc
index 37b4bf3dc482f4..d7c698f7f9d5c1 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/cpu_allocator_impl.cc
+++ b/third_party/xla/third_party/tsl/tsl/framework/cpu_allocator_impl.cc
@@ -177,7 +177,8 @@ class CPUAllocator : public Allocator {
   std::atomic<int> single_allocation_warning_count_;
   int total_allocation_warning_count_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
+  CPUAllocator(const CPUAllocator&) = delete;
+  void operator=(const CPUAllocator&) = delete;
 };
 
 class CPUAllocatorFactory : public AllocatorFactory {
diff --git a/third_party/xla/third_party/tsl/tsl/framework/device_id_manager.cc b/third_party/xla/third_party/tsl/tsl/framework/device_id_manager.cc
index 0cfb6939c15a90..4a2c3bd2ffe2bd 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/device_id_manager.cc
+++ b/third_party/xla/third_party/tsl/tsl/framework/device_id_manager.cc
@@ -113,7 +113,8 @@ class TfToPlatformDeviceIdMap {
   TypeIdMapType id_map_ TF_GUARDED_BY(mu_);
 
   friend class ::tsl::DeviceIdManager;
-  TF_DISALLOW_COPY_AND_ASSIGN(TfToPlatformDeviceIdMap);
+  TfToPlatformDeviceIdMap(const TfToPlatformDeviceIdMap&) = delete;
+  void operator=(const TfToPlatformDeviceIdMap&) = delete;
 };
 }  // namespace
 
diff --git a/third_party/xla/third_party/tsl/tsl/lib/histogram/histogram.h b/third_party/xla/third_party/tsl/tsl/lib/histogram/histogram.h
index b7c96add7e567b..a024e2275b4d29 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/histogram/histogram.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/histogram/histogram.h
@@ -100,7 +100,8 @@ class Histogram {
 
   double Remap(double x, double x0, double x1, double y0, double y1) const;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Histogram);
+  Histogram(const Histogram&) = delete;
+  void operator=(const Histogram&) = delete;
 };
 
 // Wrapper around a Histogram object that is thread safe.
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/buffered_file.h b/third_party/xla/third_party/tsl/tsl/lib/io/buffered_file.h
index ffcce6653922c5..5627a7228fb782 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/buffered_file.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/buffered_file.h
@@ -107,7 +107,8 @@ class BufferedWritableFile : public WritableFile {
   std::unique_ptr<WritableFile> file_;
   uint32_t crc32_ = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BufferedWritableFile);
+  BufferedWritableFile(const BufferedWritableFile&) = delete;
+  void operator=(const BufferedWritableFile&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/buffered_inputstream.h b/third_party/xla/third_party/tsl/tsl/lib/io/buffered_inputstream.h
index 7101730ed373be..5318434c63919c 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/buffered_inputstream.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/buffered_inputstream.h
@@ -110,7 +110,8 @@ class BufferedInputStream : public InputStreamInterface {
   // buffer allocations.
   Status file_status_ = OkStatus();
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BufferedInputStream);
+  BufferedInputStream(const BufferedInputStream&) = delete;
+  void operator=(const BufferedInputStream&) = delete;
 };
 
 // Explicit instantiations defined in buffered_inputstream.cc.
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/inputbuffer.h b/third_party/xla/third_party/tsl/tsl/lib/io/inputbuffer.h
index 8213ec061bed20..e357efb5f75b53 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/inputbuffer.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/inputbuffer.h
@@ -107,7 +107,8 @@ class InputBuffer {
   char* pos_;    // Current position in "buf"
   char* limit_;  // Just past end of valid data in "buf"
 
-  TF_DISALLOW_COPY_AND_ASSIGN(InputBuffer);
+  InputBuffer(const InputBuffer&) = delete;
+  void operator=(const InputBuffer&) = delete;
 };
 
 // Implementation details.
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/record_reader.h b/third_party/xla/third_party/tsl/tsl/lib/io/record_reader.h
index 7981dedcdb5e0f..282c0daff2a5a8 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/record_reader.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/record_reader.h
@@ -124,7 +124,8 @@ class RecordReader {
 
   std::unique_ptr<Metadata> cached_metadata_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RecordReader);
+  RecordReader(const RecordReader&) = delete;
+  void operator=(const RecordReader&) = delete;
 };
 
 // High-level interface to read TFRecord files.
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/record_writer.h b/third_party/xla/third_party/tsl/tsl/lib/io/record_writer.h
index 5347b0c542227c..b585cb9b52f70c 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/record_writer.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/record_writer.h
@@ -124,7 +124,8 @@ class RecordWriter {
   }
 #endif
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RecordWriter);
+  RecordWriter(const RecordWriter&) = delete;
+  void operator=(const RecordWriter&) = delete;
 };
 
 void RecordWriter::PopulateHeader(char* header, const char* data, size_t n) {
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.h b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.h
index 433b24d1852f36..e4f4144f0625dc 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.h
@@ -124,7 +124,8 @@ class SnappyInputBuffer : public InputStreamInterface {
   // Number of *uncompressed* bytes that have been read from this stream.
   int64_t bytes_read_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SnappyInputBuffer);
+  SnappyInputBuffer(const SnappyInputBuffer&) = delete;
+  void operator=(const SnappyInputBuffer&) = delete;
 };
 
 }  // namespace io
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.h b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.h
index 8df474da18c355..f9ffe5e95378dd 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.h
@@ -82,7 +82,8 @@ class SnappyInputStream : public InputStreamInterface {
   // is not yet read.
   size_t avail_out_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SnappyInputStream);
+  SnappyInputStream(const SnappyInputStream&) = delete;
+  void operator=(const SnappyInputStream&) = delete;
 };
 
 }  // namespace io
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.h b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.h
index 271317ed832365..59b34d9cb4d98e 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.h
@@ -148,7 +148,8 @@ class SnappyOutputBuffer : public WritableFile {
   char* next_out_;
   size_t avail_out_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SnappyOutputBuffer);
+  SnappyOutputBuffer(const SnappyOutputBuffer&) = delete;
+  void operator=(const SnappyOutputBuffer&) = delete;
 };
 
 }  // namespace io
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
index c22b1743f52047..e19843c812d350 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
@@ -132,7 +132,8 @@ class ZlibInputStream : public InputStreamInterface {
   // Number of *uncompressed* bytes that have been read from this stream.
   int64_t bytes_read_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ZlibInputStream);
+  ZlibInputStream(const ZlibInputStream&) = delete;
+  void operator=(const ZlibInputStream&) = delete;
 };
 
 }  // namespace io
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
index baf8825e7147cf..3e4236ac1e44fd 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
@@ -149,7 +149,8 @@ class ZlibOutputBuffer : public WritableFile {
     return flush_mode == Z_SYNC_FLUSH || flush_mode == Z_FULL_FLUSH;
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ZlibOutputBuffer);
+  ZlibOutputBuffer(const ZlibOutputBuffer&) = delete;
+  void operator=(const ZlibOutputBuffer&) = delete;
 };
 
 }  // namespace io
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/collection_registry.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/collection_registry.h
index 72b398644894f0..d988d2f19f15ad 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/collection_registry.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/collection_registry.h
@@ -94,7 +94,8 @@ class CollectionRegistry {
  private:
   CollectionRegistry() {}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CollectionRegistry);
+  CollectionRegistry(const CollectionRegistry&) = delete;
+  void operator=(const CollectionRegistry&) = delete;
 };
 
 }  // namespace monitoring
@@ -273,7 +274,8 @@ class CollectionRegistry {
   };
   std::map<StringPiece, CollectionInfo> registry_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CollectionRegistry);
+  CollectionRegistry(const CollectionRegistry&) = delete;
+  void operator=(const CollectionRegistry&) = delete;
 };
 
 ////
@@ -397,7 +399,8 @@ class Collector {
   std::unique_ptr<CollectedMetrics> collected_metrics_ TF_GUARDED_BY(mu_);
   const uint64 collection_time_millis_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Collector);
+  Collector(const Collector&) = delete;
+  void operator=(const Collector&) = delete;
 };
 
 // Write the timestamps for the point based on the MetricKind.
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/counter.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/counter.h
index 51749a97e0dbdb..ec7b04981f495a 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/counter.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/counter.h
@@ -42,7 +42,8 @@ class CounterCell {
   int64 value() const { return 0; }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CounterCell);
+  CounterCell(const CounterCell&) = delete;
+  void operator=(const CounterCell&) = delete;
 };
 
 // Counter which has a null implementation.
@@ -68,7 +69,8 @@ class Counter {
 
   CounterCell default_counter_cell_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Counter);
+  Counter(const Counter&) = delete;
+  void operator=(const Counter&) = delete;
 };
 
 }  // namespace monitoring
@@ -117,7 +119,8 @@ class CounterCell {
  private:
   std::atomic<int64_t> value_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CounterCell);
+  CounterCell(const CounterCell&) = delete;
+  void operator=(const CounterCell&) = delete;
 };
 
 // A stateful class for updating a cumulative integer metric.
@@ -189,7 +192,8 @@ class Counter {
 
   std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Counter);
+  Counter(const Counter&) = delete;
+  void operator=(const Counter&) = delete;
 };
 
 ////
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h
index 8d71635d54fb7a..93cbe9aa928df0 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h
@@ -47,7 +47,8 @@ class GaugeCell {
   T value() const { return T(); }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
+  GaugeCell(const GaugeCell&) = delete;
+  void operator=(const GaugeCell&) = delete;
 };
 
 // Gauge which has a null implementation.
@@ -81,7 +82,8 @@ class Gauge {
 
   GaugeCell<ValueType> default_gauge_cell_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Gauge);
+  Gauge(const Gauge&) = delete;
+  void operator=(const Gauge&) = delete;
 };
 
 }  // namespace monitoring
@@ -131,7 +133,8 @@ class GaugeCell {
   T value_ TF_GUARDED_BY(mu_);
   mutable mutex mu_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
+  GaugeCell(const GaugeCell&) = delete;
+  void operator=(const GaugeCell&) = delete;
 };
 
 // Explicit specialization of GaugeCell<int64_t>. Compared to the primary
@@ -152,7 +155,8 @@ class GaugeCell<int64_t> {
  private:
   std::atomic<int64_t> value_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
+  GaugeCell(const GaugeCell&) = delete;
+  void operator=(const GaugeCell&) = delete;
 };
 
 // Explicit specialization of GaugeCell<bool>. Compared to the primary
@@ -173,7 +177,8 @@ class GaugeCell<bool> {
  private:
   std::atomic<bool> value_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
+  GaugeCell(const GaugeCell&) = delete;
+  void operator=(const GaugeCell&) = delete;
 };
 
 // A stateful class for updating a gauge-like metric. Allowed ValueType are
@@ -254,7 +259,8 @@ class Gauge {
 
   std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Gauge);
+  Gauge(const Gauge&) = delete;
+  void operator=(const Gauge&) = delete;
 };
 
 ////
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/percentile_sampler.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/percentile_sampler.h
index 38df40a770ea46..5d14b5b2894bc6 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/percentile_sampler.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/percentile_sampler.h
@@ -62,7 +62,8 @@ class PercentileSampler {
 
   PercentileSampler() = default;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PercentileSampler);
+  PercentileSampler(const PercentileSampler&) = delete;
+  void operator=(const PercentileSampler&) = delete;
 };
 
 template <int NumLabels>
@@ -134,7 +135,8 @@ class PercentileSamplerCell {
   size_t total_samples_ TF_GUARDED_BY(mu_);
   long double accumulator_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PercentileSamplerCell);
+  PercentileSamplerCell(const PercentileSamplerCell&) = delete;
+  void operator=(const PercentileSamplerCell&) = delete;
 };
 
 // A stateful class for updating a cumulative percentile sampled metric.
@@ -242,7 +244,8 @@ class PercentileSampler {
   // Registration handle with the CollectionRegistry.
   std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PercentileSampler);
+  PercentileSampler(const PercentileSampler&) = delete;
+  void operator=(const PercentileSampler&) = delete;
 };
 
 template <int NumLabels>
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.cc b/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.cc
index 28dce2301ba5ff..738f6f2968e95c 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.cc
@@ -60,7 +60,8 @@ class ExplicitBuckets : public Buckets {
  private:
   std::vector<double> bucket_limits_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ExplicitBuckets);
+  ExplicitBuckets(const ExplicitBuckets&) = delete;
+  void operator=(const ExplicitBuckets&) = delete;
 };
 
 class ExponentialBuckets : public Buckets {
@@ -91,7 +92,8 @@ class ExponentialBuckets : public Buckets {
 
   ExplicitBuckets explicit_buckets_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ExponentialBuckets);
+  ExponentialBuckets(const ExponentialBuckets&) = delete;
+  void operator=(const ExponentialBuckets&) = delete;
 };
 
 }  // namespace
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.h
index 175dbea77ecd47..eae275c265bfb9 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.h
@@ -46,7 +46,8 @@ class SamplerCell {
   HistogramProto value() const { return HistogramProto(); }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(SamplerCell);
+  SamplerCell(const SamplerCell&) = delete;
+  void operator=(const SamplerCell&) = delete;
 };
 
 // Buckets which has a null implementation.
@@ -73,7 +74,8 @@ class Buckets {
  private:
   std::vector<double> explicit_bounds_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Buckets);
+  Buckets(const Buckets&) = delete;
+  void operator=(const Buckets&) = delete;
 };
 
 // Sampler which has a null implementation.
@@ -102,7 +104,8 @@ class Sampler {
   SamplerCell default_sampler_cell_;
   std::unique_ptr<Buckets> buckets_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Sampler);
+  Sampler(const Sampler&) = delete;
+  void operator=(const Sampler&) = delete;
 };
 
 }  // namespace monitoring
@@ -156,7 +159,8 @@ class SamplerCell {
  private:
   histogram::ThreadSafeHistogram histogram_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SamplerCell);
+  SamplerCell(const SamplerCell&) = delete;
+  void operator=(const SamplerCell&) = delete;
 };
 
 // Bucketing strategies for the samplers.
@@ -275,7 +279,8 @@ class Sampler {
   // Registration handle with the CollectionRegistry.
   std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Sampler);
+  Sampler(const Sampler&) = delete;
+  void operator=(const Sampler&) = delete;
 };
 
 ////
diff --git a/third_party/xla/third_party/tsl/tsl/lib/random/distribution_sampler.h b/third_party/xla/third_party/tsl/tsl/lib/random/distribution_sampler.h
index eabc4eac485161..877660c8532baa 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/random/distribution_sampler.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/random/distribution_sampler.h
@@ -85,7 +85,8 @@ class DistributionSampler {
   int num_;
   std::unique_ptr<std::pair<float, int>[]> data_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DistributionSampler);
+  DistributionSampler(const DistributionSampler&) = delete;
+  void operator=(const DistributionSampler&) = delete;
 };
 
 }  // namespace random
diff --git a/third_party/xla/third_party/tsl/tsl/lib/random/weighted_picker.h b/third_party/xla/third_party/tsl/tsl/lib/random/weighted_picker.h
index e716e68cfaf283..05fabea852b5f7 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/random/weighted_picker.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/random/weighted_picker.h
@@ -114,7 +114,8 @@ class WeightedPicker {
   // Rebuild the tree weights using the leaf weights
   void RebuildTreeWeights();
 
-  TF_DISALLOW_COPY_AND_ASSIGN(WeightedPicker);
+  WeightedPicker(const WeightedPicker&) = delete;
+  void operator=(const WeightedPicker&) = delete;
 };
 
 inline int32 WeightedPicker::get_weight(int index) const {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/compute_engine_metadata_client.h b/third_party/xla/third_party/tsl/tsl/platform/cloud/compute_engine_metadata_client.h
index d9f610e7e19100..fac94cdb1c9c8c 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/compute_engine_metadata_client.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/compute_engine_metadata_client.h
@@ -58,7 +58,8 @@ class ComputeEngineMetadataClient {
   std::shared_ptr<HttpRequest::Factory> http_request_factory_;
   const RetryConfig retry_config_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ComputeEngineMetadataClient);
+  ComputeEngineMetadataClient(const ComputeEngineMetadataClient&) = delete;
+  void operator=(const ComputeEngineMetadataClient&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/compute_engine_zone_provider.h b/third_party/xla/third_party/tsl/tsl/platform/cloud/compute_engine_zone_provider.h
index fe4f0fc4f8ae60..a37b43c22a484f 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/compute_engine_zone_provider.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/compute_engine_zone_provider.h
@@ -32,7 +32,8 @@ class ComputeEngineZoneProvider : public ZoneProvider {
  private:
   std::shared_ptr<ComputeEngineMetadataClient> google_metadata_client_;
   string cached_zone;
-  TF_DISALLOW_COPY_AND_ASSIGN(ComputeEngineZoneProvider);
+  ComputeEngineZoneProvider(const ComputeEngineZoneProvider&) = delete;
+  void operator=(const ComputeEngineZoneProvider&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.h b/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.h
index 1680302ad4d2b0..490e762967f182 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.h
@@ -228,7 +228,8 @@ class CurlHttpRequest : public HttpRequest {
   // Limit the size of an http response that is copied into an error message.
   const size_t response_to_error_limit_ = 500;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CurlHttpRequest);
+  CurlHttpRequest(const CurlHttpRequest&) = delete;
+  void operator=(const CurlHttpRequest&) = delete;
 };
 
 /// \brief A proxy to the libcurl C interface as a dependency injection measure.
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/gcs_file_system.h b/third_party/xla/third_party/tsl/tsl/platform/cloud/gcs_file_system.h
index c78f837eec6d6d..17725e8d5b01e6 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/gcs_file_system.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/gcs_file_system.h
@@ -441,7 +441,8 @@ class GcsFileSystem : public FileSystem {
   // Additional header material to be transmitted with all GCS requests
   std::unique_ptr<std::pair<const string, const string>> additional_header_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GcsFileSystem);
+  GcsFileSystem(const GcsFileSystem&) = delete;
+  void operator=(const GcsFileSystem&) = delete;
 };
 
 /// Google Cloud Storage implementation of a file system with retry on failures.
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/google_auth_provider.h b/third_party/xla/third_party/tsl/tsl/platform/cloud/google_auth_provider.h
index 24e9f57aadfdc8..63b7ea63abf5f3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/google_auth_provider.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/google_auth_provider.h
@@ -61,7 +61,8 @@ class GoogleAuthProvider : public AuthProvider {
   mutex mu_;
   string current_token_ TF_GUARDED_BY(mu_);
   uint64 expiration_timestamp_sec_ TF_GUARDED_BY(mu_) = 0;
-  TF_DISALLOW_COPY_AND_ASSIGN(GoogleAuthProvider);
+  GoogleAuthProvider(const GoogleAuthProvider&) = delete;
+  void operator=(const GoogleAuthProvider&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/http_request.h b/third_party/xla/third_party/tsl/tsl/platform/cloud/http_request.h
index eb849129e138d6..a3a3136d66e6f7 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/http_request.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/http_request.h
@@ -184,7 +184,8 @@ class HttpRequest {
   virtual void SetTimeouts(uint32 connection, uint32 inactivity,
                            uint32 total) = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(HttpRequest);
+  HttpRequest(const HttpRequest&) = delete;
+  void operator=(const HttpRequest&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/oauth_client.h b/third_party/xla/third_party/tsl/tsl/platform/cloud/oauth_client.h
index e3bf3b1033897b..895c2d01cd5f8f 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/oauth_client.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/oauth_client.h
@@ -55,7 +55,8 @@ class OAuthClient {
  private:
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
   Env* env_;
-  TF_DISALLOW_COPY_AND_ASSIGN(OAuthClient);
+  OAuthClient(const OAuthClient&) = delete;
+  void operator=(const OAuthClient&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/subprocess.h b/third_party/xla/third_party/tsl/tsl/platform/default/subprocess.h
index b415d487c1938b..5b6e1f31065a17 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/subprocess.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/subprocess.h
@@ -123,7 +123,8 @@ class SubProcess {
   int parent_pipe_[kNFds] TF_GUARDED_BY(data_mu_);
   int child_pipe_[kNFds] TF_GUARDED_BY(data_mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SubProcess);
+  SubProcess(const SubProcess&) = delete;
+  void operator=(const SubProcess&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/denormal.h b/third_party/xla/third_party/tsl/tsl/platform/denormal.h
index a90e5442ae0b85..5b13ab1b0d752c 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/denormal.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/denormal.h
@@ -59,7 +59,9 @@ class ScopedRestoreFlushDenormalState {
 
  private:
   DenormalState denormal_state_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ScopedRestoreFlushDenormalState);
+  ScopedRestoreFlushDenormalState(const ScopedRestoreFlushDenormalState&) =
+      delete;
+  void operator=(const ScopedRestoreFlushDenormalState&) = delete;
 };
 
 // While this class is active, denormal floating point numbers are flushed
@@ -70,7 +72,8 @@ class ScopedFlushDenormal {
 
  private:
   ScopedRestoreFlushDenormalState restore_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ScopedFlushDenormal);
+  ScopedFlushDenormal(const ScopedFlushDenormal&) = delete;
+  void operator=(const ScopedFlushDenormal&) = delete;
 };
 
 // While this class is active, denormal floating point numbers are not flushed
@@ -81,7 +84,8 @@ class ScopedDontFlushDenormal {
 
  private:
   ScopedRestoreFlushDenormalState restore_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ScopedDontFlushDenormal);
+  ScopedDontFlushDenormal(const ScopedDontFlushDenormal&) = delete;
+  void operator=(const ScopedDontFlushDenormal&) = delete;
 };
 
 }  // namespace port
diff --git a/third_party/xla/third_party/tsl/tsl/platform/env.h b/third_party/xla/third_party/tsl/tsl/platform/env.h
index 92bab5138d790a..fe3354c765a06f 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/env.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/env.h
@@ -490,7 +490,8 @@ class Env {
 
  private:
   std::unique_ptr<FileSystemRegistry> file_system_registry_;
-  TF_DISALLOW_COPY_AND_ASSIGN(Env);
+  Env(const Env&) = delete;
+  void operator=(const Env&) = delete;
 };
 
 /// \brief An implementation of Env that forwards all calls to another Env.
@@ -576,7 +577,8 @@ class Thread {
   virtual ~Thread();
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(Thread);
+  Thread(const Thread&) = delete;
+  void operator=(const Thread&) = delete;
 };
 
 /// \brief Cross-platform setenv.
diff --git a/third_party/xla/third_party/tsl/tsl/platform/file_system.h b/third_party/xla/third_party/tsl/tsl/platform/file_system.h
index b33d07f7fe23a3..76fab57f4b64b5 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/file_system.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/file_system.h
@@ -772,7 +772,8 @@ class RandomAccessFile {
 #endif
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(RandomAccessFile);
+  RandomAccessFile(const RandomAccessFile&) = delete;
+  void operator=(const RandomAccessFile&) = delete;
 };
 
 /// \brief A file abstraction for sequential writing.
@@ -845,7 +846,8 @@ class WritableFile {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(WritableFile);
+  WritableFile(const WritableFile&) = delete;
+  void operator=(const WritableFile&) = delete;
 };
 
 /// \brief A readonly memmapped file abstraction.
diff --git a/third_party/xla/third_party/tsl/tsl/platform/macros.h b/third_party/xla/third_party/tsl/tsl/platform/macros.h
index d9e1b46576f51f..cb91c4ff64e847 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/macros.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/macros.h
@@ -115,6 +115,7 @@ limitations under the License.
 #define TF_PREDICT_TRUE(x) (x)
 #endif
 
+// DEPRECATED: directly use the macro implementation instead.
 // A macro to disallow the copy constructor and operator= functions
 // This is usually placed in the private: declarations for a class.
 #define TF_DISALLOW_COPY_AND_ASSIGN(TypeName) \
diff --git a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/android_armv7a_cpu_utils_helper.h
index 64557f55e03e5d..b178af8f8648d0 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/android_armv7a_cpu_utils_helper.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -56,7 +56,8 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
   bool is_initialized_{false};
   int fd_{INVALID_FD};
 
-  TF_DISALLOW_COPY_AND_ASSIGN(AndroidArmV7ACpuUtilsHelper);
+  AndroidArmV7ACpuUtilsHelper(const AndroidArmV7ACpuUtilsHelper &) = delete;
+  void operator=(const AndroidArmV7ACpuUtilsHelper &) = delete;
 };
 
 }  // namespace profile_utils
diff --git a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/clock_cycle_profiler.h b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/clock_cycle_profiler.h
index 05546667df6b7f..364921abc58021 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/clock_cycle_profiler.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/clock_cycle_profiler.h
@@ -98,7 +98,8 @@ class ClockCycleProfiler {
   double worst_clock_cycle_{0.0};
   bool valid_{true};
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ClockCycleProfiler);
+  ClockCycleProfiler(const ClockCycleProfiler&) = delete;
+  void operator=(const ClockCycleProfiler&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/cpu_utils.h b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/cpu_utils.h
index 9fc277fb9993a5..a6b153cfa37bbf 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/cpu_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/cpu_utils.h
@@ -164,7 +164,8 @@ class CpuUtils {
     int64_t CalculateCpuFrequency() final { return INVALID_FREQUENCY; }
 
    private:
-    TF_DISALLOW_COPY_AND_ASSIGN(DefaultCpuUtilsHelper);
+    DefaultCpuUtilsHelper(const DefaultCpuUtilsHelper&) = delete;
+    void operator=(const DefaultCpuUtilsHelper&) = delete;
   };
 
   // Return cpu frequency.
@@ -181,7 +182,8 @@ class CpuUtils {
   // 2. Minimize the overhead of acquiring ICpuUtilsHelper
   static ICpuUtilsHelper& GetCpuUtilsHelperSingletonInstance();
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CpuUtils);
+  CpuUtils(const CpuUtils&) = delete;
+  void operator=(const CpuUtils&) = delete;
 };
 
 }  // namespace profile_utils
diff --git a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/i_cpu_utils_helper.h b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/i_cpu_utils_helper.h
index b76976377b63c1..03b1dd20f497cb 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/i_cpu_utils_helper.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/i_cpu_utils_helper.h
@@ -45,7 +45,8 @@ class ICpuUtilsHelper {
   virtual int64_t CalculateCpuFrequency() = 0;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ICpuUtilsHelper);
+  ICpuUtilsHelper(const ICpuUtilsHelper&) = delete;
+  void operator=(const ICpuUtilsHelper&) = delete;
 };
 
 }  // namespace profile_utils
diff --git a/third_party/xla/third_party/tsl/tsl/platform/ram_file_system.h b/third_party/xla/third_party/tsl/tsl/platform/ram_file_system.h
index 218276f1ffeac5..1b51653b716c3e 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/ram_file_system.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/ram_file_system.h
@@ -97,7 +97,8 @@ class RamRandomAccessFile : public RandomAccessFile, public WritableFile {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(RamRandomAccessFile);
+  RamRandomAccessFile(const RamRandomAccessFile&) = delete;
+  void operator=(const RamRandomAccessFile&) = delete;
   std::string name_;
   std::shared_ptr<std::string> data_;
 };
diff --git a/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h b/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h
index 75e9812e72a93a..591423b4fe3ec7 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h
@@ -165,7 +165,8 @@ class RetryingFileSystem : public FileSystem {
   std::unique_ptr<Underlying> base_file_system_;
   const RetryConfig retry_config_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RetryingFileSystem);
+  RetryingFileSystem(const RetryingFileSystem&) = delete;
+  void operator=(const RetryingFileSystem&) = delete;
 };
 
 namespace retrying_internals {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/scanner.h b/third_party/xla/third_party/tsl/tsl/platform/scanner.h
index 5b726f0d535eab..2a53d57320cbe5 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/scanner.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/scanner.h
@@ -237,7 +237,8 @@ class Scanner {
 
   friend class ScannerTest;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Scanner);
+  Scanner(const Scanner&) = delete;
+  void operator=(const Scanner&) = delete;
 };
 
 }  // namespace strings
diff --git a/third_party/xla/third_party/tsl/tsl/platform/setround.h b/third_party/xla/third_party/tsl/tsl/platform/setround.h
index e9700ca6eff450..adfc3fd2ee29fa 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/setround.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/setround.h
@@ -45,7 +45,8 @@ class ScopedSetRound {
  private:
   int original_mode_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ScopedSetRound);
+  ScopedSetRound(const ScopedSetRound&) = delete;
+  void operator=(const ScopedSetRound&) = delete;
 };
 
 }  // namespace port
diff --git a/third_party/xla/third_party/tsl/tsl/platform/static_threadlocal.h b/third_party/xla/third_party/tsl/tsl/platform/static_threadlocal.h
index 80364cd3b9a401..f8535f1a7234b9 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/static_threadlocal.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/static_threadlocal.h
@@ -34,7 +34,8 @@ limitations under the License.
     bool is_native_tls() const { return true; }                    \
                                                                    \
    private:                                                        \
-    SE_DISALLOW_COPY_AND_ASSIGN(ThreadLocal_##_var_);              \
+    ThreadLocal_##_var_(const ThreadLocal_##_var_ &) = delete;     \
+    void operator=(const ThreadLocal_##_var_ &) = delete;          \
   } _var_;                                                         \
   }
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/statusor_test.cc b/third_party/xla/third_party/tsl/tsl/platform/statusor_test.cc
index d85149f0b66401..9cfc44de7167ed 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/statusor_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/statusor_test.cc
@@ -545,7 +545,8 @@ class BenchmarkFactory {
 
  private:
   T* volatile value_;
-  TF_DISALLOW_COPY_AND_ASSIGN(BenchmarkFactory);
+  BenchmarkFactory(const BenchmarkFactory&) = delete;
+  void operator=(const BenchmarkFactory&) = delete;
 };
 
 // A simple type we use with the factory.
@@ -556,7 +557,8 @@ class BenchmarkType {
   virtual void DoWork() TF_ATTRIBUTE_NOINLINE {}
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(BenchmarkType);
+  BenchmarkType(const BenchmarkType&) = delete;
+  void operator=(const BenchmarkType&) = delete;
 };
 
 // Calibrate the amount of time spent just calling DoWork, since each of our
diff --git a/third_party/xla/third_party/tsl/tsl/platform/strcat.h b/third_party/xla/third_party/tsl/tsl/platform/strcat.h
index 8caa0fce1e4c68..198465b4cc0515 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/strcat.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/strcat.h
@@ -143,7 +143,8 @@ class AlphaNum {
   AlphaNum(char c);  // NOLINT(runtime/explicit)
 
   // NOLINTEND(google-explicit-constructor)
-  TF_DISALLOW_COPY_AND_ASSIGN(AlphaNum);
+  AlphaNum(const AlphaNum &) = delete;
+  void operator=(const AlphaNum &) = delete;
 };
 
 // ----------------------------------------------------------------------
diff --git a/third_party/xla/third_party/tsl/tsl/platform/threadpool.h b/third_party/xla/third_party/tsl/tsl/platform/threadpool.h
index 722478f1f63970..df650f6eccfd4c 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/threadpool.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/threadpool.h
@@ -235,7 +235,8 @@ class ThreadPool {
   // user_threadpool is not in the constructor.
   std::unique_ptr<Eigen::ThreadPoolTempl<EigenEnvironment>> eigen_threadpool_;
   std::unique_ptr<Eigen::ThreadPoolDevice> threadpool_device_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ThreadPool);
+  ThreadPool(const ThreadPool&) = delete;
+  void operator=(const ThreadPool&) = delete;
 };
 
 }  // namespace thread
diff --git a/third_party/xla/third_party/tsl/tsl/platform/tracing.h b/third_party/xla/third_party/tsl/tsl/platform/tracing.h
index e5ab8f281b8357..90678dc5f60ada 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/tracing.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/tracing.h
@@ -129,7 +129,8 @@ class ScopedRegion {
   bool IsEnabled() const { return collector_ != nullptr; }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ScopedRegion);
+  ScopedRegion(const ScopedRegion&) = delete;
+  void operator=(const ScopedRegion&) = delete;
 
   const EventCollector* collector_;
 };
diff --git a/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.h b/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.h
index 2356dd9b0c34fd..05148e465e94c8 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/windows/subprocess.h
@@ -117,7 +117,8 @@ class SubProcess {
   ChannelAction action_[kNFds] TF_GUARDED_BY(data_mu_);
   void* parent_pipe_[kNFds] TF_GUARDED_BY(data_mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SubProcess);
+  SubProcess(const SubProcess&) = delete;
+  void operator=(const SubProcess&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.h b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.h
index da6fde9b0ba481..23bd5236f185bf 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.h
@@ -84,7 +84,8 @@ class AnnotationStack {
  private:
   AnnotationStack() = default;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(AnnotationStack);
+  AnnotationStack(const AnnotationStack&) = delete;
+  void operator=(const AnnotationStack&) = delete;
 
   // Returns a reference to the annotation for the current thread.
   static string* ThreadAnnotationStack();
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/traceme_recorder.h b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/traceme_recorder.h
index 872c040cd8f042..20499e33ca11b4 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/traceme_recorder.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/traceme_recorder.h
@@ -112,7 +112,8 @@ class TraceMeRecorder {
 
   TraceMeRecorder() = default;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TraceMeRecorder);
+  TraceMeRecorder(const TraceMeRecorder&) = delete;
+  void operator=(const TraceMeRecorder&) = delete;
 
   void RegisterThread(uint32 tid, std::shared_ptr<ThreadLocalRecorder> thread);
   void UnregisterThread(uint32 tid);
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
index c324be40677887..643d7045428605 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
@@ -153,7 +153,8 @@ class ScopedAnnotationT {
   // signals that annotation is disabled at the constructor.
   static constexpr size_t kInvalidLength = static_cast<size_t>(-1);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ScopedAnnotationT);
+  ScopedAnnotationT(const ScopedAnnotationT&) = delete;
+  void operator=(const ScopedAnnotationT&) = delete;
 
   size_t old_length_ = kInvalidLength;
 };
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme.h
index c9224612136357..323659eadb62fe 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme.h
@@ -307,7 +307,8 @@ class TraceMe {
   // Start time used when tracing is disabled.
   constexpr static int64_t kUntracedActivity = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TraceMe);
+  TraceMe(const TraceMe&) = delete;
+  void operator=(const TraceMe&) = delete;
 
   // Wrap the name into a union so that we can avoid the cost of string
   // initialization when tracing is disabled.
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme_encode.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme_encode.h
index 15d82b45ef33ea..76c5f301e7d703 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme_encode.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme_encode.h
@@ -43,7 +43,8 @@ struct TraceMeArg {
              const absl::AlphaNum& v ABSL_ATTRIBUTE_LIFETIME_BOUND)
       : key(k), value(v.Piece()) {}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TraceMeArg);
+  TraceMeArg(const TraceMeArg&) = delete;
+  void operator=(const TraceMeArg&) = delete;
 
   absl::string_view key;
   absl::string_view value;
diff --git a/third_party/xla/third_party/tsl/tsl/util/reporter.h b/third_party/xla/third_party/tsl/tsl/util/reporter.h
index 973c70c0247095..d020e94fae1276 100644
--- a/third_party/xla/third_party/tsl/tsl/util/reporter.h
+++ b/third_party/xla/third_party/tsl/tsl/util/reporter.h
@@ -54,7 +54,8 @@ class TestReportFile {
   string fname_;
   string test_name_;
   std::unique_ptr<WritableFile> log_file_;
-  TF_DISALLOW_COPY_AND_ASSIGN(TestReportFile);
+  TestReportFile(const TestReportFile&) = delete;
+  void operator=(const TestReportFile&) = delete;
 };
 
 // The TestReporter writes test / benchmark output to binary Protobuf files when
@@ -124,7 +125,8 @@ class TestReporter {
   }
   TestReportFile report_file_;
   tensorflow::BenchmarkEntry benchmark_entry_;
-  TF_DISALLOW_COPY_AND_ASSIGN(TestReporter);
+  TestReporter(const TestReporter&) = delete;
+  void operator=(const TestReporter&) = delete;
 };
 
 }  // namespace tsl

From 043c8fc649a4929a2ca76b92ecb9ab08fc6cebc2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 02:01:53 -0700
Subject: [PATCH 514/567] Update GraphDef version to 1638.

PiperOrigin-RevId: 570316698
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 23ccff90b8d429..9e08d23aa58aa7 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1637  // Updated: 2023/10/2
+#define TF_GRAPH_DEF_VERSION 1638  // Updated: 2023/10/3
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 56603b2d2063b887e07f4038645c81972ae3afc7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 02:01:56 -0700
Subject: [PATCH 515/567] compat: Update forward compatibility horizon to
 2023-10-03

PiperOrigin-RevId: 570316707
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 65383715827778..f67844baea7f14 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 10, 2)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 10, 3)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From b492435349495fbab5e2ed6095f03e81ff9be17a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 02:53:22 -0700
Subject: [PATCH 516/567] Internal Code Change

PiperOrigin-RevId: 570326337
---
 .../experimental/acceleration/mini_benchmark/BUILD   | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
index 667588a7b40ce4..23cedaae8879a4 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
@@ -694,11 +694,7 @@ cc_library(
         "mini_benchmark.cc",
     ],
     hdrs = ["mini_benchmark.h"],
-    visibility = [
-        "//tensorflow/lite/experimental/acceleration/mini_benchmark:__subpackages__",
-        "//tensorflow/lite/tools/benchmark:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support/cc:__subpackages__",
-    ] + minibenchmark_visibility_allowlist(),
+    visibility = ["@org_tensorflow_lite_support//tensorflow_lite_support/cc:__subpackages__"] + minibenchmark_visibility_allowlist(),
     deps = [
         "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "@com_google_absl//absl/status:statusor",
@@ -712,11 +708,7 @@ cc_library(
     srcs = [
         "mini_benchmark_implementation.cc",
     ],
-    visibility = [
-        "//tensorflow/lite/experimental/acceleration/mini_benchmark:__subpackages__",
-        "//tensorflow/lite/tools/benchmark:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support/cc:__subpackages__",
-    ] + minibenchmark_visibility_allowlist(),
+    visibility = ["@org_tensorflow_lite_support//tensorflow_lite_support/cc:__subpackages__"] + minibenchmark_visibility_allowlist(),
     deps = [
         ":fb_storage",
         ":mini_benchmark",

From 8e7d0609c58bfd64422dd180f0bf86883dde19fe Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 3 Oct 2023 06:59:07 -0700
Subject: [PATCH 517/567] [XLA] Split --xla_detailed_logging_and_dumping debug
 flag into --xla_detailed_logging and --xla_enable_dumping.

We want to suppress detailed logging (notably on TPU, which has pretty verbose detailed logging) separately from disabling HLO dumps. Even if we don't print detailed log information, it's quite surprising if an HLO module doesn't show up in the set of modules dumped by XLA.

PiperOrigin-RevId: 570374492
---
 tensorflow/compiler/jit/device_compiler_client.cc  |  4 +++-
 .../compiler/jit/device_compiler_client_test.cc    |  4 ++--
 tensorflow/compiler/jit/get_compiler_ir.cc         |  4 +++-
 third_party/xla/xla/debug_options_flags.cc         |  3 ++-
 third_party/xla/xla/python/xla_client.py           |  2 +-
 third_party/xla/xla/python/xla_compiler.cc         |  7 ++++---
 .../xla/xla/python/xla_extension/__init__.pyi      |  2 ++
 third_party/xla/xla/service/dump.cc                |  2 +-
 third_party/xla/xla/xla.proto                      | 14 +++++++++-----
 9 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/jit/device_compiler_client.cc b/tensorflow/compiler/jit/device_compiler_client.cc
index e84906126b5899..747aa7de090125 100644
--- a/tensorflow/compiler/jit/device_compiler_client.cc
+++ b/tensorflow/compiler/jit/device_compiler_client.cc
@@ -35,7 +35,9 @@ xla::ExecutableBuildOptions GetExecutableBuildOptions(
   build_options.set_result_layout(result.xla_output_shape);
   build_options.set_device_allocator(options.device_allocator.get());
   build_options.set_alias_passthrough_params(options.alias_passthrough_params);
-  build_options.mutable_debug_options()->set_xla_detailed_logging_and_dumping(
+  build_options.mutable_debug_options()->set_xla_detailed_logging(
+      options.detailed_logging);
+  build_options.mutable_debug_options()->set_xla_enable_dumping(
       options.detailed_logging);
   if (tensorflow::OpDeterminismRequired()) {
     build_options.mutable_debug_options()->set_xla_gpu_deterministic_ops(true);
diff --git a/tensorflow/compiler/jit/device_compiler_client_test.cc b/tensorflow/compiler/jit/device_compiler_client_test.cc
index 104b0b0f651b91..4ac2e7f1a205d1 100644
--- a/tensorflow/compiler/jit/device_compiler_client_test.cc
+++ b/tensorflow/compiler/jit/device_compiler_client_test.cc
@@ -36,8 +36,8 @@ TEST(GetExecutableOptionTest, Basic) {
   EXPECT_EQ(build_option.result_layout()->ToString(),
             xla_output_shape.ToString());
   EXPECT_EQ(build_option.alias_passthrough_params(), true);
-  EXPECT_EQ(build_option.debug_options().xla_detailed_logging_and_dumping(),
-            true);
+  EXPECT_EQ(build_option.debug_options().xla_detailed_logging(), true);
+  EXPECT_EQ(build_option.debug_options().xla_enable_dumping(), true);
 }
 
 TEST(GetExecutableOptionTest, DefaultDeviceOrdinal) {
diff --git a/tensorflow/compiler/jit/get_compiler_ir.cc b/tensorflow/compiler/jit/get_compiler_ir.cc
index 531a51a750fce6..37987cb55ecf38 100644
--- a/tensorflow/compiler/jit/get_compiler_ir.cc
+++ b/tensorflow/compiler/jit/get_compiler_ir.cc
@@ -72,7 +72,9 @@ static StatusOr<std::unique_ptr<xla::LocalExecutable>> BuildExecutable(
   build_options.set_result_layout(result.xla_output_shape);
   build_options.set_device_allocator(options.device_allocator.get());
   build_options.set_alias_passthrough_params(options.alias_passthrough_params);
-  build_options.mutable_debug_options()->set_xla_detailed_logging_and_dumping(
+  build_options.mutable_debug_options()->set_xla_detailed_logging(
+      options.detailed_logging);
+  build_options.mutable_debug_options()->set_xla_enable_dumping(
       options.detailed_logging);
   // If the embed_ir_in_executable is set, hlo_proto will be dumped in
   // executable. The hlo_proto contains HLO modules and buffer assignment.
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index e2d27b6ff245bf..cf9dcb1c6e835f 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -125,7 +125,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_enable_xprof_traceme(false);
   opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
   opts.set_xla_multiheap_size_constraint_per_heap(-1);
-  opts.set_xla_detailed_logging_and_dumping(true);
+  opts.set_xla_detailed_logging(true);
+  opts.set_xla_enable_dumping(true);
 
   opts.set_xla_gpu_enable_xla_runtime_executable(true);
   opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 1f7977e903abee..15749cc5bc6465 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -45,7 +45,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 200
+_version = 201
 
 # Version number for MLIR:Python components.
 mlir_api_version = 54
diff --git a/third_party/xla/xla/python/xla_compiler.cc b/third_party/xla/xla/python/xla_compiler.cc
index 4b752c02698344..335795fa048cd3 100644
--- a/third_party/xla/xla/python/xla_compiler.cc
+++ b/third_party/xla/xla/python/xla_compiler.cc
@@ -831,9 +831,10 @@ void BuildXlaCompilerSubmodule(py::module& m) {
       .def_property("xla_cpu_fast_math_honor_functions",
                     &DebugOptions::xla_cpu_fast_math_honor_functions,
                     &DebugOptions::set_xla_cpu_fast_math_honor_functions)
-      .def_property("xla_detailed_logging_and_dumping",
-                    &DebugOptions::xla_detailed_logging_and_dumping,
-                    &DebugOptions::set_xla_detailed_logging_and_dumping)
+      .def_property("xla_detailed_logging", &DebugOptions::xla_detailed_logging,
+                    &DebugOptions::set_xla_detailed_logging)
+      .def_property("xla_enable_dumping", &DebugOptions::xla_enable_dumping,
+                    &DebugOptions::set_xla_enable_dumping)
       .def_property("xla_gpu_enable_fast_min_max",
                     &DebugOptions::xla_gpu_enable_fast_min_max,
                     &DebugOptions::set_xla_gpu_enable_fast_min_max)
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
index 71b9eccb1e1ace..acc2b7656a35c2 100644
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ b/third_party/xla/xla/python/xla_extension/__init__.pyi
@@ -277,6 +277,8 @@ class DebugOptions:
   xla_gpu_enable_async_collective_permute: bool
   xla_gpu_enable_async_all_to_all: bool
   xla_gpu_enable_async_reduce_scatter: bool
+  xla_detailed_logging: bool
+  xla_enable_dumping: bool
 
 class CompiledMemoryStats:
   generated_code_size_in_bytes: int
diff --git a/third_party/xla/xla/service/dump.cc b/third_party/xla/xla/service/dump.cc
index 83e67f1f7da65f..65a09e9fcbb165 100644
--- a/third_party/xla/xla/service/dump.cc
+++ b/third_party/xla/xla/service/dump.cc
@@ -85,7 +85,7 @@ struct CanonicalDebugOptions {
     }
 
     // Disable dumping if specified by the user.
-    if (!opts.xla_detailed_logging_and_dumping()) {
+    if (!opts.xla_enable_dumping()) {
       dump_to = "";
     }
 
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index af4172ed2f73c2..2304d4568b14b6 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -350,10 +350,14 @@ message DebugOptions {
   // reached.
   int32 xla_multiheap_size_constraint_per_heap = 142;
 
-  // Enable detailed logging into vlog and xla dumping. If this is disabled, no
-  // compilation summary will be printed in the end of computation and no hlo
-  // modules will be dumped.
-  bool xla_detailed_logging_and_dumping = 143;
+  reserved 143;  // Was xla_detailed_logging_and_dumping
+
+  // Enable detailed logging into vlog. If this is disabled, no
+  // compilation summary will be printed in the end of computation.
+  bool xla_detailed_logging = 252;
+
+  // Enable HLO dumping. If this is disabled, no HLO modules will be dumped.
+  bool xla_enable_dumping = 253;
 
   // Overrides normal multi-threaded compilation setting to use this many
   // threads. Setting to 0 (the default value) means no enforcement.
@@ -633,7 +637,7 @@ message DebugOptions {
   // Maximum number of buffers to print when debugging buffer assignment.
   int64 xla_debug_buffer_assignment_show_max = 251;
 
-  // Next id: 252
+  // Next id: 254
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.

From 1a0f99181cab73830a254650fbc9df37c2b21300 Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Tue, 3 Oct 2023 07:41:44 -0700
Subject: [PATCH 518/567] Copy/paste AddGraphExportLoweringPasses into API/v1
 in preparation to call it from bridge.cc.

PiperOrigin-RevId: 570384568
---
 tensorflow/compiler/mlir/tf2xla/api/v1/BUILD  |  22 ++++
 .../tf2xla/api/v1/testdata/empty_func.mlir    |   5 +
 .../api/v1/testdata/invalid_executor.mlir     |  17 +++
 .../tf2xla/api/v1/tf_dialect_to_executor.cc   | 106 +++++++++++++++++-
 .../tf2xla/api/v1/tf_dialect_to_executor.h    |   4 +-
 .../api/v1/tf_dialect_to_executor_test.cc     |  59 +++++++++-
 6 files changed, 207 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v1/testdata/empty_func.mlir
 create mode 100644 tensorflow/compiler/mlir/tf2xla/api/v1/testdata/invalid_executor.mlir

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
index f9a5cac4e5ed29..b160d46a24b225 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -172,8 +172,20 @@ cc_library(
     srcs = ["tf_dialect_to_executor.cc"],
     hdrs = ["tf_dialect_to_executor.h"],
     deps = [
+        "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "//tensorflow/core:framework",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -181,10 +193,20 @@ cc_library(
 tf_cc_test(
     name = "tf_dialect_to_executor_test",
     srcs = ["tf_dialect_to_executor_test.cc"],
+    data = [
+        "testdata/empty_func.mlir",
+        "testdata/invalid_executor.mlir",
+    ],
     deps = [
         ":tf_dialect_to_executor",
+        "//tensorflow/compiler/mlir:register_common_dialects",
+        "//tensorflow/core/platform:resource_loader",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/testdata/empty_func.mlir b/tensorflow/compiler/mlir/tf2xla/api/v1/testdata/empty_func.mlir
new file mode 100644
index 00000000000000..e0fd01c86d182f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/testdata/empty_func.mlir
@@ -0,0 +1,5 @@
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+  func.func @main() -> () {
+    func.return
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/testdata/invalid_executor.mlir b/tensorflow/compiler/mlir/tf2xla/api/v1/testdata/invalid_executor.mlir
new file mode 100644
index 00000000000000..fd493ecafb0436
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/testdata/invalid_executor.mlir
@@ -0,0 +1,17 @@
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+  func.func @main() {
+    tf_executor.graph {
+      %control = tf_executor.island {
+        tf_executor.yield
+      }
+      tf_executor.fetch %control : !tf_executor.control
+    }
+    tf_executor.graph {
+      %control = tf_executor.island {
+        tf_executor.yield
+      }
+      tf_executor.fetch %control : !tf_executor.control
+    }
+    return
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
index dc6a190bd35d28..05fceb6fdc4ceb 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
@@ -15,18 +15,120 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h"
 
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/util/debug_data_dumper.h"
 #include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace tf2xla {
 namespace v1 {
 
+using mlir::LogicalResult;
 using mlir::ModuleOp;
+using mlir::OpPassManager;
+using mlir::Pass;
+using mlir::PassManager;
+using mlir::func::FuncOp;
+
+namespace {
+
+void AddTfDialectToExecutorPasses(OpPassManager &pm) {
+  auto add_pass = [&](std::unique_ptr<Pass> pass) {
+    pm.addNestedPass<FuncOp>(std::move(pass));
+    pm.addPass(mlir::CreateBreakUpIslandsPass());
+  };
+
+  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
+  add_pass(mlir::CreateFunctionalToExecutorDialectConversionPass());
+  add_pass(mlir::TFDevice::CreateReplicateToIslandPass(
+      /*legacy_graph_export=*/true));
+  add_pass(mlir::TFDevice::CreateReplicaIDToDeviceOrdinalPass());
+  add_pass(mlir::TFDevice::CreateParallelExecuteToIslandsPass(
+      /*legacy_graph_export=*/true));
+  add_pass(mlir::TFDevice::CreateLaunchToDeviceAttributePass(
+      /*legacy_graph_export=*/true));
+  pm.addNestedPass<FuncOp>(mlir::TFTPU::CreateTPUDevicePropagationPass());
+  pm.addNestedPass<FuncOp>(mlir::TFTPU::CreateTPUColocateSplitsPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  if (tensorflow::GetMlirCommonFlags()
+          ->tf_mlir_enable_convert_control_to_data_outputs_pass) {
+    pm.addPass(
+        mlir::tf_executor::CreateTFExecutorConvertControlToDataOutputsPass());
+  }
+  pm.addPass(mlir::TF::CreateVerifySuitableForExportPass());
+}
+
+// Add logger to bridge passmanager.
+// Enable timing statistics per pass for the bridge passmanager.
+void EnableDetailedLogging(PassManager *pm,
+                           llvm::StringRef module_name = llvm::StringRef()) {
+  // Print the whole module after each pass, which requires disabling
+  // multi-threading as well.
+  pm->getContext()->disableMultithreading();
+  pm->enableIRPrinting(std::make_unique<::tensorflow::DataDumperLoggerConfig>(
+      [module_name](const std::string &pass_tag_name, mlir::Operation *op) {
+        return DEBUG_DATA_DUMPER()->GetDumpFilename(
+            module_name.str(), kDebugGroupBridgePhase1, pass_tag_name);
+      },
+      "",
+      /*print_module_scope=*/true));
+}
+
+}  // namespace
+
+tensorflow::Status ExportFromTensorflowDialectToExecutor(
+    ModuleOp module, llvm::StringRef module_name) {
+  PassManager tf_to_executor(module.getContext());
+  ::tensorflow::applyTensorflowAndCLOptions(tf_to_executor);
+
+  AddTfDialectToExecutorPasses(tf_to_executor);
+
+  mlir::StatusScopedDiagnosticHandler diag_handler(
+      module.getContext(), /*propagate=*/false,
+      /*filter_stack=*/!VLOG_IS_ON(1));
+
+  if (VLOG_IS_ON(1) ||
+      DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(), kDebugGroupMain)) {
+    ::tensorflow::DumpMlirOpToFile(
+        DEBUG_DATA_DUMPER()->GetDumpFilename(
+            module_name.str(), kDebugGroupMain,
+            "tfxla_bridge_v1_tfdialect_to_executor_before"),
+        module, llvm::StringRef(), &tf_to_executor);
+
+    if (VLOG_IS_ON(2) || DEBUG_DATA_DUMPER()->ShouldDump(
+                             module_name.str(), kDebugGroupBridgePhase1)) {
+      EnableDetailedLogging(&tf_to_executor, module_name);
+    }
+  }
+
+  LogicalResult result = tf_to_executor.run(module);
+  (void)result;  // Ignore the error since it's captured by the diag_handler.
+
+  if (VLOG_IS_ON(1) ||
+      DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(), kDebugGroupMain)) {
+    ::tensorflow::DumpMlirOpToFile(
+        DEBUG_DATA_DUMPER()->GetDumpFilename(
+            module_name.str(), kDebugGroupMain,
+            "tfxla_bridge_v1_tfdialect_to_executor_after"),
+        module, llvm::StringRef(), &tf_to_executor);
+  }
 
-tensorflow::Status ExportFromTensorflowDialectToExecutor(ModuleOp module) {
-  return tsl::OkStatus();
+  return diag_handler.ConsumeStatus();
 }
 
 }  // namespace v1
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h
index 06ddc799bf75ad..5822674734b05a 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_TF_DIALECT_TO_EXECUTOR_H_
 #define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_TF_DIALECT_TO_EXECUTOR_H_
 
+#include "llvm/ADT/StringRef.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/core/platform/status.h"
 
@@ -35,7 +36,8 @@ namespace v1 {
 // Input: A MLIR Module in the Tensorflow Dialect with no
 // `tf_device.cluster_func` ops.
 // Output: A MLIR module in the Tensorflow Executor Dialect.
-tensorflow::Status ExportFromTensorflowDialectToExecutor(mlir::ModuleOp module);
+tensorflow::Status ExportFromTensorflowDialectToExecutor(
+    mlir::ModuleOp module, llvm::StringRef module_name = llvm::StringRef());
 
 }  // namespace v1
 }  // namespace tf2xla
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc
index eb87a7ae80444f..80a770169a88b2 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc
@@ -15,9 +15,20 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h"
 
+#include <string>
+
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/register_common_dialects.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace tf2xla {
@@ -26,9 +37,51 @@ namespace {
 
 using mlir::ModuleOp;
 
-TEST(TensorflowDialectToExecutor, ConvertsToExecutor) {
-  ModuleOp module;
-  TF_ASSERT_OK(ExportFromTensorflowDialectToExecutor(module));
+using mlir::DialectRegistry;
+using mlir::MLIRContext;
+using mlir::ModuleOp;
+using mlir::OwningOpRef;
+
+std::string TestDataPath() {
+  return tensorflow::GetDataDependencyFilepath(
+      "tensorflow/compiler/mlir/tf2xla/api/v1/testdata/");
+}
+
+class TensorflowDialectToExecutorTest : public ::testing::Test {
+ public:
+  TensorflowDialectToExecutorTest() {
+    mlir::RegisterCommonToolingDialects(registry_);
+    context_.appendDialectRegistry(registry_);
+    context_.loadAllAvailableDialects();
+  }
+
+  tsl::Status CreateMlirModule(std::string mlir_module_filename) {
+    std::string mlir_module_path = TestDataPath() + mlir_module_filename;
+    mlir_module_ =
+        mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context_);
+    if (!mlir_module_) {
+      return tsl::Status(
+          absl::StatusCode::kNotFound,
+          absl::StrCat("Could not find MLIR module at ", mlir_module_path));
+    }
+    return tsl::OkStatus();
+  }
+
+  DialectRegistry registry_;
+  MLIRContext context_;
+  OwningOpRef<mlir::ModuleOp> mlir_module_;
+};
+
+TEST_F(TensorflowDialectToExecutorTest, ConvertsToExecutor) {
+  TF_ASSERT_OK(CreateMlirModule("empty_func.mlir"));
+
+  TF_EXPECT_OK(ExportFromTensorflowDialectToExecutor(*mlir_module_));
+}
+
+TEST_F(TensorflowDialectToExecutorTest, ErrorsWhenCannotConvert) {
+  TF_ASSERT_OK(CreateMlirModule("invalid_executor.mlir"));
+
+  EXPECT_FALSE(ExportFromTensorflowDialectToExecutor(*mlir_module_).ok());
 }
 
 }  // namespace

From 7f3422962bfc779a9c03fe0493678159aecab12b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 07:54:17 -0700
Subject: [PATCH 519/567] Propagate shape into functions by inferred type when
 input type is ranked

PiperOrigin-RevId: 570387390
---
 .../tensorflow/tests/shape_inference.mlir     | 14 ++++++++++++
 .../tensorflow/transforms/shape_inference.cc  | 22 +++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index f84260faf8338b..4e6a2cd8bdbacf 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -2173,4 +2173,18 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %6 = "tf.TensorListSetItem"(%if49, %4, %5) {device = ""} : (!tf_variant, tensor<i32>, tensor<2x2xf32>)-> tensor<*x!tf_type.variant>
     func.return
   }
+
+  // Test shape are propagated to function by inferred type.
+  // CHECK-LABEL: func @pcall_fnc
+  func.func @pcall_fnc(%arg0: tensor<?x?x8xf32>) {
+    // CHECK: "tf.StatefulPartitionedCall"
+    // CHECK-SAME: (tensor<?x?x8xf32>) -> tensor<4x4x8xf32>
+    %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @pcall_callee_result_func} : (tensor<?x?x8xf32>) -> tensor<?x?x8xf32>
+    func.return
+  }
+  // CHECK-LABEL: func @pcall_callee_result_func
+  // CHECK-SAME: tensor<4x4x8xf32>
+  func.func @pcall_callee_result_func(%arg0: tensor<4x4x?xf32>) -> (tensor<4x4x?xf32>) {
+    func.return %arg0: tensor<4x4x?xf32>
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 1bc881e7ebcac6..bfc0c23af6f3c5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -2631,11 +2631,29 @@ FailureOr<bool> ShapeInference::PropagateShapeToFunctions(
       continue;
     }
     FunctionType func_type = func.getFunctionType();
-    func.setType(FunctionType::get(func.getContext(), input_types,
+    ArrayRef<Type> func_inputs = func_type.getInputs();
+    llvm::SmallVector<Type> infer_types;
+    for (auto it : llvm::zip(input_types, func_inputs)) {
+      auto input_type = std::get<0>(it);
+      auto func_input = std::get<1>(it);
+      Type infer_type = TypeMeet(input_type, func_input);
+      auto input_type_shape = input_type.dyn_cast<ShapedType>();
+      // TODO: Always use infer_type even the shape of input_type is unranked.
+      // We cannot do this currently because it will expose some bugs in
+      // multiple tests, either due to incorrect function signatures or
+      // unsupported type.
+      if (input_type_shape && input_type_shape.hasRank()) {
+        infer_types.push_back(infer_type);
+      } else {
+        infer_types.push_back(input_type);
+      }
+    }
+
+    func.setType(FunctionType::get(func.getContext(), infer_types,
                                    func_type.getResults()));
 
     FailureOr<bool> failure_or_converged =
-        PropagateShapeToRegions(input_types, {&func.getBody()}, max_iterations);
+        PropagateShapeToRegions(infer_types, {&func.getBody()}, max_iterations);
     if (failed(failure_or_converged)) {
       any_failure = true;
       continue;

From 079edd0a8affc7f5ad2fab46ee85c943179626d2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 08:00:41 -0700
Subject: [PATCH 520/567] Integrate LLVM at llvm/llvm-project@3661a48a8473

Updates LLVM usage to match
[3661a48a8473](https://github.com/llvm/llvm-project/commit/3661a48a8473)

PiperOrigin-RevId: 570388799
---
 third_party/llvm/generated.patch | 18 ++++++++++++++++++
 third_party/llvm/workspace.bzl   |  4 ++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 509398da979e83..32ff7d1eefe70d 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1 +1,19 @@
 Auto generated patch. Do not edit or delete it, even if empty.
+diff -ruN --strip-trailing-cr a/clang/lib/AST/Interp/Source.h b/clang/lib/AST/Interp/Source.h
+--- a/clang/lib/AST/Interp/Source.h
++++ b/clang/lib/AST/Interp/Source.h
+@@ -14,12 +14,12 @@
+ #define LLVM_CLANG_AST_INTERP_SOURCE_H
+ 
+ #include "PrimType.h"
++#include "clang/AST/DeclBase.h"
++#include "clang/AST/Stmt.h"
+ #include "llvm/ADT/PointerUnion.h"
+ #include "llvm/Support/Endian.h"
+ 
+ namespace clang {
+-class Stmt;
+-class Decl;
+ class Expr;
+ class SourceLocation;
+ class SourceRange;
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 1a35cc3c8dd812..a8b7276da6f085 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "512739ebbb25a75a6298b783f97aea2fb9f6a8e8"
-    LLVM_SHA256 = "ebb856efcb8e4e2b6a45b313766df92ffcf430ab37cbc4ee5571afe8ee9e992d"
+    LLVM_COMMIT = "3661a48a84731ab5086bf1fca8f7e6b9f294225a"
+    LLVM_SHA256 = "458cf3a135143f34e9328d70009fe62dfeaa82bc1f813f3d48a0387e843331dc"
 
     tf_http_archive(
         name = name,

From 5c70558af8f7d6baa92a5f126c2b2ccf2aa29b58 Mon Sep 17 00:00:00 2001
From: Michael Hudgins <michaelhudgins@google.com>
Date: Tue, 3 Oct 2023 16:10:38 +0000
Subject: [PATCH 521/567] Fix typo in wheel test

---
 .bazelrc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.bazelrc b/.bazelrc
index 66eb37866bc827..05452855f5cd05 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -699,7 +699,7 @@ test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflo
 test:linux_arm64_wheel_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_arm64_wheel_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_arm64_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_arm64_wheel_test --config=linux_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/wheel/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...  -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test
+test:linux_arm64_wheel_test --config=linux_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...  -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test
 
 # PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
 # the whole TF code base. These are usually run continuously or upon presubmit.

From 412923e9ea429cf60d6bf16d84bff777a1c0a984 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 09:13:02 -0700
Subject: [PATCH 522/567] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/13996eb1c80de1197e434f4be6258a4565dc0851.

PiperOrigin-RevId: 570406676
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 third_party/xla/third_party/tf_runtime/workspace.bzl          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 2727639805158a..aa5ab76c6ce35b 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "09614836c83b716cfc7f6ed63208a0a122020799"
-    TFRT_SHA256 = "a62a69706c814038605d67f5e6b9c68467aa51b9283b8fe3fec104a21874d8d8"
+    TFRT_COMMIT = "13996eb1c80de1197e434f4be6258a4565dc0851"
+    TFRT_SHA256 = "8aa4a2015969a2e28e384d77be0982a371ede430d7e040bc15a761a0a21c5d08"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
index 2727639805158a..aa5ab76c6ce35b 100644
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "09614836c83b716cfc7f6ed63208a0a122020799"
-    TFRT_SHA256 = "a62a69706c814038605d67f5e6b9c68467aa51b9283b8fe3fec104a21874d8d8"
+    TFRT_COMMIT = "13996eb1c80de1197e434f4be6258a4565dc0851"
+    TFRT_SHA256 = "8aa4a2015969a2e28e384d77be0982a371ede430d7e040bc15a761a0a21c5d08"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 2727639805158a..aa5ab76c6ce35b 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "09614836c83b716cfc7f6ed63208a0a122020799"
-    TFRT_SHA256 = "a62a69706c814038605d67f5e6b9c68467aa51b9283b8fe3fec104a21874d8d8"
+    TFRT_COMMIT = "13996eb1c80de1197e434f4be6258a4565dc0851"
+    TFRT_SHA256 = "8aa4a2015969a2e28e384d77be0982a371ede430d7e040bc15a761a0a21c5d08"
 
     tf_http_archive(
         name = "tf_runtime",

From 3761657625151d0836ecc9c8962df766bfa31d66 Mon Sep 17 00:00:00 2001
From: Kanvi Khanna <kanvi.khanna@intel.com>
Date: Tue, 3 Oct 2023 09:20:03 -0700
Subject: [PATCH 523/567] missed change - remove unnecessary includes

---
 tensorflow/core/grappler/optimizers/mkl_remapper_test.cc | 1 -
 tensorflow/core/grappler/optimizers/remapper.cc          | 1 -
 tensorflow/core/grappler/optimizers/remapper_test.cc     | 1 -
 3 files changed, 3 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index 340fa8b460efbf..fe3b23d9837175 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/remapper.h"
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index d5238316d73c70..9c2453e77d2db5 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/versions.pb.h"
-#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 2f654dc25e85d3..39a75d69aa775e 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"

From 52dbc1f2160be86834cf820eae5839b4ec25de12 Mon Sep 17 00:00:00 2001
From: Jie Sun <jiesun@google.com>
Date: Tue, 3 Oct 2023 10:15:13 -0700
Subject: [PATCH 524/567] clean up old legacy connections.

PiperOrigin-RevId: 570424028
---
 .../tsl/tsl/profiler/utils/preprocess_xplane.cc   | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
index 70f239dde506be..029e53b8abcdee 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "tsl/profiler/utils/preprocess_xplane.h"
 
+#include <cstdint>
 #include <memory>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/types/optional.h"
-#include "tsl/profiler/lib/connected_traceme.h"
+#include "tsl/profiler/lib/context_types.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/utils/xplane_builder.h"
 #include "tsl/profiler/utils/xplane_schema.h"
@@ -139,17 +139,6 @@ CreateMutatorFactories() {
           XContextStatsAccessor<uint64_t, StatType::kDeviceOrdinal>,
           XContextStatsAccessor<uint64_t, StatType::kQueueId>,
           XContextStatsAccessor<uint64_t, StatType::kRunId>>::CreateFactory());
-  // TODO(jiesun): remove kDoEnqueueContinuationProgram after 04/21/2023
-  // see cl/443548431.
-  mutator_factories.push_back(
-      XplaneConnectedEventMutatorFactory<
-          /*producer_event=*/HostEventType::kDoEnqueueContinuationProgram,
-          /*consumer_event=*/HostEventType::kCompleteCallbacks,
-          ContextType::kTpuLaunch,
-          /*unique_stats=*/true,
-          XContextStatsAccessor<uint64_t, StatType::kDeviceOrdinal>,
-          XContextStatsAccessor<uint64_t, StatType::kQueueId>,
-          XContextStatsAccessor<uint64_t, StatType::kRunId>>::CreateFactory());
 
   mutator_factories.push_back(TpuModuleLineMutatorFactory::CreateFactory());
   return mutator_factories;

From 185e2019ae9d5a085e7f0165264aff16c1cfa6ff Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Tue, 3 Oct 2023 10:19:40 -0700
Subject: [PATCH 525/567] Add libnvidia-ml-dev to docker container packages.
 This will enable us to use nvml.h which can be used for XLA cost modeling.

PiperOrigin-RevId: 570425411
---
 tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
index 311a4d905814a0..f0f05bef74639f 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
@@ -6,6 +6,7 @@ cuda-cupti-12-2
 cuda-nvprune-12-2
 cuda-libraries-12-2
 cuda-libraries-dev-12-2
+cuda-nvml-dev-12-2
 libcufft-12-2
 libcurand-12-2
 libcusolver-dev-12-2

From 8199719e72dafcd72ba2a15847f24603e3a6382f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 10:48:13 -0700
Subject: [PATCH 526/567] Check in generated pyi files for some py_extension
 targets.

PiperOrigin-RevId: 570434238
---
 .../lite/python/interpreter_wrapper/BUILD     |  7 +++
 ..._pywrap_tensorflow_interpreter_wrapper.pyi | 48 +++++++++++++++++++
 tensorflow/lite/python/optimize/BUILD         |  7 +++
 ...ap_tensorflow_lite_calibration_wrapper.pyi | 40 ++++++++++++++++
 tensorflow/lite/testing/BUILD                 |  7 +++
 .../lite/testing/_pywrap_string_util.pyi      | 16 +++++++
 6 files changed, 125 insertions(+)
 create mode 100644 tensorflow/lite/python/interpreter_wrapper/_pywrap_tensorflow_interpreter_wrapper.pyi
 create mode 100644 tensorflow/lite/python/optimize/_pywrap_tensorflow_lite_calibration_wrapper.pyi
 create mode 100644 tensorflow/lite/testing/_pywrap_string_util.pyi

diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index ee2d710efd171f..fa0af673063325 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -82,8 +82,15 @@ pybind_extension(
         "interpreter_wrapper_pybind11.cc",
     ],
     hdrs = ["interpreter_wrapper.h"],
+    additional_stubgen_deps = [
+        "//third_party/py/numpy:numpy",
+    ],
     compatible_with = get_compatible_with_portable(),
+    enable_stub_generation = True,
     link_in_framework = True,
+    pytype_srcs = [
+        "_pywrap_tensorflow_interpreter_wrapper.pyi",
+    ],
     deps = [
         ":interpreter_wrapper_lib",
         "//tensorflow/lite:framework",
diff --git a/tensorflow/lite/python/interpreter_wrapper/_pywrap_tensorflow_interpreter_wrapper.pyi b/tensorflow/lite/python/interpreter_wrapper/_pywrap_tensorflow_interpreter_wrapper.pyi
new file mode 100644
index 00000000000000..123b909c4693c3
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/_pywrap_tensorflow_interpreter_wrapper.pyi
@@ -0,0 +1,48 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Any
+
+class InterpreterWrapper:
+    def __init__(self, *args, **kwargs) -> None: ...
+    def AllocateTensors(self, subgraph_index: int = ...) -> object: ...
+    def GetSignatureDefs(self) -> object: ...
+    def GetSubgraphIndexFromSignature(self, arg0: str) -> object: ...
+    def GetTensor(self, tensor_index: int, subgraph_index: int = ...) -> object: ...
+    def InputIndices(self) -> object: ...
+    def Invoke(self, subgraph_index: int = ...) -> object: ...
+    def ModifyGraphWithDelegate(self, arg0: int) -> object: ...
+    def NodeInputs(self, arg0: int) -> object: ...
+    def NodeName(self, arg0: int) -> str: ...
+    def NodeOutputs(self, arg0: int) -> object: ...
+    def NumNodes(self) -> int: ...
+    def NumTensors(self, arg0: int) -> int: ...
+    def OutputIndices(self) -> object: ...
+    def ResetVariableTensors(self) -> object: ...
+    def ResizeInputTensor(self, i: int, value, strict: bool, subgraph_index: int = ...) -> object: ...
+    def SetNumThreads(self, arg0: int) -> object: ...
+    def SetTensor(self, i: int, value, subgraph_index: int = ...) -> object: ...
+    def TensorName(self, arg0: int, arg1: int) -> str: ...
+    def TensorQuantization(self, arg0: int, arg1: int) -> object: ...
+    def TensorQuantizationParameters(self, arg0: int, arg1: int) -> object: ...
+    def TensorSize(self, arg0: int, arg1: int) -> object: ...
+    def TensorSizeSignature(self, arg0: int, arg1: int) -> object: ...
+    def TensorSparsityParameters(self, arg0: int, arg1: int) -> object: ...
+    def TensorType(self, arg0: int, arg1: int) -> object: ...
+    def interpreter(self) -> int: ...
+    def tensor(self, base_object, tensor_index: int, subgraph_index: int = ...) -> object: ...
+
+def CreateWrapperFromBuffer(*args, **kwargs) -> Any: ...
+def CreateWrapperFromFile(*args, **kwargs) -> Any: ...
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index 35b793fab732e9..e8bf37b60f77c2 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -41,7 +41,14 @@ pybind_extension(
         "calibration_wrapper_pybind11.cc",
     ],
     hdrs = ["calibration_wrapper.h"],
+    additional_stubgen_deps = [
+        "//third_party/py/numpy:numpy",
+    ],
+    enable_stub_generation = True,
     link_in_framework = True,
+    pytype_srcs = [
+        "_pywrap_tensorflow_lite_calibration_wrapper.pyi",
+    ],
     deps = [
         ":calibration_wrapper_lib",
         "//tensorflow/lite:framework",
diff --git a/tensorflow/lite/python/optimize/_pywrap_tensorflow_lite_calibration_wrapper.pyi b/tensorflow/lite/python/optimize/_pywrap_tensorflow_lite_calibration_wrapper.pyi
new file mode 100644
index 00000000000000..6dfc00d27101fe
--- /dev/null
+++ b/tensorflow/lite/python/optimize/_pywrap_tensorflow_lite_calibration_wrapper.pyi
@@ -0,0 +1,40 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Callable, List
+
+from typing import overload
+
+class CalibrationWrapper:
+    def __init__(self, arg0, arg1: List[str], arg2: List[Callable[[int],None]]) -> None: ...
+    def Calibrate(self) -> object: ...
+    @overload
+    def FeedTensor(self, arg0, arg1: str) -> object: ...
+    @overload
+    def FeedTensor(self, arg0) -> object: ...
+    @overload
+    def Prepare(self, arg0, arg1: str) -> object: ...
+    @overload
+    def Prepare(self, arg0) -> object: ...
+    @overload
+    def Prepare(self, arg0: str) -> object: ...
+    @overload
+    def Prepare(self) -> object: ...
+    @overload
+    def QuantizeModel(self, arg0: int, arg1: int, arg2: bool, arg3: int, arg4: int, arg5: bool) -> object: ...
+    @overload
+    def QuantizeModel(self, arg0: int, arg1: int, arg2: bool, arg3: str) -> object: ...
+
+def AddIntermediateTensors(arg0) -> object: ...
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index dd9a90367bc548..c054b978d9fd2d 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -651,7 +651,14 @@ pybind_extension(
         "string_util_wrapper.cc",
     ],
     hdrs = ["string_util.h"],
+    additional_stubgen_deps = [
+        "//third_party/py/numpy:numpy",
+    ],
+    enable_stub_generation = True,
     features = ["-use_header_modules"],
+    pytype_srcs = [
+        "_pywrap_string_util.pyi",
+    ],
     deps = [
         ":string_util_lib",
         "//tensorflow/lite/python/interpreter_wrapper:numpy",
diff --git a/tensorflow/lite/testing/_pywrap_string_util.pyi b/tensorflow/lite/testing/_pywrap_string_util.pyi
new file mode 100644
index 00000000000000..b46edabf2140e0
--- /dev/null
+++ b/tensorflow/lite/testing/_pywrap_string_util.pyi
@@ -0,0 +1,16 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+def SerializeAsHexString(arg0) -> object: ...

From 3d541bd9958a8fb892bb30f987f959a9ecde3ce4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 11:05:13 -0700
Subject: [PATCH 527/567] Generate the XlaRecvFrom with the local shape

PiperOrigin-RevId: 570439971
---
 tensorflow/dtensor/mlir/BUILD                 |  2 ++
 tensorflow/dtensor/mlir/dtensor_send_recv.cc  | 33 +++++++++++++++++--
 .../dtensor/mlir/tests/lower_send_recv.mlir   |  4 +--
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/tensorflow/dtensor/mlir/BUILD b/tensorflow/dtensor/mlir/BUILD
index 8a0a5c60569216..26bc5a08f7d37d 100644
--- a/tensorflow/dtensor/mlir/BUILD
+++ b/tensorflow/dtensor/mlir/BUILD
@@ -280,11 +280,13 @@ cc_library(
         ":device_utils",
         ":layout_parsing",
         ":op_utils",
+        ":shape_utils",
         ":spmd_expander_common",
         ":tf_dtensor_dialect",
         ":value_utils",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/core/platform:errors",
         "//tensorflow/dtensor/cc:constants",
diff --git a/tensorflow/dtensor/mlir/dtensor_send_recv.cc b/tensorflow/dtensor/mlir/dtensor_send_recv.cc
index 4a17b4db82dca6..ed4b636081fc93 100644
--- a/tensorflow/dtensor/mlir/dtensor_send_recv.cc
+++ b/tensorflow/dtensor/mlir/dtensor_send_recv.cc
@@ -16,22 +16,30 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/dtensor_send_recv.h"
 
 #include <string>
+#include <vector>
 
+#include "absl/status/status.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/cc/constants.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives.h"
 #include "tensorflow/dtensor/mlir/device_utils.h"
+#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/op_utils.h"
+#include "tensorflow/dtensor/mlir/shape_utils.h"
 #include "tensorflow/dtensor/mlir/spmd_expander_common.h"
 #include "tensorflow/dtensor/mlir/value_utils.h"
 
@@ -181,6 +189,25 @@ StatusOr<mlir::Operation*> LowerDTensorSendToXlaOp(
   return lowered_send_op;
 }
 
+// Creates a shape attribute of the local shape version of RecvFromOp's result.
+StatusOr<mlir::TF::ShapeAttr> GetDTensorRecvLocalShapeAttr(
+    mlir::TF::DTensorRecv dtensor_recv) {
+  if (dtensor_recv->getNumResults() != 1) {
+    return absl::InvalidArgumentError(
+        "XlaRecvFromHostOp must have exactly one result.");
+  }
+  TF_ASSIGN_OR_RETURN(std::vector<Layout> layouts,
+                      ExtractRequiredLayoutFromOp(dtensor_recv));
+  if (layouts.empty() || layouts.size() > 1) {
+    return absl::InvalidArgumentError(
+        "invalid layout for XlaRecvFromHostOp specified");
+  }
+  auto result_type = dtensor_recv->getResult(0);
+  TF_ASSIGN_OR_RETURN(auto result_shape, GetShapeOfValue(result_type));
+  auto local_shape = layouts[0].LocalShapeFromGlobalShape(result_shape);
+  return mlir::TF::ShapeAttr::get(result_type.getContext(), local_shape);
+}
+
 // Lowers DTensorRecv op to either one of XlaRecvAtHost or XlaRecvFromHost,
 // depending on src mesh cluster configuration. `output_type` can be set to the
 // specific local tensor type needed, if different from the Recv op output type.
@@ -216,10 +243,12 @@ StatusOr<mlir::Operation*> LowerDTensorRecvToXlaOp(
         dtensor_recv.getLoc(), output_types,
         /*dynamic_key=*/program_key, device_ordinal, dtensor_recv.getKeyAttr());
   } else {
+    TF_ASSIGN_OR_RETURN(auto local_shape_attr,
+                        GetDTensorRecvLocalShapeAttr(dtensor_recv));
+
     // Create XlaRecvFromHost op.
     recv_xla_op = builder.create<mlir::TF::XlaRecvFromHostOp>(
-        dtensor_recv.getLoc(), output_type,
-        ConvertTypeToTensorShapeAttr(dtensor_recv.getType()),
+        dtensor_recv.getLoc(), output_type, local_shape_attr,
         dtensor_recv.getKeyAttr());
   }
 
diff --git a/tensorflow/dtensor/mlir/tests/lower_send_recv.mlir b/tensorflow/dtensor/mlir/tests/lower_send_recv.mlir
index da12e597ac8c70..5a06d5d620117a 100644
--- a/tensorflow/dtensor/mlir/tests/lower_send_recv.mlir
+++ b/tensorflow/dtensor/mlir/tests/lower_send_recv.mlir
@@ -42,12 +42,12 @@ func.func @main(%arg0: tensor<i32>) {
     %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>} : (tensor<1xi32>) -> tensor<1xi32>
     "tf.DTensorSend"(%1) {key = "communication_key_TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0", target_mesh = #dtensor.mesh<TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<1xi32>) -> ()
 
-    %2 = "tf.DTensorRecv"() {key = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_2", mesh = #dtensor.mesh<CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>, shape = #tf_type.shape<>} : () -> (tensor<1xi32>)
+    %2 = "tf.DTensorRecv"() {key = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_2", mesh = #dtensor.mesh<CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>, shape = #tf_type.shape<>, _layout = ["sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"]} : () -> (tensor<1xi32>)
     "tf.B"(%2) : (tensor<1xi32>) -> ()
     tf_device.return
   }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> ()
   "tf_device.cluster"() ({
-    %0 = "tf.DTensorRecv"() {key = "communication_key_TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0", mesh = #dtensor.mesh<TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, shape = #tf_type.shape<>} : () -> tensor<1xi32>
+    %0 = "tf.DTensorRecv"() {key = "communication_key_TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0", mesh = #dtensor.mesh<TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, shape = #tf_type.shape<>, _layout = ["sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"]} : () -> tensor<1xi32>
     %1 = "tf.Relayout"(%0) {global_shape = #tf_type.shape<1>, layout = "sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : (tensor<1xi32>) -> tensor<1xi32>
     %2 = "tf.A"(%1) : (tensor<1xi32>) -> tensor<1xi32>
     "tf.DTensorSend"(%2) {key = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_2", target_mesh = #dtensor.mesh<CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>} : (tensor<1xi32>) -> ()

From c50efc7e58a5a29e2cb3d2e59444029ccea1375f Mon Sep 17 00:00:00 2001
From: Edward Schwartz <schwartzedward@google.com>
Date: Tue, 3 Oct 2023 11:11:20 -0700
Subject: [PATCH 528/567] Add error handling for negative batch_dims in
 ResourceGather op constructor

PiperOrigin-RevId: 570441784
---
 .../core/kernels/resource_variable_ops.cc      |  5 +++++
 .../variables/resource_variable_ops_test.py    | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 203609915b1c19..8346cb4498d418 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -45,6 +45,8 @@ limitations under the License.
 //   (use_locking=false), we never copy even if the variable's
 //   reference count is >1.
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/op_requires.h"
 #define EIGEN_USE_THREADS
 
@@ -709,6 +711,9 @@ class ResourceGatherOp : public OpKernel {
  public:
   explicit ResourceGatherOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("batch_dims", &batch_dims_));
+    OP_REQUIRES(c, batch_dims_ >= 0,
+                absl::InvalidArgumentError(absl::StrCat(
+                    "batch_dims is negative (", batch_dims_, ")")));
   }
 
   void Compute(OpKernelContext* c) override {
diff --git a/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
index f63a8cbf7806d4..da3058fd18934d 100644
--- a/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
@@ -1871,5 +1871,23 @@ def testHandleNamePreservedThroughPackAndUnpack(self):
         expand_composites=True)
     self.assertEqual(reconstructed_v._handle_name, expected_handle_name)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGatherBatchDimsNeg(self):
+    var = resource_variable_ops.ResourceVariable(
+        [1], dtype=dtypes.int32, name="var0"
+    )
+    with ops.control_dependencies([var.initializer]):
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError),
+          "(batch_dims is negative)|(Expected batch_dims in the range)"
+      ):
+        result = resource_variable_ops.resource_gather(
+            var.handle,
+            indices=[1],
+            dtype=var.dtype,
+            batch_dims=-42,
+        )
+        self.evaluate(result)
+
 if __name__ == "__main__":
   test.main()

From 512927a86a8d518d070fbdf30e3fa2f3b170506d Mon Sep 17 00:00:00 2001
From: Mehdi Drissi <mdrissi@snapchat.com>
Date: Tue, 3 Oct 2023 11:18:53 -0700
Subject: [PATCH 529/567] Fix inconsistency in ragged and sparse embedding
 lookup.

---
 tensorflow/python/ops/ragged/ragged_embedding_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/ragged/ragged_embedding_ops.py b/tensorflow/python/ops/ragged/ragged_embedding_ops.py
index 9379c8a7352983..a9ea994bfa242d 100644
--- a/tensorflow/python/ops/ragged/ragged_embedding_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_embedding_ops.py
@@ -327,7 +327,7 @@ def safe_embedding_lookup_sparse(
   embedding_weights = [
       w
       if (
-          isinstance(w, resource_variable_ops.ResourceVariable)
+          resource_variable_ops.is_resource_variable(w)
           and dtype in (None, w.dtype)
       )
       else ops.convert_to_tensor(w, dtype=dtype)

From 70f407107e3708e6d0fe13cd56c6f0e3261b5796 Mon Sep 17 00:00:00 2001
From: Majid Dadashi <majiddadashi@google.com>
Date: Tue, 3 Oct 2023 11:13:40 -0700
Subject: [PATCH 530/567] [tflite] split the stablehlo scatter re-usable
 functions

PiperOrigin-RevId: 570442495
---
 tensorflow/lite/kernels/BUILD                 |  28 ++++
 tensorflow/lite/kernels/stablehlo_scatter.cc  | 154 +++---------------
 tensorflow/lite/kernels/tensor_slice_util.h   | 153 +++++++++++++++++
 .../lite/kernels/tensor_slice_util_test.cc    | 134 +++++++++++++++
 4 files changed, 334 insertions(+), 135 deletions(-)
 create mode 100644 tensorflow/lite/kernels/tensor_slice_util.h
 create mode 100644 tensorflow/lite/kernels/tensor_slice_util_test.cc

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index ed39d82fb06f54..b52bd915b89700 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -518,6 +518,33 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "tensor_slice_util",
+    hdrs = [
+        "tensor_slice_util.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels/internal:runtime_shape",
+        "//tensorflow/lite/kernels/internal:tensor_ctypes",
+    ],
+)
+
+cc_test(
+    name = "tensor_slice_util_test",
+    size = "small",
+    srcs = ["tensor_slice_util_test.cc"],
+    deps = [
+        ":tensor_slice_util",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels/internal:runtime_shape",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "kernel_util",
     srcs = [
@@ -753,6 +780,7 @@ BUILTIN_KERNEL_DEPS = [
     ":cpu_backend_threadpool",
     ":kernel_util",
     ":rng_util",
+    ":tensor_slice_util",
     ":lstm_eval",
     ":lstm_shared",
     ":op_macros",
diff --git a/tensorflow/lite/kernels/stablehlo_scatter.cc b/tensorflow/lite/kernels/stablehlo_scatter.cc
index 9331e7ac3eb2c0..7db81c88be51df 100644
--- a/tensorflow/lite/kernels/stablehlo_scatter.cc
+++ b/tensorflow/lite/kernels/stablehlo_scatter.cc
@@ -1,16 +1,17 @@
-// Copyright 2023 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
 
 #include <algorithm>
 #include <cstddef>
@@ -28,6 +29,7 @@
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/tensor_slice_util.h"
 
 namespace tflite {
 namespace ops {
@@ -57,22 +59,10 @@ struct OpData {
   ComputationType computation_type;
 };
 
-// Element i contains the index of an entry in the i-th dimension.
-template <typename IndexType>
-using Index = std::vector<IndexType>;
-
 // Contains a vector with each element being a dimension index
 // example: [1, 4] means the second and fifth dimensions of another vector.
 using DimVector = std::vector<int64_t>;
 
-// Returns true if `value` is found in `array`.
-static bool ArrayContains(const int64_t* array, int size, int64_t value) {
-  if (size == 0) {
-    return false;
-  }
-  return std::find(array, array + size, value) != array + size;
-}
-
 // Returns the update scatter dimension given the update window dimensions.
 // Example:
 // When updates_rank=5, update_window_dims=[2, 4]
@@ -168,56 +158,6 @@ static TfLiteStatus GetComputationType(const Subgraph* computation_subgraph,
   return kTfLiteOk;
 }
 
-// Creates a new Index based on the provided scatter_dims.
-// Example:
-// For index={s0, s1}, scatter_dims_to_operand_dims=[1, 0], returns {s2,s1,0}.
-// Result has same size as rank of input. All the result dimensions not in
-// scatter_dims_to_operand_dims get the value 0.
-template <typename IndexType>
-static TfLiteStatus ScatterIndex(const Index<IndexType>& index,
-                                 const int64_t* scatter_dims,
-                                 int num_scatter_dims, int64_t to_rank,
-                                 Index<IndexType>* result) {
-  if (result == nullptr) {
-    return kTfLiteError;
-  }
-
-  *result = Index<IndexType>(to_rank, 0);
-  for (int idx = 0; idx < num_scatter_dims; ++idx) {
-    if (scatter_dims[idx] >= result->size()) {
-      return kTfLiteError;
-    }
-    (*result)[scatter_dims[idx]] = index[idx];
-  }
-  return kTfLiteOk;
-}
-
-// A helper function that converts a tensor index into a flat array index.
-template <typename IndexType>
-static IndexType TensorIndexToFlat(const IndexType* index, const int64_t dims,
-                                   const RuntimeShape& shape) {
-  // If it's a scalar, just return the index of the first element.
-  if (dims == 0) {
-    return 0;
-  }
-  IndexType flat_index = index[0];
-  for (int64_t i = 1; i < dims; ++i) {
-    flat_index = flat_index * shape.Dims(i) + index[i];
-  }
-  return flat_index;
-}
-
-template <typename IndexType>
-static Index<IndexType> AddIndices(const Index<IndexType>& index1,
-                                   const Index<IndexType>& index2) {
-  Index<IndexType> result;
-  result.reserve(index1.size());
-  for (int64_t dim = 0; dim < index1.size(); ++dim) {
-    result.push_back(index1[dim] + index2[dim]);
-  }
-  return result;
-}
-
 // Applies the provided computation to `input_value` and `update_value` and
 // stores the result in `tensor[index]`.
 template <typename DataType, typename IndexType>
@@ -252,65 +192,8 @@ static TfLiteStatus ApplyComputation(TfLiteTensor* tensor,
   return kTfLiteOk;
 }
 
-// Creates a new Index with the number of dimensions increased with respect to
-// inserted_dims array.
-// Example: index=[i, j], inserted_window_dims=[1], the result is [i, 0, j]
-template <typename IndexType>
-static TfLiteStatus ExpandDims(const Index<IndexType>& index,
-                               const int64_t* inserted_dims,
-                               int num_inserted_dims,
-                               Index<IndexType>* result) {
-  std::vector<int64_t> scatter_dims;
-  scatter_dims.reserve(index.size());
-  int64_t ctr = 0;
-  for (int idx = 0; idx < index.size(); ++idx) {
-    while (ArrayContains(inserted_dims, num_inserted_dims, ctr)) {
-      ++ctr;
-    }
-    scatter_dims.push_back(ctr);
-    ++ctr;
-  }
-  TF_LITE_ENSURE_STATUS(ScatterIndex(index, scatter_dims.data(),
-                                     scatter_dims.size(),
-                                     index.size() + num_inserted_dims, result));
-  return kTfLiteOk;
-}
-
-// Reads one array from a given tensor.
-// Example: other_indices=[j, l, m], dim_to_read= 2
-// The resulting read array is: [j, l, :, m]
-template <typename IndexType>
-static Index<IndexType> ReadScatterIndicesAxis(
-    const TfLiteTensor* scatter_indices, const RuntimeShape& tensor_shape,
-    const Index<IndexType>& other_indices, int64_t dim_to_read) {
-  Index<IndexType> index;
-  index.reserve(tensor_shape.DimensionsCount());
-  int shift = 0;
-  for (int64_t dim = 0; dim < tensor_shape.DimensionsCount(); ++dim) {
-    if (dim == dim_to_read) {
-      index.push_back(0);
-      shift = 1;
-    } else {
-      index.push_back(other_indices[dim - shift]);
-    }
-  }
-  int64_t index_vector_size = tensor_shape.Dims(dim_to_read);
-  Index<IndexType> result;
-  result.reserve(index_vector_size);
-  for (IndexType index_vector_idx = 0; index_vector_idx < index_vector_size;
-       ++index_vector_idx) {
-    index[dim_to_read] = index_vector_idx;
-
-    IndexType flat_index = TensorIndexToFlat(
-        index.data(), tensor_shape.DimensionsCount(), tensor_shape);
-    const IndexType* tensor_data = GetTensorData<IndexType>(scatter_indices);
-    result.push_back(tensor_data[flat_index]);
-  }
-  return result;
-}
-
 // Evaluates this node given the type of the elements in the scatter_indices
-// and the type of the elements in the input/updates vector.
+// and the type of the elements in the input/updates tensors.
 template <typename IndexType, typename DataType>
 TfLiteStatus EvalWithTypes(TfLiteContext* context, TfLiteNode* node) {
   OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
@@ -360,8 +243,8 @@ TfLiteStatus EvalWithTypes(TfLiteContext* context, TfLiteNode* node) {
     // Read the index_vector_dim dimension with the other dimension indices set.
     // Let's say the contents after reading are {s1, s2}
     Index<IndexType> start_index =
-        ReadScatterIndicesAxis(scatter_indices, scatter_indices_shape,
-                               update_scatter_index, data->index_vector_dim);
+        ReadIndexVector(scatter_indices, scatter_indices_shape,
+                        update_scatter_index, data->index_vector_dim);
 
     Index<IndexType> full_start_index;
     TF_LITE_ENSURE_STATUS(ScatterIndex(
@@ -460,6 +343,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     GetOutputSafe(context, node, kOutputTensor, &output));
 
+  // Output is the same size as input. Scatter just updates some of the values.
   // Need the copy since ResizeTensor takes ownership of output_size
   TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
   TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_size));
diff --git a/tensorflow/lite/kernels/tensor_slice_util.h b/tensorflow/lite/kernels/tensor_slice_util.h
new file mode 100644
index 00000000000000..3b003646594c85
--- /dev/null
+++ b/tensorflow/lite/kernels/tensor_slice_util.h
@@ -0,0 +1,153 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_TENSOR_SLICE_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_TENSOR_SLICE_UTIL_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+// Element i contains the index of an entry in the i-th dimension.
+template <typename IndexType>
+using Index = std::vector<IndexType>;
+
+// Returns true if the `array` contains `value`.
+// Always returns false if the array is empty.
+inline bool ArrayContains(const int64_t* array, int size, int64_t value) {
+  if (size == 0) {
+    return false;
+  }
+  return std::find(array, array + size, value) != array + size;
+}
+
+// Creates a new Index based on the provided `scatter_dims`.
+// Example:
+// For `index`={s0, s1}, `scatter_dims`=[1, 0], returns {s2,s1,0}.
+// Result has same size as rank of input. All the result dimensions not in
+// `scatter_dims` get the value 0.
+template <typename IndexType>
+TfLiteStatus ScatterIndex(const Index<IndexType>& index,
+                          const int64_t* scatter_dims, int num_scatter_dims,
+                          int64_t to_rank, Index<IndexType>* result) {
+  if (result == nullptr) {
+    return kTfLiteError;
+  }
+
+  *result = Index<IndexType>(to_rank, 0);
+  for (int idx = 0; idx < num_scatter_dims; ++idx) {
+    if (scatter_dims[idx] >= result->size()) {
+      return kTfLiteError;
+    }
+    (*result)[scatter_dims[idx]] = index[idx];
+  }
+  return kTfLiteOk;
+}
+
+// A helper function that converts a tensor index into a flat array index.
+template <typename IndexType>
+IndexType TensorIndexToFlat(const IndexType* index, const int64_t dims,
+                            const RuntimeShape& shape) {
+  // If it's a scalar, just return the index of the first element.
+  if (dims == 0) {
+    return 0;
+  }
+  IndexType flat_index = index[0];
+  for (int64_t i = 1; i < dims; ++i) {
+    flat_index = flat_index * shape.Dims(i) + index[i];
+  }
+  return flat_index;
+}
+
+template <typename IndexType>
+Index<IndexType> AddIndices(const Index<IndexType>& index1,
+                            const Index<IndexType>& index2) {
+  Index<IndexType> result;
+  result.reserve(index1.size());
+  for (int64_t dim = 0; dim < index1.size(); ++dim) {
+    result.push_back(index1[dim] + index2[dim]);
+  }
+  return result;
+}
+
+// Creates a new Index with the number of dimensions increased with respect to
+// `avoided_dims` array.
+// Example: `index`=[i, j], `avoided_dims`=[1], the result is [i, 0, j]
+template <typename IndexType>
+TfLiteStatus ExpandDims(const Index<IndexType>& index,
+                        const int64_t* avoided_dims, int num_avoided_dims,
+                        Index<IndexType>* result) {
+  std::vector<int64_t> scatter_dims;
+  scatter_dims.reserve(index.size());
+  int64_t ctr = 0;
+  for (int idx = 0; idx < index.size(); ++idx) {
+    while (ArrayContains(avoided_dims, num_avoided_dims, ctr)) {
+      ++ctr;
+    }
+    scatter_dims.push_back(ctr);
+    ++ctr;
+  }
+  TF_LITE_ENSURE_STATUS(ScatterIndex(index, scatter_dims.data(),
+                                     scatter_dims.size(),
+                                     index.size() + num_avoided_dims, result));
+  return kTfLiteOk;
+}
+
+// Reads one array from a given tensor.
+// Example: `other_indices`=[j, l, m], `dim_to_read`= 2
+// The resulting read array is: [j, l, :, m]
+template <typename IndexType>
+Index<IndexType> ReadIndexVector(const TfLiteTensor* indices_tensor,
+                                 const RuntimeShape& tensor_shape,
+                                 const Index<IndexType>& other_indices,
+                                 int64_t dim_to_read) {
+  Index<IndexType> index;
+  index.reserve(tensor_shape.DimensionsCount());
+  int shift = 0;
+  for (int64_t dim = 0; dim < tensor_shape.DimensionsCount(); ++dim) {
+    if (dim == dim_to_read) {
+      index.push_back(0);
+      shift = 1;
+    } else {
+      index.push_back(other_indices[dim - shift]);
+    }
+  }
+  int64_t index_vector_size = tensor_shape.Dims(dim_to_read);
+  Index<IndexType> result;
+  result.reserve(index_vector_size);
+  for (IndexType index_vector_idx = 0; index_vector_idx < index_vector_size;
+       ++index_vector_idx) {
+    index[dim_to_read] = index_vector_idx;
+
+    IndexType flat_index = TensorIndexToFlat(
+        index.data(), tensor_shape.DimensionsCount(), tensor_shape);
+    const IndexType* tensor_data = GetTensorData<IndexType>(indices_tensor);
+    result.push_back(tensor_data[flat_index]);
+  }
+  return result;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_TENSOR_SLICE_UTIL_H_
diff --git a/tensorflow/lite/kernels/tensor_slice_util_test.cc b/tensorflow/lite/kernels/tensor_slice_util_test.cc
new file mode 100644
index 00000000000000..b3102c67683620
--- /dev/null
+++ b/tensorflow/lite/kernels/tensor_slice_util_test.cc
@@ -0,0 +1,134 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/tensor_slice_util.h"
+
+#include <cstdint>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+TEST(TensorSliceUtil, ArrayContains) {
+  std::vector<int64_t> array = {1, 2, 3};
+
+  EXPECT_TRUE(ArrayContains(array.data(), array.size(), 2));
+  EXPECT_FALSE(ArrayContains(array.data(), array.size(), 0));
+}
+
+TEST(TensorSliceUtil, ArrayContainsWorkOnEmptyArray) {
+  std::vector<int64_t> array = {};
+
+  EXPECT_FALSE(ArrayContains(array.data(), 0, 2));
+}
+
+TEST(TensorSliceUtil, ScatterIndexHandlesNullPtr) {
+  Index<int64_t> index = {3, 5};
+  std::vector<int64_t> scatter_dims = {1, 0};
+  Index<int64_t>* result = nullptr;
+  TfLiteStatus status =
+      ScatterIndex(index, scatter_dims.data(), scatter_dims.size(), 3, result);
+  EXPECT_THAT(status, kTfLiteError);
+}
+
+TEST(TensorSliceUtil, ScatterIndexHandlesOutOfBoundIndices) {
+  Index<int64_t> index = {3, 5};
+  std::vector<int64_t> scatter_dims = {4, 0};
+  Index<int64_t> result;
+  TfLiteStatus status =
+      ScatterIndex(index, scatter_dims.data(), scatter_dims.size(), 3, &result);
+
+  EXPECT_THAT(status, kTfLiteError);
+}
+
+TEST(TensorSliceUtil, ScatterIndex) {
+  Index<int64_t> index = {3, 5};
+  std::vector<int64_t> scatter_dims = {1, 0};
+  Index<int64_t> result;
+  ScatterIndex(index, scatter_dims.data(), scatter_dims.size(), 3, &result);
+
+  EXPECT_THAT(result, ElementsAreArray({5, 3, 0}));
+}
+
+TEST(TensorSliceUtil, TensorIndexToFlatWorksForScalars) {
+  Index<int64_t> index = {0};
+  RuntimeShape shape(0);
+
+  EXPECT_EQ(TensorIndexToFlat(index.data(), index.size(), shape), 0);
+}
+
+TEST(TensorSliceUtil, TensorIndexToFlat) {
+  Index<int64_t> index = {2, 4};
+  RuntimeShape shape({3, 5});
+
+  EXPECT_EQ(TensorIndexToFlat(index.data(), index.size(), shape), 14);
+}
+
+TEST(TensorSliceUtil, AddIndices) {
+  Index<int64_t> index1 = {1, 2, 3};
+  Index<int64_t> index2 = {2, 7, 5};
+  EXPECT_THAT(AddIndices(index1, index2), ElementsAreArray({3, 9, 8}));
+}
+
+TEST(TensorSliceUtil, ExpandDimsHandlesEmptyIndex) {
+  Index<int64_t> index = {};
+  std::vector<int64_t> avoided_dims = {0, 1};
+  Index<int64_t> result;
+
+  ExpandDims(index, avoided_dims.data(), avoided_dims.size(), &result);
+  EXPECT_THAT(result, ElementsAreArray({0, 0}));
+}
+
+TEST(TensorSliceUtil, ExpandDims) {
+  Index<int64_t> index = {2, 4};
+  std::vector<int64_t> avoided_dims = {0, 2};
+  Index<int64_t> result;
+
+  ExpandDims(index, avoided_dims.data(), avoided_dims.size(), &result);
+  EXPECT_THAT(result, ElementsAreArray({0, 2, 0, 4}));
+}
+
+TEST(TensorSliceUtil, ReadIndexVector) {
+  TfLiteTensor tensor;
+  tensor.type = kTfLiteInt64;
+  // [
+  //   [[0, 2], [1, 0], [2, 1]],
+  //   [[0, 1], [1, 0], [0, 9]]
+  // ]
+  std::vector<int64_t> tensor_data = {0, 2, 1, 0, 2, 1, 0, 1, 1, 0, 0, 9};
+  TfLitePtrUnion ptr_union;
+  ptr_union.i64 = tensor_data.data();
+  tensor.data = ptr_union;
+
+  RuntimeShape shape = {2, 3, 2};
+  Index<int64_t> other_indices = {1, 1};
+  int64_t dim_to_read = 1;
+  EXPECT_THAT(ReadIndexVector(&tensor, shape, other_indices, dim_to_read),
+              ElementsAreArray({1, 0, 9}));
+}
+
+}  // namespace
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite

From 8c774b651cb3c8df280deffdcc0ac85563aea7eb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 11:23:58 -0700
Subject: [PATCH 531/567] Fix GetHostname in repository.cc for Tensorboard
 Profiler

PiperOrigin-RevId: 570445646
---
 tensorflow/core/profiler/convert/repository.cc      | 9 ++++++---
 tensorflow/core/profiler/convert/repository_test.cc | 9 +++++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/profiler/convert/repository.cc b/tensorflow/core/profiler/convert/repository.cc
index c8e06f6c1ce6f2..4a7e6e614f482f 100644
--- a/tensorflow/core/profiler/convert/repository.cc
+++ b/tensorflow/core/profiler/convert/repository.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/path.h"
@@ -38,9 +40,10 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 std::string GetHostnameByPath(absl::string_view xspace_path) {
-  std::string file_name = std::string(tensorflow::io::Basename(xspace_path));
-  std::vector<std::string> parts = absl::StrSplit(file_name, '.');
-  return parts[0];
+  std::string_view file_name = tensorflow::io::Basename(xspace_path);
+  // Remove suffix from file_name, preserving entire prefix.
+  absl::ConsumeSuffix(&file_name, ".xplane.pb");
+  return std::string(file_name);
 }
 }  // namespace
 
diff --git a/tensorflow/core/profiler/convert/repository_test.cc b/tensorflow/core/profiler/convert/repository_test.cc
index 97c3b403d9b90c..e885b21741f5f8 100644
--- a/tensorflow/core/profiler/convert/repository_test.cc
+++ b/tensorflow/core/profiler/convert/repository_test.cc
@@ -42,6 +42,15 @@ TEST(Repository, GetHostName) {
   EXPECT_TRUE(session_snapshot_or.value().HasAccessibleRunDir());
 }
 
+TEST(Repository, GetHostNameWithPeriods) {
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"log/plugins/profile/127.0.0.1_6009.xplane.pb"},
+                              /*xspaces=*/std::nullopt);
+  TF_CHECK_OK(session_snapshot_or.status());
+  EXPECT_THAT(session_snapshot_or.value().GetHostname(0), Eq("127.0.0.1_6009"));
+  EXPECT_TRUE(session_snapshot_or.value().HasAccessibleRunDir());
+}
+
 TEST(Repository, GetSpaceByHostName) {
   std::vector<std::unique_ptr<XSpace>> xspaces;
   // prepare host 1.

From 451645fda59e68f69c9f91b0a0bbc05a6bac2585 Mon Sep 17 00:00:00 2001
From: Grant Jensen <grantjensen@google.com>
Date: Tue, 3 Oct 2023 11:40:22 -0700
Subject: [PATCH 532/567] Fix ReLUN1To1 fused activation for OpenCL and OpenGL.

PiperOrigin-RevId: 570450610
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |   3 +-
 .../delegates/gpu/cl/cl_arguments_test.cc     |   5 +-
 .../delegates/gpu/cl/kernels/relu_test.cc     |  20 +++
 .../lite/delegates/gpu/cl/testing/BUILD       |   7 +
 .../delegates/gpu/common/model_builder.cc     |  23 +--
 .../gpu/common/model_builder_helper.cc        |   9 +-
 .../lite/delegates/gpu/common/operations.h    |  21 ++-
 .../lite/delegates/gpu/common/tasks/relu.cc   |  15 +-
 .../gpu/common/tasks/relu_test_util.cc        | 136 +++++++++++++++++-
 .../gpu/common/tasks/relu_test_util.h         |   4 +
 tensorflow/lite/delegates/gpu/gl/BUILD        |   6 +-
 .../lite/delegates/gpu/gl/kernels/BUILD       |  19 ++-
 .../lite/delegates/gpu/gl/kernels/relu.cc     |  11 +-
 .../delegates/gpu/gl/kernels/relu_test.cc     |  64 ++++++++-
 14 files changed, 296 insertions(+), 47 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index d710059af90886..760f401bdc61ea 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -1,9 +1,9 @@
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("//tensorflow:tensorflow.bzl", "if_google", "workspace_root")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
 )
-load("//tensorflow:tensorflow.bzl", "if_google", "workspace_root")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -126,6 +126,7 @@ cc_test(
     deps = [
         ":buffer",
         ":cl_arguments",
+        ":cl_test",
         ":gpu_object",
         "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc b/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
index 2a46c202286dd5..c9da770940968f 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_arguments_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/match.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_test.h"
 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 
@@ -45,7 +46,7 @@ __kernel void main_function($0) {
 
   CLArguments cl_args;
   GpuInfo gpu_info;
-  ASSERT_OK(cl_args.Init(gpu_info, {}, nullptr, &args, &sample_code));
+  ASSERT_OK(cl_args.Init(gpu_info, nullptr, &args, &sample_code));
   EXPECT_TRUE(absl::StrContains(sample_code, "value = weights_buffer[id];"));
   EXPECT_TRUE(
       absl::StrContains(sample_code, "__global float4* weights_buffer"));
@@ -67,7 +68,7 @@ TEST(CLArgumentsTest, TestNoSelector) {
 )";
   CLArguments cl_args;
   GpuInfo gpu_info;
-  EXPECT_FALSE(cl_args.Init(gpu_info, {}, nullptr, &args, &sample_code).ok());
+  EXPECT_FALSE(cl_args.Init(gpu_info, nullptr, &args, &sample_code).ok());
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
index 10a5c565c2437f..fd76f2cbc48bf8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
@@ -46,6 +46,26 @@ TEST_F(OpenCLOperationTest, ReLUAlphaClip) {
   ASSERT_TRUE(status.ok()) << status.message();
 }
 
+TEST_F(OpenCLOperationTest, ReLULN1NoClipNoAlpha) {
+  auto status = ReLUN1NoClipNoAlphaTest(&exec_env_);
+  ASSERT_TRUE(status.ok()) << status;
+}
+
+TEST_F(OpenCLOperationTest, ReLUN1Clip) {
+  auto status = ReLUN1ClipTest(&exec_env_);
+  ASSERT_TRUE(status.ok()) << status;
+}
+
+TEST_F(OpenCLOperationTest, ReLULN1Alpha) {
+  auto status = ReLUN1AlphaTest(&exec_env_);
+  ASSERT_TRUE(status.ok()) << status;
+}
+
+TEST_F(OpenCLOperationTest, ReLUN1AlphaClip) {
+  auto status = ReLUN1AlphaClipTest(&exec_env_);
+  ASSERT_TRUE(status.ok()) << status;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index f5c26496875348..75e36c0c9ca877 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -42,6 +42,13 @@ cc_test(
 cc_binary(
     name = "internal_api_samples",
     srcs = ["internal_api_samples.cc"],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lEGL",
+            "-lGLESv3",
+        ],
+        "//conditions:default": [],
+    }),
     tags = [
         "nobuilder",
         "notap",
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index a8aec7c4a505dc..548cbcba1afc80 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -433,8 +433,8 @@ class ClampOperationsParser : public TFLiteOperationParser {
     // We replace clamp(...) with sequence of elementwise ops:
     // substaction -> usual relu with alpha = 0.0 -> addition.
     // node_sub = v0 = v - a // add op (add -a)
-    // node_relu = v1 = clamp(v0, 0.0, clip); // relu op alpha = 0.0,
-    // clip = b - a;
+    // node_relu = v1 = clamp(v0, 0.0, activation_max); // relu op alpha = 0.0,
+    // activation_max = b - a;
     // node_add = v2 = v1 + a // add op (add a)
     Node* node_sub = graph->NewNode();
     Node* node_relu = graph->NewNode();
@@ -447,7 +447,7 @@ class ClampOperationsParser : public TFLiteOperationParser {
 
     ReLUAttributes relu_attr;
     relu_attr.alpha = 0.0f;
-    relu_attr.clip = clamp_b_ - clamp_a_;
+    relu_attr.activation_max = clamp_b_ - clamp_a_;
     node_relu->operation.type = ToString(OperationType::RELU);
     node_relu->operation.attributes = relu_attr;
 
@@ -1943,7 +1943,8 @@ class QuantizeOperationParser : public TFLiteOperationParser {
 
 class ReLUOperationParser : public TFLiteOperationParser {
  public:
-  explicit ReLUOperationParser(int clip) : clip_(clip) {}
+  explicit ReLUOperationParser(int activation_min, int activation_max)
+      : activation_min_(activation_min), activation_max_(activation_max) {}
 
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
@@ -1963,13 +1964,15 @@ class ReLUOperationParser : public TFLiteOperationParser {
     const TfLiteLeakyReluParams* tf_options;
     auto status = RetrieveBuiltinData(tflite_node, &tf_options);
     attr.alpha = status.ok() ? tf_options->alpha : 0;
-    attr.clip = clip_;
+    attr.activation_min = activation_min_;
+    attr.activation_max = activation_max_;
     node->operation.attributes = attr;
     return reader->AddOutputs(node);
   }
 
  private:
-  const int clip_;
+  const int activation_min_;
+  const int activation_max_;
 };
 
 class ResamplerOperationParser : public TFLiteOperationParser {
@@ -3207,13 +3210,13 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       }
       break;
     case kTfLiteBuiltinRelu:
-      return std::make_unique<ReLUOperationParser>(0);
+      return std::make_unique<ReLUOperationParser>(0, 0);
     case kTfLiteBuiltinRelu6:
-      return std::make_unique<ReLUOperationParser>(6);
+      return std::make_unique<ReLUOperationParser>(0, 6);
     case kTfLiteBuiltinReluN1To1:
-      return std::make_unique<ClampOperationsParser>(-1.0, 1.0);
+      return std::make_unique<ReLUOperationParser>(-1.0, 1.0);
     case kTfLiteBuiltinLeakyRelu:
-      return std::make_unique<ReLUOperationParser>(0);
+      return std::make_unique<ReLUOperationParser>(0, 0);
     case kTfLiteBuiltinPrelu:
       return std::make_unique<PReLUOperationParser>();
     case kTfLiteBuiltinReshape:
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index c825d2be691d5f..b498916bfed447 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -397,9 +397,12 @@ absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
     case kTfLiteActReluN1To1:
     case kTfLiteActRelu6: {
       ReLUAttributes attr;
-      attr.clip = fused_activation == kTfLiteActRelu
-                      ? 0.0f
-                      : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
+      attr.activation_max =
+          fused_activation == kTfLiteActRelu
+              ? 0.0f
+              : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
+      attr.activation_min =
+          fused_activation == kTfLiteActReluN1To1 ? -1.0f : 0.0f;
       Node* activation_node;
       RETURN_IF_ERROR(
           NewPassthroughNode(graph, node, outputs[0], &activation_node));
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 37d8dfa0fcdc6f..ac384bbe801ea2 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -391,18 +391,25 @@ Padding3D CalculateSamePadding(const BHWDC& input,
                                const DepthwiseConvolution3DAttributes& attr);
 
 // f(x):= {
-//   if x < 0  : x -> alpha * x
-//   if x >= 0 : x -> min(clip, x)
+//   if alpha != 0: x -> min(activation_max, x)
+//   else
+//     if x < activation_min : x -> min(activation_min, alpha * x)
+//     if x >= activation_min : x -> min(activation_max, x)
 // }
 //
 // Examples:
-//   - ReLU: clip = 0, alpha = 0
-//   - ReLU6: clip = 6, alpha = 0
-//   - Leaky ReLU: clip = 0, alpha = a
+//   - ReLU: activation_min = 0, activation_max = 0, alpha = 0
+//   - ReLU6: activation_min = 0, activation_max = 6, alpha = 0
+//   - Leaky ReLU: activation_min = 0, activation_max = 0, alpha = a
+//   - ReLUN1To1: activation_min = -1, activation_max = 1, alpha = 0
 struct ReLUAttributes {
-  // clip <= 0 mean it is not set.
-  float clip = 0;
+  // activation_min must be < activation_max
+  float activation_min = 0;
 
+  // activation_max <= 0 mean it is not set.
+  float activation_max = 0;
+
+  // alpha must be <= 1
   float alpha = 0;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/relu.cc b/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
index 9afc2c7508eabb..861ba496cc2b72 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/relu.cc
@@ -35,16 +35,21 @@ ElementwiseDescriptor CreateReLU(const ReLUAttributes& attr,
       result.args.AddHalf("alpha", half(attr.alpha));
     }
   } else {
-    min_func = "INIT_FLT4(0.0f)";
+    min_func = "INIT_FLT4(args.activation_min)";
+    if (precision == CalculationsPrecision::F32) {
+      result.args.AddFloat("activation_min", attr.activation_min);
+    } else {
+      result.args.AddHalf("activation_min", half(attr.activation_min));
+    }
   }
-  if (attr.clip != 0.0f) {
+  if (attr.activation_max != 0.0f) {
     if (precision == CalculationsPrecision::F32) {
-      result.args.AddFloat("clip", attr.clip);
+      result.args.AddFloat("activation_max", attr.activation_max);
     } else {
-      result.args.AddHalf("clip", half(attr.clip));
+      result.args.AddHalf("activation_max", half(attr.activation_max));
     }
     result.code = absl::StrCat("out_value = clamp(in_value, " + min_func +
-                               ", INIT_FLT4(args.clip));");
+                               ", INIT_FLT4(args.activation_max));");
   } else {
     result.code = absl::StrCat("out_value = max(in_value, ", min_func, ");");
   }
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
index 7c87bbb7823c0b..09a8c2110c2935 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.cc
@@ -33,7 +33,7 @@ absl::Status ReLUNoClipNoAlphaTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.0f;
-  attr.clip = 0.0f;
+  attr.activation_max = 0.0f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -62,7 +62,7 @@ absl::Status ReLUClipTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.0f;
-  attr.clip = 0.9f;
+  attr.activation_max = 0.9f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -91,7 +91,7 @@ absl::Status ReLUAlphaTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.5f;
-  attr.clip = 0.0f;
+  attr.activation_max = 0.0f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -120,7 +120,7 @@ absl::Status ReLUAlphaClipTest(TestExecutionEnvironment* env) {
 
   ReLUAttributes attr;
   attr.alpha = 0.5f;
-  attr.clip = 0.5f;
+  attr.activation_max = 0.5f;
 
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -142,5 +142,133 @@ absl::Status ReLUAlphaClipTest(TestExecutionEnvironment* env) {
   return absl::OkStatus();
 }
 
+absl::Status ReLUN1NoClipNoAlphaTest(TestExecutionEnvironment* env) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 0.0f;
+  attr.activation_min = -1.0f;
+  attr.activation_max = 0.0f;
+
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateReLU(op_def, attr);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
+          BHWC(1, 2, 1, 4), &dst_tensor));
+      RETURN_IF_ERROR(
+          PointWiseNear({-1.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f},
+                        dst_tensor.data, eps));
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ReLUN1ClipTest(TestExecutionEnvironment* env) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 0.0f;
+  attr.activation_min = -1.0f;
+  attr.activation_max = 1.0f;
+
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateReLU(op_def, attr);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
+          BHWC(1, 2, 1, 4), &dst_tensor));
+      RETURN_IF_ERROR(
+          PointWiseNear({-1.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 1.0f},
+                        dst_tensor.data, eps));
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ReLUN1AlphaTest(TestExecutionEnvironment* env) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 1.0f;
+  attr.activation_min = -1.0f;  // activation_min ignored if alpha != 0
+  attr.activation_max = 0.0f;
+
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateReLU(op_def, attr);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
+          BHWC(1, 2, 1, 4), &dst_tensor));
+      RETURN_IF_ERROR(
+          PointWiseNear({-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f},
+                        dst_tensor.data, eps));
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ReLUN1AlphaClipTest(TestExecutionEnvironment* env) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 4);
+  src_tensor.data = {-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.2f};
+
+  ReLUAttributes attr;
+  attr.alpha = 1.0f;
+  attr.activation_min = -1.0f;  // activation_min ignored if alpha != 0
+  attr.activation_max = 3.0f;
+
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateReLU(op_def, attr);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
+          BHWC(1, 2, 1, 4), &dst_tensor));
+      RETURN_IF_ERROR(
+          PointWiseNear({-12.0f, -1.0f, -0.5f, 0.0f, 0.8f, -0.6f, 1.0f, 3.0f},
+                        dst_tensor.data, eps));
+    }
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h
index 92ed2eea5cbcd8..8c0a7023048962 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/relu_test_util.h
@@ -26,6 +26,10 @@ absl::Status ReLUNoClipNoAlphaTest(TestExecutionEnvironment* env);
 absl::Status ReLUClipTest(TestExecutionEnvironment* env);
 absl::Status ReLUAlphaTest(TestExecutionEnvironment* env);
 absl::Status ReLUAlphaClipTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1NoClipNoAlphaTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1ClipTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1AlphaTest(TestExecutionEnvironment* env);
+absl::Status ReLUN1AlphaClipTest(TestExecutionEnvironment* env);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/BUILD b/tensorflow/lite/delegates/gpu/gl/BUILD
index 4768126863d078..7bc5cd2357d11c 100644
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@@ -1,9 +1,9 @@
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
 )
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gpu_delegate_linkopts")
 
 package(
@@ -242,6 +242,7 @@ cc_test(
     linkopts = [
         "-lEGL",
         "-lGLESv2",
+        "-lm",
     ],
     tags = tf_gpu_tests_tags() + [
         "local",
@@ -475,6 +476,9 @@ cc_library(
 cc_test(
     name = "serialization_test",
     srcs = ["serialization_test.cc"],
+    linkopts = [
+        "-lm",
+    ],
     tags = [
         "local",
         "nobuilder",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 1c920b92b8dab7..169e0cd75c59f3 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -1,8 +1,8 @@
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
 )
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -45,6 +45,7 @@ cc_test(
     ],
     deps = [
         ":converter",
+        ":test_util",
         "//tensorflow/lite/delegates/gpu/common:convert",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -721,10 +722,18 @@ cc_library(
     testonly = 1,
     srcs = ["test_util.cc"],
     hdrs = ["test_util.h"],
-    linkopts = [
-        "-lEGL",
-        "-lGLESv3",
-    ],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lEGL",
+            "-lGLESv3",
+            "-ldl",
+            "-lm",
+        ],
+        "//conditions:default": [
+            "-lEGL",
+            "-lGLESv3",
+        ],
+    }),
     deps = [
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
index 6d05ea894d4cfe..5bfd0517df68eb 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc
@@ -39,21 +39,22 @@ class ReLU : public NodeShader {
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
     const auto& attr = std::any_cast<const ReLUAttributes&>(ctx.op_attr);
-    // clamp(value, min(0, alpha * value), clip)
+    // clamp(value, min(0, alpha * value), activation_max)
     std::vector<Variable> params;
     std::string min;
     if (attr.alpha == 0) {
-      min = "vec4(0.0)";
+      min = "vec4($activation_min$)";
+      params.push_back({"activation_min", attr.activation_min});
     } else {
       min = "min($alpha$ * value_0, 0.0)";
       params.push_back({"alpha", attr.alpha});
     }
     std::string code;
-    if (attr.clip == 0) {
+    if (attr.activation_max == 0) {
       code = "value_0 = max(value_0, " + min + ");";
     } else {
-      code = "value_0 = clamp(value_0, " + min + ", vec4($clip$));";
-      params.push_back({"clip", attr.clip});
+      code = "value_0 = clamp(value_0, " + min + ", vec4($activation_max$));";
+      params.push_back({"activation_max", attr.activation_max});
     }
     *generated_code = {
         /*parameters=*/std::move(params),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
index 0c2d2536fd654e..035555656b2a3a 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/relu_test.cc
@@ -45,7 +45,7 @@ class ReluTest : public ::testing::Test {
 TEST_F(ReluTest, Smoke) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.clip = 0;
+  attr.activation_max = 0;
   attr.alpha = 0;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});
@@ -58,7 +58,7 @@ TEST_F(ReluTest, Smoke) {
 TEST_F(ReluTest, ClipOnly) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.clip = 6;
+  attr.activation_max = 6;
   attr.alpha = 0;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});
@@ -71,7 +71,7 @@ TEST_F(ReluTest, ClipOnly) {
 TEST_F(ReluTest, AlphaOnly) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.clip = 0;
+  attr.activation_max = 0;
   attr.alpha = 0.5;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});
@@ -84,7 +84,63 @@ TEST_F(ReluTest, AlphaOnly) {
 TEST_F(ReluTest, ClipAndAlpha) {
   OperationType op_type = OperationType::RELU;
   ReLUAttributes attr;
-  attr.clip = 6;
+  attr.activation_max = 6;
+  attr.alpha = 0.5;
+  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
+                      {GetTensorRef(1)});
+  ASSERT_TRUE(model.PopulateTensor(0, {-6.0, 0.0, 2.0, 8.0}));
+  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {-3.0, 0.0, 2.0, 6.0}));
+}
+
+TEST_F(ReluTest, ReLUN1Smoke) {
+  OperationType op_type = OperationType::RELU;
+  ReLUAttributes attr;
+  attr.activation_min = -1;
+  attr.activation_max = 0;
+  attr.alpha = 0;
+  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
+                      {GetTensorRef(1)});
+  ASSERT_TRUE(model.PopulateTensor(0, {-12.0f, -0.5f, 0.8f, 3.2f}));
+  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {-1.0f, -0.5f, 0.8f, 3.2f}));
+}
+
+TEST_F(ReluTest, ReLUN1ClipOnly) {
+  OperationType op_type = OperationType::RELU;
+  ReLUAttributes attr;
+  attr.activation_min = -1;
+  attr.activation_max = 1;
+  attr.alpha = 0;
+  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
+                      {GetTensorRef(1)});
+  ASSERT_TRUE(model.PopulateTensor(0, {-12.0f, -0.5f, 0.8f, 3.2f}));
+  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {-1.0f, -0.5f, 0.8f, 1.0f}));
+}
+
+TEST_F(ReluTest, ReLUN1AlphaOnly) {
+  OperationType op_type = OperationType::RELU;
+  ReLUAttributes attr;
+  attr.activation_min = -1;  // activation_min ignored if alpha != 0
+  attr.activation_max = 0;
+  attr.alpha = 0.5;
+  SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
+                      {GetTensorRef(1)});
+  ASSERT_TRUE(model.PopulateTensor(0, {-6.0, 0.0, 2.0, 8.0}));
+  ASSERT_OK(model.Invoke(*NewReLUNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {-3.0, 0.0, 2.0, 8.0}));
+}
+
+TEST_F(ReluTest, ReLUN1ClipAndAlpha) {
+  OperationType op_type = OperationType::RELU;
+  ReLUAttributes attr;
+  attr.activation_min = -1;  // activation_min ignored if alpha != 0
+  attr.activation_max = 6;
   attr.alpha = 0.5;
   SingleOpModel model({ToString(op_type), attr}, {GetTensorRef(0)},
                       {GetTensorRef(1)});

From b321e8c40df25382c09824afa2571ea162670aba Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Tue, 3 Oct 2023 11:42:46 -0700
Subject: [PATCH 533/567] Fix deadlock issue in ml_dtypes loading.

There's a race condition between the python GIL and static loading of
ml_dtypes type numbers from C++.  To resolve this, we need to
explicitly release and re-acquire the GIL.

PiperOrigin-RevId: 570451242
---
 .../third_party/tsl/tsl/python/lib/core/BUILD |   4 +-
 .../tsl/tsl/python/lib/core/ml_dtypes.cc      | 113 +++++++++++++++---
 .../tsl/tsl/python/lib/core/ml_dtypes.h       |   8 +-
 3 files changed, 99 insertions(+), 26 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD b/third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD
index 988d0978542f59..72dff69dc0023f 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD
@@ -33,8 +33,10 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":numpy",
+        "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
-        "//tsl/python/lib/core:numpy",
+        "@com_google_absl//absl/base",
         "@pybind11",
     ],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc b/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc
index b9a09c6b9d1f45..7a171eac1ae34d 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc
+++ b/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc
@@ -14,35 +14,110 @@ limitations under the License.
 ==============================================================================*/
 #include "tsl/python/lib/core/ml_dtypes.h"
 
+#include <exception>
+
+#include "numpy/ndarraytypes.h"
+#include "absl/base/call_once.h"
+#include "pybind11/gil.h"  // from @pybind11
 #include "pybind11/numpy.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "tsl/python/lib/core/numpy.h"  // IWYU pragma: keep
 
 namespace tsl {
 namespace ml_dtypes {
 
 namespace py = pybind11;
 
+namespace {
+
+class MlDtypesInitInfo {
+ public:
+  MlDtypesInitInfo()
+      : np_dtypes_{/*bfloat16=*/NPY_NOTYPE,
+                   /*float8_e4m3fn=*/NPY_NOTYPE,
+                   /*float8_e4m3b11fnuz=*/NPY_NOTYPE,
+                   /*float8_e4m3fnuz=*/NPY_NOTYPE,
+                   /*float8_e5m2=*/NPY_NOTYPE,
+                   /*float8_e5m2fnuz=*/NPY_NOTYPE,
+                   /*int4=*/NPY_NOTYPE,
+                   /*uint4=*/NPY_NOTYPE},
+        valid_{false} {}
+
+  void Init() {
+    valid_ = true;
+    // Pybind11 might throw.
+    try {
+      py::gil_scoped_acquire acquire;
+      py::module ml_dtypes = py::module::import("ml_dtypes");
+      np_dtypes_.bfloat16 =
+          py::dtype::from_args(ml_dtypes.attr("bfloat16")).num();
+      np_dtypes_.float8_e4m3fn =
+          py::dtype::from_args(ml_dtypes.attr("float8_e4m3fn")).num();
+      np_dtypes_.float8_e5m2 =
+          py::dtype::from_args(ml_dtypes.attr("float8_e5m2")).num();
+      np_dtypes_.float8_e4m3b11fnuz =
+          py::dtype::from_args(ml_dtypes.attr("float8_e4m3b11fnuz")).num();
+      np_dtypes_.float8_e4m3fnuz =
+          py::dtype::from_args(ml_dtypes.attr("float8_e4m3fnuz")).num();
+      np_dtypes_.float8_e5m2fnuz =
+          py::dtype::from_args(ml_dtypes.attr("float8_e5m2fnuz")).num();
+      np_dtypes_.int4 = py::dtype::from_args(ml_dtypes.attr("int4")).num();
+      np_dtypes_.uint4 = py::dtype::from_args(ml_dtypes.attr("uint4")).num();
+    } catch (const std::exception& e) {
+      py::print(e.what());
+      valid_ = false;
+    }
+
+    // Verify all types were successfully loaded.
+    if (np_dtypes_.bfloat16 == NPY_NOTYPE ||
+        np_dtypes_.float8_e4m3fn == NPY_NOTYPE ||
+        np_dtypes_.float8_e4m3fnuz == NPY_NOTYPE ||
+        np_dtypes_.float8_e4m3b11fnuz == NPY_NOTYPE ||
+        np_dtypes_.float8_e5m2 == NPY_NOTYPE ||
+        np_dtypes_.float8_e5m2fnuz == NPY_NOTYPE ||
+        np_dtypes_.int4 == NPY_NOTYPE || np_dtypes_.uint4 == NPY_NOTYPE) {
+      valid_ = false;
+    }
+  }
+
+  bool IsValid() const { return valid_; }
+
+  const NumpyDtypes& GetNumpyDtypes() const { return np_dtypes_; }
+
+ private:
+  NumpyDtypes np_dtypes_;  // Numpy type numbers.
+  bool valid_;             // Stores whether type loading was valid.
+};
+
+// Safely initialize the ml_dtypes module and load the numpy dtype information.
+const MlDtypesInitInfo& GetMlDtypesInitInfo() {
+  static MlDtypesInitInfo info;
+
+  // We must take special care in initializing the ml_dtypes module
+  // since there is a potential race condition between the python
+  // GIL and any synchronization mechanism we attempt to use (b/302750630).
+  // We also want to avoid unnecessarily locking the GIL if possible.
+  static bool initialized = false;
+  if (!initialized) {
+    auto init = [&]() { info.Init(); };
+
+    // GIL must be released prior to attempting synchronization.
+    py::gil_scoped_release release;
+    static absl::once_flag init_flag;
+    absl::call_once(init_flag, init);
+    initialized = true;
+  }
+
+  return info;
+}
+
+}  // namespace
+
 const NumpyDtypes& GetNumpyDtypes() {
-  static const NumpyDtypes* numpy_dtypes = []() {
-    py::module ml_dtypes = py::module::import("ml_dtypes");
-    NumpyDtypes* dtypes = new NumpyDtypes();
-    dtypes->bfloat16 = py::dtype::from_args(ml_dtypes.attr("bfloat16")).num();
-    dtypes->float8_e4m3fn =
-        py::dtype::from_args(ml_dtypes.attr("float8_e4m3fn")).num();
-    dtypes->float8_e5m2 =
-        py::dtype::from_args(ml_dtypes.attr("float8_e5m2")).num();
-    dtypes->float8_e4m3b11fnuz =
-        py::dtype::from_args(ml_dtypes.attr("float8_e4m3b11fnuz")).num();
-    dtypes->float8_e4m3fnuz =
-        py::dtype::from_args(ml_dtypes.attr("float8_e4m3fnuz")).num();
-    dtypes->float8_e5m2fnuz =
-        py::dtype::from_args(ml_dtypes.attr("float8_e5m2fnuz")).num();
-    dtypes->int4 = py::dtype::from_args(ml_dtypes.attr("int4")).num();
-    dtypes->uint4 = py::dtype::from_args(ml_dtypes.attr("uint4")).num();
-    return dtypes;
-  }();
-  return *numpy_dtypes;
+  return GetMlDtypesInitInfo().GetNumpyDtypes();
 }
 
+bool RegisterTypes() { return GetMlDtypesInitInfo().IsValid(); }
+
 }  // namespace ml_dtypes
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.h b/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.h
index 6164901d0a7c4b..eec117d3caf378 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.h
+++ b/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.h
@@ -33,13 +33,9 @@ struct NumpyDtypes {
   int uint4;
 };
 
-const NumpyDtypes& GetNumpyDtypes();
+bool RegisterTypes();
 
-// Deprecated: no longer required, but is currently heavily used.
-inline bool RegisterTypes() {
-  GetNumpyDtypes();
-  return true;
-}
+const NumpyDtypes& GetNumpyDtypes();
 
 inline int GetBfloat16TypeNum() { return GetNumpyDtypes().bfloat16; }
 

From 6dfbe1a5e0ae7d4863543d6b83a1f528fe130726 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 3 Oct 2023 11:46:35 -0700
Subject: [PATCH 534/567] [XLA:GPU] Remove cuda_executor.h, rocm_gpu_executor.h
 and targets.

They don't provide anything useful.

PiperOrigin-RevId: 570452290
---
 .../xla/xla/stream_executor/cuda/BUILD        | 16 ---------
 .../xla/xla/stream_executor/cuda/cuda_blas.cc |  1 -
 .../xla/xla/stream_executor/cuda/cuda_dnn.cc  |  2 +-
 .../xla/stream_executor/cuda/cuda_event.cc    |  2 +-
 .../xla/stream_executor/cuda/cuda_executor.cc |  2 --
 .../xla/stream_executor/cuda/cuda_executor.h  | 35 -------------------
 .../xla/xla/stream_executor/cuda/cuda_fft.cc  |  2 +-
 .../xla/stream_executor/cuda/cuda_platform.cc |  2 +-
 .../xla/xla/stream_executor/rocm/BUILD        | 13 -------
 .../stream_executor/rocm/rocm_gpu_executor.cc |  2 --
 .../stream_executor/rocm/rocm_gpu_executor.h  | 35 -------------------
 11 files changed, 4 insertions(+), 108 deletions(-)
 delete mode 100644 third_party/xla/xla/stream_executor/cuda/cuda_executor.h
 delete mode 100644 third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.h

diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 2aa3e04098ab28..598bdd8329f278 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -212,18 +212,6 @@ cc_library(
     ]),
 )
 
-cc_library(
-    name = "cuda_executor_header",
-    textual_hdrs = if_cuda_is_configured(["cuda_executor.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
-        ":cuda_kernel",
-        "//xla/stream_executor:stream_executor_headers",
-        "//xla/stream_executor/gpu:gpu_executor_header",
-        "//xla/stream_executor/platform",
-    ]),
-)
-
 cc_library(
     name = "cublas_lt_header",
     hdrs = if_cuda_is_configured([
@@ -305,7 +293,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_activation_header",
-        ":cuda_executor_header",
         ":cuda_platform_id",
         ":cuda_stream",
         ":cuda_helpers",
@@ -437,7 +424,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_driver",
-        ":cuda_executor_header",
         ":cuda_stream",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_event",
@@ -452,7 +438,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_driver",
-        ":cuda_executor_header",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_stream",
         "//xla/stream_executor/platform",
@@ -478,7 +463,6 @@ cc_library(
 cc_library(
     name = "cuda_executor",
     srcs = if_cuda_is_configured(["cuda_executor.cc"]),
-    hdrs = if_cuda_is_configured(["cuda_executor.h"]),
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_activation",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index 831798624e46d8..a790acadbe8971 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/cuda/cuda_activation.h"
 #include "xla/stream_executor/cuda/cuda_blas_utils.h"
-#include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_helpers.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/cuda_stream.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 6500658e26f735..a7519e660dcbe5 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -39,10 +39,10 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_activation.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_driver.h"
-#include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/cuda_stream.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
index 8ca395e4c951ec..1a61f500af2859 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_event.h"
 
-#include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_stream.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
 #include "tsl/platform/statusor.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index a3e88e65f2f524..23d8a93b6bb135 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/cuda/cuda_executor.h"
-
 #include <cstdint>
 #include <memory>
 #include <utility>
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
deleted file mode 100644
index 8d7ee5d9cd3a0b..00000000000000
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// The CUDA implementation of the StreamExecutorInterface functionality.
-// CUDA inclusions are ideally confined to this implementation file.
-//
-// The notions from the StreamExecutor basically correspond to the CUDA streams
-// programming model provided by the libcuda.so driver APIs, so we don't have
-// to do much more than wrap the calls to the libraries appropriately.
-#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
-#define XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
-
-#include "xla/stream_executor/gpu/gpu_executor.h"
-
-namespace stream_executor {
-namespace cuda {
-
-using CUDAExecutor = gpu::GpuExecutor;
-
-}  // namespace cuda
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
index 4aa37fd6713908..1d84b9a1819e15 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
@@ -21,11 +21,11 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "xla/stream_executor/cuda/cuda_activation.h"
-#include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_helpers.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/cuda_stream.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
index 9e81e0eb8c9eb2..c4153854b0b03e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "xla/stream_executor/cuda/cuda_driver.h"
-#include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 361216524667a7..98ff3e58f5037e 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -98,7 +98,6 @@ cc_library(
 cc_library(
     name = "rocm_gpu_executor",
     srcs = if_rocm_is_configured(["rocm_gpu_executor.cc"]),
-    hdrs = if_rocm_is_configured(["rocm_gpu_executor.h"]),
     visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocm_diagnostics",
@@ -123,18 +122,6 @@ cc_library(
     alwayslink = True,
 )
 
-cc_library(
-    name = "rocm_gpu_executor_header",
-    textual_hdrs = if_rocm_is_configured(["rocm_gpu_executor.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_rocm_is_configured([
-        ":rocm_kernel",
-        "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_executor_header",
-        "//xla/stream_executor/platform",
-    ]),
-)
-
 cc_library(
     name = "rocm_kernel",
     srcs = if_rocm_is_configured(["rocm_kernel.cc"]),
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
index 389b4ba6c390e0..989ae0cb7b0280 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "xla/stream_executor/rocm/rocm_gpu_executor.h"
-
 #include <unistd.h>
 
 #include <memory>
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.h b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.h
deleted file mode 100644
index 43ab5254012392..00000000000000
--- a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// The ROCm implementation of the StreamExecutorInterface functionality.
-// ROCm inclusions are ideally confined to this implementation file.
-//
-// The notions from the StreamExecutor basically correspond to the ROCm streams
-// programming model provided by the librocm.so driver APIs, so we don't have
-// to do much more than wrap the calls to the libraries appropriately.
-#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_GPU_EXECUTOR_H_
-#define XLA_STREAM_EXECUTOR_ROCM_ROCM_GPU_EXECUTOR_H_
-
-#include "xla/stream_executor/gpu/gpu_executor.h"
-
-namespace stream_executor {
-namespace rocm {
-
-using ROCMExecutor = gpu::GpuExecutor;
-
-}  // namespace rocm
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_GPU_EXECUTOR_H_

From 444412889f8d48be6156abbce0cfb215d25ef81a Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Tue, 3 Oct 2023 12:23:56 -0700
Subject: [PATCH 535/567] Use TF Dialect to TF Executor Bridge API instead of
 adding graph passes. Should be a no-op since these passes were just pulled
 into its own API instead.

PiperOrigin-RevId: 570462504
---
 .../compiler/mlir/tensorflow/transforms/BUILD |  1 +
 .../mlir/tensorflow/transforms/bridge.cc      | 82 +++++++------------
 .../mlir/tensorflow/transforms/passes.h       |  4 -
 tensorflow/compiler/mlir/tf2xla/api/v2/BUILD  |  6 +-
 4 files changed, 37 insertions(+), 56 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index 117e475e431bdf..77a949471170bb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -690,6 +690,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tf2xla/api/v2:tf_dialect_to_executor",
         "//tensorflow/compiler/mlir/tf2xla/internal:clustering_bridge_passes",
         "//tensorflow/compiler/mlir/tf2xla/internal/inference:inference_metrics_pass",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index 210ff858e84286..d5fd74479b592f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/inference/inference_passes.h"
 #include "tensorflow/core/framework/metrics.h"
@@ -177,32 +178,36 @@ tensorflow::Status TPUBridge(ModuleOp module, bool fallback_enabled,
       << "TPU Bridge called stack trace is "
       << "(NOTE: this is not an error; rather the stack trace for debugging) : "
       << tensorflow::CurrentStackTrace();
-  Status status = RunTFXLABridge(
+  Status bridge_status = RunTFXLABridge(
       module,
       [module_name](OpPassManager &pm) {
         CreateTPUBridgePipeline(pm, module_name);
-        // Add set of passes to lower back to graph
-        // (from tf_executor). Use graph export
-        // pipline V2 in TPU Bridge.
-        // TODO(hanxiong): Completely replace
-        // AddGraphExportLoweringPasses with
-        // AddGraphExortLoweringPassessV2 in all the
-        // code paths (V1 compat pipeline, CPU/GPU
-        // bridge, etc.)
-        TF::AddGraphExportLoweringPassesV2(pm);
       },
       module_name);
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-      "tpu", "v2", fallback_enabled, status.ok() ? "success" : "failure");
+      "tpu", "v2", fallback_enabled,
+      bridge_status.ok() ? "success" : "failure");
   tsl::OkOrSetErrorCounterPayload(
       tensorflow::core::platform::ErrorSourceProto::MLIR_BRIDGE_PHASE_1,
-      status);
-  if (!status.ok()) {
+      bridge_status);
+  if (!bridge_status.ok()) {
     tsl::error_logging::Log(kBridgeComponent, "TFXLA_PHASE_ONE_MLIR_TPU_BRIDGE",
-                            status.ToString())
+                            bridge_status.ToString())
         .IgnoreError();
+    return bridge_status;
   }
-  return status;
+
+  Status export_status =
+      tensorflow::tf2xla::v2::ExportFromTensorflowDialectToExecutor(
+          module, module_name);
+  if (!export_status.ok()) {
+    tsl::error_logging::Log(kBridgeComponent,
+                            "TFXLA_PHASE_ONE_MLIR_TPU_BRIDGE_EXPORT",
+                            export_status.ToString())
+        .IgnoreError();
+  }
+
+  return export_status;
 }
 tensorflow::Status TPUBridgeV1Compat(ModuleOp module, bool fallback_enabled) {
   VLOG(2)
@@ -255,40 +260,6 @@ void AddGraphExportLoweringPasses(OpPassManager &pm) {
   pm.addPass(CreateVerifySuitableForExportPass());
 }
 
-void AddGraphExportLoweringPassesV2(OpPassManager &pm) {
-  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
-
-  // First, we need to convert from functional, to executor dialect.
-  pm.addNestedPass<func::FuncOp>(
-      CreateFunctionalToExecutorDialectConversionPass());
-
-  // Do a single pass to split the graph's single island op into an island per
-  // op as expected by the following passes.
-  pm.addNestedPass<func::FuncOp>(CreateSplitIntoIslandPerOpPass());
-
-  pm.addNestedPass<func::FuncOp>(TFDevice::CreateReplicateToIslandPass(
-      /*legacy_graph_export=*/false));
-  pm.addNestedPass<func::FuncOp>(
-      TFDevice::CreateReplicaIDToDeviceOrdinalPass());
-  pm.addNestedPass<func::FuncOp>(TFDevice::CreateParallelExecuteToIslandsPass(
-      /*legacy_graph_export=*/false));
-  pm.addNestedPass<func::FuncOp>(TFDevice::CreateLaunchToDeviceAttributePass(
-      /*legacy_graph_export=*/false));
-
-  // Do a single pass to encode necessary control deps in the IR according to
-  // the results of side effect analysis.
-  pm.addPass(tf_executor::CreateTFExecutorUpdateControlDependenciesPass());
-
-  pm.addNestedPass<func::FuncOp>(TFTPU::CreateTPUDevicePropagationPass());
-  pm.addNestedPass<func::FuncOp>(TFTPU::CreateTPUColocateSplitsPass());
-  pm.addPass(createSymbolDCEPass());
-  if (tensorflow::GetMlirCommonFlags()
-          ->tf_mlir_enable_convert_control_to_data_outputs_pass) {
-    pm.addPass(tf_executor::CreateTFExecutorConvertControlToDataOutputsPass());
-  }
-  pm.addPass(CreateVerifySuitableForExportPass());
-}
-
 tensorflow::Status RunBridgeWithStandardPipeline(ModuleOp module,
                                                  bool enable_logging,
                                                  bool enable_inliner) {
@@ -399,8 +370,6 @@ tensorflow::Status RunTFXLABridge(ModuleOp module,
       module,
       [](OpPassManager &pm) {
         CreateTFXLABridgePipeline(pm);
-        // Add set of passes to lower back to graph (from tf_executor).
-        TF::AddGraphExportLoweringPassesV2(pm);
       },
       module_name);
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
@@ -413,6 +382,17 @@ tensorflow::Status RunTFXLABridge(ModuleOp module,
                             status.ToString())
         .IgnoreError();
   }
+
+  Status export_status =
+      tensorflow::tf2xla::v2::ExportFromTensorflowDialectToExecutor(
+          module, module_name);
+  if (!export_status.ok()) {
+    tsl::error_logging::Log(kBridgeComponent,
+                            "TFXLA_PHASE_ONE_MLIR_CPU_BRIDGE_EXPORT",
+                            export_status.ToString())
+        .IgnoreError();
+  }
+
   return status;
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 2d2a608729344d..5eddff00046b03 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -257,10 +257,6 @@ std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateCrossHostTransferPass();
 // will replicate the tf.Const op once for each device.
 std::unique_ptr<OperationPass<ModuleOp>> CreateConstantOpDeviceAssignmentPass();
 
-// Populates the supplied passmanager with the passes required to export
-// to TensorFlow Graph.
-void AddGraphExportLoweringPassesV2(OpPassManager& pm);
-
 // Populates the supplied passmanager with the passes required to export
 // to TensorFlow Graph.
 // ***This is the legacy graph export pipeline, prefer
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index ce6b1dbd849bab..42fe6d00e5ee33 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -13,7 +13,11 @@ package(
 # Please reach out to tf-bridge-team@ before using the TF2XLA bridge.
 package_group(
     name = "tf2xla_users",
-    packages = ["//tensorflow/compiler/mlir/quantization/stablehlo/..."],
+    packages = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo/...",
+        # Legacy due to where the bridge currently runs. This should go away.
+        "//tensorflow/compiler/mlir/tensorflow/transforms",
+    ],
 )
 
 cc_library(

From cde9fbbaa24cd4ec001b2cf27430adb8ef40c3b4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 12:46:21 -0700
Subject: [PATCH 536/567] Enables soft memory budgets by default (with a
 corresponding coefficient of 1e6).

PiperOrigin-RevId: 570468490
---
 .../xla/hlo/experimental/auto_sharding/auto_sharding_solver.h   | 2 +-
 .../hlo/experimental/auto_sharding/auto_sharding_solver_test.cc | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
index 5c65a087874f91..28477c7b5e7971 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
@@ -52,7 +52,7 @@ struct AutoShardingSolverRequest {
   std::vector<std::vector<double>> v;
   std::vector<std::string> instruction_names;
   std::optional<int64_t> solver_timeout_in_seconds;
-  std::optional<double> overbudget_coeff;
+  std::optional<double> overbudget_coeff = 1e6;
   std::optional<double> makespan_coeff;
   std::optional<double> max_departures;
   bool crash_at_infinity_costs_check = false;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
index 3526f0b74e2fbe..41e1c450e93bdc 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
@@ -12,6 +12,7 @@ limitations under the License.
 
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
 
+#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -78,6 +79,7 @@ AutoShardingSolverRequest DefaultAutoShardingSolverRequest() {
                 1, 0, 1,
                 1, 1, 0}};
   request.instruction_names = {"A", "B", "C", "D", "E"};
+  request.overbudget_coeff = std::nullopt;
   return request;
 }
 

From 006a8c0dbc9daf49a3109fd6377c92f179089e59 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 3 Oct 2023 13:15:15 -0700
Subject: [PATCH 537/567] NFC: replace uses of
 TF_DISALLOW_COPY_AND_ASSIGN/SE_DISALLOW_COPY_AND_ASSIGN macros with
 implementation.

PiperOrigin-RevId: 570476254
---
 tensorflow/core/activity_watcher/activity.h   |  3 +-
 tensorflow/core/data/captured_function.cc     |  3 +-
 tensorflow/core/data/captured_function.h      |  6 ++-
 .../core/data/service/dispatcher_impl.h       |  3 +-
 .../core/data/service/grpc_dispatcher_impl.h  |  3 +-
 .../core/data/service/grpc_worker_impl.h      |  3 +-
 tensorflow/core/data/service/task_runner.h    |  7 ++-
 .../core/data/service/thread_safe_buffer.h    |  3 +-
 tensorflow/core/data/service/worker_impl.h    |  3 +-
 .../distributed_runtime/base_rendezvous_mgr.h |  9 ++--
 .../coordination_service_barrier_proxy.h      |  6 ++-
 .../eager/eager_service_impl.h                |  3 +-
 .../core/distributed_runtime/graph_mgr.h      |  3 +-
 .../core/distributed_runtime/local_master.h   |  3 +-
 tensorflow/core/distributed_runtime/master.cc |  3 +-
 tensorflow/core/distributed_runtime/master.h  |  3 +-
 .../distributed_runtime/master_session.cc     |  9 ++--
 .../core/distributed_runtime/master_session.h |  3 +-
 .../core/distributed_runtime/remote_device.cc |  3 +-
 .../rpc/eager/grpc_eager_service_impl.h       |  3 +-
 .../rpc/grpc_master_service.cc                |  3 +-
 .../rpc/grpc_remote_worker.cc                 |  3 +-
 .../distributed_runtime/rpc/grpc_session.h    |  3 +-
 .../distributed_runtime/rpc/grpc_testlib.h    |  3 +-
 .../rpc/grpc_worker_service.cc                |  6 ++-
 .../rpc/rpc_rendezvous_mgr.cc                 |  6 ++-
 .../rpc/rpc_rendezvous_mgr.h                  |  3 +-
 .../core/distributed_runtime/scheduler.h      |  9 ++--
 .../core/distributed_runtime/server_lib.h     |  3 +-
 tensorflow/core/distributed_runtime/worker.h  |  3 +-
 tensorflow/core/framework/collective.h        |  3 +-
 tensorflow/core/framework/dataset.h           |  6 ++-
 tensorflow/core/framework/device.h            |  3 +-
 tensorflow/core/framework/function.h          |  3 +-
 .../core/framework/kernel_def_builder.h       |  3 +-
 tensorflow/core/framework/local_rendezvous.h  |  3 +-
 tensorflow/core/framework/op_kernel.h         |  9 ++--
 tensorflow/core/framework/op_segment.h        |  3 +-
 tensorflow/core/framework/rendezvous.cc       |  3 +-
 tensorflow/core/framework/resource_mgr.h      |  6 ++-
 tensorflow/core/framework/resource_var.h      |  3 +-
 tensorflow/core/framework/shape_inference.h   |  9 ++--
 tensorflow/core/framework/tensor.cc           |  6 ++-
 tensorflow/core/graph/costmodel.h             |  3 +-
 tensorflow/core/graph/edgeset.h               |  3 +-
 tensorflow/core/graph/graph.h                 |  6 ++-
 .../core/graph/graph_debug_info_builder.h     |  3 +-
 .../grappler/optimizers/function_api_info.h   |  6 ++-
 .../grappler/optimizers/function_optimizer.cc |  3 +-
 .../optimizers/graph_optimizer_stage.h        |  3 +-
 .../optimizers/implementation_selector.h      |  3 +-
 tensorflow/core/kernels/argmax_op.cc          |  3 +-
 tensorflow/core/kernels/barrier_ops.cc        | 15 ++++--
 .../adaptive_shared_batch_scheduler.h         |  9 ++--
 .../adaptive_shared_batch_scheduler_test.cc   |  3 +-
 .../batching_util/basic_batch_scheduler.h     |  3 +-
 .../basic_batch_scheduler_benchmark_test.cc   |  3 +-
 .../basic_batch_scheduler_test.cc             |  3 +-
 .../kernels/batching_util/batch_scheduler.h   |  3 +-
 .../batching_util/batch_scheduler_test.cc     |  3 +-
 .../kernels/batching_util/bounded_executor.h  |  3 +-
 .../kernels/batching_util/fake_clock_env.h    |  3 +-
 .../kernels/batching_util/periodic_function.h |  3 +-
 .../serial_device_batch_scheduler.h           |  9 ++--
 .../serial_device_batch_scheduler_test.cc     |  3 +-
 .../batching_util/shared_batch_scheduler.h    |  9 ++--
 .../shared_batch_scheduler_test.cc            |  3 +-
 tensorflow/core/kernels/bcast_ops.cc          |  6 ++-
 tensorflow/core/kernels/cast_op.h             |  3 +-
 tensorflow/core/kernels/collective_ops.cc     | 12 +++--
 .../core/kernels/conditional_accumulator.h    |  3 +-
 .../conditional_accumulator_base_op.cc        | 14 ++++--
 .../kernels/conditional_accumulator_op.cc     | 21 ++++++---
 tensorflow/core/kernels/constant_op.h         |  3 +-
 tensorflow/core/kernels/control_flow_ops.cc   |  3 +-
 tensorflow/core/kernels/control_flow_ops.h    | 21 ++++++---
 .../core/kernels/conv_grad_filter_ops.cc      |  6 ++-
 .../core/kernels/conv_grad_filter_ops_3d.cc   |  6 ++-
 tensorflow/core/kernels/conv_grad_input_ops.h |  6 ++-
 .../core/kernels/conv_grad_input_ops_3d.cc    |  6 ++-
 .../kernels/conv_ops_fused_image_transform.cc |  3 +-
 tensorflow/core/kernels/conv_ops_fused_impl.h |  3 +-
 tensorflow/core/kernels/conv_ops_impl.h       |  6 ++-
 .../core/kernels/conv_ops_using_gemm.cc       |  3 +-
 tensorflow/core/kernels/ctc_decoder_ops.cc    |  9 ++--
 tensorflow/core/kernels/ctc_loss_op.cc        |  6 ++-
 tensorflow/core/kernels/cwise_op_select.cc    |  6 ++-
 .../kernels/data/experimental/lookup_ops.cc   |  3 +-
 .../set_stats_aggregator_dataset_op.cc        |  4 +-
 .../data/experimental/stats_aggregator_ops.cc |  6 ++-
 tensorflow/core/kernels/decode_proto_op.cc    |  3 +-
 .../core/kernels/depthwise_conv_grad_op.cc    |  8 +++-
 tensorflow/core/kernels/depthwise_conv_op.cc  |  3 +-
 tensorflow/core/kernels/edit_distance_op.cc   |  3 +-
 tensorflow/core/kernels/encode_proto_op.cc    |  3 +-
 tensorflow/core/kernels/fifo_queue.h          |  6 ++-
 tensorflow/core/kernels/function_ops.cc       |  3 +-
 tensorflow/core/kernels/function_ops.h        |  9 ++--
 tensorflow/core/kernels/functional_ops.cc     |  3 +-
 tensorflow/core/kernels/gpu_device_array.h    |  3 +-
 tensorflow/core/kernels/gpu_utils.h           |  3 +-
 tensorflow/core/kernels/host_constant_op.h    |  3 +-
 .../kernels/image/extract_image_patches_op.cc |  3 +-
 .../image/extract_volume_patches_op.cc        |  3 +-
 .../core/kernels/immutable_constant_op.cc     |  3 +-
 .../core/kernels/immutable_constant_op.h      |  3 +-
 .../core/kernels/initializable_lookup_table.h |  6 ++-
 .../kernels/linalg/matrix_band_part_op.cc     |  3 +-
 .../core/kernels/linalg/matrix_diag_op.cc     |  6 ++-
 .../kernels/linalg/matrix_exponential_op.cc   |  3 +-
 .../core/kernels/linalg/matrix_inverse_op.cc  |  6 ++-
 .../kernels/linalg/matrix_logarithm_op.cc     |  3 +-
 .../core/kernels/linalg/matrix_set_diag_op.cc |  3 +-
 .../core/kernels/linalg/matrix_solve_op.cc    |  3 +-
 .../kernels/linalg/matrix_square_root_op.cc   |  3 +-
 tensorflow/core/kernels/linalg/qr_op_impl.h   |  6 ++-
 .../linalg/self_adjoint_eig_v2_op_gpu.cc      |  3 +-
 tensorflow/core/kernels/linalg/svd_op_impl.h  |  3 +-
 .../kernels/linalg/tridiagonal_matmul_op.cc   |  3 +-
 .../kernels/linalg/tridiagonal_solve_op.cc    |  3 +-
 .../linalg/tridiagonal_solve_op_gpu.cu.cc     |  3 +-
 .../core/kernels/lookup_table_init_op.cc      |  3 +-
 tensorflow/core/kernels/lookup_table_op.h     |  6 ++-
 tensorflow/core/kernels/lookup_util.cc        |  3 +-
 tensorflow/core/kernels/matmul_op_fused.cc    |  3 +-
 tensorflow/core/kernels/mfcc.h                |  3 +-
 tensorflow/core/kernels/mfcc_dct.h            |  3 +-
 tensorflow/core/kernels/mfcc_mel_filterbank.h |  3 +-
 tensorflow/core/kernels/nccl_ops.cc           |  3 +-
 tensorflow/core/kernels/one_hot_op.cc         |  3 +-
 tensorflow/core/kernels/ops_testutil.h        |  3 +-
 tensorflow/core/kernels/padding_fifo_queue.h  |  3 +-
 .../core/kernels/padding_fifo_queue_op.cc     |  3 +-
 .../parameterized_truncated_normal_op.cc      |  8 +++-
 tensorflow/core/kernels/priority_queue.h      |  3 +-
 tensorflow/core/kernels/priority_queue_op.cc  |  3 +-
 tensorflow/core/kernels/quantization_utils.h  |  6 ++-
 .../kernels/quantized_resize_bilinear_op.cc   |  3 +-
 tensorflow/core/kernels/queue_base.h          |  3 +-
 tensorflow/core/kernels/queue_op.h            | 24 ++++++----
 tensorflow/core/kernels/random_binomial_op.cc |  6 ++-
 .../core/kernels/random_index_shuffle_ops.cc  |  3 +-
 tensorflow/core/kernels/random_op.cc          |  3 +-
 tensorflow/core/kernels/random_poisson_op.cc  |  3 +-
 .../core/kernels/random_shuffle_queue_op.cc   |  6 ++-
 .../core/kernels/regex_full_match_op.cc       |  3 +-
 tensorflow/core/kernels/regex_replace_op.cc   |  3 +-
 .../core/kernels/reverse_sequence_op.cc       |  3 +-
 tensorflow/core/kernels/sdca_internal.h       |  9 ++--
 tensorflow/core/kernels/sendrecv_ops.h        |  6 ++-
 tensorflow/core/kernels/session_ops.cc        |  9 ++--
 tensorflow/core/kernels/sparse/add_op.cc      |  3 +-
 .../kernels/sparse_conditional_accumulator.h  |  3 +-
 .../sparse_conditional_accumulator_op.cc      | 12 +++--
 tensorflow/core/kernels/sparse_matmul_op.cc   |  6 ++-
 tensorflow/core/kernels/spectrogram.h         |  3 +-
 tensorflow/core/kernels/stack.h               |  3 +-
 .../core/kernels/stateless_random_gamma_op.cc |  3 +-
 .../core/kernels/stateless_random_ops.cc      |  3 +-
 .../kernels/string_to_hash_bucket_fast_op.h   |  3 +-
 .../core/kernels/string_to_hash_bucket_op.cc  |  3 +-
 .../core/kernels/string_to_hash_bucket_op.h   |  3 +-
 tensorflow/core/kernels/sync_ops.cc           |  6 ++-
 tensorflow/core/kernels/tensor_array_ops.cc   |  6 ++-
 .../core/kernels/tensor_to_hash_bucket_op.cc  |  3 +-
 tensorflow/core/kernels/tile_ops.cc           |  6 ++-
 tensorflow/core/kernels/variable_ops.h        |  3 +-
 tensorflow/core/kernels/where_op.cc           |  6 ++-
 tensorflow/core/lib/core/arena.h              |  3 +-
 tensorflow/core/lib/db/sqlite.h               | 12 +++--
 tensorflow/core/lib/strings/ordered_code.h    |  3 +-
 tensorflow/core/lib/strings/proto_text_util.h |  3 +-
 tensorflow/core/nccl/nccl_manager.h           |  3 +-
 .../profiler/backends/gpu/cupti_wrapper.h     |  3 +-
 tensorflow/core/profiler/utils/cost_utils.h   |  3 +-
 .../runtime_fallback/test/test_opkernels.cc   | 10 ++--
 tensorflow/core/summary/summary_db_writer.cc  | 15 ++++--
 .../core/tfrt/mlrt/kernel/kernel_test.cc      |  3 +-
 .../core/tpu/kernels/cross_replica_ops.cc     |  9 ++--
 .../core/tpu/kernels/sparse_core_xla_ops.cc   | 47 ++++++++++++++-----
 .../core/tpu/kernels/sparse_core_xla_ops.h    |  4 +-
 .../tpu_compilation_cache_entry_unloader.h    |  4 +-
 .../tpu_embedding_configuration_ops.cc        | 22 ++++++---
 .../tpu/kernels/tpu_embedding_enqueue_ops.cc  |  4 +-
 .../kernels/tpu_embedding_load_retrieve_ops.h |  8 +++-
 .../core/tpu/kernels/tpu_embedding_ops.cc     | 19 +++++---
 tensorflow/core/tpu/kernels/tpu_execute_op.h  |  7 ++-
 .../core/tpu/kernels/tpu_program_group.h      |  3 +-
 .../core/tpu/kernels/xla/host_compute_ops.cc  |  9 ++--
 tensorflow/core/tpu/kernels/xla/infeed_op.cc  |  6 ++-
 .../core/tpu/kernels/xla/outfeed_ops.cc       |  6 ++-
 tensorflow/core/util/bcast.h                  |  6 ++-
 tensorflow/core/util/ctc/ctc_beam_entry.h     |  3 +-
 tensorflow/core/util/ctc/ctc_beam_search.h    |  3 +-
 tensorflow/core/util/cuda_sparse.cc           |  3 +-
 tensorflow/core/util/cuda_sparse.h            | 19 +++++---
 tensorflow/core/util/debug_events_writer.h    |  3 +-
 tensorflow/core/util/events_writer.h          |  3 +-
 tensorflow/core/util/fake_clock_env.h         |  3 +-
 tensorflow/core/util/gpu_solvers.h            |  3 +-
 tensorflow/core/util/guarded_philox_random.h  |  3 +-
 tensorflow/core/util/memmapped_file_system.h  |  3 +-
 .../core/util/memmapped_file_system_writer.h  |  3 +-
 tensorflow/core/util/mkl_util.h               |  3 +-
 tensorflow/core/util/presized_cuckoo_map.h    |  3 +-
 tensorflow/core/util/rocm_sparse.cc           |  3 +-
 .../core/util/tensor_bundle/tensor_bundle.h   |  6 ++-
 tensorflow/core/util/tensor_slice_reader.h    |  3 +-
 tensorflow/core/util/tensor_slice_writer.h    |  3 +-
 209 files changed, 724 insertions(+), 346 deletions(-)

diff --git a/tensorflow/core/activity_watcher/activity.h b/tensorflow/core/activity_watcher/activity.h
index b4b0f48912b100..334a58d45190ba 100644
--- a/tensorflow/core/activity_watcher/activity.h
+++ b/tensorflow/core/activity_watcher/activity.h
@@ -176,7 +176,8 @@ class ActivityScope {
 
  private:
   ActivityId activity_id_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ActivityScope);
+  ActivityScope(const ActivityScope&) = delete;
+  void operator=(const ActivityScope&) = delete;
 };
 
 }  // namespace activity_watcher
diff --git a/tensorflow/core/data/captured_function.cc b/tensorflow/core/data/captured_function.cc
index 8336551138806e..d05afc40135f1b 100644
--- a/tensorflow/core/data/captured_function.cc
+++ b/tensorflow/core/data/captured_function.cc
@@ -301,7 +301,8 @@ class CallFrameBase : public CallFrameInterface {
  private:
   DataTypeSlice ret_types_;
   std::vector<std::optional<Tensor>> retvals_;
-  TF_DISALLOW_COPY_AND_ASSIGN(CallFrameBase);
+  CallFrameBase(const CallFrameBase&) = delete;
+  void operator=(const CallFrameBase&) = delete;
 };
 
 class OwnedArgsCallFrame : public CallFrameBase {
diff --git a/tensorflow/core/data/captured_function.h b/tensorflow/core/data/captured_function.h
index 947ad4c1503073..c3d489dd855263 100644
--- a/tensorflow/core/data/captured_function.h
+++ b/tensorflow/core/data/captured_function.h
@@ -217,7 +217,8 @@ class CapturedFunction {
   const std::shared_ptr<const FunctionMetadata> metadata_;
   const std::vector<Tensor> captured_inputs_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
+  CapturedFunction(const CapturedFunction&) = delete;
+  void operator=(const CapturedFunction&) = delete;
 };
 
 // `InstantiatedCapturedFunction` encapsulates all the runtime support needed
@@ -311,7 +312,8 @@ class InstantiatedCapturedFunction {
   CapturedFunction* const captured_func_;  // Not owned.
   const bool is_multi_device_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(InstantiatedCapturedFunction);
+  InstantiatedCapturedFunction(const InstantiatedCapturedFunction&) = delete;
+  void operator=(const InstantiatedCapturedFunction&) = delete;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/dispatcher_impl.h b/tensorflow/core/data/service/dispatcher_impl.h
index 6f992130638130..9c07e67d0da036 100644
--- a/tensorflow/core/data/service/dispatcher_impl.h
+++ b/tensorflow/core/data/service/dispatcher_impl.h
@@ -388,7 +388,8 @@ class DataServiceDispatcherImpl {
   std::unique_ptr<Thread> maintenance_thread_;
   MultipleIterationsAutoScaler auto_scaler_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DataServiceDispatcherImpl);
+  DataServiceDispatcherImpl(const DataServiceDispatcherImpl&) = delete;
+  void operator=(const DataServiceDispatcherImpl&) = delete;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl.h b/tensorflow/core/data/service/grpc_dispatcher_impl.h
index f8d12dcccdbc0c..6142811b9ed968 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl.h
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl.h
@@ -67,7 +67,8 @@ class GrpcDispatcherImpl : public DispatcherService::Service {
  private:
   DataServiceDispatcherImpl impl_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcDispatcherImpl);
+  GrpcDispatcherImpl(const GrpcDispatcherImpl&) = delete;
+  void operator=(const GrpcDispatcherImpl&) = delete;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/grpc_worker_impl.h b/tensorflow/core/data/service/grpc_worker_impl.h
index 48774e6c56017c..9969aac48ed910 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.h
+++ b/tensorflow/core/data/service/grpc_worker_impl.h
@@ -70,7 +70,8 @@ class GrpcWorkerImpl : public WorkerService::Service {
   // the servers' methods to avoid RPC calls and data copy.
   std::shared_ptr<DataServiceWorkerImpl> impl_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerImpl);
+  GrpcWorkerImpl(const GrpcWorkerImpl&) = delete;
+  void operator=(const GrpcWorkerImpl&) = delete;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/task_runner.h b/tensorflow/core/data/service/task_runner.h
index e885471fb85e12..a1db121ac602ca 100644
--- a/tensorflow/core/data/service/task_runner.h
+++ b/tensorflow/core/data/service/task_runner.h
@@ -147,7 +147,9 @@ class FirstComeFirstServedTaskRunner : public TaskRunner {
   ThreadSafeBuffer<GetElementResult> buffer_;
   std::unique_ptr<Thread> prefetch_thread_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FirstComeFirstServedTaskRunner);
+  FirstComeFirstServedTaskRunner(const FirstComeFirstServedTaskRunner&) =
+      delete;
+  void operator=(const FirstComeFirstServedTaskRunner&) = delete;
 };
 
 // A task runner which prefetches elements on a first-come first-served basis
@@ -191,7 +193,8 @@ class CachingTaskRunner : public TaskRunner {
   FirstComeFirstServedTaskRunner fcfs_task_runner_;
   CrossTrainerCache<GetElementResult> cache_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CachingTaskRunner);
+  CachingTaskRunner(const CachingTaskRunner&) = delete;
+  void operator=(const CachingTaskRunner&) = delete;
 };
 
 // An element produced by a task.
diff --git a/tensorflow/core/data/service/thread_safe_buffer.h b/tensorflow/core/data/service/thread_safe_buffer.h
index 24c2fa9c2a0686..49b9200256e086 100644
--- a/tensorflow/core/data/service/thread_safe_buffer.h
+++ b/tensorflow/core/data/service/thread_safe_buffer.h
@@ -56,7 +56,8 @@ class ThreadSafeBuffer final {
   std::deque<StatusOr<T>> results_ TF_GUARDED_BY(mu_);
   Status status_ TF_GUARDED_BY(mu_) = OkStatus();
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ThreadSafeBuffer);
+  ThreadSafeBuffer(const ThreadSafeBuffer&) = delete;
+  void operator=(const ThreadSafeBuffer&) = delete;
 };
 
 template <class T>
diff --git a/tensorflow/core/data/service/worker_impl.h b/tensorflow/core/data/service/worker_impl.h
index b175cc6f5a49f2..59a55ff8cddd31 100644
--- a/tensorflow/core/data/service/worker_impl.h
+++ b/tensorflow/core/data/service/worker_impl.h
@@ -210,7 +210,8 @@ class DataServiceWorkerImpl {
   // A thread for performing regular heartbeats to the dispatcher.
   std::unique_ptr<Thread> heartbeat_thread_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DataServiceWorkerImpl);
+  DataServiceWorkerImpl(const DataServiceWorkerImpl&) = delete;
+  void operator=(const DataServiceWorkerImpl&) = delete;
 };
 
 // Local in-process workers shared among clients and servers. If clients and
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index a4a20219aca7c4..3b5c99b16988d3 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -110,7 +110,8 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
 
   tsl::core::RefCountPtr<BaseRemoteRendezvous> FindOrCreate(int64_t step_id);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BaseRendezvousMgr);
+  BaseRendezvousMgr(const BaseRendezvousMgr&) = delete;
+  void operator=(const BaseRendezvousMgr&) = delete;
 };
 
 // RemoteRendezvous is a Rendezvous which can handle either
@@ -270,7 +271,8 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   // Must be called only if fully initialized.
   void RecvLocalAsyncInternal(const ParsedKey& parsed, DoneCallback done);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BaseRemoteRendezvous);
+  BaseRemoteRendezvous(const BaseRemoteRendezvous&) = delete;
+  void operator=(const BaseRemoteRendezvous&) = delete;
 };
 
 class BaseRecvTensorCall {
@@ -285,7 +287,8 @@ class BaseRecvTensorCall {
   virtual Status status() const = 0;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(BaseRecvTensorCall);
+  BaseRecvTensorCall(const BaseRecvTensorCall&) = delete;
+  void operator=(const BaseRecvTensorCall&) = delete;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
index 25fdab00d72544..3254a5d3369f1e 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
@@ -50,7 +50,8 @@ namespace tensorflow {
 //   }
 class BarrierProxy {
  public:
-  TF_DISALLOW_COPY_AND_ASSIGN(BarrierProxy);
+  BarrierProxy(const BarrierProxy&) = delete;
+  void operator=(const BarrierProxy&) = delete;
   // Construct a BarrierProxy connected to the coordination service via `agent`.
   // `tasks` specifies all participating coordinated tasks and
   // `num_local_threads` specifies the number of threads in this task to
@@ -96,7 +97,8 @@ class BarrierProxy {
 //   Status s = barrier_mgr.Wait(agent, task, num_local_threads, key, timeout);
 class BarrierProxyManager {
  public:
-  TF_DISALLOW_COPY_AND_ASSIGN(BarrierProxyManager);
+  BarrierProxyManager(const BarrierProxyManager&) = delete;
+  void operator=(const BarrierProxyManager&) = delete;
   BarrierProxyManager() = default;
   ~BarrierProxyManager() = default;
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index 8092362b144fd9..8239b361b48ef5 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -232,7 +232,8 @@ class EagerServiceImpl {
   condition_variable gc_thread_cv_;
   bool shutting_down_ TF_GUARDED_BY(gc_thread_shutdown_mu_) = false;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(EagerServiceImpl);
+  EagerServiceImpl(const EagerServiceImpl&) = delete;
+  void operator=(const EagerServiceImpl&) = delete;
 };
 
 }  // namespace eager
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index c633eb9372172e..d394178f597554 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -194,7 +194,8 @@ class GraphMgr {
   Status DecorateAndPublishGraphForDebug(const DebugOptions& debug_options,
                                          Graph* graph, Device* device);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GraphMgr);
+  GraphMgr(const GraphMgr&) = delete;
+  void operator=(const GraphMgr&) = delete;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index 0fb3cea6c60af1..32b8e7cc065eb9 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -103,7 +103,8 @@ class LocalMaster : public MasterInterface {
   // objects of this type.
   LocalMaster(Master* master_impl, const int64_t default_timeout_in_ms);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(LocalMaster);
+  LocalMaster(const LocalMaster&) = delete;
+  void operator=(const LocalMaster&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 94666e2577f099..3992a211ef68b6 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -356,7 +356,8 @@ class DeviceFinder {
     return false;
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DeviceFinder);
+  DeviceFinder(const DeviceFinder&) = delete;
+  void operator=(const DeviceFinder&) = delete;
 };
 
 void Master::CreateSession(const CreateSessionRequest* req,
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index ecb44ddee9cbb0..8cdbe6fccedd0a 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -109,7 +109,8 @@ class Master {
   // on the returned MasterSession if not null.
   MasterSession* FindMasterSession(const string& handle);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Master);
+  Master(const Master&) = delete;
+  void operator=(const Master&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index a70816d188a421..e54c577bc012a5 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -336,7 +336,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   // destructor and does not wait for the rpc completion.
   void DeregisterPartitions();
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ReffedClientGraph);
+  ReffedClientGraph(const ReffedClientGraph&) = delete;
+  void operator=(const ReffedClientGraph&) = delete;
 };
 
 Status MasterSession::ReffedClientGraph::RegisterPartitions(
@@ -610,7 +611,8 @@ class RunManyGraphs {
     status_group_.Update(s);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RunManyGraphs);
+  RunManyGraphs(const RunManyGraphs&) = delete;
+  void operator=(const RunManyGraphs&) = delete;
 };
 
 Status AddSendFromClientRequest(const RunStepRequestWrapper& client_req,
@@ -919,7 +921,8 @@ class CleanupBroadcastHelper {
   // Callback to be called when all operations complete.
   StatusCallback done_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CleanupBroadcastHelper);
+  CleanupBroadcastHelper(const CleanupBroadcastHelper&) = delete;
+  void operator=(const CleanupBroadcastHelper&) = delete;
 };
 
 }  // namespace
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index 5a3d89d8b3a7a2..ab63dea86842e2 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -254,7 +254,8 @@ class MasterSession : public core::RefCounted {
       int64_t rcg_execution_count,
       std::unique_ptr<DebuggerStateInterface>* debugger_state);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MasterSession);
+  MasterSession(const MasterSession&) = delete;
+  void operator=(const MasterSession&) = delete;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index 458273de5da81c..a7a56435d829b7 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -55,7 +55,8 @@ class RemoteDevice : public Device {
  private:
   const string local_dev_name_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RemoteDevice);
+  RemoteDevice(const RemoteDevice&) = delete;
+  void operator=(const RemoteDevice&) = delete;
 };
 
 void AsRemoteDevices(
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index ab399a365fa20a..d310424c92f588 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -160,7 +160,8 @@ class GrpcEagerServiceImpl : public tsl::AsyncServiceInterface {
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   grpc::EagerService::AsyncService service_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcEagerServiceImpl);
+  GrpcEagerServiceImpl(const GrpcEagerServiceImpl&) = delete;
+  void operator=(const GrpcEagerServiceImpl&) = delete;
 };
 
 }  // namespace eager
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index dfff1bad1dabfc..141dcff71b15e2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -303,7 +303,8 @@ class GrpcMasterService : public tsl::AsyncServiceInterface {
                                  profiler::TraceMeLevel::kInfo);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcMasterService);
+  GrpcMasterService(const GrpcMasterService&) = delete;
+  void operator=(const GrpcMasterService&) = delete;
 };
 
 tsl::AsyncServiceInterface* NewGrpcMasterService(
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 26fab86cb4428b..05bf31918ef543 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -338,7 +338,8 @@ class GrpcRemoteWorker : public WorkerInterface {
   WorkerCacheLogger* logger_;
   const string target_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcRemoteWorker);
+  GrpcRemoteWorker(const GrpcRemoteWorker&) = delete;
+  void operator=(const GrpcRemoteWorker&) = delete;
 };
 
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index adb78e70d97e52..b0278b855d3b7c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -143,7 +143,8 @@ class GrpcSession : public Session {
   Status CreateImpl(CallOptions* call_options, GraphDef graph);
   Status ExtendImpl(CallOptions* call_options, GraphDef graph);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcSession);
+  GrpcSession(const GrpcSession&) = delete;
+  void operator=(const GrpcSession&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
index ba7938b019c0a1..43aa2b38c53611 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
@@ -89,7 +89,8 @@ class TestCluster {
   absl::flat_hash_map<std::string, std::vector<std::string>> targets_;
   std::vector<DeviceAttributes> devices_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TestCluster);
+  TestCluster(const TestCluster&) = delete;
+  void operator=(const TestCluster&) = delete;
 };
 
 }  // end namespace test
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index adc57523caba42..888d13a398a418 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -361,7 +361,8 @@ class GrpcWorkerServiceThread {
 
   mutex shutdown_mu_;
   bool is_shutdown_ TF_GUARDED_BY(shutdown_mu_);
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerServiceThread);
+  GrpcWorkerServiceThread(const GrpcWorkerServiceThread&) = delete;
+  void operator=(const GrpcWorkerServiceThread&) = delete;
 };
 
 class GrpcWorkerService : public tsl::AsyncServiceInterface {
@@ -411,7 +412,8 @@ class GrpcWorkerService : public tsl::AsyncServiceInterface {
   mutex service_shutdown_mu_;
   bool is_shutdown_ TF_GUARDED_BY(service_shutdown_mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerService);
+  GrpcWorkerService(const GrpcWorkerService&) = delete;
+  void operator=(const GrpcWorkerService&) = delete;
 };
 
 }  // namespace
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 0b2a974b1e4c60..1bf7183245a8a9 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -49,7 +49,8 @@ class RpcRemoteRendezvous : public BaseRemoteRendezvous {
  private:
   ~RpcRemoteRendezvous() override {}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RpcRemoteRendezvous);
+  RpcRemoteRendezvous(const RpcRemoteRendezvous&) = delete;
+  void operator=(const RpcRemoteRendezvous&) = delete;
 };
 
 // Used only to retrieve tensors from remote processes.
@@ -180,7 +181,8 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
   mutable mutex mu_;
   Status status_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RpcRecvTensorCall);
+  RpcRecvTensorCall(const RpcRecvTensorCall&) = delete;
+  void operator=(const RpcRecvTensorCall&) = delete;
 };
 
 class RpcRecvTensorFreeList {
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
index ffc9e7b2dea9fb..42eda4ea71aef7 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
@@ -51,7 +51,8 @@ class RpcRendezvousMgr : public BaseRendezvousMgr {
       int64_t step_id, const WorkerEnv* worker_env) override;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(RpcRendezvousMgr);
+  RpcRendezvousMgr(const RpcRendezvousMgr&) = delete;
+  void operator=(const RpcRendezvousMgr&) = delete;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/scheduler.h b/tensorflow/core/distributed_runtime/scheduler.h
index ae27826392a667..4385db786ff38a 100644
--- a/tensorflow/core/distributed_runtime/scheduler.h
+++ b/tensorflow/core/distributed_runtime/scheduler.h
@@ -49,7 +49,8 @@ class SlackAnalysis {
   const Graph* graph_;
   const CostModel* cost_model_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SlackAnalysis);
+  SlackAnalysis(const SlackAnalysis&) = delete;
+  void operator=(const SlackAnalysis&) = delete;
 };
 
 class GreedyScheduler {
@@ -87,7 +88,8 @@ class GreedyScheduler {
   std::vector<int64_t>* priority_;
   std::unordered_map<string, Sim*> device_states_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GreedyScheduler);
+  GreedyScheduler(const GreedyScheduler&) = delete;
+  void operator=(const GreedyScheduler&) = delete;
 };
 
 class PriorityScheduler {
@@ -110,7 +112,8 @@ class PriorityScheduler {
   const CostModel* cost_model_;
   const Graph* graph_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PriorityScheduler);
+  PriorityScheduler(const PriorityScheduler&) = delete;
+  void operator=(const PriorityScheduler&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/server_lib.h b/tensorflow/core/distributed_runtime/server_lib.h
index 7865eb7846dc5c..3a5ca1853889bc 100644
--- a/tensorflow/core/distributed_runtime/server_lib.h
+++ b/tensorflow/core/distributed_runtime/server_lib.h
@@ -86,7 +86,8 @@ class ServerInterface {
   virtual Status StopCoordinationService() = 0;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ServerInterface);
+  ServerInterface(const ServerInterface&) = delete;
+  void operator=(const ServerInterface&) = delete;
 };
 
 class ServerFactory {
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index ba013365847d91..780976dcab3b19 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -134,7 +134,8 @@ class Worker : public WorkerInterface {
                          MutableRunGraphResponseWrapper* response,
                          StatusCallback done);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Worker);
+  Worker(const Worker&) = delete;
+  void operator=(const Worker&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 0a0dbf3c39ae84..4b43fb226af248 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -389,7 +389,8 @@ class CollectiveExecutor : public core::RefCounted {
   static OpKernelContext::Params* CtxParams(OpKernelContext* ctx);
   CollectiveExecutorMgrInterface* cem_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveExecutor);
+  CollectiveExecutor(const CollectiveExecutor&) = delete;
+  void operator=(const CollectiveExecutor&) = delete;
 };
 
 struct CollectiveContext {
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index d8bcc26916f60e..a174234a86e483 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -514,7 +514,8 @@ class MemoryCheckpoint : public IteratorStateWriter {
  private:
   explicit MemoryCheckpoint(std::shared_ptr<IdRegistry> registry, bool is_root)
       : is_root_(is_root), id_registry_(registry) {}
-  TF_DISALLOW_COPY_AND_ASSIGN(MemoryCheckpoint);
+  MemoryCheckpoint(const MemoryCheckpoint&) = delete;
+  void operator=(const MemoryCheckpoint&) = delete;
 
   Status status_ = OkStatus();
   // Only set to true for the checkpoint in IteratorResource.
@@ -618,7 +619,8 @@ class SerializationContext {
  private:
   Params params_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SerializationContext);
+  SerializationContext(const SerializationContext&) = delete;
+  void operator=(const SerializationContext&) = delete;
 };
 
 // Specifies the tf.data pipeline run mode.
diff --git a/tensorflow/core/framework/device.h b/tensorflow/core/framework/device.h
index 6c2f65bbf72863..a7d5bb260d5369 100644
--- a/tensorflow/core/framework/device.h
+++ b/tensorflow/core/framework/device.h
@@ -209,7 +209,8 @@ class Device : public DeviceBase {
   // Resources associated w/ this device. E.g., shared variables, etc.
   ResourceMgr* rmgr_ = nullptr;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Device);
+  Device(const Device&) = delete;
+  void operator=(const Device&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index f84cb34ec2265d..6502a413298de8 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -351,7 +351,8 @@ class FunctionCallFrame : public CallFrameInterface {
   };
   gtl::InlinedVector<Retval, 4> rets_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FunctionCallFrame);
+  FunctionCallFrame(const FunctionCallFrame&) = delete;
+  void operator=(const FunctionCallFrame&) = delete;
 };
 
 // Map of function names to StackTracesMaps.
diff --git a/tensorflow/core/framework/kernel_def_builder.h b/tensorflow/core/framework/kernel_def_builder.h
index 4151070f148e20..f4d2220d90261f 100644
--- a/tensorflow/core/framework/kernel_def_builder.h
+++ b/tensorflow/core/framework/kernel_def_builder.h
@@ -86,7 +86,8 @@ class KernelDefBuilder {
  private:
   KernelDef* kernel_def_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(KernelDefBuilder);
+  KernelDefBuilder(const KernelDefBuilder&) = delete;
+  void operator=(const KernelDefBuilder&) = delete;
 };
 
 // IMPLEMENTATION
diff --git a/tensorflow/core/framework/local_rendezvous.h b/tensorflow/core/framework/local_rendezvous.h
index 39c4291c7dcc09..48affe938776f3 100644
--- a/tensorflow/core/framework/local_rendezvous.h
+++ b/tensorflow/core/framework/local_rendezvous.h
@@ -112,7 +112,8 @@ class LocalRendezvous {
   static std::vector<tsl::core::RefCountPtr<Rendezvous> >& aborted_rendezs_
       TF_GUARDED_BY(aborted_rendezs_mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(LocalRendezvous);
+  LocalRendezvous(const LocalRendezvous&) = delete;
+  void operator=(const LocalRendezvous&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 856078887dfcac..a70034b29b0d4b 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -219,7 +219,8 @@ class OpKernel {
   const bool is_deferred_;
   bool expensive_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(OpKernel);
+  OpKernel(const OpKernel&) = delete;
+  void operator=(const OpKernel&) = delete;
 };
 
 class AsyncOpKernel : public OpKernel {
@@ -365,7 +366,8 @@ class OpKernelConstruction {
   // Allow access from OpKernel ctor.
   friend class OpKernel;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(OpKernelConstruction);
+  OpKernelConstruction(const OpKernelConstruction&) = delete;
+  void operator=(const OpKernelConstruction&) = delete;
 };
 
 // TODO(mrry): Consider converting to a random_access_iterator, and upgrading
@@ -1318,7 +1320,8 @@ class OpKernelContext {
   friend void CheckNotInComputeAsync(OpKernelContext* ctx,
                                      const char* correct_macro_name);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(OpKernelContext);
+  OpKernelContext(const OpKernelContext&) = delete;
+  void operator=(const OpKernelContext&) = delete;
 };
 
 template <>
diff --git a/tensorflow/core/framework/op_segment.h b/tensorflow/core/framework/op_segment.h
index b4ffd936bf3bbc..8cbca548cf02df 100644
--- a/tensorflow/core/framework/op_segment.h
+++ b/tensorflow/core/framework/op_segment.h
@@ -81,7 +81,8 @@ class OpSegment {
   mutable mutex mu_;
   SessionMap sessions_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(OpSegment);
+  OpSegment(const OpSegment&) = delete;
+  void operator=(const OpSegment&) = delete;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index 793d6408bcc2f0..efea3e2597c803 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -169,7 +169,8 @@ class LocalRendezvousWrapper : public Rendezvous {
  private:
   LocalRendezvous impl_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(LocalRendezvousWrapper);
+  LocalRendezvousWrapper(const LocalRendezvousWrapper&) = delete;
+  void operator=(const LocalRendezvousWrapper&) = delete;
 };
 }  // namespace
 
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index fc043dd84bad1d..7a4a1047bc59bb 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -266,7 +266,8 @@ class ResourceMgr {
     core::RefCountPtr<ResourceBase> GetResource() const;
 
    private:
-    TF_DISALLOW_COPY_AND_ASSIGN(ResourceAndName);
+    ResourceAndName(const ResourceAndName&) = delete;
+    void operator=(const ResourceAndName&) = delete;
   };
   typedef absl::flat_hash_map<Key, ResourceAndName, KeyHash, KeyEqual>
       Container;
@@ -322,7 +323,8 @@ class ResourceMgr {
   // Map from type hash_code to type name.
   std::unordered_map<uint64, string> debug_type_names_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ResourceMgr);
+  ResourceMgr(const ResourceMgr&) = delete;
+  void operator=(const ResourceMgr&) = delete;
 };
 
 // Makes a resource handle with the specified type for a given container /
diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h
index 97fe734c3240e7..d74435d33f5406 100644
--- a/tensorflow/core/framework/resource_var.h
+++ b/tensorflow/core/framework/resource_var.h
@@ -116,7 +116,8 @@ class Var : public ResourceBase {
   std::string debug_name_;
 
   ~Var() override {}
-  TF_DISALLOW_COPY_AND_ASSIGN(Var);
+  Var(const Var&) = delete;
+  void operator=(const Var&) = delete;
 };
 
 // Does unlock and unref automatically when going out of scope, and also
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index dd84e117ea9306..658bf494a6d56f 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -123,7 +123,8 @@ class Dimension {
 
   friend class InferenceContext;
   friend class ShapeManager;
-  TF_DISALLOW_COPY_AND_ASSIGN(Dimension);
+  Dimension(const Dimension&) = delete;
+  void operator=(const Dimension&) = delete;
 };
 
 class DimensionHandle {
@@ -163,7 +164,8 @@ class Shape {
   friend class InferenceContext;
   friend class ::tensorflow::grappler::SymbolicShapeManager;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Shape);
+  Shape(const Shape&) = delete;
+  void operator=(const Shape&) = delete;
 };
 
 class ShapeHandle {
@@ -861,7 +863,8 @@ class InferenceContext {
   std::vector<std::pair<ShapeHandle, ShapeHandle>> merged_shapes_;
   std::vector<std::pair<DimensionHandle, DimensionHandle>> merged_dims_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(InferenceContext);
+  InferenceContext(const InferenceContext&) = delete;
+  void operator=(const InferenceContext&) = delete;
 };
 
 // -----------------------------------------------------------------------------
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 9922ade0195d1d..010395a9a2c4bd 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -155,7 +155,8 @@ class Buffer : public BufferBase {
 
   ~Buffer() override;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Buffer);
+  Buffer(const Buffer&) = delete;
+  void operator=(const Buffer&) = delete;
 };
 
 void LogUnexpectedSize(int64_t actual, int64_t expected) {
@@ -1065,7 +1066,8 @@ class SubBuffer : public TensorBuffer {
 
   ~SubBuffer() override { root_->Unref(); }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SubBuffer);
+  SubBuffer(const SubBuffer&) = delete;
+  void operator=(const SubBuffer&) = delete;
 };
 
 Tensor Tensor::Slice(int64_t start, int64_t limit) const {
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index 56d105fcc9f522..7778691701114d 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -232,7 +232,8 @@ class CostModel {
 
   TensorShapeProto unknown_shape_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CostModel);
+  CostModel(const CostModel&) = delete;
+  void operator=(const CostModel&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/edgeset.h b/tensorflow/core/graph/edgeset.h
index f7360b3b7e0fa9..6d6cb3ff630591 100644
--- a/tensorflow/core/graph/edgeset.h
+++ b/tensorflow/core/graph/edgeset.h
@@ -86,7 +86,8 @@ class EdgeSet {
   void RegisterMutation() { mutations_++; }
 #endif
 
-  TF_DISALLOW_COPY_AND_ASSIGN(EdgeSet);
+  EdgeSet(const EdgeSet&) = delete;
+  void operator=(const EdgeSet&) = delete;
 };
 
 class EdgeSet::const_iterator {
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 7a16673c13cedf..85d0be75346447 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -375,7 +375,8 @@ class Node {
   // this set.)
   WhileContext* while_ctx_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Node);
+  Node(const Node&) = delete;
+  void operator=(const Node&) = delete;
 };
 
 // Stores debug information associated with the Node.
@@ -918,7 +919,8 @@ class Graph {
   // Indicates the context that this Graph instance is constructed.
   ConstructionContext construction_context_ = ConstructionContext::kNotTracked;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Graph);
+  Graph(const Graph&) = delete;
+  void operator=(const Graph&) = delete;
 };
 
 // TODO(josh11b): We may want to support keeping an index on various
diff --git a/tensorflow/core/graph/graph_debug_info_builder.h b/tensorflow/core/graph/graph_debug_info_builder.h
index 66299fe91e14ca..3d3f423d5a2a9f 100644
--- a/tensorflow/core/graph/graph_debug_info_builder.h
+++ b/tensorflow/core/graph/graph_debug_info_builder.h
@@ -197,7 +197,8 @@ class GraphDebugInfoBuilder {
   absl::flat_hash_map<StackFrame, int> frame_to_index_;
   int new_name_index_ = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GraphDebugInfoBuilder);
+  GraphDebugInfoBuilder(const GraphDebugInfoBuilder&) = delete;
+  void operator=(const GraphDebugInfoBuilder&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.h b/tensorflow/core/grappler/optimizers/function_api_info.h
index ffa53a7d8d94e2..c32cd5c9ca4c04 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.h
+++ b/tensorflow/core/grappler/optimizers/function_api_info.h
@@ -60,7 +60,8 @@ class FunctionApiInfo {
   DataTypeVector input_arg_dtypes_;
   DataTypeVector output_arg_dtypes_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FunctionApiInfo);
+  FunctionApiInfo(const FunctionApiInfo&) = delete;
+  void operator=(const FunctionApiInfo&) = delete;
 };
 
 // A collection of information for function and the interface it implements.
@@ -95,7 +96,8 @@ class FunctionLibraryApiInfo {
   absl::flat_hash_map<string, std::vector<string>> intf_to_forward_funcs_;
   absl::flat_hash_map<string, std::vector<string>> intf_to_backward_funcs_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FunctionLibraryApiInfo);
+  FunctionLibraryApiInfo(const FunctionLibraryApiInfo&) = delete;
+  void operator=(const FunctionLibraryApiInfo&) = delete;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index bc6ccbe7bef193..b7fe106848c238 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -384,7 +384,8 @@ class FunctionOptimizerContext {
   // Use graph view to find active outputs of the function caller nodes.
   GraphView graph_view_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
+  FunctionOptimizerContext(const FunctionOptimizerContext&) = delete;
+  void operator=(const FunctionOptimizerContext&) = delete;
 };
 
 // Returns a pointer to the called function definition iff the given node is
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index 062e4611011e77..00d4a2109a69e5 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -305,7 +305,8 @@ class GraphOptimizerStagePipeline {
   std::vector<std::unique_ptr<GraphOptimizerStage<Result>>> stages_;
   std::function<bool(const Result&)> break_predicate_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GraphOptimizerStagePipeline);
+  GraphOptimizerStagePipeline(const GraphOptimizerStagePipeline&) = delete;
+  void operator=(const GraphOptimizerStagePipeline&) = delete;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.h b/tensorflow/core/grappler/optimizers/implementation_selector.h
index 6dbadcc0b17a05..029248ee0c695c 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.h
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.h
@@ -189,7 +189,8 @@ class ImplementationSelector : public CustomGraphOptimizer {
 
   std::unique_ptr<FunctionLibraryApiInfo> lib_info_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ImplementationSelector);
+  ImplementationSelector(const ImplementationSelector&) = delete;
+  void operator=(const ImplementationSelector&) = delete;
 };
 
 }  // namespace grappler
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
index 702f903a8e01b8..8c562587c7a591 100644
--- a/tensorflow/core/kernels/argmax_op.cc
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -111,7 +111,8 @@ class ArgOp : public OpKernel {
 #undef HANDLE_DIM
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ArgOp);
+  ArgOp(const ArgOp&) = delete;
+  void operator=(const ArgOp&) = delete;
 };
 
 template <typename Device, typename T, typename Tout>
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 2820338c3be77a..9c144faf997cb3 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -432,7 +432,8 @@ class Barrier : public ResourceBase {
   std::unordered_map<string, TensorTuple> incomplete_ TF_GUARDED_BY(mu_);
   PriorityQueue* ready_queue_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Barrier);
+  Barrier(const Barrier&) = delete;
+  void operator=(const Barrier&) = delete;
 };
 
 class BarrierOp : public ResourceOpKernel<Barrier> {
@@ -490,7 +491,8 @@ class BarrierOp : public ResourceOpKernel<Barrier> {
   DataTypeVector value_component_types_;
   std::vector<TensorShape> value_component_shapes_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BarrierOp);
+  BarrierOp(const BarrierOp&) = delete;
+  void operator=(const BarrierOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("Barrier").Device(DEVICE_CPU), BarrierOp);
@@ -549,7 +551,8 @@ class InsertManyOp : public BarrierOpKernel {
 
  private:
   int component_index_;
-  TF_DISALLOW_COPY_AND_ASSIGN(InsertManyOp);
+  InsertManyOp(const InsertManyOp&) = delete;
+  void operator=(const InsertManyOp&) = delete;
 };
 
 #define REGISTER_INSERTMANY(T)                                             \
@@ -619,7 +622,8 @@ class TakeManyOp : public BarrierOpKernel {
  private:
   int64_t timeout_;
   bool allow_small_batch_;
-  TF_DISALLOW_COPY_AND_ASSIGN(TakeManyOp);
+  TakeManyOp(const TakeManyOp&) = delete;
+  void operator=(const TakeManyOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BarrierTakeMany").Device(DEVICE_CPU), TakeManyOp);
@@ -640,7 +644,8 @@ class BarrierCloseOp : public BarrierOpKernel {
 
  private:
   bool cancel_pending_enqueues_;
-  TF_DISALLOW_COPY_AND_ASSIGN(BarrierCloseOp);
+  BarrierCloseOp(const BarrierCloseOp&) = delete;
+  void operator=(const BarrierCloseOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BarrierClose").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 06d4d34e4b9a5f..b9ca84b3bef2b6 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -284,7 +284,8 @@ class AdaptiveSharedBatchScheduler
   // Current adjustment size (as a fraction of in_flight_batches_limit_).
   double step_size_multiplier_ TF_GUARDED_BY(mu_) = kMaxStepSizeMultiplier;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(AdaptiveSharedBatchScheduler);
+  AdaptiveSharedBatchScheduler(const AdaptiveSharedBatchScheduler&) = delete;
+  void operator=(const AdaptiveSharedBatchScheduler&) = delete;
 };
 
 //////////////////////////////////////////////////////////
@@ -336,7 +337,8 @@ class ASBSQueue : public BatchScheduler<TaskType> {
   int64_t num_enqueued_batches_ TF_GUARDED_BY(mu_) = 0;
   int64_t num_enqueued_tasks_ TF_GUARDED_BY(mu_) = 0;
   mutable mutex mu_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ASBSQueue);
+  ASBSQueue(const ASBSQueue&) = delete;
+  void operator=(const ASBSQueue&) = delete;
 };
 
 // Batch which remembers when and by whom it was created.
@@ -365,7 +367,8 @@ class ASBSBatch : public Batch<TaskType> {
   const int64_t creation_time_micros_;
   const int64_t schedulable_time_micros_;
   const uint64 traceme_context_id_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ASBSBatch);
+  ASBSBatch(const ASBSBatch&) = delete;
+  void operator=(const ASBSBatch&) = delete;
 };
 }  // namespace internal
 
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
index 53d5201223ab9b..1cfeb68b028de2 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
@@ -39,7 +39,8 @@ class FakeTask : public BatchTask {
  private:
   size_t size_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
+  FakeTask(const FakeTask&) = delete;
+  void operator=(const FakeTask&) = delete;
 };
 
 // Creates a FakeTask of size 'task_size', and calls 'scheduler->Schedule()' on
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
index 1069a911a90004..607a8212175662 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
@@ -293,7 +293,8 @@ class BasicBatchScheduler : public BatchScheduler<TaskType> {
   // single queue.
   std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BasicBatchScheduler);
+  BasicBatchScheduler(const BasicBatchScheduler&) = delete;
+  void operator=(const BasicBatchScheduler&) = delete;
 };
 
 //////////
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
index 1d1c01a15bfda5..508cb794e6f5cb 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
@@ -57,7 +57,8 @@ class UniformLoadInjector : public LoadInjector {
                   int64_t average_injection_interval_micros) const override;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(UniformLoadInjector);
+  UniformLoadInjector(const UniformLoadInjector&) = delete;
+  void operator=(const UniformLoadInjector&) = delete;
 };
 
 void UniformLoadInjector::InjectLoad(
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_test.cc
index 494ba0c74c3efc..0b8fac15894e43 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_test.cc
@@ -38,7 +38,8 @@ class FakeTask : public BatchTask {
  private:
   const size_t size_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
+  FakeTask(const FakeTask&) = delete;
+  void operator=(const FakeTask&) = delete;
 };
 
 // Creates a FakeTask of size 'task_size', and calls 'scheduler->Schedule()'
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index ab0924e48cb4e0..2baff4ec9cfa57 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -142,7 +142,8 @@ class Batch {
   // The TracMe context id.
   const uint64 traceme_context_id_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Batch);
+  Batch(const Batch&) = delete;
+  void operator=(const Batch&) = delete;
 };
 
 // An abstract batch scheduler class. Collects individual tasks into batches,
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
index 1eab1d3d9a3900..b8d8c1a779df8e 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
@@ -34,7 +34,8 @@ class FakeTask : public BatchTask {
  private:
   const size_t size_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
+  FakeTask(const FakeTask&) = delete;
+  void operator=(const FakeTask&) = delete;
 };
 
 TEST(BatchTest, Basic) {
diff --git a/tensorflow/core/kernels/batching_util/bounded_executor.h b/tensorflow/core/kernels/batching_util/bounded_executor.h
index 568c0d24c7678a..7f36fe4d70960a 100644
--- a/tensorflow/core/kernels/batching_util/bounded_executor.h
+++ b/tensorflow/core/kernels/batching_util/bounded_executor.h
@@ -70,7 +70,8 @@ class BoundedExecutor : public thread::ThreadPoolInterface {
 
   // A fixed number of threads.
   std::vector<std::unique_ptr<Thread>> threads_;
-  TF_DISALLOW_COPY_AND_ASSIGN(BoundedExecutor);
+  BoundedExecutor(const BoundedExecutor&) = delete;
+  void operator=(const BoundedExecutor&) = delete;
 };
 
 }  // namespace serving
diff --git a/tensorflow/core/kernels/batching_util/fake_clock_env.h b/tensorflow/core/kernels/batching_util/fake_clock_env.h
index 4b8e97a03f9f20..6fc8d9e5cf40d8 100644
--- a/tensorflow/core/kernels/batching_util/fake_clock_env.h
+++ b/tensorflow/core/kernels/batching_util/fake_clock_env.h
@@ -66,7 +66,8 @@ class FakeClockEnv : public EnvWrapper {
   };
   std::vector<SleepingThread> sleeping_threads_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FakeClockEnv);
+  FakeClockEnv(const FakeClockEnv&) = delete;
+  void operator=(const FakeClockEnv&) = delete;
 };
 
 }  // namespace test_util
diff --git a/tensorflow/core/kernels/batching_util/periodic_function.h b/tensorflow/core/kernels/batching_util/periodic_function.h
index 278bf7668aebf8..278cfac246495d 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function.h
+++ b/tensorflow/core/kernels/batching_util/periodic_function.h
@@ -120,7 +120,8 @@ class PeriodicFunction {
   // Thread for running "function_"
   std::unique_ptr<Thread> thread_ = nullptr;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PeriodicFunction);
+  PeriodicFunction(const PeriodicFunction&) = delete;
+  void operator=(const PeriodicFunction&) = delete;
 };
 
 }  // namespace serving
diff --git a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
index f176b7ebf70950..80d24faa794838 100644
--- a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
@@ -188,7 +188,8 @@ class SerialDeviceBatchScheduler : public std::enable_shared_from_this<
 
   mutex mu_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SerialDeviceBatchScheduler);
+  SerialDeviceBatchScheduler(const SerialDeviceBatchScheduler&) = delete;
+  void operator=(const SerialDeviceBatchScheduler&) = delete;
 };
 
 //////////////////////////////////////////////////////////
@@ -233,7 +234,8 @@ class SDBSQueue : public BatchScheduler<TaskType> {
   int64_t num_enqueued_batches_ TF_GUARDED_BY(mu_) = 0;
   int64_t num_enqueued_tasks_ TF_GUARDED_BY(mu_) = 0;
   mutable mutex mu_;
-  TF_DISALLOW_COPY_AND_ASSIGN(SDBSQueue);
+  SDBSQueue(const SDBSQueue&) = delete;
+  void operator=(const SDBSQueue&) = delete;
 };
 
 // Batch which remembers when and by whom it was created.
@@ -252,7 +254,8 @@ class SDBSBatch : public Batch<TaskType> {
  private:
   SDBSQueue<TaskType>* queue_;
   const int64_t creation_time_micros_;
-  TF_DISALLOW_COPY_AND_ASSIGN(SDBSBatch);
+  SDBSBatch(const SDBSBatch&) = delete;
+  void operator=(const SDBSBatch&) = delete;
 };
 }  // namespace internal
 
diff --git a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler_test.cc
index a91356c0958528..4433f588357237 100644
--- a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler_test.cc
@@ -36,7 +36,8 @@ class FakeTask : public BatchTask {
  private:
   const size_t size_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
+  FakeTask(const FakeTask&) = delete;
+  void operator=(const FakeTask&) = delete;
 };
 
 // Creates a FakeTask of size 'task_size', and calls 'scheduler->Schedule()' on
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index d3805d2a7f3862..2b95b91b8f103e 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -293,7 +293,8 @@ class SharedBatchScheduler
   // Threads that process batches obtained from the queues.
   std::vector<std::unique_ptr<PeriodicFunction>> batch_threads_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SharedBatchScheduler);
+  SharedBatchScheduler(const SharedBatchScheduler&) = delete;
+  void operator=(const SharedBatchScheduler&) = delete;
 };
 
 //////////
@@ -524,7 +525,8 @@ class Queue {
   // 'empty_notification_->Notify()'.
   Notification* empty_notification_ TF_GUARDED_BY(mu_) = nullptr;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Queue);
+  Queue(const Queue&) = delete;
+  void operator=(const Queue&) = delete;
 };
 
 // A RAII-style object that points to a Queue and implements
@@ -550,7 +552,8 @@ class QueueHandle : public BatchScheduler<TaskType> {
   // least until this class's destructor closes it.
   Queue<TaskType>* queue_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(QueueHandle);
+  QueueHandle(const QueueHandle&) = delete;
+  void operator=(const QueueHandle&) = delete;
 };
 
 }  // namespace internal
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index 4c7aa87a05d25c..04d0b78df1d3ab 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -53,7 +53,8 @@ class FakeTask : public BatchTask {
  private:
   const size_t size_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
+  FakeTask(const FakeTask&) = delete;
+  void operator=(const FakeTask&) = delete;
 };
 
 using Queue = BatchScheduler<FakeTask>;
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
index f6b300f1ef468f..b60c5dd763923b 100644
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -63,7 +63,8 @@ class BCastArgsOp : public OpKernel {
     }
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BCastArgsOp);
+  BCastArgsOp(const BCastArgsOp&) = delete;
+  void operator=(const BCastArgsOp&) = delete;
 };
 
 // Given shapes of two tensors, computes the reduction indices for the
@@ -113,7 +114,8 @@ class BCastGradArgsOp : public OpKernel {
     }
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BCastGradArgsOp);
+  BCastGradArgsOp(const BCastGradArgsOp&) = delete;
+  void operator=(const BCastGradArgsOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index bfaa93bfe113f6..c0b6eaf084089a 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -144,7 +144,8 @@ class CastOpBase : public OpKernel {
   CastFunctorType work_ = nullptr;
   Status Unimplemented();
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CastOpBase);
+  CastOpBase(const CastOpBase&) = delete;
+  void operator=(const CastOpBase&) = delete;
 };
 
 // CPU implementation of Cast
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index 1fb1ab7b3a02e8..57815190f4073a 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -220,7 +220,8 @@ class CollectiveGatherOpKernel : public CollectiveOpV1Kernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveGatherOpKernel);
+  CollectiveGatherOpKernel(const CollectiveGatherOpKernel&) = delete;
+  void operator=(const CollectiveGatherOpKernel&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("CollectiveGather").Device(DEVICE_CPU),
@@ -330,7 +331,8 @@ class CollectiveReduceOpKernel : public CollectiveOpV1Kernel {
  private:
   std::unique_ptr<OpKernel> merge_op_;
   std::unique_ptr<OpKernel> final_op_;
-  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveReduceOpKernel);
+  CollectiveReduceOpKernel(const CollectiveReduceOpKernel&) = delete;
+  void operator=(const CollectiveReduceOpKernel&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("CollectiveReduce").Device(DEVICE_CPU),
@@ -407,7 +409,8 @@ class CollectiveBcastSendOpKernel : public CollectiveOpV1Kernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastSendOpKernel);
+  CollectiveBcastSendOpKernel(const CollectiveBcastSendOpKernel&) = delete;
+  void operator=(const CollectiveBcastSendOpKernel&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("CollectiveBcastSend").Device(DEVICE_CPU),
@@ -477,7 +480,8 @@ class CollectiveBcastRecvOpKernel : public CollectiveOpV1Kernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastRecvOpKernel);
+  CollectiveBcastRecvOpKernel(const CollectiveBcastRecvOpKernel&) = delete;
+  void operator=(const CollectiveBcastRecvOpKernel&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/conditional_accumulator.h b/tensorflow/core/kernels/conditional_accumulator.h
index 391033a00f9b91..5f901b12d43d58 100644
--- a/tensorflow/core/kernels/conditional_accumulator.h
+++ b/tensorflow/core/kernels/conditional_accumulator.h
@@ -127,7 +127,8 @@ class ConditionalAccumulator
     // do nothing
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ConditionalAccumulator);
+  ConditionalAccumulator(const ConditionalAccumulator&) = delete;
+  void operator=(const ConditionalAccumulator&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conditional_accumulator_base_op.cc b/tensorflow/core/kernels/conditional_accumulator_base_op.cc
index 9eff0d6ff1ad0b..3b6eb16c7ae02b 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_base_op.cc
@@ -55,7 +55,8 @@ class AccumulatorSetGlobalStepOp
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(AccumulatorSetGlobalStepOp);
+  AccumulatorSetGlobalStepOp(const AccumulatorSetGlobalStepOp&) = delete;
+  void operator=(const AccumulatorSetGlobalStepOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("AccumulatorSetGlobalStep").Device(DEVICE_CPU),
@@ -72,7 +73,9 @@ class ResourceAccumulatorSetGlobalStepOp : public AccumulatorSetGlobalStepOp {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ResourceAccumulatorSetGlobalStepOp);
+  ResourceAccumulatorSetGlobalStepOp(
+      const ResourceAccumulatorSetGlobalStepOp&) = delete;
+  void operator=(const ResourceAccumulatorSetGlobalStepOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(
@@ -116,7 +119,8 @@ class AccumulatorNumAccumulatedOp
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(AccumulatorNumAccumulatedOp);
+  AccumulatorNumAccumulatedOp(const AccumulatorNumAccumulatedOp&) = delete;
+  void operator=(const AccumulatorNumAccumulatedOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("AccumulatorNumAccumulated").Device(DEVICE_CPU),
@@ -133,7 +137,9 @@ class ResourceAccumulatorNumAccumulatedOp : public AccumulatorNumAccumulatedOp {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ResourceAccumulatorNumAccumulatedOp);
+  ResourceAccumulatorNumAccumulatedOp(
+      const ResourceAccumulatorNumAccumulatedOp&) = delete;
+  void operator=(const ResourceAccumulatorNumAccumulatedOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc
index e7e8b6202c4614..6df0b7f701e2fd 100644
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@@ -51,7 +51,8 @@ class ConditionalAccumulatorOp : public ConditionalAccumulatorBaseOp {
     ctx->set_output_ref(0, &mu_, &accumulator_);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ConditionalAccumulatorOp);
+  ConditionalAccumulatorOp(const ConditionalAccumulatorOp&) = delete;
+  void operator=(const ConditionalAccumulatorOp&) = delete;
 };
 
 #define REGISTER_KERNELS(type, dev)                           \
@@ -93,7 +94,9 @@ class ResourceConditionalAccumulatorOp : public ConditionalAccumulatorBaseOp {
                             TypeIndex::Make<ConditionalAccumulatorBase>()));
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ResourceConditionalAccumulatorOp);
+  ResourceConditionalAccumulatorOp(const ResourceConditionalAccumulatorOp&) =
+      delete;
+  void operator=(const ResourceConditionalAccumulatorOp&) = delete;
 };
 
 #define REGISTER_RESOURCE_KERNELS(type, dev)                     \
@@ -141,7 +144,8 @@ class AccumulatorApplyGradientOp
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(AccumulatorApplyGradientOp);
+  AccumulatorApplyGradientOp(const AccumulatorApplyGradientOp&) = delete;
+  void operator=(const AccumulatorApplyGradientOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("AccumulatorApplyGradient").Device(DEVICE_CPU),
@@ -162,7 +166,9 @@ class ResourceAccumulatorApplyGradientOp
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ResourceAccumulatorApplyGradientOp);
+  ResourceAccumulatorApplyGradientOp(
+      const ResourceAccumulatorApplyGradientOp&) = delete;
+  void operator=(const ResourceAccumulatorApplyGradientOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(
@@ -185,7 +191,8 @@ class AccumulatorTakeGradientOp
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(AccumulatorTakeGradientOp);
+  AccumulatorTakeGradientOp(const AccumulatorTakeGradientOp&) = delete;
+  void operator=(const AccumulatorTakeGradientOp&) = delete;
 };
 REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
                         AccumulatorTakeGradientOp);
@@ -202,7 +209,9 @@ class ResourceAccumulatorTakeGradientOp
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ResourceAccumulatorTakeGradientOp);
+  ResourceAccumulatorTakeGradientOp(const ResourceAccumulatorTakeGradientOp&) =
+      delete;
+  void operator=(const ResourceAccumulatorTakeGradientOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/constant_op.h b/tensorflow/core/kernels/constant_op.h
index 672560951f1cfc..32f1ddb7cfdba8 100644
--- a/tensorflow/core/kernels/constant_op.h
+++ b/tensorflow/core/kernels/constant_op.h
@@ -34,7 +34,8 @@ class ConstantOp : public OpKernel {
 
  private:
   Tensor tensor_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ConstantOp);
+  ConstantOp(const ConstantOp&) = delete;
+  void operator=(const ConstantOp&) = delete;
 };
 
 class PlaceholderOp : public OpKernel {
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index eb574c981f8127..2a063846b64ba5 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -247,7 +247,8 @@ class RefSelectOp : public OpKernel {
 
   ~RefSelectOp() override {}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RefSelectOp);
+  RefSelectOp(const RefSelectOp&) = delete;
+  void operator=(const RefSelectOp&) = delete;
 
  private:
   int num_ref_inputs_;
diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h
index 37d561ca98bff9..138693178cf756 100644
--- a/tensorflow/core/kernels/control_flow_ops.h
+++ b/tensorflow/core/kernels/control_flow_ops.h
@@ -43,7 +43,8 @@ class SwitchOp : public OpKernel {
   bool IsExpensive() override { return false; }
   ~SwitchOp() override {}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SwitchOp);
+  SwitchOp(const SwitchOp&) = delete;
+  void operator=(const SwitchOp&) = delete;
 };
 
 // An n-way switch op has two inputs and N outputs. It forwards the value of
@@ -58,7 +59,8 @@ class SwitchNOp : public OpKernel {
   bool IsExpensive() override { return false; }
   ~SwitchNOp() override {}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SwitchNOp);
+  SwitchNOp(const SwitchNOp&) = delete;
+  void operator=(const SwitchNOp&) = delete;
 };
 
 // A merge op has n inputs and two outputs. It forwards the value of the
@@ -71,7 +73,8 @@ class MergeOp : public OpKernel {
   bool IsExpensive() override { return false; }
   ~MergeOp() override {}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MergeOp);
+  MergeOp(const MergeOp&) = delete;
+  void operator=(const MergeOp&) = delete;
 };
 
 // An enter op has one input and one output. It creates or finds
@@ -84,7 +87,8 @@ class EnterOp : public OpKernel {
   bool IsExpensive() override { return false; }
   ~EnterOp() override {}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(EnterOp);
+  EnterOp(const EnterOp&) = delete;
+  void operator=(const EnterOp&) = delete;
 };
 
 // An exit op has one input and one output. It exits the current
@@ -97,7 +101,8 @@ class ExitOp : public OpKernel {
   bool IsExpensive() override { return false; }
   ~ExitOp() override {}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ExitOp);
+  ExitOp(const ExitOp&) = delete;
+  void operator=(const ExitOp&) = delete;
 };
 
 // A next_iteration op has one input and one output. It makes its input
@@ -109,7 +114,8 @@ class NextIterationOp : public OpKernel {
   bool IsExpensive() override { return false; }
   ~NextIterationOp() override {}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(NextIterationOp);
+  NextIterationOp(const NextIterationOp&) = delete;
+  void operator=(const NextIterationOp&) = delete;
 };
 
 // A LoopCond op has one input and one output. The input is a boolean
@@ -125,7 +131,8 @@ class LoopCondOp : public OpKernel {
 
   bool IsExpensive() override;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp);
+  LoopCondOp(const LoopCondOp&) = delete;
+  void operator=(const LoopCondOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index aa05bf99c7091f..439bd44abc8a49 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -220,7 +220,8 @@ class Conv2DBackpropFilterOp : public OpKernel {
   LaunchConv2DBackpropFilterOp<Device, T> launcher_;
   bool cudnn_use_autotune_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DBackpropFilterOp);
+  Conv2DBackpropFilterOp(const Conv2DBackpropFilterOp&) = delete;
+  void operator=(const Conv2DBackpropFilterOp&) = delete;
 };
 
 // Based on implementation written by Yangqing Jia (jiayq).
@@ -449,7 +450,8 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropFilterOp);
+  Conv2DCustomBackpropFilterOp(const Conv2DCustomBackpropFilterOp&) = delete;
+  void operator=(const Conv2DCustomBackpropFilterOp&) = delete;
 };
 
 #define REGISTER_CPU_KERNELS(T)                                               \
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc b/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
index 826af7e05f6744..d3cd0a9235ae9d 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
@@ -253,7 +253,8 @@ class Conv3DBackpropFilterOp : public OpKernel {
   TensorFormat data_format_;
   bool takes_shape_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv3DBackpropFilterOp);
+  Conv3DBackpropFilterOp(const Conv3DBackpropFilterOp&) = delete;
+  void operator=(const Conv3DBackpropFilterOp&) = delete;
 };
 
 // Custom backprop for filter that explicitly does the work sharding and calls
@@ -565,7 +566,8 @@ class Conv3DCustomBackpropFilterOp : public OpKernel {
   TensorFormat data_format_;
   bool takes_shape_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv3DCustomBackpropFilterOp);
+  Conv3DCustomBackpropFilterOp(const Conv3DCustomBackpropFilterOp&) = delete;
+  void operator=(const Conv3DCustomBackpropFilterOp&) = delete;
 };
 
 // Custom backrop input kernel is 30% - 4x faster when compiled with AVX2 than
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.h b/tensorflow/core/kernels/conv_grad_input_ops.h
index a70722b8447b7a..60f35fe6111db6 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.h
+++ b/tensorflow/core/kernels/conv_grad_input_ops.h
@@ -411,7 +411,8 @@ class Conv2DBackpropInputOp : public OpKernel {
   bool use_cudnn_ = false;
   bool cudnn_use_autotune_ = false;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DBackpropInputOp);
+  Conv2DBackpropInputOp(const Conv2DBackpropInputOp&) = delete;
+  void operator=(const Conv2DBackpropInputOp&) = delete;
 };
 
 // Based on implementation written by Yangqing Jia (jiayq).
@@ -690,7 +691,8 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp);
+  Conv2DCustomBackpropInputOp(const Conv2DCustomBackpropInputOp&) = delete;
+  void operator=(const Conv2DCustomBackpropInputOp&) = delete;
 };
 
 // TODO(ezhulenev): Add a cost model to switch between custom/Eigen ops.
diff --git a/tensorflow/core/kernels/conv_grad_input_ops_3d.cc b/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
index 007058916bccc9..e77ec97ba59aa6 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
@@ -247,7 +247,8 @@ class Conv3DBackpropInputOp : public OpKernel {
   TensorFormat data_format_;
   bool takes_shape_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv3DBackpropInputOp);
+  Conv3DBackpropInputOp(const Conv3DBackpropInputOp&) = delete;
+  void operator=(const Conv3DBackpropInputOp&) = delete;
 };
 
 // Custom backprop for input that explicitly does the work sharding and calls
@@ -598,7 +599,8 @@ class Conv3DCustomBackpropInputOp : public OpKernel {
   TensorFormat data_format_;
   bool takes_shape_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv3DCustomBackpropInputOp);
+  Conv3DCustomBackpropInputOp(const Conv3DCustomBackpropInputOp&) = delete;
+  void operator=(const Conv3DCustomBackpropInputOp&) = delete;
 };
 
 // Custom backrop input kernel is 30% - 4x faster when compiled with AVX2 than
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
index b81c9936bd803a..042ee1dfc3cd93 100644
--- a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -872,7 +872,8 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
   bool align_corners_;
   int offset_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
+  FusedResizeConv2DUsingGemmOp(const FusedResizeConv2DUsingGemmOp&) = delete;
+  void operator=(const FusedResizeConv2DUsingGemmOp&) = delete;
 };
 
 #define REGISTER_FUSED(T)                                                 \
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index becffff153160d..1b2f2a6e17d395 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -802,7 +802,8 @@ class FusedConv2DOp : public OpKernel {
   FusedComputationType fused_computation_ = FusedComputationType::kUndefined;
   FusedComputationArgs fused_computation_args_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DOp);
+  FusedConv2DOp(const FusedConv2DOp&) = delete;
+  void operator=(const FusedConv2DOp&) = delete;
 };
 
 // Registration of the CPU implementations.
diff --git a/tensorflow/core/kernels/conv_ops_impl.h b/tensorflow/core/kernels/conv_ops_impl.h
index 7ef650e45365a9..3d28c6e67800e2 100644
--- a/tensorflow/core/kernels/conv_ops_impl.h
+++ b/tensorflow/core/kernels/conv_ops_impl.h
@@ -550,7 +550,8 @@ class ConvOp : public BinaryOp<T> {
 
   LaunchConvOp<Device, T> launcher_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ConvOp);
+  ConvOp(const ConvOp&) = delete;
+  void operator=(const ConvOp&) = delete;
 };
 
 template <typename T>
@@ -728,7 +729,8 @@ class Conv2DOp : public BinaryOp<T> {
 
   LaunchConv2DOp<Device, T> launcher_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
+  Conv2DOp(const Conv2DOp&) = delete;
+  void operator=(const Conv2DOp&) = delete;
 };
 extern template struct Conv2DOp<CPUDevice, Eigen::bfloat16>;
 extern template struct Conv2DOp<CPUDevice, Eigen::half>;
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index 7889fe96142bbe..8374935243fec7 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -561,7 +561,8 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
   Padding padding_;
   TensorFormat data_format_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DUsingGemmOp);
+  Conv2DUsingGemmOp(const Conv2DUsingGemmOp&) = delete;
+  void operator=(const Conv2DUsingGemmOp&) = delete;
 };
 
 #define REGISTER_CPU(T)                                         \
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index e0ad0667065f8f..e89c9c8c7c4220 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -179,7 +179,8 @@ class CTCDecodeHelper {
 
  private:
   int top_paths_;
-  TF_DISALLOW_COPY_AND_ASSIGN(CTCDecodeHelper);
+  CTCDecodeHelper(const CTCDecodeHelper&) = delete;
+  void operator=(const CTCDecodeHelper&) = delete;
 };
 
 template <typename T>
@@ -270,7 +271,8 @@ class CTCGreedyDecoderOp : public OpKernel {
   bool merge_repeated_;
   int blank_index_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CTCGreedyDecoderOp);
+  CTCGreedyDecoderOp(const CTCGreedyDecoderOp&) = delete;
+  void operator=(const CTCGreedyDecoderOp&) = delete;
 };
 
 #define REGISTER_CPU(T)                                                   \
@@ -370,7 +372,8 @@ class CTCBeamSearchDecoderOp : public OpKernel {
   typename ctc::CTCBeamSearchDecoder<T>::DefaultBeamScorer beam_scorer_;
   bool merge_repeated_;
   int beam_width_;
-  TF_DISALLOW_COPY_AND_ASSIGN(CTCBeamSearchDecoderOp<T>);
+  CTCBeamSearchDecoderOp<T>(const CTCBeamSearchDecoderOp<T>&) = delete;
+  void operator=(const CTCBeamSearchDecoderOp<T>&) = delete;
 };
 
 #define REGISTER_CPU(T)                                                       \
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index df934a7e26bc60..7e38ac5fcb0669 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -221,7 +221,8 @@ class CTCLossOp : public OpKernel {
   bool ctc_merge_repeated_;
   bool ignore_longer_outputs_than_inputs_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CTCLossOp<T>);
+  CTCLossOp<T>(const CTCLossOp<T>&) = delete;
+  void operator=(const CTCLossOp<T>&) = delete;
 };
 
 #define REGISTER_CPU(T)                                          \
@@ -371,7 +372,8 @@ class CTCLossOpGPU : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CTCLossOpGPU);
+  CTCLossOpGPU(const CTCLossOpGPU&) = delete;
+  void operator=(const CTCLossOpGPU&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("CTCLossV2")
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index c6d26d1477804c..2890409c5a903a 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -135,7 +135,8 @@ class SelectOp : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(SelectOp);
+  SelectOp(const SelectOp&) = delete;
+  void operator=(const SelectOp&) = delete;
 };
 template <typename Device, typename T>
 class SelectV2Op : public OpKernel {
@@ -239,7 +240,8 @@ class SelectV2Op : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(SelectV2Op);
+  SelectV2Op(const SelectV2Op&) = delete;
+  void operator=(const SelectV2Op&) = delete;
 };
 
 #define REGISTER_SELECT(type)                                        \
diff --git a/tensorflow/core/kernels/data/experimental/lookup_ops.cc b/tensorflow/core/kernels/data/experimental/lookup_ops.cc
index ebbb2dfdeca949..91d6b1dfe03266 100644
--- a/tensorflow/core/kernels/data/experimental/lookup_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/lookup_ops.cc
@@ -199,7 +199,8 @@ class InitializeTableFromDatasetOp : public AsyncOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(InitializeTableFromDatasetOp);
+  InitializeTableFromDatasetOp(const InitializeTableFromDatasetOp&) = delete;
+  void operator=(const InitializeTableFromDatasetOp&) = delete;
 
   data::BackgroundWorker background_worker_;
 };
diff --git a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index 52c8ae601a5a3b..d1f05d8a3bcc2f 100644
--- a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -75,7 +75,9 @@ class StatsAggregatorWithTagAndPrefix : public StatsAggregator {
   std::shared_ptr<StatsAggregator> wrapped_;
   string tag_;
   string prefix_;
-  TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorWithTagAndPrefix);
+  StatsAggregatorWithTagAndPrefix(const StatsAggregatorWithTagAndPrefix&) =
+      delete;
+  void operator=(const StatsAggregatorWithTagAndPrefix&) = delete;
 };
 
 class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
diff --git a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index b61d5b21677c9d..6404c0fc52a200 100644
--- a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -110,7 +110,8 @@ class StatsAggregatorImpl : public StatsAggregator {
   std::unordered_map<string, histogram::Histogram> histograms_
       TF_GUARDED_BY(mu_);
   std::unordered_map<string, float> scalars_ TF_GUARDED_BY(mu_);
-  TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorImpl);
+  StatsAggregatorImpl(const StatsAggregatorImpl&) = delete;
+  void operator=(const StatsAggregatorImpl&) = delete;
 };
 
 class StatsAggregatorHandleOp
@@ -234,7 +235,8 @@ class StatsAggregatorImplV2 : public StatsAggregator {
   // context
   std::unordered_map<string, histogram::Histogram> histograms_
       TF_GUARDED_BY(mu_);
-  TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorImplV2);
+  StatsAggregatorImplV2(const StatsAggregatorImplV2&) = delete;
+  void operator=(const StatsAggregatorImplV2&) = delete;
 };
 
 class StatsAggregatorHandleOpV2
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
index 65cc5f1a0b3a04..6257fef46da3ff 100644
--- a/tensorflow/core/kernels/decode_proto_op.cc
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -1099,7 +1099,8 @@ class DecodeProtoOp : public OpKernel {
   // security review.
   bool sanitize_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DecodeProtoOp);
+  DecodeProtoOp(const DecodeProtoOp&) = delete;
+  void operator=(const DecodeProtoOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("DecodeProtoV2").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index b16458aa8052ca..78ca7948e55c0f 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -706,7 +706,9 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
   bool cudnn_use_autotune_;
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
+  DepthwiseConv2dNativeBackpropInputOp(
+      const DepthwiseConv2dNativeBackpropInputOp&) = delete;
+  void operator=(const DepthwiseConv2dNativeBackpropInputOp&) = delete;
 };
 
 #define REGISTER_CPU_KERNEL(T)                                       \
@@ -1258,7 +1260,9 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
   bool cudnn_use_autotune_;
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
+  DepthwiseConv2dNativeBackpropFilterOp(
+      const DepthwiseConv2dNativeBackpropFilterOp&) = delete;
+  void operator=(const DepthwiseConv2dNativeBackpropFilterOp&) = delete;
 };
 
 #define REGISTER_CPU_KERNEL(T)                    \
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index a636759602d92d..0708e70481c594 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -521,7 +521,8 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
   bool cudnn_use_autotune_;
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeOp);
+  DepthwiseConv2dNativeOp(const DepthwiseConv2dNativeOp&) = delete;
+  void operator=(const DepthwiseConv2dNativeOp&) = delete;
 };
 
 #define REGISTER_CPU_KERNEL(T)                                                 \
diff --git a/tensorflow/core/kernels/edit_distance_op.cc b/tensorflow/core/kernels/edit_distance_op.cc
index e6f0d99c497ff1..9e2b430dbc03d6 100644
--- a/tensorflow/core/kernels/edit_distance_op.cc
+++ b/tensorflow/core/kernels/edit_distance_op.cc
@@ -280,7 +280,8 @@ class EditDistanceOp : public OpKernel {
  private:
   bool normalize_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(EditDistanceOp);
+  EditDistanceOp(const EditDistanceOp&) = delete;
+  void operator=(const EditDistanceOp&) = delete;
 };
 
 #define REGISTER_CPU_KERNEL(T)                                        \
diff --git a/tensorflow/core/kernels/encode_proto_op.cc b/tensorflow/core/kernels/encode_proto_op.cc
index 5696f7d17fc05b..d585dff1f9e90f 100644
--- a/tensorflow/core/kernels/encode_proto_op.cc
+++ b/tensorflow/core/kernels/encode_proto_op.cc
@@ -636,7 +636,8 @@ class EncodeProtoOp : public OpKernel {
   // order of writing.
   std::vector<int> sorted_field_index_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(EncodeProtoOp);
+  EncodeProtoOp(const EncodeProtoOp&) = delete;
+  void operator=(const EncodeProtoOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("EncodeProto").Device(DEVICE_CPU), EncodeProtoOp);
diff --git a/tensorflow/core/kernels/fifo_queue.h b/tensorflow/core/kernels/fifo_queue.h
index cc4d4048296164..c6faff14962b31 100644
--- a/tensorflow/core/kernels/fifo_queue.h
+++ b/tensorflow/core/kernels/fifo_queue.h
@@ -67,7 +67,8 @@ class FIFOQueue : public TypedQueue<std::deque<Tensor> > {
                                              Tensor* out_tensor);
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(FIFOQueue);
+  FIFOQueue(const FIFOQueue&) = delete;
+  void operator=(const FIFOQueue&) = delete;
 };
 
 // Defines a FIFOQueueOp, which produces a Queue (specifically, one
@@ -83,7 +84,8 @@ class FIFOQueueOp : public TypedQueueOp {
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   std::vector<TensorShape> component_shapes_;
-  TF_DISALLOW_COPY_AND_ASSIGN(FIFOQueueOp);
+  FIFOQueueOp(const FIFOQueueOp&) = delete;
+  void operator=(const FIFOQueueOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 62ddcb5bb16647..3ce1145493731b 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -275,7 +275,8 @@ class SymbolicGradientOp : public AsyncOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(SymbolicGradientOp);
+  SymbolicGradientOp(const SymbolicGradientOp&) = delete;
+  void operator=(const SymbolicGradientOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/function_ops.h b/tensorflow/core/kernels/function_ops.h
index 5ed117cee916d6..552e1e6c515e3b 100644
--- a/tensorflow/core/kernels/function_ops.h
+++ b/tensorflow/core/kernels/function_ops.h
@@ -39,7 +39,8 @@ class ArgOp : public OpKernel {
   int index_;
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ArgOp);
+  ArgOp(const ArgOp&) = delete;
+  void operator=(const ArgOp&) = delete;
 };
 
 class RetvalOp : public OpKernel {
@@ -54,7 +55,8 @@ class RetvalOp : public OpKernel {
   int index_;
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp);
+  RetvalOp(const RetvalOp&) = delete;
+  void operator=(const RetvalOp&) = delete;
 };
 
 class RemoteCallOp : public AsyncOpKernel {
@@ -81,7 +83,8 @@ class RemoteCallOp : public AsyncOpKernel {
   std::map<FunctionTarget, FunctionLibraryRuntime::Handle> handle_cache_
       TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RemoteCallOp);
+  RemoteCallOp(const RemoteCallOp&) = delete;
+  void operator=(const RemoteCallOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index dc507d8c3c70c4..dd098b01bc65d2 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -594,7 +594,8 @@ class WhileOp : public AsyncOpKernel {
     std::vector<Tensor>* const retvals_;  // Not owned.
     DataTypeSlice ret_types_;
 
-    TF_DISALLOW_COPY_AND_ASSIGN(BodyFuncCallFrame);
+    BodyFuncCallFrame(const BodyFuncCallFrame&) = delete;
+    void operator=(const BodyFuncCallFrame&) = delete;
   };
 
   class State {
diff --git a/tensorflow/core/kernels/gpu_device_array.h b/tensorflow/core/kernels/gpu_device_array.h
index 777b27f5921c8b..fe3e455d96717d 100644
--- a/tensorflow/core/kernels/gpu_device_array.h
+++ b/tensorflow/core/kernels/gpu_device_array.h
@@ -114,7 +114,8 @@ class GpuDeviceArrayOnHost {
   Tensor out_of_line_values_on_host_;
   Tensor out_of_line_values_on_gpu_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GpuDeviceArrayOnHost);
+  GpuDeviceArrayOnHost(const GpuDeviceArrayOnHost&) = delete;
+  void operator=(const GpuDeviceArrayOnHost&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index e50bea5f886f94..96af46697a859b 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -238,7 +238,8 @@ class AutotuneMap {
   int32 max_autotune_global_count_;
   int32 autotune_global_count_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(AutotuneMap);
+  AutotuneMap(const AutotuneMap&) = delete;
+  void operator=(const AutotuneMap&) = delete;
 };
 
 // A Singleton helper that manages the global autotune results by groups.
diff --git a/tensorflow/core/kernels/host_constant_op.h b/tensorflow/core/kernels/host_constant_op.h
index 6d02fb3949b65a..9ba151ba71b3de 100644
--- a/tensorflow/core/kernels/host_constant_op.h
+++ b/tensorflow/core/kernels/host_constant_op.h
@@ -35,7 +35,8 @@ class _HostConstantOp : public OpKernel {
 
  private:
   Tensor tensor_;
-  TF_DISALLOW_COPY_AND_ASSIGN(_HostConstantOp);
+  _HostConstantOp(const _HostConstantOp&) = delete;
+  void operator=(const _HostConstantOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/image/extract_image_patches_op.cc b/tensorflow/core/kernels/image/extract_image_patches_op.cc
index bae07354f3bdec..a1dbcd9efa3650 100644
--- a/tensorflow/core/kernels/image/extract_image_patches_op.cc
+++ b/tensorflow/core/kernels/image/extract_image_patches_op.cc
@@ -120,7 +120,8 @@ class ExtractImagePatchesOp : public UnaryOp<T> {
 
   Padding padding_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ExtractImagePatchesOp);
+  ExtractImagePatchesOp(const ExtractImagePatchesOp&) = delete;
+  void operator=(const ExtractImagePatchesOp&) = delete;
 };
 
 // Registration of the CPU implementations.
diff --git a/tensorflow/core/kernels/image/extract_volume_patches_op.cc b/tensorflow/core/kernels/image/extract_volume_patches_op.cc
index cfa2f534e49a5c..1edcf5cad22b2f 100644
--- a/tensorflow/core/kernels/image/extract_volume_patches_op.cc
+++ b/tensorflow/core/kernels/image/extract_volume_patches_op.cc
@@ -149,7 +149,8 @@ class ExtractVolumePatchesOp : public UnaryOp<T> {
 
   Padding padding_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ExtractVolumePatchesOp);
+  ExtractVolumePatchesOp(const ExtractVolumePatchesOp&) = delete;
+  void operator=(const ExtractVolumePatchesOp&) = delete;
 };
 
 // Registration of the CPU implementations.
diff --git a/tensorflow/core/kernels/immutable_constant_op.cc b/tensorflow/core/kernels/immutable_constant_op.cc
index e4c2e8dbe64ba1..c496f8036de8da 100644
--- a/tensorflow/core/kernels/immutable_constant_op.cc
+++ b/tensorflow/core/kernels/immutable_constant_op.cc
@@ -79,7 +79,8 @@ class MemmappedTensorAllocator : public Allocator {
   // de-allocation.
   bool delete_on_deallocate_ = false;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MemmappedTensorAllocator);
+  MemmappedTensorAllocator(const MemmappedTensorAllocator&) = delete;
+  void operator=(const MemmappedTensorAllocator&) = delete;
 };
 }  // namespace
 
diff --git a/tensorflow/core/kernels/immutable_constant_op.h b/tensorflow/core/kernels/immutable_constant_op.h
index f16d0e30a97251..264abc8401b3b4 100644
--- a/tensorflow/core/kernels/immutable_constant_op.h
+++ b/tensorflow/core/kernels/immutable_constant_op.h
@@ -41,7 +41,8 @@ class ImmutableConstantOp : public OpKernel {
   string region_name_;
   DataType dtype_;
   TensorShape shape_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ImmutableConstantOp);
+  ImmutableConstantOp(const ImmutableConstantOp&) = delete;
+  void operator=(const ImmutableConstantOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h
index e4805aa40a68e3..674352278ef568 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.h
+++ b/tensorflow/core/kernels/initializable_lookup_table.h
@@ -134,7 +134,8 @@ class InitializableLookupTable : public LookupInterface {
     virtual int64_t total_size() const = 0;
 
    private:
-    TF_DISALLOW_COPY_AND_ASSIGN(InitTableIterator);
+    InitTableIterator(const InitTableIterator&) = delete;
+    void operator=(const InitTableIterator&) = delete;
   };
 
   InitializableLookupTable* GetInitializableLookupTable() override {
@@ -254,7 +255,8 @@ class KeyValueTensorIterator
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(KeyValueTensorIterator);
+  KeyValueTensorIterator(const KeyValueTensorIterator&) = delete;
+  void operator=(const KeyValueTensorIterator&) = delete;
 
   const Tensor* keys_;    // Doesn't own it.
   const Tensor* values_;  // Doesn't own it.
diff --git a/tensorflow/core/kernels/linalg/matrix_band_part_op.cc b/tensorflow/core/kernels/linalg/matrix_band_part_op.cc
index 8911629010fe6f..97a390b869cd88 100644
--- a/tensorflow/core/kernels/linalg/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_band_part_op.cc
@@ -107,7 +107,8 @@ class MatrixBandPartOp : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixBandPartOp);
+  MatrixBandPartOp(const MatrixBandPartOp&) = delete;
+  void operator=(const MatrixBandPartOp&) = delete;
 };
 
 #define REGISTER_MATRIX_BAND_PART(type)                                    \
diff --git a/tensorflow/core/kernels/linalg/matrix_diag_op.cc b/tensorflow/core/kernels/linalg/matrix_diag_op.cc
index c485044e56be60..89dc4f06ea111a 100644
--- a/tensorflow/core/kernels/linalg/matrix_diag_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_diag_op.cc
@@ -153,7 +153,8 @@ class MatrixDiagPartOp : public OpKernel {
   bool left_align_superdiagonal_ = true;
   bool left_align_subdiagonal_ = true;
   static constexpr int kNumV1Inputs = 1;
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixDiagPartOp);
+  MatrixDiagPartOp(const MatrixDiagPartOp&) = delete;
+  void operator=(const MatrixDiagPartOp&) = delete;
 };
 
 template <typename Device, typename T>
@@ -290,7 +291,8 @@ class MatrixDiagOp : public OpKernel {
   bool left_align_superdiagonal_ = true;
   bool left_align_subdiagonal_ = true;
   static constexpr int kNumV1Inputs = 1;
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixDiagOp);
+  MatrixDiagOp(const MatrixDiagOp&) = delete;
+  void operator=(const MatrixDiagOp&) = delete;
 };
 
 #define REGISTER_MATRIX_DIAG(type)                                           \
diff --git a/tensorflow/core/kernels/linalg/matrix_exponential_op.cc b/tensorflow/core/kernels/linalg/matrix_exponential_op.cc
index 3e28f0ad66dfc0..cbb356b41a90df 100644
--- a/tensorflow/core/kernels/linalg/matrix_exponential_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_exponential_op.cc
@@ -46,7 +46,8 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixExponentialOp);
+  MatrixExponentialOp(const MatrixExponentialOp&) = delete;
+  void operator=(const MatrixExponentialOp&) = delete;
 };
 
 // Deprecated kernels (2018/08/21).
diff --git a/tensorflow/core/kernels/linalg/matrix_inverse_op.cc b/tensorflow/core/kernels/linalg/matrix_inverse_op.cc
index 9371d01353818d..e3e434cc3b6896 100644
--- a/tensorflow/core/kernels/linalg/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_inverse_op.cc
@@ -80,7 +80,8 @@ class MatrixInverseOp : public LinearAlgebraOp<Scalar> {
  private:
   bool adjoint_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixInverseOp);
+  MatrixInverseOp(const MatrixInverseOp&) = delete;
+  void operator=(const MatrixInverseOp&) = delete;
 };
 
 // For Eigen::half, compute inverse via float32 - otherwise precision is
@@ -122,7 +123,8 @@ class MatrixInverseOp<Eigen::half> : public LinearAlgebraOp<Eigen::half> {
  private:
   bool adjoint_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixInverseOp);
+  MatrixInverseOp(const MatrixInverseOp&) = delete;
+  void operator=(const MatrixInverseOp&) = delete;
 };
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/linalg/matrix_logarithm_op.cc b/tensorflow/core/kernels/linalg/matrix_logarithm_op.cc
index b1d09d48d205f5..9dd75f98eead0b 100644
--- a/tensorflow/core/kernels/linalg/matrix_logarithm_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_logarithm_op.cc
@@ -46,7 +46,8 @@ class MatrixLogarithmOp : public LinearAlgebraOp<Scalar> {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixLogarithmOp);
+  MatrixLogarithmOp(const MatrixLogarithmOp&) = delete;
+  void operator=(const MatrixLogarithmOp&) = delete;
 };
 
 // For real-valued matrices, this Op would return the real part of the matrix
diff --git a/tensorflow/core/kernels/linalg/matrix_set_diag_op.cc b/tensorflow/core/kernels/linalg/matrix_set_diag_op.cc
index 04310f228eca97..f3f9f4ee767b0d 100644
--- a/tensorflow/core/kernels/linalg/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_set_diag_op.cc
@@ -175,7 +175,8 @@ class MatrixSetDiagOp : public OpKernel {
   bool left_align_superdiagonal_ = true;
   bool left_align_subdiagonal_ = true;
   static constexpr int kNumV1Inputs = 2;
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSetDiagOp);
+  MatrixSetDiagOp(const MatrixSetDiagOp&) = delete;
+  void operator=(const MatrixSetDiagOp&) = delete;
 };
 
 #define REGISTER_MATRIX_SET_DIAG(type)                                      \
diff --git a/tensorflow/core/kernels/linalg/matrix_solve_op.cc b/tensorflow/core/kernels/linalg/matrix_solve_op.cc
index 49b36db6809c84..ba0e203a8ea724 100644
--- a/tensorflow/core/kernels/linalg/matrix_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_solve_op.cc
@@ -111,7 +111,8 @@ class MatrixSolveOp : public LinearAlgebraOp<Scalar> {
  private:
   bool adjoint_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSolveOp);
+  MatrixSolveOp(const MatrixSolveOp&) = delete;
+  void operator=(const MatrixSolveOp&) = delete;
 };
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/linalg/matrix_square_root_op.cc b/tensorflow/core/kernels/linalg/matrix_square_root_op.cc
index 1ee76ed9460f3e..b9fdf85387de52 100644
--- a/tensorflow/core/kernels/linalg/matrix_square_root_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_square_root_op.cc
@@ -46,7 +46,8 @@ class MatrixSquareRootOp : public LinearAlgebraOp<Scalar> {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSquareRootOp);
+  MatrixSquareRootOp(const MatrixSquareRootOp&) = delete;
+  void operator=(const MatrixSquareRootOp&) = delete;
 };
 
 REGISTER_LINALG_OP("MatrixSquareRoot", (MatrixSquareRootOp<float>), float);
diff --git a/tensorflow/core/kernels/linalg/qr_op_impl.h b/tensorflow/core/kernels/linalg/qr_op_impl.h
index c46bb1ae36338a..c5a1823f2f69e5 100644
--- a/tensorflow/core/kernels/linalg/qr_op_impl.h
+++ b/tensorflow/core/kernels/linalg/qr_op_impl.h
@@ -122,7 +122,8 @@ class QrOp : public LinearAlgebraOp<Scalar> {
  private:
   bool full_matrices_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(QrOp);
+  QrOp(const QrOp&) = delete;
+  void operator=(const QrOp&) = delete;
 };
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -306,7 +307,8 @@ class QrOpGpu : public AsyncOpKernel {
  private:
   bool full_matrices_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(QrOpGpu);
+  QrOpGpu(const QrOpGpu&) = delete;
+  void operator=(const QrOpGpu&) = delete;
 };
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_gpu.cc b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_gpu.cc
index f3ca4d2f8d27e2..1f3d90d0d55073 100644
--- a/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_gpu.cc
+++ b/tensorflow/core/kernels/linalg/self_adjoint_eig_v2_op_gpu.cc
@@ -167,7 +167,8 @@ class SelfAdjointEigV2OpGpu : public AsyncOpKernel {
  private:
   bool compute_v_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SelfAdjointEigV2OpGpu);
+  SelfAdjointEigV2OpGpu(const SelfAdjointEigV2OpGpu&) = delete;
+  void operator=(const SelfAdjointEigV2OpGpu&) = delete;
 };
 
 #define REGISTER(Scalar)                                                       \
diff --git a/tensorflow/core/kernels/linalg/svd_op_impl.h b/tensorflow/core/kernels/linalg/svd_op_impl.h
index 602f7bae5edf40..4e6745855e47ff 100644
--- a/tensorflow/core/kernels/linalg/svd_op_impl.h
+++ b/tensorflow/core/kernels/linalg/svd_op_impl.h
@@ -126,7 +126,8 @@ class SvdOp : public LinearAlgebraOp<Scalar> {
   bool compute_uv_;
   bool full_matrices_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SvdOp);
+  SvdOp(const SvdOp&) = delete;
+  void operator=(const SvdOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc b/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
index 5295ff007fe621..30c75a1d304e56 100644
--- a/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
+++ b/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
@@ -120,7 +120,8 @@ class TridiagonalMatMulOp : public LinearAlgebraOp<Scalar> {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TridiagonalMatMulOp);
+  TridiagonalMatMulOp(const TridiagonalMatMulOp&) = delete;
+  void operator=(const TridiagonalMatMulOp&) = delete;
 };
 
 REGISTER_LINALG_OP_CPU("TridiagonalMatMul", (TridiagonalMatMulOp<float>),
diff --git a/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc b/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
index c98a5a27442bf8..d307dd365d7ef0 100644
--- a/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
@@ -152,7 +152,8 @@ class TridiagonalSolveOp : public LinearAlgebraOp<Scalar> {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TridiagonalSolveOp);
+  TridiagonalSolveOp(const TridiagonalSolveOp&) = delete;
+  void operator=(const TridiagonalSolveOp&) = delete;
 
   // Adjust pivot such that neither 'rhs[i,:] / pivot' nor '1 / pivot' cause
   // overflow, where i numerates the multiple right-hand-sides. During the
diff --git a/tensorflow/core/kernels/linalg/tridiagonal_solve_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/tridiagonal_solve_op_gpu.cu.cc
index 198c7713eebf41..beadb173231e4d 100644
--- a/tensorflow/core/kernels/linalg/tridiagonal_solve_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/tridiagonal_solve_op_gpu.cu.cc
@@ -278,7 +278,8 @@ class TridiagonalSolveOpGpu : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TridiagonalSolveOpGpu);
+  TridiagonalSolveOpGpu(const TridiagonalSolveOpGpu&) = delete;
+  void operator=(const TridiagonalSolveOpGpu&) = delete;
 
   void ComputeWithGtsvBatched(OpKernelContext* context, const Tensor& lhs,
                               const Tensor& rhs, const int batch_size) {
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 1285b17db6ece7..1a80ba41958eef 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -189,7 +189,8 @@ class InitializeTableFromTextFileOp : public OpKernel {
   int64_t value_index_;
   int64_t offset_ = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(InitializeTableFromTextFileOp);
+  InitializeTableFromTextFileOp(const InitializeTableFromTextFileOp&) = delete;
+  void operator=(const InitializeTableFromTextFileOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("InitializeTableFromTextFile").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 92fcf7361e5dd5..97d5f404d7c1df 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -130,7 +130,8 @@ class LookupTableOp : public OpKernel {
   ContainerInfo cinfo_;
   bool use_node_name_sharing_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(LookupTableOp);
+  LookupTableOp(const LookupTableOp&) = delete;
+  void operator=(const LookupTableOp&) = delete;
 };
 
 // An anonymous version of LookupTableOp, which creates a new table resource
@@ -164,7 +165,8 @@ class AnonymousLookupTableOp : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(AnonymousLookupTableOp);
+  AnonymousLookupTableOp(const AnonymousLookupTableOp&) = delete;
+  void operator=(const AnonymousLookupTableOp&) = delete;
 };
 
 namespace lookup {
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 018f8cdfea7a6c..2ea77327c7df3b 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -262,7 +262,8 @@ class TextFileLineIterator
     return OkStatus();
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TextFileLineIterator);
+  TextFileLineIterator(const TextFileLineIterator&) = delete;
+  void operator=(const TextFileLineIterator&) = delete;
 };
 
 Status GetTableHandle(StringPiece input_name, OpKernelContext* ctx,
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index 44d648480ab6c0..5f0460c0945284 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -676,7 +676,8 @@ class FusedMatMulOp : public OpKernel {
   FusedComputationType fused_computation_ = FusedComputationType::kUndefined;
   FusedComputationArgs fused_computation_args_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FusedMatMulOp);
+  FusedMatMulOp(const FusedMatMulOp&) = delete;
+  void operator=(const FusedMatMulOp&) = delete;
 };
 
 // Registration of the CPU implementations.
diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h
index 8eee76f7f0cada..790b5f7b1518bd 100644
--- a/tensorflow/core/kernels/mfcc.h
+++ b/tensorflow/core/kernels/mfcc.h
@@ -68,7 +68,8 @@ class Mfcc {
   double upper_frequency_limit_;
   int filterbank_channel_count_;
   int dct_coefficient_count_;
-  TF_DISALLOW_COPY_AND_ASSIGN(Mfcc);
+  Mfcc(const Mfcc&) = delete;
+  void operator=(const Mfcc&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_dct.h b/tensorflow/core/kernels/mfcc_dct.h
index 888b8e8df8c450..e7982d6ac88403 100644
--- a/tensorflow/core/kernels/mfcc_dct.h
+++ b/tensorflow/core/kernels/mfcc_dct.h
@@ -36,7 +36,8 @@ class MfccDct {
   int coefficient_count_;
   int input_length_;
   std::vector<std::vector<double> > cosines_;
-  TF_DISALLOW_COPY_AND_ASSIGN(MfccDct);
+  MfccDct(const MfccDct&) = delete;
+  void operator=(const MfccDct&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h
index 37c3936e80d893..293d77457872cd 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.h
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -56,7 +56,8 @@ class MfccMelFilterbank {
   int start_index_;  // Lowest FFT bin used to calculate mel spectrum.
   int end_index_;    // Highest FFT bin used to calculate mel spectrum.
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MfccMelFilterbank);
+  MfccMelFilterbank(const MfccMelFilterbank&) = delete;
+  void operator=(const MfccMelFilterbank&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/nccl_ops.cc b/tensorflow/core/kernels/nccl_ops.cc
index b8ade086a8f854..7a6c85c9bd1f8d 100644
--- a/tensorflow/core/kernels/nccl_ops.cc
+++ b/tensorflow/core/kernels/nccl_ops.cc
@@ -63,7 +63,8 @@ class NcclAsyncOpBase : public AsyncOpKernel {
   int num_devices_;
   string collective_prefix_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(NcclAsyncOpBase);
+  NcclAsyncOpBase(const NcclAsyncOpBase&) = delete;
+  void operator=(const NcclAsyncOpBase&) = delete;
 };
 
 class NcclReduceOpBase : public NcclAsyncOpBase {
diff --git a/tensorflow/core/kernels/one_hot_op.cc b/tensorflow/core/kernels/one_hot_op.cc
index c588a80e8d4f09..1a7ef6a9a46d0f 100644
--- a/tensorflow/core/kernels/one_hot_op.cc
+++ b/tensorflow/core/kernels/one_hot_op.cc
@@ -124,7 +124,8 @@ class OneHotOp : public OpKernel {
  private:
   int32 axis_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(OneHotOp);
+  OneHotOp(const OneHotOp&) = delete;
+  void operator=(const OneHotOp&) = delete;
 };
 
 #define REGISTER_ONE_HOT_INDEX(type, index_type)                \
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index 12153c0bc15c73..0a94bc4291920c 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -203,7 +203,8 @@ class OpsTestBase : public ::testing::Test {
   SessionMetadata session_metadata_;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(OpsTestBase);
+  OpsTestBase(const OpsTestBase&) = delete;
+  void operator=(const OpsTestBase&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/padding_fifo_queue.h b/tensorflow/core/kernels/padding_fifo_queue.h
index ee24f7690031d3..939e04dd5e769b 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.h
+++ b/tensorflow/core/kernels/padding_fifo_queue.h
@@ -81,7 +81,8 @@ class PaddingFIFOQueue : public FIFOQueue {
   static Status IsSameSizeExceptZerosInFirst(const TensorShape& first,
                                              const TensorShape& second);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PaddingFIFOQueue);
+  PaddingFIFOQueue(const PaddingFIFOQueue&) = delete;
+  void operator=(const PaddingFIFOQueue&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/padding_fifo_queue_op.cc b/tensorflow/core/kernels/padding_fifo_queue_op.cc
index c92cd732d5b7f8..177aaf223d7764 100644
--- a/tensorflow/core/kernels/padding_fifo_queue_op.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue_op.cc
@@ -62,7 +62,8 @@ class PaddingFIFOQueueOp : public TypedQueueOp {
 
   std::vector<PartialTensorShape> component_shapes_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PaddingFIFOQueueOp);
+  PaddingFIFOQueueOp(const PaddingFIFOQueueOp&) = delete;
+  void operator=(const PaddingFIFOQueueOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("PaddingFIFOQueue").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index a007d37c4e290a..782c22c5efd43c 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -726,7 +726,9 @@ class ParameterizedTruncatedNormalOp : public OpKernel {
  private:
   GuardedPhiloxRandom generator_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ParameterizedTruncatedNormalOp);
+  ParameterizedTruncatedNormalOp(const ParameterizedTruncatedNormalOp&) =
+      delete;
+  void operator=(const ParameterizedTruncatedNormalOp&) = delete;
 };
 
 // Samples from a truncated normal distribution, using the given parameters.
@@ -815,7 +817,9 @@ class StatelessParameterizedTruncatedNormal : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessParameterizedTruncatedNormal);
+  StatelessParameterizedTruncatedNormal(
+      const StatelessParameterizedTruncatedNormal&) = delete;
+  void operator=(const StatelessParameterizedTruncatedNormal&) = delete;
 };
 
 }  // namespace
diff --git a/tensorflow/core/kernels/priority_queue.h b/tensorflow/core/kernels/priority_queue.h
index 21cec9d4261198..c571846a37d118 100644
--- a/tensorflow/core/kernels/priority_queue.h
+++ b/tensorflow/core/kernels/priority_queue.h
@@ -85,7 +85,8 @@ class PriorityQueue
                                              OpKernelContext* ctx,
                                              Tensor* out_element);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PriorityQueue);
+  PriorityQueue(const PriorityQueue&) = delete;
+  void operator=(const PriorityQueue&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/priority_queue_op.cc b/tensorflow/core/kernels/priority_queue_op.cc
index 3449621b62e422..2b304694870000 100644
--- a/tensorflow/core/kernels/priority_queue_op.cc
+++ b/tensorflow/core/kernels/priority_queue_op.cc
@@ -59,7 +59,8 @@ class PriorityQueueOp : public TypedQueueOp {
 
   std::vector<TensorShape> component_shapes_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PriorityQueueOp);
+  PriorityQueueOp(const PriorityQueueOp&) = delete;
+  void operator=(const PriorityQueueOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("PriorityQueue").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 80c7a338bb26e2..88bee91121641a 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -943,7 +943,8 @@ class TensorflowGemmlowpWorkersPool {
   // The BlockingCounter used to wait for the workers.
   gemmlowp::BlockingCounter counter_to_decrement_when_ready_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorflowGemmlowpWorkersPool);
+  TensorflowGemmlowpWorkersPool(const TensorflowGemmlowpWorkersPool&) = delete;
+  void operator=(const TensorflowGemmlowpWorkersPool&) = delete;
 };
 
 class TensorflowGemmContext : public gemmlowp::MultiThreadGemmContextBase {
@@ -958,7 +959,8 @@ class TensorflowGemmContext : public gemmlowp::MultiThreadGemmContextBase {
  private:
   TensorflowGemmlowpWorkersPool workers_pool_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorflowGemmContext);
+  TensorflowGemmContext(const TensorflowGemmContext&) = delete;
+  void operator=(const TensorflowGemmContext&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
index 860c9e1cecada9..2efdd38dc6ef45 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
@@ -738,7 +738,8 @@ class QuantizedResizeBilinearOp : public OpKernel {
   bool align_corners_;
   bool half_pixel_centers_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(QuantizedResizeBilinearOp<T>);
+  QuantizedResizeBilinearOp<T>(const QuantizedResizeBilinearOp<T>&) = delete;
+  void operator=(const QuantizedResizeBilinearOp<T>&) = delete;
 };
 
 #define REGISTER_CPU_KERNEL(type)                         \
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index 377966def9d2fb..71884a9dff0af9 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -178,7 +178,8 @@ class QueueBase : public QueueInterface {
   std::deque<Attempt> enqueue_attempts_ TF_GUARDED_BY(mu_);
   std::deque<Attempt> dequeue_attempts_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(QueueBase);
+  QueueBase(const QueueBase&) = delete;
+  void operator=(const QueueBase&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index f30bfcf32ad1d9..5583b79c786e95 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -100,7 +100,8 @@ class EnqueueOp : public QueueAccessOpKernel {
                     DoneCallback callback) override;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(EnqueueOp);
+  EnqueueOp(const EnqueueOp&) = delete;
+  void operator=(const EnqueueOp&) = delete;
 };
 
 // Defines an EnqueueManyOp, the execution of which slices each
@@ -127,7 +128,8 @@ class EnqueueManyOp : public QueueAccessOpKernel {
   ~EnqueueManyOp() override;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(EnqueueManyOp);
+  EnqueueManyOp(const EnqueueManyOp&) = delete;
+  void operator=(const EnqueueManyOp&) = delete;
 };
 
 // Defines a DequeueOp, the execution of which dequeues a tuple of
@@ -148,7 +150,8 @@ class DequeueOp : public QueueAccessOpKernel {
   ~DequeueOp() override;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(DequeueOp);
+  DequeueOp(const DequeueOp&) = delete;
+  void operator=(const DequeueOp&) = delete;
 };
 
 // Defines a DequeueManyOp, the execution of which concatenates the
@@ -173,7 +176,8 @@ class DequeueManyOp : public QueueAccessOpKernel {
   ~DequeueManyOp() override;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(DequeueManyOp);
+  DequeueManyOp(const DequeueManyOp&) = delete;
+  void operator=(const DequeueManyOp&) = delete;
 };
 
 // Defines a DequeueUpToOp, the execution of which concatenates the
@@ -216,7 +220,8 @@ class DequeueUpToOp : public QueueAccessOpKernel {
   ~DequeueUpToOp() override;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(DequeueUpToOp);
+  DequeueUpToOp(const DequeueUpToOp&) = delete;
+  void operator=(const DequeueUpToOp&) = delete;
 };
 
 // Defines a QueueCloseOp, which closes the given Queue. Closing a
@@ -233,7 +238,8 @@ class QueueCloseOp : public QueueOpKernel {
 
  private:
   bool cancel_pending_enqueues_;
-  TF_DISALLOW_COPY_AND_ASSIGN(QueueCloseOp);
+  QueueCloseOp(const QueueCloseOp&) = delete;
+  void operator=(const QueueCloseOp&) = delete;
 };
 
 // Defines a QueueSizeOp, which computes the number of elements in the
@@ -251,7 +257,8 @@ class QueueSizeOp : public QueueOpKernel {
                     DoneCallback callback) override;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(QueueSizeOp);
+  QueueSizeOp(const QueueSizeOp&) = delete;
+  void operator=(const QueueSizeOp&) = delete;
 };
 
 class QueueIsClosedOp : public QueueOpKernel {
@@ -263,7 +270,8 @@ class QueueIsClosedOp : public QueueOpKernel {
                     DoneCallback callback) override;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(QueueIsClosedOp);
+  QueueIsClosedOp(const QueueIsClosedOp&) = delete;
+  void operator=(const QueueIsClosedOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_binomial_op.cc b/tensorflow/core/kernels/random_binomial_op.cc
index b72a70634a838b..8fceaf70c0dbbb 100644
--- a/tensorflow/core/kernels/random_binomial_op.cc
+++ b/tensorflow/core/kernels/random_binomial_op.cc
@@ -433,7 +433,8 @@ class RandomBinomialOp : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(RandomBinomialOp);
+  RandomBinomialOp(const RandomBinomialOp&) = delete;
+  void operator=(const RandomBinomialOp&) = delete;
 };
 
 // Samples from a binomial distribution, using the given parameters.
@@ -523,7 +524,8 @@ class StatelessRandomBinomialOp : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomBinomialOp);
+  StatelessRandomBinomialOp(const StatelessRandomBinomialOp&) = delete;
+  void operator=(const StatelessRandomBinomialOp&) = delete;
 };
 
 }  // namespace
diff --git a/tensorflow/core/kernels/random_index_shuffle_ops.cc b/tensorflow/core/kernels/random_index_shuffle_ops.cc
index d75c48f5bb2b1c..6233f982381a81 100644
--- a/tensorflow/core/kernels/random_index_shuffle_ops.cc
+++ b/tensorflow/core/kernels/random_index_shuffle_ops.cc
@@ -134,7 +134,8 @@ class RandomIndexShuffleOp : public OpKernel {
  private:
   int32_t rounds_;  // Number of rounds for the block cipher.
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RandomIndexShuffleOp);
+  RandomIndexShuffleOp(const RandomIndexShuffleOp&) = delete;
+  void operator=(const RandomIndexShuffleOp&) = delete;
 };
 
 #define REGISTER(TYPE)                                        \
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 7764e2d60ea925..129d5fc4379909 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -321,7 +321,8 @@ class RandomGammaOp : public OpKernel {
  private:
   GuardedPhiloxRandom generator_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RandomGammaOp);
+  RandomGammaOp(const RandomGammaOp&) = delete;
+  void operator=(const RandomGammaOp&) = delete;
 };
 
 }  // namespace
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
index ccd52b81099c59..3c703f5d0ca0d7 100644
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -317,7 +317,8 @@ class RandomPoissonOp : public OpKernel {
  private:
   GuardedPhiloxRandom generator_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RandomPoissonOp);
+  RandomPoissonOp(const RandomPoissonOp&) = delete;
+  void operator=(const RandomPoissonOp&) = delete;
 };
 }  // namespace
 
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 67914eb0f6c793..1f0352a153d7d5 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -85,7 +85,8 @@ class RandomShuffleQueue : public TypedQueue<std::vector<Tensor> > {
   random::SingleSampleAdapter<random::PhiloxRandom> generator_
       TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RandomShuffleQueue);
+  RandomShuffleQueue(const RandomShuffleQueue&) = delete;
+  void operator=(const RandomShuffleQueue&) = delete;
 };
 
 RandomShuffleQueue::RandomShuffleQueue(
@@ -508,7 +509,8 @@ class RandomShuffleQueueOp : public TypedQueueOp {
   int64_t seed2_;
   std::vector<TensorShape> component_shapes_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RandomShuffleQueueOp);
+  RandomShuffleQueueOp(const RandomShuffleQueueOp&) = delete;
+  void operator=(const RandomShuffleQueueOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("RandomShuffleQueue").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/regex_full_match_op.cc b/tensorflow/core/kernels/regex_full_match_op.cc
index 19198ee5f911d8..ddcc165cf5fd18 100644
--- a/tensorflow/core/kernels/regex_full_match_op.cc
+++ b/tensorflow/core/kernels/regex_full_match_op.cc
@@ -78,7 +78,8 @@ class RegexFullMatchOp : public OpKernel {
   mutex mu_;
   std::shared_ptr<RE2> regex_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RegexFullMatchOp);
+  RegexFullMatchOp(const RegexFullMatchOp&) = delete;
+  void operator=(const RegexFullMatchOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("RegexFullMatch").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/regex_replace_op.cc b/tensorflow/core/kernels/regex_replace_op.cc
index a2a08d7b424d74..7b615aaf401fef 100644
--- a/tensorflow/core/kernels/regex_replace_op.cc
+++ b/tensorflow/core/kernels/regex_replace_op.cc
@@ -115,7 +115,8 @@ class RegexReplaceOp : public OpKernel {
   mutex mu_;
   std::shared_ptr<RE2> regex_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RegexReplaceOp);
+  RegexReplaceOp(const RegexReplaceOp&) = delete;
+  void operator=(const RegexReplaceOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("RegexReplace").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index 1e1676ab00c06f..139520ece5e2a0 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -167,7 +167,8 @@ class ReverseSequenceOp : public OpKernel {
   int32 batch_dim_;
   int32 seq_dim_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp);
+  ReverseSequenceOp(const ReverseSequenceOp&) = delete;
+  void operator=(const ReverseSequenceOp&) = delete;
 };
 
 #define REGISTER_REVERSE_SEQUENCE(type, len_type)                \
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index dd1742d876c050..5f5f8bacd15c1e 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -117,7 +117,8 @@ class Regularizations {
   // L1 divided by L2, pre-computed for use during weight shrinking.
   double shrinkage_ = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Regularizations);
+  Regularizations(const Regularizations&) = delete;
+  void operator=(const Regularizations&) = delete;
 };
 
 class ModelWeights;
@@ -307,7 +308,8 @@ class ModelWeights {
   std::vector<FeatureWeightsSparseStorage> sparse_weights_;
   std::vector<FeatureWeightsDenseStorage> dense_weights_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ModelWeights);
+  ModelWeights(const ModelWeights&) = delete;
+  void operator=(const ModelWeights&) = delete;
 };
 
 // Examples contains all the training examples that SDCA uses for a mini-batch.
@@ -382,7 +384,8 @@ class Examples {
 
   int num_features_ = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Examples);
+  Examples(const Examples&) = delete;
+  void operator=(const Examples&) = delete;
 };
 
 }  // namespace sdca
diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h
index 36bc22db1e7e53..34f27d10c0c7de 100644
--- a/tensorflow/core/kernels/sendrecv_ops.h
+++ b/tensorflow/core/kernels/sendrecv_ops.h
@@ -33,7 +33,8 @@ class SendOp : public OpKernel {
   Rendezvous::ParsedKey parsed_key_;
   bool hostmem_sendrecv_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SendOp);
+  SendOp(const SendOp&) = delete;
+  void operator=(const SendOp&) = delete;
 };
 
 class RecvOp : public AsyncOpKernel {
@@ -48,7 +49,8 @@ class RecvOp : public AsyncOpKernel {
   Rendezvous::ParsedKey parsed_key_;
   bool hostmem_sendrecv_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RecvOp);
+  RecvOp(const RecvOp&) = delete;
+  void operator=(const RecvOp&) = delete;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index 2feb79a24e329e..524d9fdce0bb64 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -67,7 +67,8 @@ class GetSessionHandleOp : public OpKernel {
     }
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GetSessionHandleOp);
+  GetSessionHandleOp(const GetSessionHandleOp&) = delete;
+  void operator=(const GetSessionHandleOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("GetSessionHandle").Device(DEVICE_CPU),
@@ -110,7 +111,8 @@ class GetSessionTensorOp : public OpKernel {
     ctx->set_output(0, val);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GetSessionTensorOp);
+  GetSessionTensorOp(const GetSessionTensorOp&) = delete;
+  void operator=(const GetSessionTensorOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("GetSessionTensor").Device(DEVICE_CPU),
@@ -144,7 +146,8 @@ class DeleteSessionTensorOp : public OpKernel {
     OP_REQUIRES_OK(ctx, session_state->DeleteTensor(name));
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DeleteSessionTensorOp);
+  DeleteSessionTensorOp(const DeleteSessionTensorOp&) = delete;
+  void operator=(const DeleteSessionTensorOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("DeleteSessionTensor").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/sparse/add_op.cc b/tensorflow/core/kernels/sparse/add_op.cc
index 8aa17686ef1669..6dd92e11320043 100644
--- a/tensorflow/core/kernels/sparse/add_op.cc
+++ b/tensorflow/core/kernels/sparse/add_op.cc
@@ -381,7 +381,8 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
   const T beta_;
   bool initialized_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CSRSparseMatrixAdd);
+  CSRSparseMatrixAdd(const CSRSparseMatrixAdd&) = delete;
+  void operator=(const CSRSparseMatrixAdd&) = delete;
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator.h b/tensorflow/core/kernels/sparse_conditional_accumulator.h
index 69800cbd6f6831..3b809f03e95dc2 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator.h
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator.h
@@ -429,7 +429,8 @@ class SparseConditionalAccumulator
     return true;
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SparseConditionalAccumulator);
+  SparseConditionalAccumulator(const SparseConditionalAccumulator&) = delete;
+  void operator=(const SparseConditionalAccumulator&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
index d9b5de85819de2..46137639b1d1d7 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
@@ -53,7 +53,9 @@ class SparseConditionalAccumulatorOp : public ConditionalAccumulatorBaseOp {
     ctx->set_output_ref(0, &mu_, &accumulator_);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SparseConditionalAccumulatorOp);
+  SparseConditionalAccumulatorOp(const SparseConditionalAccumulatorOp&) =
+      delete;
+  void operator=(const SparseConditionalAccumulatorOp&) = delete;
 };
 
 #define REGISTER_KERNELS(type, dev)                            \
@@ -91,7 +93,9 @@ class SparseAccumulatorApplyGradientOp
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(SparseAccumulatorApplyGradientOp);
+  SparseAccumulatorApplyGradientOp(const SparseAccumulatorApplyGradientOp&) =
+      delete;
+  void operator=(const SparseAccumulatorApplyGradientOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(
@@ -126,7 +130,9 @@ class SparseAccumulatorTakeGradientOp
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(SparseAccumulatorTakeGradientOp);
+  SparseAccumulatorTakeGradientOp(const SparseAccumulatorTakeGradientOp&) =
+      delete;
+  void operator=(const SparseAccumulatorTakeGradientOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 0b86e1a08c6116..6ec0bdd78fae11 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -839,7 +839,8 @@ class SparseMatMul {
                                        int* KR, int* NR, int* KL, int* JB,
                                        int* IB);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SparseMatMul);
+  SparseMatMul(const SparseMatMul&) = delete;
+  void operator=(const SparseMatMul&) = delete;
 };
 
 template <typename TL, typename TR,
@@ -979,7 +980,8 @@ class SparseMatMulOp : public OpKernel {
   bool a_is_sparse_;
   bool b_is_sparse_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SparseMatMulOp);
+  SparseMatMulOp(const SparseMatMulOp&) = delete;
+  void operator=(const SparseMatMulOp&) = delete;
 };
 
 template <typename TL, typename TR>
diff --git a/tensorflow/core/kernels/spectrogram.h b/tensorflow/core/kernels/spectrogram.h
index 62544f575de8f4..4b6b9c8ba687d2 100644
--- a/tensorflow/core/kernels/spectrogram.h
+++ b/tensorflow/core/kernels/spectrogram.h
@@ -117,7 +117,8 @@ class Spectrogram {
   std::vector<int> fft_integer_working_area_;
   std::vector<double> fft_double_working_area_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Spectrogram);
+  Spectrogram(const Spectrogram&) = delete;
+  void operator=(const Spectrogram&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stack.h b/tensorflow/core/kernels/stack.h
index e1927e1f28fa21..a9c6a6074059be 100644
--- a/tensorflow/core/kernels/stack.h
+++ b/tensorflow/core/kernels/stack.h
@@ -35,7 +35,8 @@ class StackOp : public OpKernel {
   DataType elem_type_;
   string stack_name_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
+  StackOp(const StackOp&) = delete;
+  void operator=(const StackOp&) = delete;
 };
 
 class StackPushOp : public AsyncOpKernel {
diff --git a/tensorflow/core/kernels/stateless_random_gamma_op.cc b/tensorflow/core/kernels/stateless_random_gamma_op.cc
index c6b92a1705f46d..6b4573d159fb4c 100644
--- a/tensorflow/core/kernels/stateless_random_gamma_op.cc
+++ b/tensorflow/core/kernels/stateless_random_gamma_op.cc
@@ -244,7 +244,8 @@ class StatelessRandomGammaOp : public StatelessRandomOpBase {
                  /*key=*/nullptr, /*counter=*/nullptr, random, samples_flat));
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomGammaOp);
+  StatelessRandomGammaOp(const StatelessRandomGammaOp&) = delete;
+  void operator=(const StatelessRandomGammaOp&) = delete;
 };
 
 // A stateless-random-gamma kernel that takes shape, key, counter and algorithm
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index 9d54c190728d2c..beb2391b03f7de 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -196,7 +196,8 @@ class StatelessRandomPoissonOp : public StatelessRandomOpBase {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomPoissonOp);
+  StatelessRandomPoissonOp(const StatelessRandomPoissonOp&) = delete;
+  void operator=(const StatelessRandomPoissonOp&) = delete;
 };
 
 #define REGISTER(DEVICE, TYPE)                                              \
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h b/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h
index d22562a0a70546..73c55cf54dc8d5 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h
+++ b/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h
@@ -58,7 +58,8 @@ class StringToHashBucketOp : public OpKernel {
  private:
   int64_t num_buckets_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StringToHashBucketOp);
+  StringToHashBucketOp(const StringToHashBucketOp&) = delete;
+  void operator=(const StringToHashBucketOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.cc b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
index 154769a590cae4..3231b6b87cd8d9 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_op.cc
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
@@ -54,7 +54,8 @@ class LegacyStringToHashBucketOp : public OpKernel {
  private:
   int64_t num_buckets_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(LegacyStringToHashBucketOp);
+  LegacyStringToHashBucketOp(const LegacyStringToHashBucketOp&) = delete;
+  void operator=(const LegacyStringToHashBucketOp&) = delete;
 };
 
 // StringToHashBucket is deprecated in favor of StringToHashBucketFast/Strong.
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.h b/tensorflow/core/kernels/string_to_hash_bucket_op.h
index c8fc90ffbea20a..71fba9b63ad6d1 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_op.h
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.h
@@ -66,7 +66,8 @@ class StringToKeyedHashBucketOp : public OpKernel {
   int64_t num_buckets_;
   uint64 key_[2];
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StringToKeyedHashBucketOp);
+  StringToKeyedHashBucketOp(const StringToKeyedHashBucketOp&) = delete;
+  void operator=(const StringToKeyedHashBucketOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sync_ops.cc b/tensorflow/core/kernels/sync_ops.cc
index fe2650512ce398..0e252b8d71fb0f 100644
--- a/tensorflow/core/kernels/sync_ops.cc
+++ b/tensorflow/core/kernels/sync_ops.cc
@@ -29,7 +29,8 @@ class SyncDeviceOp : public OpKernel {
   void Compute(OpKernelContext* context) override {}
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(SyncDeviceOp);
+  SyncDeviceOp(const SyncDeviceOp&) = delete;
+  void operator=(const SyncDeviceOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("SyncDevice").Device(DEVICE_DEFAULT),
@@ -49,7 +50,8 @@ class SyncDeviceGpuOp : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(SyncDeviceGpuOp);
+  SyncDeviceGpuOp(const SyncDeviceGpuOp&) = delete;
+  void operator=(const SyncDeviceGpuOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("SyncDevice").Device(DEVICE_GPU), SyncDeviceGpuOp);
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 82673a4b600cf0..dc4f1ca9400bf2 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -231,7 +231,8 @@ class TensorArrayOp : public TensorArrayCreationOp {
   bool clear_after_read_;
   string tensor_array_name_;  // The name used to create the TensorArray.
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayOp);
+  TensorArrayOp(const TensorArrayOp&) = delete;
+  void operator=(const TensorArrayOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("TensorArray").Device(DEVICE_CPU), TensorArrayOp);
@@ -407,7 +408,8 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
   // call.  Typical values look like "gradients", "gradients_1", ...
   string source_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayGradOp);
+  TensorArrayGradOp(const TensorArrayGradOp&) = delete;
+  void operator=(const TensorArrayGradOp&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("TensorArrayGrad").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/tensor_to_hash_bucket_op.cc b/tensorflow/core/kernels/tensor_to_hash_bucket_op.cc
index d3bf7b5808f8df..d031461318df3e 100644
--- a/tensorflow/core/kernels/tensor_to_hash_bucket_op.cc
+++ b/tensorflow/core/kernels/tensor_to_hash_bucket_op.cc
@@ -60,7 +60,8 @@ class TensorToHashBucketOp : public OpKernel {
  private:
   int64_t num_buckets_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorToHashBucketOp);
+  TensorToHashBucketOp(const TensorToHashBucketOp&) = delete;
+  void operator=(const TensorToHashBucketOp&) = delete;
 };
 
 #define REGISTER_CPU_KERNELS(type)                        \
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index 055820c48227e5..550c54ed59b147 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -256,7 +256,8 @@ class TileOp : public OpKernel {
                   const gtl::ArraySlice<Tmultiples> multiples_array,
                   Tensor* result);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TileOp);
+  TileOp(const TileOp&) = delete;
+  void operator=(const TileOp&) = delete;
 };
 
 template <typename Device, typename Tmultiples>
@@ -508,7 +509,8 @@ class TileGradientOp : public OpKernel {
         context->input(0).tensor<T, NDIM>(), reduce_dim, reshape_dim);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TileGradientOp);
+  TileGradientOp(const TileGradientOp&) = delete;
+  void operator=(const TileGradientOp&) = delete;
 };
 
 template <typename Device, typename Tmultiples>
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index 51252221f06314..035b583a5238e6 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -38,7 +38,8 @@ class VariableOp : public OpKernel {
   TensorShape shape_;
   ContainerInfo cinfo_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(VariableOp);
+  VariableOp(const VariableOp&) = delete;
+  void operator=(const VariableOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index 3f7d90808ad12f..2478fed3465375 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -191,7 +191,8 @@ class WhereCPUOp : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(WhereCPUOp);
+  WhereCPUOp(const WhereCPUOp&) = delete;
+  void operator=(const WhereCPUOp&) = delete;
 };
 
 #define REGISTER_WHERE_OP(T) \
@@ -357,7 +358,8 @@ class WhereGPUOp : public AsyncOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(WhereGPUOp);
+  WhereGPUOp(const WhereGPUOp&) = delete;
+  void operator=(const WhereGPUOp&) = delete;
 };
 
 #define REGISTER_GPU_WHERE_OP(T) \
diff --git a/tensorflow/core/lib/core/arena.h b/tensorflow/core/lib/core/arena.h
index 4b791afa75f4e1..14d80422496bd2 100644
--- a/tensorflow/core/lib/core/arena.h
+++ b/tensorflow/core/lib/core/arena.h
@@ -101,7 +101,8 @@ class Arena {
 
   void FreeBlocks();  // Frees all except first block
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Arena);
+  Arena(const Arena&) = delete;
+  void operator=(const Arena&) = delete;
 };
 
 }  // namespace core
diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h
index f645a001745abb..5c286403aa687e 100644
--- a/tensorflow/core/lib/db/sqlite.h
+++ b/tensorflow/core/lib/db/sqlite.h
@@ -129,7 +129,8 @@ class TF_LOCKABLE Sqlite : public core::RefCounted {
   sqlite3_stmt* const rollback_;
   bool is_in_transaction_ = false;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Sqlite);
+  Sqlite(const Sqlite&) = delete;
+  void operator=(const Sqlite&) = delete;
 };
 
 /// \brief SQLite prepared statement.
@@ -374,7 +375,8 @@ class SqliteStatement {
   int bind_error_parameter_ = 0;
   uint64 size_ = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SqliteStatement);
+  SqliteStatement(const SqliteStatement&) = delete;
+  void operator=(const SqliteStatement&) = delete;
 };
 
 /// \brief Reentrant SQLite connection object lock
@@ -400,7 +402,8 @@ class TF_SCOPED_LOCKABLE SqliteLock {
  private:
   sqlite3_mutex* const mutex_;
   bool is_locked_ = true;
-  TF_DISALLOW_COPY_AND_ASSIGN(SqliteLock);
+  SqliteLock(const SqliteLock&) = delete;
+  void operator=(const SqliteLock&) = delete;
 };
 #define SqliteLock(x) static_assert(0, "sqlite_lock_decl_missing_name");
 
@@ -431,7 +434,8 @@ class TF_SCOPED_LOCKABLE SqliteTransaction {
   void Begin();
   Sqlite* const db_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SqliteTransaction);
+  SqliteTransaction(const SqliteTransaction&) = delete;
+  void operator=(const SqliteTransaction&) = delete;
 };
 
 #define SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(...) \
diff --git a/tensorflow/core/lib/strings/ordered_code.h b/tensorflow/core/lib/strings/ordered_code.h
index 2492440a99ed2d..bfccfc54938d7a 100644
--- a/tensorflow/core/lib/strings/ordered_code.h
+++ b/tensorflow/core/lib/strings/ordered_code.h
@@ -85,7 +85,8 @@ class OrderedCode {
  private:
   // This has only static methods, so disallow construction entirely
   OrderedCode();
-  TF_DISALLOW_COPY_AND_ASSIGN(OrderedCode);
+  OrderedCode(const OrderedCode&) = delete;
+  void operator=(const OrderedCode&) = delete;
 };
 
 }  // namespace strings
diff --git a/tensorflow/core/lib/strings/proto_text_util.h b/tensorflow/core/lib/strings/proto_text_util.h
index 7243a0b1e04c03..44559578da02e9 100644
--- a/tensorflow/core/lib/strings/proto_text_util.h
+++ b/tensorflow/core/lib/strings/proto_text_util.h
@@ -116,7 +116,8 @@ class ProtoTextOutput {
   // current deepest level of nesting.
   bool level_empty_ = true;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ProtoTextOutput);
+  ProtoTextOutput(const ProtoTextOutput&) = delete;
+  void operator=(const ProtoTextOutput&) = delete;
 };
 
 inline void ProtoSpaceAndComments(Scanner* scanner) {
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index 671fc8151cf3cd..721ac3f6201143 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -271,7 +271,8 @@ class NcclManager {
 
   Status status_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(NcclManager);
+  NcclManager(const NcclManager&) = delete;
+  void operator=(const NcclManager&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_wrapper.h b/tensorflow/core/profiler/backends/gpu/cupti_wrapper.h
index 029dafe0dda0c7..331828f31bec31 100644
--- a/tensorflow/core/profiler/backends/gpu/cupti_wrapper.h
+++ b/tensorflow/core/profiler/backends/gpu/cupti_wrapper.h
@@ -176,7 +176,8 @@ class CuptiWrapper : public tensorflow::profiler::CuptiInterface {
   bool Disabled() const override { return false; }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiWrapper);
+  CuptiWrapper(const CuptiWrapper&) = delete;
+  void operator=(const CuptiWrapper&) = delete;
 };
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/utils/cost_utils.h b/tensorflow/core/profiler/utils/cost_utils.h
index a778bca533025e..7ea14fe907d096 100644
--- a/tensorflow/core/profiler/utils/cost_utils.h
+++ b/tensorflow/core/profiler/utils/cost_utils.h
@@ -50,7 +50,8 @@ class TfOpRoofLineCostEstimator
   absl::flat_hash_set<std::string>
       unsupported_ops_;  // summary for unsupported ops.
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TfOpRoofLineCostEstimator);
+  TfOpRoofLineCostEstimator(const TfOpRoofLineCostEstimator&) = delete;
+  void operator=(const TfOpRoofLineCostEstimator&) = delete;
 };
 
 }  // namespace profiler
diff --git a/tensorflow/core/runtime_fallback/test/test_opkernels.cc b/tensorflow/core/runtime_fallback/test/test_opkernels.cc
index b795579d0a16ec..3616ab7a76ba09 100644
--- a/tensorflow/core/runtime_fallback/test/test_opkernels.cc
+++ b/tensorflow/core/runtime_fallback/test/test_opkernels.cc
@@ -48,7 +48,8 @@ class TestAsyncIdentityKernel : public AsyncOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TestAsyncIdentityKernel);
+  TestAsyncIdentityKernel(const TestAsyncIdentityKernel&) = delete;
+  void operator=(const TestAsyncIdentityKernel&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("TestAsyncIdentity").Device(DEVICE_CPU),
@@ -89,7 +90,9 @@ class TestAsyncTfrtAsyncThreadKernel : public AsyncOpKernel {
  private:
   mutex mu_;
   std::unique_ptr<Thread> thread_ TF_GUARDED_BY(mu_);
-  TF_DISALLOW_COPY_AND_ASSIGN(TestAsyncTfrtAsyncThreadKernel);
+  TestAsyncTfrtAsyncThreadKernel(const TestAsyncTfrtAsyncThreadKernel&) =
+      delete;
+  void operator=(const TestAsyncTfrtAsyncThreadKernel&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("TestAsyncTfrtAsyncThread").Device(DEVICE_CPU),
@@ -112,7 +115,8 @@ class TestPrintThreadNameKernel : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TestPrintThreadNameKernel);
+  TestPrintThreadNameKernel(const TestPrintThreadNameKernel&) = delete;
+  void operator=(const TestPrintThreadNameKernel&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("TestPrintThreadName").Device(DEVICE_CPU),
diff --git a/tensorflow/core/summary/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
index e96a01751619ac..7f6c69c50681e5 100644
--- a/tensorflow/core/summary/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -210,7 +210,8 @@ class IdAllocator {
   Sqlite* const db_;
   int tier_ TF_GUARDED_BY(mu_) = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(IdAllocator);
+  IdAllocator(const IdAllocator&) = delete;
+  void operator=(const IdAllocator&) = delete;
 };
 
 class GraphWriter {
@@ -372,7 +373,8 @@ class GraphWriter {
   std::vector<string> name_copies_;
   std::unordered_map<StringPiece, int64_t, StringPieceHasher> name_to_node_id_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GraphWriter);
+  GraphWriter(const GraphWriter&) = delete;
+  void operator=(const GraphWriter&) = delete;
 };
 
 /// \brief Run metadata manager.
@@ -619,7 +621,8 @@ class RunMetadata {
   double run_started_time_ TF_GUARDED_BY(mu_) = 0.0;
   std::unordered_map<string, int64_t> tag_ids_ TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RunMetadata);
+  RunMetadata(const RunMetadata&) = delete;
+  void operator=(const RunMetadata&) = delete;
 };
 
 /// \brief Tensor writer for a single series, e.g. Tag.
@@ -817,7 +820,8 @@ class SeriesWriter {
   std::deque<int64_t> rowids_ TF_GUARDED_BY(mu_);
   uint64 unflushed_bytes_ TF_GUARDED_BY(mu_) = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SeriesWriter);
+  SeriesWriter(const SeriesWriter&) = delete;
+  void operator=(const SeriesWriter&) = delete;
 };
 
 /// \brief Tensor writer for a single Run.
@@ -869,7 +873,8 @@ class RunWriter {
   std::unordered_map<int64_t, std::unique_ptr<SeriesWriter>> series_writers_
       TF_GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RunWriter);
+  RunWriter(const RunWriter&) = delete;
+  void operator=(const RunWriter&) = delete;
 };
 
 /// \brief SQLite implementation of SummaryWriterInterface.
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
index a38d0af303358a..e38f2c8c4dab79 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
@@ -424,7 +424,8 @@ class TestAsyncIdentityKernel : public AsyncOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TestAsyncIdentityKernel);
+  TestAsyncIdentityKernel(const TestAsyncIdentityKernel&) = delete;
+  void operator=(const TestAsyncIdentityKernel&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("TestAsyncIdentity").Device(DEVICE_CPU),
diff --git a/tensorflow/core/tpu/kernels/cross_replica_ops.cc b/tensorflow/core/tpu/kernels/cross_replica_ops.cc
index df08192c414d20..24a8ee467f63a7 100644
--- a/tensorflow/core/tpu/kernels/cross_replica_ops.cc
+++ b/tensorflow/core/tpu/kernels/cross_replica_ops.cc
@@ -62,7 +62,8 @@ class CrossReplicaSumOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CrossReplicaSumOp);
+  CrossReplicaSumOp(const CrossReplicaSumOp&) = delete;
+  void operator=(const CrossReplicaSumOp&) = delete;
 };
 
 class AllToAllOp : public XlaOpKernel {
@@ -90,7 +91,8 @@ class AllToAllOp : public XlaOpKernel {
   int64_t concat_dimension_;
   int64_t split_count_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(AllToAllOp);
+  AllToAllOp(const AllToAllOp&) = delete;
+  void operator=(const AllToAllOp&) = delete;
 };
 
 class CollectivePermuteOp : public XlaOpKernel {
@@ -121,7 +123,8 @@ class CollectivePermuteOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CollectivePermuteOp);
+  CollectivePermuteOp(const CollectivePermuteOp&) = delete;
+  void operator=(const CollectivePermuteOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("AllToAll").CompileTimeConstantInput("group_assignment"),
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
index 964e5de0dedb4d..b83e33bbb43693 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
@@ -174,7 +174,8 @@ class XlaSparseDenseMatmulOp : public XlaOpKernel {
   int max_ids_per_partition_;
   int max_unique_ids_per_partition_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulOp);
+  XlaSparseDenseMatmulOp(const XlaSparseDenseMatmulOp&) = delete;
+  void operator=(const XlaSparseDenseMatmulOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaSparseDenseMatmul"), XlaSparseDenseMatmulOp);
@@ -321,7 +322,9 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
   std::optional<int> quantization_config_num_buckets_;
   std::string table_name_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulWithCsrInputOp);
+  XlaSparseDenseMatmulWithCsrInputOp(
+      const XlaSparseDenseMatmulWithCsrInputOp&) = delete;
+  void operator=(const XlaSparseDenseMatmulWithCsrInputOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaSparseDenseMatmulWithCsrInput"),
@@ -451,7 +454,9 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
  private:
   std::string table_name_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulGradWithCsrInputBase);
+  XlaSparseDenseMatmulGradWithCsrInputBase(
+      const XlaSparseDenseMatmulGradWithCsrInputBase&) = delete;
+  void operator=(const XlaSparseDenseMatmulGradWithCsrInputBase&) = delete;
 };
 
 // This TensorFlow op calculates the gradients and performs SGD update on the
@@ -514,7 +519,9 @@ class XlaSparseDenseMatmulGradWithSgdAndCsrInputOp
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulGradWithSgdAndCsrInputOp);
+  XlaSparseDenseMatmulGradWithSgdAndCsrInputOp(
+      const XlaSparseDenseMatmulGradWithSgdAndCsrInputOp&) = delete;
+  void operator=(const XlaSparseDenseMatmulGradWithSgdAndCsrInputOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithSgdAndCsrInput"),
@@ -591,7 +598,10 @@ class XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp);
+  XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp(
+      const XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp&) = delete;
+  void operator=(const XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp&) =
+      delete;
 };
 
 REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithAdagradAndCsrInput"),
@@ -834,7 +844,9 @@ class XlaSparseDenseMatmulGradWithAdamAndCsrInputOp
   float beta2_;
   float epsilon_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulGradWithAdamAndCsrInputOp);
+  XlaSparseDenseMatmulGradWithAdamAndCsrInputOp(
+      const XlaSparseDenseMatmulGradWithAdamAndCsrInputOp&) = delete;
+  void operator=(const XlaSparseDenseMatmulGradWithAdamAndCsrInputOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithAdamAndCsrInput"),
@@ -992,7 +1004,9 @@ class XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp
   float l1_regularization_strength_;
   float l2_regularization_strength_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp);
+  XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp(
+      const XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp&) = delete;
+  void operator=(const XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithFtrlAndCsrInput"),
@@ -1049,7 +1063,8 @@ class XlaSparseCoreOptimizerOpBase : public XlaOpKernel {
  protected:
   int feature_width_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreOptimizerOpBase);
+  XlaSparseCoreOptimizerOpBase(const XlaSparseCoreOptimizerOpBase&) = delete;
+  void operator=(const XlaSparseCoreOptimizerOpBase&) = delete;
 };
 
 // This class uses the SGD optimizer to update the embedding table weights.
@@ -1079,7 +1094,8 @@ class XlaSparseCoreSgdOp : public XlaSparseCoreOptimizerOpBase {
     ctx->SetOutput(0, result);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreSgdOp);
+  XlaSparseCoreSgdOp(const XlaSparseCoreSgdOp&) = delete;
+  void operator=(const XlaSparseCoreSgdOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaSparseCoreSgd"), XlaSparseCoreSgdOp);
@@ -1118,7 +1134,8 @@ class XlaSparseCoreAdagradOp : public XlaSparseCoreOptimizerOpBase {
     ctx->SetOutput(1, accum_result);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreAdagradOp);
+  XlaSparseCoreAdagradOp(const XlaSparseCoreAdagradOp&) = delete;
+  void operator=(const XlaSparseCoreAdagradOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaSparseCoreAdagrad"), XlaSparseCoreAdagradOp);
@@ -1199,7 +1216,9 @@ class XlaSparseCoreAdagradMomentumOp : public XlaSparseCoreOptimizerOpBase {
     ctx->SetOutput(2, momen_result);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreAdagradMomentumOp);
+  XlaSparseCoreAdagradMomentumOp(const XlaSparseCoreAdagradMomentumOp&) =
+      delete;
+  void operator=(const XlaSparseCoreAdagradMomentumOp&) = delete;
 
  private:
   bool use_nesterov_;
@@ -1276,7 +1295,8 @@ class XlaSparseCoreAdamOp : public XlaSparseCoreOptimizerOpBase {
     ctx->SetOutput(2, momen_result);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreAdamOp);
+  XlaSparseCoreAdamOp(const XlaSparseCoreAdamOp&) = delete;
+  void operator=(const XlaSparseCoreAdamOp&) = delete;
 
  private:
   bool use_sum_inside_sqrt_;
@@ -1388,7 +1408,8 @@ class XlaSparseCoreFtrlOp : public XlaSparseCoreOptimizerOpBase {
     ctx->SetOutput(2, linear_result);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSparseCoreFtrlOp);
+  XlaSparseCoreFtrlOp(const XlaSparseCoreFtrlOp&) = delete;
+  void operator=(const XlaSparseCoreFtrlOp&) = delete;
 
  private:
   bool multiply_linear_by_learning_rate_;
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.h b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.h
index c5eb0b09fb9e18..0a4b9183a04184 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.h
@@ -39,7 +39,9 @@ class UseSparseCoreFrontendAttributes {
   xla::XlaBuilder* builder_;
   const xla::FrontendAttributes original_frontend_attributes_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(UseSparseCoreFrontendAttributes);
+  UseSparseCoreFrontendAttributes(const UseSparseCoreFrontendAttributes&) =
+      delete;
+  void operator=(const UseSparseCoreFrontendAttributes&) = delete;
 };
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_XLA_OPS_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
index 7b758b08ce1123..58372aa8101233 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
@@ -64,7 +64,9 @@ class TpuCompilationCacheEntryUnloader : public ResourceBase {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TpuCompilationCacheEntryUnloader);
+  TpuCompilationCacheEntryUnloader(const TpuCompilationCacheEntryUnloader&) =
+      delete;
+  void operator=(const TpuCompilationCacheEntryUnloader&) = delete;
   mutable absl::Mutex mu_;
   TpuCompilationCacheInterface* cache_;  // Not owned.
   absl::flat_hash_set<int64_t> cache_entry_uids_ ABSL_GUARDED_BY(mu_);
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_configuration_ops.cc
index 476e260f8d7a10..40bc59a6b82fb0 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_configuration_ops.cc
@@ -96,7 +96,9 @@ class ExecuteTPUEmbeddingPartitionerOp : public OpKernel {
   // The embedding layer configuration for the TPUEmbedding host software.
   std::string config_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ExecuteTPUEmbeddingPartitionerOp);
+  ExecuteTPUEmbeddingPartitionerOp(const ExecuteTPUEmbeddingPartitionerOp&) =
+      delete;
+  void operator=(const ExecuteTPUEmbeddingPartitionerOp&) = delete;
 };
 
 // Initializes the HBM memory addresses and segments on each host.
@@ -147,7 +149,8 @@ class ConfigureTPUEmbeddingMemoryOp : public OpKernel {
   ~ConfigureTPUEmbeddingMemoryOp() override {}
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ConfigureTPUEmbeddingMemoryOp);
+  ConfigureTPUEmbeddingMemoryOp(const ConfigureTPUEmbeddingMemoryOp&) = delete;
+  void operator=(const ConfigureTPUEmbeddingMemoryOp&) = delete;
 };
 
 // Merges the memory configurations of all hosts into one
@@ -208,7 +211,8 @@ class CollateTPUEmbeddingMemoryOp : public OpKernel {
   ~CollateTPUEmbeddingMemoryOp() override {}
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CollateTPUEmbeddingMemoryOp);
+  CollateTPUEmbeddingMemoryOp(const CollateTPUEmbeddingMemoryOp&) = delete;
+  void operator=(const CollateTPUEmbeddingMemoryOp&) = delete;
 };
 
 // The ConfigureTpuEmbeddingHost op is used to set up the TPUEmbedding host
@@ -283,7 +287,8 @@ class ConfigureTPUEmbeddingHostOp : public OpKernel {
   // The embedding layer configuration for the TPUEmbedding host software.
   std::string config_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ConfigureTPUEmbeddingHostOp);
+  ConfigureTPUEmbeddingHostOp(const ConfigureTPUEmbeddingHostOp&) = delete;
+  void operator=(const ConfigureTPUEmbeddingHostOp&) = delete;
 };
 
 // The ConnectTpuEmbeddingHosts op is used to set up gRPC connections
@@ -329,7 +334,8 @@ class ConnectTPUEmbeddingHostsOp : public OpKernel {
   ~ConnectTPUEmbeddingHostsOp() override {}
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ConnectTPUEmbeddingHostsOp);
+  ConnectTPUEmbeddingHostsOp(const ConnectTPUEmbeddingHostsOp&) = delete;
+  void operator=(const ConnectTPUEmbeddingHostsOp&) = delete;
 };
 
 // The FinalizeTpuEmbeddingOp op is used to update TpuMeshCommonState and
@@ -378,7 +384,8 @@ class FinalizeTPUEmbeddingOp : public OpKernel {
   ~FinalizeTPUEmbeddingOp() override {}
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(FinalizeTPUEmbeddingOp);
+  FinalizeTPUEmbeddingOp(const FinalizeTPUEmbeddingOp&) = delete;
+  void operator=(const FinalizeTPUEmbeddingOp&) = delete;
 };
 
 // The IsTPUEmbeddingInitializedOp is used to check whether the TPU
@@ -419,7 +426,8 @@ class IsTPUEmbeddingInitializedOp : public OpKernel {
 
  private:
   std::string config_string_;
-  TF_DISALLOW_COPY_AND_ASSIGN(IsTPUEmbeddingInitializedOp);
+  IsTPUEmbeddingInitializedOp(const IsTPUEmbeddingInitializedOp&) = delete;
+  void operator=(const IsTPUEmbeddingInitializedOp&) = delete;
 };
 
 // These ops execute on the CPU devices of TPU worker tasks.
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc
index 31498d8b31347a..96ba38b12129df 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc
@@ -200,7 +200,9 @@ class EnqueueTPUEmbeddingArbitraryTensorBatchOp : public OpKernel {
   bool device_ordinal_set_ = false;
   TpuEmbedding_TensorBatchFixedState* fixed_state_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(EnqueueTPUEmbeddingArbitraryTensorBatchOp);
+  EnqueueTPUEmbeddingArbitraryTensorBatchOp(
+      const EnqueueTPUEmbeddingArbitraryTensorBatchOp&) = delete;
+  void operator=(const EnqueueTPUEmbeddingArbitraryTensorBatchOp&) = delete;
 };
 
 #ifdef LIBTPU_ON_GCE
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.h b/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.h
index 08a532181ba752..51459c6a028341 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.h
@@ -54,7 +54,9 @@ class LoadAllTPUEmbeddingParametersOp : public OpKernel {
   tpu::TPUEmbeddingConfiguration config_;
   std::vector<TensorShape> table_shapes_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(LoadAllTPUEmbeddingParametersOp);
+  LoadAllTPUEmbeddingParametersOp(const LoadAllTPUEmbeddingParametersOp&) =
+      delete;
+  void operator=(const LoadAllTPUEmbeddingParametersOp&) = delete;
 };
 
 // The RetrieveAllTPUEmbeddingParameters op is used to retrieve updated
@@ -87,7 +89,9 @@ class RetrieveAllTPUEmbeddingParametersOp : public OpKernel {
   tpu::TPUEmbeddingConfiguration config_;
   std::vector<TensorShape> table_shapes_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RetrieveAllTPUEmbeddingParametersOp);
+  RetrieveAllTPUEmbeddingParametersOp(
+      const RetrieveAllTPUEmbeddingParametersOp&) = delete;
+  void operator=(const RetrieveAllTPUEmbeddingParametersOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
index 7eddabd4ccb529..fc88d3b5698d04 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
@@ -130,7 +130,8 @@ class RecvTPUEmbeddingActivationsOp : public XlaOpKernel {
   tensorflow::tpu::TPUEmbeddingConfiguration tpu_embedding_config_;
   std::string config_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RecvTPUEmbeddingActivationsOp);
+  RecvTPUEmbeddingActivationsOp(const RecvTPUEmbeddingActivationsOp&) = delete;
+  void operator=(const RecvTPUEmbeddingActivationsOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaRecvTPUEmbeddingActivations").AllowVariantTypes(),
@@ -210,7 +211,9 @@ class RecvTPUEmbeddingDeduplicationDataOp : public XlaOpKernel {
   // TPU Embedding config string.
   std::string config_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RecvTPUEmbeddingDeduplicationDataOp);
+  RecvTPUEmbeddingDeduplicationDataOp(
+      const RecvTPUEmbeddingDeduplicationDataOp&) = delete;
+  void operator=(const RecvTPUEmbeddingDeduplicationDataOp&) = delete;
 };
 
 REGISTER_XLA_OP(
@@ -326,7 +329,8 @@ class SendTPUEmbeddingGradientsOp : public XlaOpKernel {
   // TPU Embedding config string.
   std::string config_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SendTPUEmbeddingGradientsOp);
+  SendTPUEmbeddingGradientsOp(const SendTPUEmbeddingGradientsOp&) = delete;
+  void operator=(const SendTPUEmbeddingGradientsOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaSendTPUEmbeddingGradients").AllowVariantTypes(),
@@ -491,7 +495,8 @@ class SplitDedupDataOp : public XlaOpKernel {
   std::string tuple_mask_string_;
   tensorflow::TensorProto tuple_mask_tensor_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SplitDedupDataOp);
+  SplitDedupDataOp(const SplitDedupDataOp&) = delete;
+  void operator=(const SplitDedupDataOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("SplitDedupData").AllowVariantTypes(), SplitDedupDataOp);
@@ -681,7 +686,8 @@ class MergeDedupDataOp : public XlaOpKernel {
   std::string tuple_mask_string_;
   tensorflow::TensorProto tuple_mask_tensor_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MergeDedupDataOp);
+  MergeDedupDataOp(const MergeDedupDataOp&) = delete;
+  void operator=(const MergeDedupDataOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("MergeDedupData").AllowVariantTypes(), MergeDedupDataOp);
@@ -733,7 +739,8 @@ class ComputeDedupDataTupleMaskOp : public XlaOpKernel {
   // TPU Embedding config string.
   std::string config_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ComputeDedupDataTupleMaskOp);
+  ComputeDedupDataTupleMaskOp(const ComputeDedupDataTupleMaskOp&) = delete;
+  void operator=(const ComputeDedupDataTupleMaskOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("ComputeDedupDataTupleMask").AllowVariantTypes(),
diff --git a/tensorflow/core/tpu/kernels/tpu_execute_op.h b/tensorflow/core/tpu/kernels/tpu_execute_op.h
index c66118ad45ea0b..87fbef1ecda0af 100644
--- a/tensorflow/core/tpu/kernels/tpu_execute_op.h
+++ b/tensorflow/core/tpu/kernels/tpu_execute_op.h
@@ -48,7 +48,8 @@ class TPUExecuteOp : public AsyncOpKernel {
  private:
   Status DoWork(OpKernelContext* context);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TPUExecuteOp);
+  TPUExecuteOp(const TPUExecuteOp&) = delete;
+  void operator=(const TPUExecuteOp&) = delete;
 };
 
 // A variant of TPUExecuteOp that contains fused device variable reads and
@@ -59,7 +60,9 @@ class TPUExecuteAndUpdateVariablesOp : public TPUExecuteOp {
   ~TPUExecuteAndUpdateVariablesOp() override = default;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TPUExecuteAndUpdateVariablesOp);
+  TPUExecuteAndUpdateVariablesOp(const TPUExecuteAndUpdateVariablesOp&) =
+      delete;
+  void operator=(const TPUExecuteAndUpdateVariablesOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index 717e3875a0b28e..3d164c09725666 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -185,7 +185,8 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
   std::vector<xla::HloProto> hlo_metadatas_;  // Owned.
   std::vector<const xla::HloProto*> hlo_metadatas_ptrs_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TpuProgramGroup);
+  TpuProgramGroup(const TpuProgramGroup&) = delete;
+  void operator=(const TpuProgramGroup&) = delete;
 };
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
index 3efbeb009ed66d..cf76fa88f7ee3f 100644
--- a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
@@ -419,7 +419,8 @@ class HostComputeOp : public XlaOpKernel {
   int64_t tpu_core_;
   std::vector<string> token_input_nodes_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(HostComputeOp);
+  HostComputeOp(const HostComputeOp&) = delete;
+  void operator=(const HostComputeOp&) = delete;
 };
 
 class SendToHostOp : public XlaOpKernel {
@@ -472,7 +473,8 @@ class SendToHostOp : public XlaOpKernel {
   string key_;
   std::vector<string> token_input_nodes_;
   string original_node_name_;
-  TF_DISALLOW_COPY_AND_ASSIGN(SendToHostOp);
+  SendToHostOp(const SendToHostOp&) = delete;
+  void operator=(const SendToHostOp&) = delete;
 };
 
 class RecvFromHostOp : public XlaOpKernel {
@@ -529,7 +531,8 @@ class RecvFromHostOp : public XlaOpKernel {
   string key_;
   std::vector<string> token_input_nodes_;
   string original_node_name_;
-  TF_DISALLOW_COPY_AND_ASSIGN(RecvFromHostOp);
+  RecvFromHostOp(const RecvFromHostOp&) = delete;
+  void operator=(const RecvFromHostOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaHostCompute"), HostComputeOp);
diff --git a/tensorflow/core/tpu/kernels/xla/infeed_op.cc b/tensorflow/core/tpu/kernels/xla/infeed_op.cc
index 7ce01c530a8db5..050829f1ecfd5c 100644
--- a/tensorflow/core/tpu/kernels/xla/infeed_op.cc
+++ b/tensorflow/core/tpu/kernels/xla/infeed_op.cc
@@ -107,7 +107,8 @@ class InfeedDequeueOp : public XlaOpKernel {
   DataType dtype_;
   xla::Shape xla_shape_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(InfeedDequeueOp);
+  InfeedDequeueOp(const InfeedDequeueOp&) = delete;
+  void operator=(const InfeedDequeueOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("InfeedDequeue"), InfeedDequeueOp);
@@ -156,7 +157,8 @@ class InfeedDequeueTupleOp : public XlaOpKernel {
   std::vector<xla::Shape> xla_shapes_;
   xla::Shape tuple_shape_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(InfeedDequeueTupleOp);
+  InfeedDequeueTupleOp(const InfeedDequeueTupleOp&) = delete;
+  void operator=(const InfeedDequeueTupleOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("InfeedDequeueTuple"), InfeedDequeueTupleOp);
diff --git a/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
index c259dad1c443c2..aa2dacfb319b35 100644
--- a/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
@@ -53,7 +53,8 @@ class OutfeedEnqueueOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(OutfeedEnqueueOp);
+  OutfeedEnqueueOp(const OutfeedEnqueueOp&) = delete;
+  void operator=(const OutfeedEnqueueOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("OutfeedEnqueue"), OutfeedEnqueueOp);
@@ -90,7 +91,8 @@ class OutfeedEnqueueTupleOp : public XlaOpKernel {
  private:
   DataTypeVector dtypes_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(OutfeedEnqueueTupleOp);
+  OutfeedEnqueueTupleOp(const OutfeedEnqueueTupleOp&) = delete;
+  void operator=(const OutfeedEnqueueTupleOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("OutfeedEnqueueTuple"), OutfeedEnqueueTupleOp);
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index a9d8d19e6ebb83..e5fa5c1311482a 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -124,7 +124,8 @@ class BCastList {
     std::reverse(shape->begin(), shape->end());
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BCastList);
+  BCastList(const BCastList&) = delete;
+  void operator=(const BCastList&) = delete;
 };
 
 template <int N>
@@ -417,7 +418,8 @@ class BCast : public BCastList<2> {
   static TensorShape ToShape(const Vec& vec);
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(BCast);
+  BCast(const BCast&) = delete;
+  void operator=(const BCast&) = delete;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/util/ctc/ctc_beam_entry.h b/tensorflow/core/util/ctc/ctc_beam_entry.h
index 5ccf81fbd79e2d..c8a230369b7be2 100644
--- a/tensorflow/core/util/ctc/ctc_beam_entry.h
+++ b/tensorflow/core/util/ctc/ctc_beam_entry.h
@@ -107,7 +107,8 @@ struct BeamEntry {
   BeamEntry(BeamEntry* p, int l, BeamRoot<T, CTCBeamState>* beam_root)
       : parent(p), label(l), beam_root(beam_root) {}
   BeamRoot<T, CTCBeamState>* beam_root;
-  TF_DISALLOW_COPY_AND_ASSIGN(BeamEntry);
+  BeamEntry(const BeamEntry&) = delete;
+  void operator=(const BeamEntry&) = delete;
 };
 
 // This class owns all instances of BeamEntry.  This is used to avoid recursive
diff --git a/tensorflow/core/util/ctc/ctc_beam_search.h b/tensorflow/core/util/ctc/ctc_beam_search.h
index e60f6091847514..74ab5370d8e346 100644
--- a/tensorflow/core/util/ctc/ctc_beam_search.h
+++ b/tensorflow/core/util/ctc/ctc_beam_search.h
@@ -152,7 +152,8 @@ class CTCBeamSearchDecoder : public CTCDecoder<T> {
   std::unique_ptr<BeamRoot> beam_root_;
   BaseBeamScorer<T, CTCBeamState>* beam_scorer_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CTCBeamSearchDecoder);
+  CTCBeamSearchDecoder(const CTCBeamSearchDecoder&) = delete;
+  void operator=(const CTCBeamSearchDecoder&) = delete;
 };
 
 template <typename T, typename CTCBeamState, typename CTCBeamComparer>
diff --git a/tensorflow/core/util/cuda_sparse.cc b/tensorflow/core/util/cuda_sparse.cc
index 3dac1e1ab59e1c..2aae0110c44487 100644
--- a/tensorflow/core/util/cuda_sparse.cc
+++ b/tensorflow/core/util/cuda_sparse.cc
@@ -127,7 +127,8 @@ class CudaSparseHandles {
   cudaStream_t stream_;
   cusparseHandle_t cusparse_handle_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CudaSparseHandles);
+  CudaSparseHandles(const CudaSparseHandles&) = delete;
+  void operator=(const CudaSparseHandles&) = delete;
 };
 
 // TODO(ebrevdo): Replace global mutex guarding CudaSparseHandles
diff --git a/tensorflow/core/util/cuda_sparse.h b/tensorflow/core/util/cuda_sparse.h
index 896fb795216dc4..ca3ac8ffa76961 100644
--- a/tensorflow/core/util/cuda_sparse.h
+++ b/tensorflow/core/util/cuda_sparse.h
@@ -222,7 +222,8 @@ class GpuSparseSpGEMMDescr {
   bool initialized_;
   cusparseSpGEMMDescr_t descr_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparseSpGEMMDescr);
+  GpuSparseSpGEMMDescr(const GpuSparseSpGEMMDescr&) = delete;
+  void operator=(const GpuSparseSpGEMMDescr&) = delete;
 };
 
 class GpuSparseSpMatDescr {
@@ -255,7 +256,8 @@ class GpuSparseSpMatDescr {
  private:
   bool initialized_;
   cusparseSpMatDescr_t descr_;
-  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparseSpMatDescr);
+  GpuSparseSpMatDescr(const GpuSparseSpMatDescr&) = delete;
+  void operator=(const GpuSparseSpMatDescr&) = delete;
 };
 
 class GpuSparseConstSpMatDescr {
@@ -288,7 +290,8 @@ class GpuSparseConstSpMatDescr {
  private:
   bool initialized_;
   cusparseConstSpMatDescr_t descr_;
-  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparseConstSpMatDescr);
+  GpuSparseConstSpMatDescr(const GpuSparseConstSpMatDescr&) = delete;
+  void operator=(const GpuSparseConstSpMatDescr&) = delete;
 };
 
 #endif
@@ -580,7 +583,8 @@ class GpuSparse {
   gpuStream_t gpu_stream_;
   gpusparseHandle_t* gpusparse_handle_;  // not owned.
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparse);
+  GpuSparse(const GpuSparse&) = delete;
+  void operator=(const GpuSparse&) = delete;
 };
 
 // A wrapper class to ensure that a CUDA sparse matrix descriptor is initialized
@@ -644,7 +648,8 @@ class GpuSparseMatrixDescriptor {
   bool initialized_;
   gpusparseMatDescr_t descr_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparseMatrixDescriptor);
+  GpuSparseMatrixDescriptor(const GpuSparseMatrixDescriptor&) = delete;
+  void operator=(const GpuSparseMatrixDescriptor&) = delete;
 };
 
 #if GOOGLE_CUDA
@@ -703,7 +708,9 @@ class GpuSparseCsrSortingConversionInfo {
   bool initialized_;
   csru2csrInfo_t info_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparseCsrSortingConversionInfo);
+  GpuSparseCsrSortingConversionInfo(const GpuSparseCsrSortingConversionInfo&) =
+      delete;
+  void operator=(const GpuSparseCsrSortingConversionInfo&) = delete;
 };
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/util/debug_events_writer.h b/tensorflow/core/util/debug_events_writer.h
index 570a40fc9c9665..79aad9488c9438 100644
--- a/tensorflow/core/util/debug_events_writer.h
+++ b/tensorflow/core/util/debug_events_writer.h
@@ -263,7 +263,8 @@ class DebugEventsWriter {
   std::unique_ptr<SingleDebugEventFileWriter> execution_writer_;
   std::unique_ptr<SingleDebugEventFileWriter> graph_execution_traces_writer_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DebugEventsWriter);
+  DebugEventsWriter(const DebugEventsWriter&) = delete;
+  void operator=(const DebugEventsWriter&) = delete;
 
   friend class DebugEventsWriterTest;
 };
diff --git a/tensorflow/core/util/events_writer.h b/tensorflow/core/util/events_writer.h
index 05e62f04ce1cf2..c2fd0d9af3817c 100644
--- a/tensorflow/core/util/events_writer.h
+++ b/tensorflow/core/util/events_writer.h
@@ -92,7 +92,8 @@ class EventsWriter {
   std::unique_ptr<WritableFile> recordio_file_;
   std::unique_ptr<io::RecordWriter> recordio_writer_;
   int num_outstanding_events_;
-  TF_DISALLOW_COPY_AND_ASSIGN(EventsWriter);
+  EventsWriter(const EventsWriter&) = delete;
+  void operator=(const EventsWriter&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/fake_clock_env.h b/tensorflow/core/util/fake_clock_env.h
index 0282e9655b367c..2ded1708aed79c 100644
--- a/tensorflow/core/util/fake_clock_env.h
+++ b/tensorflow/core/util/fake_clock_env.h
@@ -49,7 +49,8 @@ class FakeClockEnv : public EnvWrapper {
   mutable mutex mu_;
   uint64 current_time_ TF_GUARDED_BY(mu_) = 0;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FakeClockEnv);
+  FakeClockEnv(const FakeClockEnv&) = delete;
+  void operator=(const FakeClockEnv&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/gpu_solvers.h b/tensorflow/core/util/gpu_solvers.h
index 8f539e183b858a..32c2dba0d27baa 100644
--- a/tensorflow/core/util/gpu_solvers.h
+++ b/tensorflow/core/util/gpu_solvers.h
@@ -589,7 +589,8 @@ class GpuSolver {
 
   std::vector<TensorReference> scratch_tensor_refs_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GpuSolver);
+  GpuSolver(const GpuSolver&) = delete;
+  void operator=(const GpuSolver&) = delete;
 };
 
 // Helper class to allocate scratch memory and keep track of debug info.
diff --git a/tensorflow/core/util/guarded_philox_random.h b/tensorflow/core/util/guarded_philox_random.h
index e599be1ea55cbd..b2f4fa86d35d49 100644
--- a/tensorflow/core/util/guarded_philox_random.h
+++ b/tensorflow/core/util/guarded_philox_random.h
@@ -74,7 +74,8 @@ class GuardedPhiloxRandom {
   random::PhiloxRandom generator_ TF_GUARDED_BY(mu_);
   bool initialized_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GuardedPhiloxRandom);
+  GuardedPhiloxRandom(const GuardedPhiloxRandom&) = delete;
+  void operator=(const GuardedPhiloxRandom&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 80dc5869255691..6b5e098a2d7217 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -118,7 +118,8 @@ class MemmappedFileSystem : public FileSystem {
   std::unique_ptr<ReadOnlyMemoryRegion> mapped_memory_;
   DirectoryType directory_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MemmappedFileSystem);
+  MemmappedFileSystem(const MemmappedFileSystem&) = delete;
+  void operator=(const MemmappedFileSystem&) = delete;
 };
 
 class MemmappedEnv : public EnvWrapper {
diff --git a/tensorflow/core/util/memmapped_file_system_writer.h b/tensorflow/core/util/memmapped_file_system_writer.h
index 884b4b8bc63418..2b15a7e476ff9f 100644
--- a/tensorflow/core/util/memmapped_file_system_writer.h
+++ b/tensorflow/core/util/memmapped_file_system_writer.h
@@ -45,7 +45,8 @@ class MemmappedFileSystemWriter {
   // The current offset in the file, to support alignment.
   uint64 output_file_offset_ = 0;
   std::unique_ptr<WritableFile> output_file_;
-  TF_DISALLOW_COPY_AND_ASSIGN(MemmappedFileSystemWriter);
+  MemmappedFileSystemWriter(const MemmappedFileSystemWriter&) = delete;
+  void operator=(const MemmappedFileSystemWriter&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index fc0bf9ef54bd40..fc2ba56664da8d 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -332,7 +332,8 @@ class MklDnnShape {
   }
 
   ~MklDnnShape() {}
-  TF_DISALLOW_COPY_AND_ASSIGN(MklDnnShape);  // Cannot copy
+  MklDnnShape(const MklDnnShape&) = delete;
+  void operator=(const MklDnnShape&) = delete;  // Cannot copy
 
   /// Equality function for MklDnnShape objects
   /// @return true if both are equal; false otherwise.
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
index acefad521470cf..bab9c951d6cd62 100644
--- a/tensorflow/core/util/presized_cuckoo_map.h
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -356,7 +356,8 @@ class PresizedCuckooMap {
   std::unique_ptr<CuckooPathQueue> cpq_;
   CuckooPathEntry visited_[kVisitedListSize];
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PresizedCuckooMap);
+  PresizedCuckooMap(const PresizedCuckooMap&) = delete;
+  void operator=(const PresizedCuckooMap&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/rocm_sparse.cc b/tensorflow/core/util/rocm_sparse.cc
index f2c1247cb751af..11e5f946bfdc03 100644
--- a/tensorflow/core/util/rocm_sparse.cc
+++ b/tensorflow/core/util/rocm_sparse.cc
@@ -99,7 +99,8 @@ class HipSparseHandles {
   hipStream_t stream_;
   hipsparseHandle_t hipsparse_handle_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(HipSparseHandles);
+  HipSparseHandles(const HipSparseHandles&) = delete;
+  void operator=(const HipSparseHandles&) = delete;
 };
 
 // TODO(ebrevdo): Replace global mutex guarding CudaSparseHandles
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index e4f4db48df579e..1c9a8a3e6b1c00 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -164,7 +164,8 @@ class BundleWriter {
   std::map<std::string, BundleEntryProto> entries_;
   Status status_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BundleWriter);
+  BundleWriter(const BundleWriter&) = delete;
+  void operator=(const BundleWriter&) = delete;
 };
 
 // Merges a set of bundles (given their prefixes) into a single bundle with the
@@ -343,7 +344,8 @@ class BundleReader {
 
   bool enable_multi_threading_for_testing_ = false;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BundleReader);
+  BundleReader(const BundleReader&) = delete;
+  void operator=(const BundleReader&) = delete;
 };
 
 template <class T>
diff --git a/tensorflow/core/util/tensor_slice_reader.h b/tensorflow/core/util/tensor_slice_reader.h
index aaf3ffab9dc9d4..91ee7cd7d76ab9 100644
--- a/tensorflow/core/util/tensor_slice_reader.h
+++ b/tensorflow/core/util/tensor_slice_reader.h
@@ -138,7 +138,8 @@ class TensorSliceReader {
   mutable std::unordered_map<string, TensorSliceSet*> tensors_;
   mutable Status status_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorSliceReader);
+  TensorSliceReader(const TensorSliceReader&) = delete;
+  void operator=(const TensorSliceReader&) = delete;
 };
 
 Status OpenTableTensorSliceReader(const string& fname,
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index 0f69d5eb4dd398..1bd7b1c91202dc 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -96,7 +96,8 @@ class TensorSliceWriter {
   std::map<string, string> data_;
   // Total number of slices written
   int slices_;
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorSliceWriter);
+  TensorSliceWriter(const TensorSliceWriter&) = delete;
+  void operator=(const TensorSliceWriter&) = delete;
 };
 
 template <typename T>

From a97e766ab301ea4c61641a30c3745ff6edaf357f Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 3 Oct 2023 13:27:00 -0700
Subject: [PATCH 538/567] [XLA:GPU] NFC: apply clang-tidy fixes.

PiperOrigin-RevId: 570479756
---
 .../gpu/auto_sharding_gpu_compiler_test.cc    |  1 -
 .../xla/service/gpu/autotuner_util_test.cc    |  4 ---
 .../xla/xla/service/gpu/buffer_comparator.cc  |  2 +-
 .../xla/service/gpu/elemental_ir_emitter.cc   |  3 --
 third_party/xla/xla/service/gpu/for_thunk.h   |  2 +-
 .../xla/xla/service/gpu/fusions/loop.h        |  2 +-
 .../gpu/gpu_conv_padding_legalization_test.cc |  1 -
 .../xla/xla/service/gpu/gpu_conv_runner.cc    |  3 --
 .../xla/xla/service/gpu/gpu_executable.h      |  2 +-
 .../xla/service/gpu/gpu_hlo_schedule_test.cc  |  2 +-
 .../xla/service/gpu/gpu_layout_assignment.h   |  2 +-
 .../xla/service/gpu/gpu_performance_model.h   |  3 +-
 .../service/gpu/horizontal_input_fusion.cc    |  2 +-
 .../xla/service/gpu/horizontal_loop_fusion.h  |  5 +--
 third_party/xla/xla/service/gpu/ir_emitter.cc |  2 --
 .../xla/xla/service/gpu/ir_emitter_nested.h   |  2 +-
 .../xla/xla/service/gpu/kernel_thunk.cc       |  2 +-
 .../service/gpu/runtime/topk_kernel_test.cc   |  5 ++-
 .../service/gpu/tests/gpu_fused_mha_test.cc   | 34 +++++++++----------
 third_party/xla/xla/service/gpu/xfeed_queue.h |  2 +-
 20 files changed, 33 insertions(+), 48 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
index c410370b9700cb..d125ea202631c8 100644
--- a/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
@@ -24,7 +24,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-namespace op = xla::testing::opcode_matchers;
 namespace m = ::xla::match;
 
 class AutoShardingTest : public HloTestBase {
diff --git a/third_party/xla/xla/service/gpu/autotuner_util_test.cc b/third_party/xla/xla/service/gpu/autotuner_util_test.cc
index 1e287e5cc9f2d1..09887c01be7bea 100644
--- a/third_party/xla/xla/service/gpu/autotuner_util_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_util_test.cc
@@ -31,12 +31,8 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using ::absl::LogSeverity;
-using ::absl::ScopedMockLog;
-using ::testing::EndsWith;
 using ::testing::IsEmpty;
 using ::testing::Not;
-using ::testing::StartsWith;
 using ::testing::TempDir;
 
 class AutotunerUtilTest : public HloTestBase {
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator.cc b/third_party/xla/xla/service/gpu/buffer_comparator.cc
index 40180e249b2ea9..b150f858cc136b 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator.cc
+++ b/third_party/xla/xla/service/gpu/buffer_comparator.cc
@@ -1052,7 +1052,7 @@ static StatusOr<bool> DeviceCompare(se::Stream* stream,
     static absl::once_flag ptxas_not_found_logged;
     absl::call_once(ptxas_not_found_logged, [&]() {
       LOG(WARNING)
-          << compiled_ptx_or.status().ToString()
+          << compiled_ptx_or.status()
           << "\nRelying on driver to perform ptx compilation. "
           << "\nSetting XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda "
           << " or modifying $PATH can be used to set the location of ptxas"
diff --git a/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc b/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
index 6955d3050e869a..260287cc259521 100644
--- a/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
@@ -47,9 +47,6 @@ namespace xla {
 namespace gpu {
 
 using absl::StrAppend;
-using llvm_ir::IrArray;
-using llvm_ir::IrName;
-using llvm_ir::SetToFirstInsertPoint;
 
 namespace {
 // Returns whether operand is a floating-point literal with the given value.
diff --git a/third_party/xla/xla/service/gpu/for_thunk.h b/third_party/xla/xla/service/gpu/for_thunk.h
index 7a82e7c9e455ac..13e44ede16fa12 100644
--- a/third_party/xla/xla/service/gpu/for_thunk.h
+++ b/third_party/xla/xla/service/gpu/for_thunk.h
@@ -30,7 +30,7 @@ namespace gpu {
 // ForThunk executes 'loop_limit' invocations of 'body_thunk_sequence'.
 class ForThunk : public Thunk {
  public:
-  ForThunk(ThunkInfo thunk_info, const int64_t loop_limit,
+  ForThunk(ThunkInfo thunk_info, int64_t loop_limit,
            std::unique_ptr<ThunkSequence> body_thunk_sequence);
   ForThunk(const ForThunk&) = delete;
   ForThunk& operator=(const ForThunk&) = delete;
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.h b/third_party/xla/xla/service/gpu/fusions/loop.h
index 587cc3c9dcbdb9..6d539355c6a5d3 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.h
+++ b/third_party/xla/xla/service/gpu/fusions/loop.h
@@ -30,7 +30,7 @@ namespace gpu {
 // Generic loop fusion.
 class LoopFusion : public KernelFusionEmitterBase {
  public:
-  LoopFusion(HloFusionAnalysis& analysis) : analysis_(analysis) {}
+  explicit LoopFusion(HloFusionAnalysis& analysis) : analysis_(analysis) {}
   StatusOr<LaunchDimensions> launch_dimensions(
       IrEmitterContext& ir_emitter_context, int kernel_index) const override;
 
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_padding_legalization_test.cc b/third_party/xla/xla/service/gpu/gpu_conv_padding_legalization_test.cc
index 472a476c326c52..147b3187c25e01 100644
--- a/third_party/xla/xla/service/gpu/gpu_conv_padding_legalization_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_conv_padding_legalization_test.cc
@@ -32,7 +32,6 @@ namespace gpu {
 namespace {
 
 namespace m = ::xla::match;
-using ::testing::_;
 
 using GpuConvPaddingLegalizationTest = HloTestBase;
 
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_runner.cc b/third_party/xla/xla/service/gpu/gpu_conv_runner.cc
index aae548b5189d7a..5ca19f64c87133 100644
--- a/third_party/xla/xla/service/gpu/gpu_conv_runner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_conv_runner.cc
@@ -33,15 +33,12 @@ namespace {
 
 using se::DeviceMemory;
 using se::DeviceMemoryBase;
-using se::Stream;
-using se::dnn::AlgorithmConfig;
 using se::dnn::BatchDescriptor;
 using se::dnn::ConvolutionDescriptor;
 using se::dnn::DataLayout;
 using se::dnn::DimIndex;
 using se::dnn::FilterDescriptor;
 using se::dnn::FilterLayout;
-using se::dnn::ProfileResult;
 
 template <typename ElementType, typename OutputType>
 Status RunGpuConvUnfused(const GpuConvParams& params, se::Stream* stream,
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index a8af42ccbc26b6..1d459baefcef68 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -260,7 +260,7 @@ class GpuExecutable : public Executable {
       VariantArguments arguments,
       const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
       const BufferAllocation& allocation,
-      se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal,
+      se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
       int64_t arg_idx);
 
   // The LLVM IR, in string format, of the unoptimized module generated for
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index 985095286df5d2..7bac36b36f2a62 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -48,7 +48,7 @@ class GpuHloScheduleTest : public HloTestBase {
         test_backend.default_stream_executor()->GetDeviceDescription();
     TF_CHECK_OK(ScheduleGpuModule(
         module, /*pointer_size=*/8,
-        /*memory_size=*/gpu_device_info.device_memory_size() * 8 / 10));
+        /*memory_limit=*/gpu_device_info.device_memory_size() * 8 / 10));
     return SequentialHloOrdering{module->schedule()};
   }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.h b/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
index 54893a982c2a7b..27ca9bcfb72be8 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
@@ -35,7 +35,7 @@ class GpuLayoutAssignment : public LayoutAssignment {
       ChannelLayoutConstraints* channel_constraints = nullptr)
       : LayoutAssignment(entry_computation_layout, channel_constraints),
         stream_executor_(stream_executor) {}
-  ~GpuLayoutAssignment() override {}
+  ~GpuLayoutAssignment() override = default;
 
  protected:
   Status AddBackendConstraints(LayoutConstraints* constraints) override;
diff --git a/third_party/xla/xla/service/gpu/gpu_performance_model.h b/third_party/xla/xla/service/gpu/gpu_performance_model.h
index 2e28087a7ba745..f35d0587caccdd 100644
--- a/third_party/xla/xla/service/gpu/gpu_performance_model.h
+++ b/third_party/xla/xla/service/gpu/gpu_performance_model.h
@@ -33,7 +33,8 @@ class GpuPerformanceModel {
   };
   static RunTimes EstimateRunTimes(
       const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
-      std::vector<HloInstruction*> fused_users = {}, bool multi_output = false);
+      std::vector<HloInstruction*> fused_consumers = {},
+      bool multi_output = false);
 
   // Writes estimated execution time to FusionBackendConfig.reification_cost.
   static void RecordEstimatedRunTime(HloInstruction* instruction,
diff --git a/third_party/xla/xla/service/gpu/horizontal_input_fusion.cc b/third_party/xla/xla/service/gpu/horizontal_input_fusion.cc
index 58170caf37e9a7..0e01d4964149d3 100644
--- a/third_party/xla/xla/service/gpu/horizontal_input_fusion.cc
+++ b/third_party/xla/xla/service/gpu/horizontal_input_fusion.cc
@@ -46,7 +46,7 @@ class HorizontalInputFusionImpl {
                                      const se::DeviceDescription& d)
       : computation_(computation), device_info_(d) {}
 
-  ~HorizontalInputFusionImpl() {}
+  ~HorizontalInputFusionImpl() = default;
 
   StatusOr<bool> Run();
 
diff --git a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.h b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.h
index 71814156a06ade..4d81ba65412f4d 100644
--- a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.h
+++ b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.h
@@ -118,8 +118,9 @@ namespace gpu {
 // Note, reshapes are added only if the tensors isn't already a vector.
 class GpuHorizontalLoopFusion : public HloModulePass {
  public:
-  GpuHorizontalLoopFusion() {}
-  GpuHorizontalLoopFusion(absl::string_view prefix) : prefix_(prefix) {}
+  GpuHorizontalLoopFusion() = default;
+  explicit GpuHorizontalLoopFusion(absl::string_view prefix)
+      : prefix_(prefix) {}
 
   absl::string_view name() const override {
     return "gpu_horizontal_loop_fusion";
diff --git a/third_party/xla/xla/service/gpu/ir_emitter.cc b/third_party/xla/xla/service/gpu/ir_emitter.cc
index 4033ac7a79afb5..8290787c69db0d 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter.cc
@@ -39,8 +39,6 @@ limitations under the License.
 
 namespace xla {
 
-using llvm_ir::SetToFirstInsertPoint;
-
 namespace gpu {
 
 IrEmitter::IrEmitter(IrEmitterContext* ir_emitter_context, bool is_nested)
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_nested.h b/third_party/xla/xla/service/gpu/ir_emitter_nested.h
index cec80a5b1efc22..9f1b6e1baf9839 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_nested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_nested.h
@@ -50,7 +50,7 @@ Status CallNestedComputation(llvm::IRBuilder<>* builder,
 StatusOr<std::vector<llvm::Value*>> CallNestedComputationWithScalars(
     llvm::IRBuilder<>* builder, IrEmitterContext& ir_emitter_context,
     const HloComputation& computation,
-    absl::Span<llvm::Value* const> parameter_scalars);
+    absl::Span<llvm::Value* const> parameter_elements);
 
 // Like CallNestedComputationWithScalars, but parameters are scalar addresses.
 StatusOr<std::vector<llvm::Value*>> CallNestedComputationWithScalarAddrs(
diff --git a/third_party/xla/xla/service/gpu/kernel_thunk.cc b/third_party/xla/xla/service/gpu/kernel_thunk.cc
index 3ac25290f35d75..459b6de5176c6d 100644
--- a/third_party/xla/xla/service/gpu/kernel_thunk.cc
+++ b/third_party/xla/xla/service/gpu/kernel_thunk.cc
@@ -100,7 +100,7 @@ static void PrintBufferContents(
   for (const se::DeviceMemoryBase& buf : buffer_args) {
     auto host_buffer = std::make_unique<char[]>(buf.size());
     CHECK(stream->ThenMemcpy(host_buffer.get(), buf, buf.size()).ok());
-    CHECK(stream->BlockHostUntilDone().ok());
+    CHECK_OK(stream->BlockHostUntilDone());
 
     std::string buffer_contents;
     for (int i = 0; i < buf.size(); i++) {
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel_test.cc b/third_party/xla/xla/service/gpu/runtime/topk_kernel_test.cc
index 71b9e4d15b38c0..da993cfd0204d0 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/topk_kernel_test.cc
@@ -182,9 +182,8 @@ void BM_SmallTopk(benchmark::State& state) {
     CUDA_CHECK(cudaEventCreate(&start));
     CUDA_CHECK(cudaEventCreate(&stop));
     CUDA_CHECK(cudaEventRecord(start, stream));
-    CHECK(RunTopk(stream, Get(T()), input_buffer, n, output_values,
-                  output_indices, k, batch_size)
-              .ok());
+    CHECK_OK(RunTopk(stream, Get(T()), input_buffer, n, output_values,
+                     output_indices, k, batch_size));
     CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cudaEventRecord(stop, stream));
     CUDA_CHECK(cudaEventSynchronize(stop));
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
index 10b03e874ad9a7..85ec435927e803 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
@@ -161,7 +161,7 @@ class MultiHeadedAttentionTest : public GpuCodegenTest {
 
 class MultiHeadedAttentionBMMBMM : public MultiHeadedAttentionTest {
  protected:
-  const std::string GetModuleFMHABMM_BMM_vanilla_HloString_F16() {
+  std::string GetModuleFMHABMM_BMM_vanilla_HloString_F16() {
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0})->f16[16,16,256,64]{3,2,1,0}}
 
@@ -176,7 +176,7 @@ class MultiHeadedAttentionBMMBMM : public MultiHeadedAttentionTest {
     return hlo_text;
   }
 
-  const std::string GetModuleFMHABMM_BMM_vanilla_HloString_BF16() {
+  std::string GetModuleFMHABMM_BMM_vanilla_HloString_BF16() {
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
 
@@ -223,7 +223,7 @@ class MultiHeadedAttentionBMMBMM : public MultiHeadedAttentionTest {
     return hlo_text;
   }
 
-  const std::string
+  std::string
   GetModuleFMHABMM_BMM_arg_layout_manipulation_arg_reversal_HloString_BF16() {  // NOLINT
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0})->bf16[16,16,64,256]{3,2,1,0}}
@@ -239,7 +239,7 @@ class MultiHeadedAttentionBMMBMM : public MultiHeadedAttentionTest {
     return hlo_text;
   }
 
-  const std::string
+  std::string
   GetModuleFMHABMM_BMM_arg_layout_manipulation_arg_reversal_HloString_F16() {  // NOLINT
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0})->f16[16,16,64,256]{3,2,1,0}}
@@ -405,7 +405,7 @@ class MultiHeadedAttentionBMMBMM : public MultiHeadedAttentionTest {
     return hlo_text;
   }
 
-  const std::string
+  std::string
   GetModuleFMHABMM_BMM2_non_contracting_dim_stride_not_1_HloString_F16() {  // NOLINT
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{2,3,1,0})->f16[16,16,256,64]{3,2,1,0}}
@@ -729,8 +729,7 @@ class MultiHeadedAttentionBMMBMM : public MultiHeadedAttentionTest {
 class MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM
     : public MultiHeadedAttentionTest {
  protected:
-  const std::string
-  GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_F16() {
+  std::string GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_F16() {
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->f16[16,16,256,64]{3,2,1,0}}
 
@@ -987,8 +986,7 @@ class MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM
     return hlo_text;
   }
 
-  const std::string
-  GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_BF16() {
+  std::string GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_BF16() {
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
 
@@ -1043,7 +1041,7 @@ class MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM
     return hlo_text;
   }
 
-  const std::string
+  std::string
   GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_BF16_smaller() {  // NOLINT
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,40,64]{3,2,1,0},bf16[2,6,64,40]{3,2,1,0},bf16[2,6,40,64]{3,2,1,0},pred[2,6,40,40]{3,2,1,0})->bf16[2,6,40,64]{3,2,1,0}}
@@ -1099,7 +1097,7 @@ class MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM
     return hlo_text;
   }
 
-  const std::string
+  std::string
   GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_F16_smaller() {  // NOLINT
     const std::string hlo_text = R"(
   HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,40,64]{3,2,1,0},f16[2,6,64,40]{3,2,1,0},f16[2,6,40,64]{3,2,1,0},pred[2,6,40,40]{3,2,1,0})->f16[2,6,40,64]{3,2,1,0}}
@@ -1155,7 +1153,7 @@ class MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM
     return hlo_text;
   }
 
-  const std::string
+  std::string
   GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_arg_reversal_HloString_F16() {  // NOLINT
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->f16[16,16,64,256]{3,2,1,0}}
@@ -1211,7 +1209,7 @@ class MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM
     return hlo_text;
   }
 
-  const std::string
+  std::string
   GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_arg_reversal_HloString_BF16() {  // NOLINT
     const std::string hlo_text = R"(
   HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->bf16[16,16,64,256]{3,2,1,0}}
@@ -1503,7 +1501,7 @@ class MultiHeadedAttentionBMMScaleMaskSoftmaxBMM
     return hlo_text;
   }
 
-  const std::string
+  std::string
   GetModuleFMHABMM1_Scale_Mask_Softmax_BMM2_arg_reversal_HloString_F16() {  // NOLINT
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->f16[16,16,64,256]{3,2,1,0}}
@@ -1556,7 +1554,7 @@ class MultiHeadedAttentionBMMScaleMaskSoftmaxBMM
     return hlo_text;
   }
 
-  const std::string
+  std::string
   GetModuleFMHABMM1_Scale_Mask_Softmax_BMM2_arg_reversal_HloString_BF16() {  // NOLINT
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->bf16[16,16,64,256]{3,2,1,0}}
@@ -1676,7 +1674,7 @@ class MultiHeadedAttentionBMMScaleMaskSoftmaxBMM
 class MultiHeadedAttentionBMMSoftmaxBMM : public MultiHeadedAttentionTest {
   // Bmm1 - Softmax - Bmm2
  protected:
-  const std::string GetModuleFMHABMM1_Softmax_BMM2_HloString_F16() {
+  std::string GetModuleFMHABMM1_Softmax_BMM2_HloString_F16() {
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0})->f16[16,16,256,64]{3,2,1,0}}
 
@@ -1721,7 +1719,7 @@ class MultiHeadedAttentionBMMSoftmaxBMM : public MultiHeadedAttentionTest {
     return hlo_text;
   }
 
-  const std::string GetModuleFMHABMM1_Softmax_BMM2_HloString_BF16() {
+  std::string GetModuleFMHABMM1_Softmax_BMM2_HloString_BF16() {
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
 
@@ -1801,7 +1799,7 @@ class MultiHeadedAttentionBMMScaleBiasSoftmaxBMM
     : public MultiHeadedAttentionTest {
  protected:
   // Bmm1 - Scale - Bias - Softmax - Bmm2
-  const std::string GetModuleFMHABMM1_Scale_Bias_Softmax_BMM2_HloString_F16() {
+  std::string GetModuleFMHABMM1_Scale_Bias_Softmax_BMM2_HloString_F16() {
     const std::string hlo_text = R"(
     HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0})->f16[16,16,256,64]{3,2,1,0}}
 
diff --git a/third_party/xla/xla/service/gpu/xfeed_queue.h b/third_party/xla/xla/service/gpu/xfeed_queue.h
index 09fb778d90eb07..1cb8dec6d5f91c 100644
--- a/third_party/xla/xla/service/gpu/xfeed_queue.h
+++ b/third_party/xla/xla/service/gpu/xfeed_queue.h
@@ -82,7 +82,7 @@ class XfeedQueue {
     before_get_next_dest_callbacks_.push_back(std::move(callback));
   }
 
-  virtual ~XfeedQueue() {}
+  virtual ~XfeedQueue() = default;
 
  protected:
   virtual void DequeueHook() ABSL_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {}

From 7d89e0b122075882b02e0887c8ef18f4f406dbc9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 14:24:14 -0700
Subject: [PATCH 539/567] [xla:gpu] Deallocate all graphs with same device
 ordinal if a graph fallbacks

PiperOrigin-RevId: 570496666
---
 third_party/xla/xla/runtime/state.h           |  7 ++-
 third_party/xla/xla/service/gpu/runtime/BUILD |  1 -
 .../xla/xla/service/gpu/runtime/executable.cc |  5 +--
 .../xla/service/gpu/runtime/graph_launch.cc   | 44 ++++++-------------
 .../xla/service/gpu/runtime/graph_launch.h    |  3 --
 5 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/third_party/xla/xla/runtime/state.h b/third_party/xla/xla/runtime/state.h
index bbfd6e9a8cb8f0..c505dd96ffea72 100644
--- a/third_party/xla/xla/runtime/state.h
+++ b/third_party/xla/xla/runtime/state.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <cstddef>
 #include <memory>
-#include <optional>
 #include <type_traits>
 #include <vector>
 
@@ -73,7 +72,7 @@ class StateVector {
     template <typename F>
     absl::StatusOr<T*> GetOrCreate(size_t id, F&& create);
 
-    std::optional<T*> Get(size_t id);
+    absl::StatusOr<T*> Get(size_t id);
 
     absl::Status Erase(size_t id);
 
@@ -184,7 +183,7 @@ absl::StatusOr<T*> StateVector<T>::Snapshot::GetOrCreate(size_t id,
 }
 
 template <typename T>
-std::optional<T*> StateVector<T>::Snapshot::Get(size_t id) {
+absl::StatusOr<T*> StateVector<T>::Snapshot::Get(size_t id) {
   // If snapshot already contains the entry, just return it.
   std::vector<T*>& snapshot = *maybe_obsolete_snapshot_;
   if (id < snapshot.size() && snapshot[id]) return snapshot[id];
@@ -197,7 +196,7 @@ std::optional<T*> StateVector<T>::Snapshot::Get(size_t id) {
   std::vector<std::unique_ptr<T>>& state = owning_state_.vector_;
   if (id < state.size() && state[id].get()) return state[id].get();
 
-  return std::nullopt;
+  return absl::InternalError("Value not found in state vector");
 }
 
 template <typename T>
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index c995fb2f175cba..4cdd2dcbbd8464 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -431,7 +431,6 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/profiler/lib:profiler_lock",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
diff --git a/third_party/xla/xla/service/gpu/runtime/executable.cc b/third_party/xla/xla/service/gpu/runtime/executable.cc
index 13641b8fa1b510..8fe28a284f275f 100644
--- a/third_party/xla/xla/service/gpu/runtime/executable.cc
+++ b/third_party/xla/xla/service/gpu/runtime/executable.cc
@@ -416,7 +416,7 @@ Status GpuRuntimeExecutable::Execute(
   std::shared_ptr<StreamExecutorGraphInstances> executor_graphs =
       graph_instances_(executor);
 
-  StreamExecutorGraphInstances::Snapshot se_graph_instances =
+  StreamExecutorGraphInstances::Snapshot graph_instances =
       executor_graphs->snapshot();
   CapturedFunctionExecutionCount::Snapshot execution_count =
       captured_function_counts_(executor)->snapshot();
@@ -461,8 +461,7 @@ Status GpuRuntimeExecutable::Execute(
       // only.
       &fused_attention_runners, &fused_attention_backward_runners,
 #endif  // GOOGLE_CUDA
-      &se_graph_instances, &execution_count, &ordinal_to_fallback,
-      &graph_instances_,
+      &graph_instances, &execution_count, &ordinal_to_fallback,
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       &concurrent_region_status,
       // Null pointer will be interpreted as an absence of async collectives
diff --git a/third_party/xla/xla/service/gpu/runtime/graph_launch.cc b/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
index b40e5ff7954d4f..fa42e1330050c4 100644
--- a/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
+++ b/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/support.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/statusor.h"
-#include "tsl/platform/errors.h"
 #include "tsl/profiler/lib/profiler_lock.h"
 #include "tsl/profiler/lib/traceme.h"
 #include "tsl/profiler/lib/traceme_encode.h"
@@ -304,8 +303,8 @@ Status GraphInstances::InstantiateAllGraphs(
                           "xla.gpu.graph.capture"))
       continue;
 
-    std::optional<std::monostate*> fallback = ordinal_to_fallback->Get(ordinal);
-    if (fallback.has_value()) continue;
+    StatusOr<std::monostate*> fallback = ordinal_to_fallback->Get(ordinal);
+    if (fallback.ok()) continue;
 
     VLOG(3) << "Instantiate Gpu graph defined by capture function @"
             << executable.function_name(ordinal) << " (ordinal = " << ordinal
@@ -376,17 +375,6 @@ Status GraphInstances::InstantiateAllGraphs(
   return OkStatus();
 }
 
-Status GraphInstances::RemoveGraphsByOrdinal(unsigned ordinal) {
-  absl::MutexLock lock(&impl_->mu);
-  for (auto& [stream_executor, state] : impl_->graphs) {
-    Status erased = state.instances->snapshot().Erase(ordinal);
-    if (erased.ok()) {
-      NotifyGraphInstancesDestroyed(stream_executor, 1);
-    }
-  }
-  return OkStatus();
-}
-
 CapturedFunctionExecutionCount* CapturedFunctionExecutionCounts::operator()(
     se::StreamExecutor* executor) {
   absl::MutexLock lock(&mutex_);
@@ -567,11 +555,11 @@ static absl::Status LaunchGraph(
     const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
     StreamExecutorKernels::Snapshot* kernels,
     StreamExecutorConvRunners::Snapshot* convs,
-    StreamExecutorGraphInstances::Snapshot* se_graph_instances,
+    StreamExecutorGraphInstances::Snapshot* instances,
     CapturedFunctionExecutionCount::Snapshot* counts,
     OrdinalToFallback::Snapshot* ordinal_to_fallback,
-    GraphInstances* graph_instances, GemmConfigs::Snapshot* gemm_config,
-    runtime::Executable* executable, NonAtomicallyUpgradeableRWLock* gpu_lock,
+    GemmConfigs::Snapshot* gemm_config, runtime::Executable* executable,
+    NonAtomicallyUpgradeableRWLock* gpu_lock,
     ConcurrentRegionStatus* region_status, CustomCall::RemainingArgs fwd_args,
     CustomCall::FunctionOrdinal capture) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -603,10 +591,10 @@ static absl::Status LaunchGraph(
   // work around disable graph execution and run everything in op-by-op mode.
   bool is_profiling = tsl::profiler::ProfilerLock::HasActiveSession();
 
-  std::optional<std::monostate*> fallback =
+  StatusOr<std::monostate*> fallback =
       ordinal_to_fallback->Get(capture.ordinal);
 
-  if (count < num_runs_to_instantiate || is_profiling || fallback.has_value()) {
+  if (count < num_runs_to_instantiate || is_profiling || fallback.ok()) {
     VLOG(3) << "Run gpu graph in op-by-op mode: ordinal = " << capture.ordinal;
     return RunGraphOpByOp(run_options, function_ref, fwd_args, user_data());
   }
@@ -629,16 +617,16 @@ static absl::Status LaunchGraph(
     // If num_runs_to_instantiate is less than 0, all graphs should be
     // instantiated ahead-of-time. If we fail to get the graph instance, then
     // graph instantiation failed due to OOM. So we run the graph op-by-op.
-    std::optional<GraphInstance*> try_get_instance =
-        se_graph_instances->Get(capture.ordinal);
-    if (try_get_instance.has_value()) {
+    absl::StatusOr<GraphInstance*> try_get_instance =
+        instances->Get(capture.ordinal);
+    if (try_get_instance.ok()) {
       instance = try_get_instance.value();
     } else {
       return RunGraphOpByOp(run_options, function_ref, fwd_args, user_data());
     }
   } else {
-    TF_ASSIGN_OR_RETURN(instance, se_graph_instances->GetOrCreate(
-                                      capture.ordinal, instantiate));
+    TF_ASSIGN_OR_RETURN(instance,
+                        instances->GetOrCreate(capture.ordinal, instantiate));
   }
 
   {
@@ -682,6 +670,8 @@ static absl::Status LaunchGraph(
     case se::gpu::OwnedGpuGraphExec::UpdateResult::kFallback: {
       LOG(WARNING) << "Fallback to op-by-op mode because memset node breaks "
                       "graph update";
+      // Deallocate instance.
+      TF_RETURN_IF_ERROR(instances->Erase(capture.ordinal));
       // Set ordinal_to_fallback to prevent future instantiation of this graph.
       TF_ASSIGN_OR_RETURN(
           std::monostate * fallback,
@@ -689,11 +679,6 @@ static absl::Status LaunchGraph(
               capture.ordinal,
               []() -> StatusOr<std::monostate> { return std::monostate{}; }));
       DCHECK(fallback);
-
-      // Deallocate graph instances with the ordinal.
-      TF_RETURN_IF_ERROR(
-          graph_instances->RemoveGraphsByOrdinal(capture.ordinal));
-
       return RunGraphOpByOp(run_options, function_ref, fwd_args, user_data());
     }
     case se::gpu::OwnedGpuGraphExec::UpdateResult::kSuccess:
@@ -731,7 +716,6 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .UserData<StreamExecutorGraphInstances::Snapshot*>()
         .UserData<CapturedFunctionExecutionCount::Snapshot*>()
         .UserData<OrdinalToFallback::Snapshot*>()
-        .UserData<GraphInstances*>()
         .UserData<GemmConfigs::Snapshot*>()
         .UserData<Executable*>()
         .UserData<NonAtomicallyUpgradeableRWLock*>()
diff --git a/third_party/xla/xla/service/gpu/runtime/graph_launch.h b/third_party/xla/xla/service/gpu/runtime/graph_launch.h
index 1553815acf8717..e4841541bc781e 100644
--- a/third_party/xla/xla/service/gpu/runtime/graph_launch.h
+++ b/third_party/xla/xla/service/gpu/runtime/graph_launch.h
@@ -118,9 +118,6 @@ class GraphInstances {
   bool InstantiatedAllGraphs(const ServiceExecutableRunOptions* run_options,
                              const runtime::Executable& executable);
 
-  // Remove all instantiated graph instances that has a certain ordinal.
-  Status RemoveGraphsByOrdinal(unsigned ordinal);
-
  private:
   std::shared_ptr<Impl> impl_;
 };

From 51f1597034d0c46290ff843885df2c30c0aecf82 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 14:37:08 -0700
Subject: [PATCH 540/567] Do not compress mesh shapes when mesh selection is on
 as this can lead to incorrect alpha/beta values being used.

PiperOrigin-RevId: 570500290
---
 .../auto_sharding/auto_sharding.cc            |  1 -
 .../auto_sharding/auto_sharding_option.h      | 36 +++++++++++--------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index a35840689a8592..c4dd87226acab3 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -947,7 +947,6 @@ void EnumeratePartitionReshape(const HloInstruction* ins,
     batch_dim = iter->second;
   }
 
-
   // Split batch dim + another dim
   for (int64_t i = 0; i < ins->shape().rank(); ++i) {
     auto tensor_it = std::find(tensor_dims.begin(), tensor_dims.end(), i);
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
index 465bfd1cdbb205..062a7b2fd1a12f 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
@@ -261,7 +261,10 @@ struct AutoShardingOption {
     std::vector<int64_t> mesh_dims_greater_than_one_indices =
         spmd::VectorGreaterThanOneElementIndices(device_mesh_shape);
 
-    if (mesh_dims_greater_than_one_indices.size() > 3) {
+    // TODO(pratikf) The device mesh shape handling in this function currently
+    // does not work when try_multiple_mesh_shapes is true. Fix it.
+    if (mesh_dims_greater_than_one_indices.size() > 3 ||
+        (device_mesh_shape.size() > 3 && try_multiple_mesh_shapes)) {
       return absl::OutOfRangeError(absl::StrCat(
           "Not supported: only device_mesh_shapes with 3 or less "
           "dimensions larger than 1 are supported. Instead we have ",
@@ -282,6 +285,7 @@ struct AutoShardingOption {
                        "more than two shardable dims: device_mesh_shape=",
                        absl::StrJoin(device_mesh_shape, ",")));
     }
+
     if (device_mesh_alpha.empty()) {
       // Generates simple device_mesh_alpha based on the size of
       // device_mesh_shape.
@@ -310,22 +314,24 @@ struct AutoShardingOption {
           "please leave them empty and default values will be used."));
     }
 
-    std::vector<int64_t> compressed_device_mesh_shape;
-    std::vector<double> compressed_device_mesh_alpha;
-    std::vector<double> compressed_device_mesh_beta;
-    int non_zero_counter = 0;
-    for (size_t i = 0; i < device_mesh_shape.size(); ++i) {
-      if (non_zero_counter < mesh_dims_greater_than_one_indices.size() &&
-          i == mesh_dims_greater_than_one_indices[non_zero_counter]) {
-        non_zero_counter++;
-        compressed_device_mesh_shape.push_back(device_mesh_shape[i]);
-        compressed_device_mesh_alpha.push_back(device_mesh_alpha[i]);
-        compressed_device_mesh_beta.push_back(device_mesh_beta[i]);
+    if (!try_multiple_mesh_shapes) {
+      std::vector<int64_t> compressed_device_mesh_shape;
+      std::vector<double> compressed_device_mesh_alpha;
+      std::vector<double> compressed_device_mesh_beta;
+      int non_zero_counter = 0;
+      for (size_t i = 0; i < device_mesh_shape.size(); ++i) {
+        if (non_zero_counter < mesh_dims_greater_than_one_indices.size() &&
+            i == mesh_dims_greater_than_one_indices[non_zero_counter]) {
+          non_zero_counter++;
+          compressed_device_mesh_shape.push_back(device_mesh_shape[i]);
+          compressed_device_mesh_alpha.push_back(device_mesh_alpha[i]);
+          compressed_device_mesh_beta.push_back(device_mesh_beta[i]);
+        }
       }
+      this->device_mesh_shape = compressed_device_mesh_shape;
+      this->device_mesh_alpha = compressed_device_mesh_alpha;
+      this->device_mesh_beta = compressed_device_mesh_beta;
     }
-    this->device_mesh_shape = compressed_device_mesh_shape;
-    this->device_mesh_alpha = compressed_device_mesh_alpha;
-    this->device_mesh_beta = compressed_device_mesh_beta;
 
     // If device_mesh_shape has only one value, append 1 to it
     if (device_mesh_shape.size() == 1) {

From bc853de7916ad7f70e704fdbd0fd54a2e8f48871 Mon Sep 17 00:00:00 2001
From: Yishuang Pang <ypang@google.com>
Date: Tue, 3 Oct 2023 14:38:32 -0700
Subject: [PATCH 541/567] Update CocoaPods specs for TFLite 2.14.0

PiperOrigin-RevId: 570500681
---
 tensorflow/lite/ios/TensorFlowLiteC.podspec           | 6 +++---
 tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec | 4 ++--
 tensorflow/lite/objc/TensorFlowLiteObjC.podspec       | 6 +++---
 tensorflow/lite/swift/TensorFlowLiteSwift.podspec     | 6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/ios/TensorFlowLiteC.podspec b/tensorflow/lite/ios/TensorFlowLiteC.podspec
index 4bb16d234cb3e4..d84abe0b58eb70 100644
--- a/tensorflow/lite/ios/TensorFlowLiteC.podspec
+++ b/tensorflow/lite/ios/TensorFlowLiteC.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteC'
-  s.version          = '2.13.0'
+  s.version          = '2.14.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :http => "https://dl.google.com/tflite-release/ios/prod/tensorflow/lite/release/ios/release/28/20230707-140347/TensorFlowLiteC/2.13.0/ee5f7d501376cea8/TensorFlowLiteC-2.13.0.tar.gz" }
+  s.source           = { :http => "https://dl.google.com/tflite-release/ios/prod/tensorflow/lite/release/ios/release/30/20231002-210715/TensorFlowLiteC/2.14.0/883c6fc838e0354b/TensorFlowLiteC-2.14.0.tar.gz" }
   s.summary          = 'TensorFlow Lite'
   s.description      = <<-DESC
 
@@ -16,7 +16,7 @@ Pod::Spec.new do |s|
                        DESC
 
   s.cocoapods_version = '>= 1.9.0'
-  s.ios.deployment_target = '11.0'
+  s.ios.deployment_target = '12.0'
 
   s.module_name = 'TensorFlowLiteC'
   s.library = 'c++'
diff --git a/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec b/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec
index fe1241cc5a1d7c..ada82b0d8e5751 100644
--- a/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec
+++ b/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteSelectTfOps'
-  s.version          = '2.13.0'
+  s.version          = '2.14.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :http => "https://dl.google.com/tflite-release/ios/prod/tensorflow/lite/release/ios/release/28/20230707-140347/TensorFlowLiteSelectTfOps/2.13.0/f7ed69e4d922c040/TensorFlowLiteSelectTfOps-2.13.0.tar.gz" }
+  s.source           = { :http => "https://dl.google.com/tflite-release/ios/prod/tensorflow/lite/release/ios/release/30/20231002-210715/TensorFlowLiteSelectTfOps/2.14.0/e43d76a7247eec5c/TensorFlowLiteSelectTfOps-2.14.0.tar.gz" }
   s.summary          = 'TensorFlow Lite'
   s.description      = <<-DESC
 
diff --git a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
index 5a18a424b3f09c..b71f830d07eceb 100644
--- a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteObjC'
-  s.version          = '2.13.0'
+  s.version          = '2.14.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '1cb1a030a62b169d90d34c747ab9b09f332bf905' }
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '4dacf3f368eb7965e9b5c3bbdd5193986081c3b2' }
   s.summary          = 'TensorFlow Lite for Objective-C'
   s.description      = <<-DESC
 
@@ -15,7 +15,7 @@ Pod::Spec.new do |s|
                        DESC
 
   s.cocoapods_version = '>= 1.9.0'
-  s.ios.deployment_target = '11.0'
+  s.ios.deployment_target = '12.0'
 
   s.module_name = 'TFLTensorFlowLite'
   s.static_framework = true
diff --git a/tensorflow/lite/swift/TensorFlowLiteSwift.podspec b/tensorflow/lite/swift/TensorFlowLiteSwift.podspec
index 2d145dafdf71ad..8a3d2a9e3ee493 100644
--- a/tensorflow/lite/swift/TensorFlowLiteSwift.podspec
+++ b/tensorflow/lite/swift/TensorFlowLiteSwift.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteSwift'
-  s.version          = '2.13.0'
+  s.version          = '2.14.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '1cb1a030a62b169d90d34c747ab9b09f332bf905' }
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '4dacf3f368eb7965e9b5c3bbdd5193986081c3b2' }
   s.summary          = 'TensorFlow Lite for Swift'
   s.description      = <<-DESC
 
@@ -14,7 +14,7 @@ Pod::Spec.new do |s|
                        DESC
 
   s.cocoapods_version = '>= 1.9.0'
-  s.ios.deployment_target = '11.0'
+  s.ios.deployment_target = '12.0'
   s.swift_version = '5.0'
 
   s.module_name = 'TensorFlowLite'

From 85a1b2f83ce8075b9f1ca23ed6857784ede9965c Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Tue, 3 Oct 2023 14:40:13 -0700
Subject: [PATCH 542/567] Integrate StableHLO at openxla/stablehlo@fe5fa269

PiperOrigin-RevId: 570501197
---
 third_party/stablehlo/workspace.bzl                 | 4 ++--
 third_party/xla/third_party/stablehlo/workspace.bzl | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index 82c29f575b94d4..6ddac7fffab70d 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "6bf3f5d7b8dd3fc67ee1236432606d9c414ffcbd"
-    STABLEHLO_SHA256 = "95e10a1f10cd6aaf66cbe5fe43a65dc588099f3ded179ef87af0748adf500fdb"
+    STABLEHLO_COMMIT = "fe5fa2690e33241b51cefdb27a1fff0fd0626047"
+    STABLEHLO_SHA256 = "e4246b176a911ecc8fc9f42622ba1a3997f88633d47dd90ad641ee3d973b6792"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 82c29f575b94d4..6ddac7fffab70d 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "6bf3f5d7b8dd3fc67ee1236432606d9c414ffcbd"
-    STABLEHLO_SHA256 = "95e10a1f10cd6aaf66cbe5fe43a65dc588099f3ded179ef87af0748adf500fdb"
+    STABLEHLO_COMMIT = "fe5fa2690e33241b51cefdb27a1fff0fd0626047"
+    STABLEHLO_SHA256 = "e4246b176a911ecc8fc9f42622ba1a3997f88633d47dd90ad641ee3d973b6792"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(

From 1bbd10c3bb8de86bfe126ae7ace123f7e85354cf Mon Sep 17 00:00:00 2001
From: Jian Cai <jiancai@google.com>
Date: Tue, 3 Oct 2023 14:51:35 -0700
Subject: [PATCH 543/567] Extend coverage of TF2XLA MLIR generic phase 1 bridge

TF2XLA bridge currently only allows CPU/GPU graphs in parameter server jobs with resource variable arguments to go through the phase 1 bridge. This change removes this restriction to cover more graphs. It would allow feature parity  (outside compilation, distributed strategy, etc) on CPU/GPU/TPU graphs to provide long-term benefits (outside compilation, consistent clustering, debuggability, etc), and unblocks the unification of CPU/GPU and TPU MLIR-based phase 1 pipelines.

PiperOrigin-RevId: 570504216
---
 .../compiler/tf2xla/mlir_bridge_pass.cc       | 47 +------------------
 1 file changed, 1 insertion(+), 46 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 8fb4a7ed191f7b..0e173d357a765c 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -94,36 +94,6 @@ bool HasTPUDevice(const DeviceSet& device_set) {
   return false;
 }
 
-// Check if the `graph` has parameter serverjobs and resource variable arguments
-// that are on parameter servers
-bool HasPsWithResourceVariable(const Graph& graph) {
-  // Check parameter serverjobs and resource variable arguments that are
-  // on parameter servers.
-  const std::string jobType = "ps";
-  const std::string nodeType = "_Arg";
-  const std::string attrKey = "T";
-  for (const Node* node : graph.nodes()) {
-    if (node->type_string() == nodeType) {
-      auto device_name = node->assigned_device_name();
-      DeviceNameUtils::ParsedName device;
-      if (DeviceNameUtils::ParseFullName(device_name, &device) &&
-          device.has_job && device.job == jobType) {
-        for (const auto& attr : node->attrs()) {
-          auto attr_key = attr.first;
-          auto attr_value = attr.second;
-          if (attr_key == attrKey &&
-              attr_value.value_case() == AttrValue::kType &&
-              attr_value.type() == DT_RESOURCE) {
-            return true;
-            break;
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
 // Check that graph has tf.StatefulPartitionedCall op with _XlaMustCompile.
 bool HasQualifiedNonTPUOp(const Graph& graph) {
   const std::string kStatefulPartitionedCallOp = "StatefulPartitionedCall";
@@ -140,13 +110,6 @@ bool HasQualifiedNonTPUOp(const Graph& graph) {
   return false;
 }
 
-// Check if non TPU pipeline should be used
-bool EnableNonTpuBridge(const Graph& graph) {
-  // Remark that this is staging change. It will be expanded later for further
-  // check based on the requirement.
-  return HasPsWithResourceVariable(graph) && HasQualifiedNonTPUOp(graph);
-}
-
 }  // namespace
 
 // Analyzes the user requested policy as well as the contents of the graph and
@@ -164,15 +127,7 @@ MlirOptimizationPassState GetPassStateImpl(
     const FunctionLibraryDefinition& function_library) {
   // Skip MLIR TF/XLA Bridge if no TPU devices and no qualified CPU/GPU
   // graphs are found.
-  if (!run_tpu_bridge && !EnableNonTpuBridge(graph)) {
-    // Only record CPU/GPU graphs that are qualified but filtered out
-    if (HasQualifiedNonTPUOp(graph)) {
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-          /*device type*/ "cpu/gpu",
-          /*bridge version*/ "tfxla",
-          /*fallback_enabled*/ false,
-          /*result*/ "invalid_graph");
-    }
+  if (!run_tpu_bridge && !HasQualifiedNonTPUOp(graph)) {
     VLOG(3) << "Skipping MLIR CPU/GPU Bridge, "
                "graph is not qualified to run the bridge";
     return MlirOptimizationPassState::Disabled;

From 9fdacd483cbe1fded22e9e5ababd99170546d94d Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 3 Oct 2023 15:12:40 -0700
Subject: [PATCH 544/567] [jax] Add pathways_ifrt xla_client_test config

PiperOrigin-RevId: 570510365
---
 third_party/xla/xla/python/xla_client_test.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/xla_client_test.py b/third_party/xla/xla/python/xla_client_test.py
index 22d6ac43580ff2..0fd1661ca1131b 100644
--- a/third_party/xla/xla/python/xla_client_test.py
+++ b/third_party/xla/xla/python/xla_client_test.py
@@ -86,7 +86,8 @@ def TestFactory(xla_backend,
                 cloud_tpu=False,
                 tfrt_tpu=False,
                 pjrt_c_api=False,
-                pathways=False):
+                pathways=False,
+                pathways_ifrt=False):
   tests = []
 
   int_dtypes = [np.int32, np.int64, np.uint32, np.uint64]
@@ -586,6 +587,7 @@ def testBlockUntilReadyRaisesOnDeletedBuffer(self):
               "BlockHostUntilReady() called on deleted or donated buffer")):
         buffer.block_until_ready()
 
+    @unittest.skipIf(pathways_ifrt, "not implemented")
     def testOnDeviceSizeInBytes(self):
       if not isinstance(self.backend, xla_client.Client):
         self.skipTest("TPU Driver doesn't support OnDeviceSizeInBytes.")
@@ -663,6 +665,7 @@ def testStandardTypes(self):
         arr = np.asarray(arr)
         self.assertEqual(dtype, type(arr[0]))
 
+    @unittest.skipIf(pathways_ifrt, "not implemented")
     def testUnsafeBufferPointer(self):
       if not isinstance(self.backend, xla_client.Client):
         self.skipTest("TPU Driver doesn't support UnsafeBufferPointer().")
@@ -2142,6 +2145,7 @@ def testLocalHardwareId(self):
         if local_hardware_id is not None:
           self.assertGreaterEqual(local_hardware_id, 0)
 
+    @unittest.skipIf(pathways_ifrt, "not implemented")
     def testLocalDeviceFromLocalHardwareId(self):
       for device in self.backend.local_devices():
         if device.local_hardware_id is not None:

From aa4e77c7c26fb6e6b5ae581e4ffa7a3372966bca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 15:18:38 -0700
Subject: [PATCH 545/567] xProf Roofline model for EdgeTPU

PiperOrigin-RevId: 570511888
---
 tensorflow/core/profiler/convert/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index cf76fb7e2c01af..7dfbd5ea26c0f6 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -80,6 +80,7 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/utils:math_utils",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
     ],
 )
 

From 8ff90ae77b0f441cc941abf8eab11ef982fc9b8b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 15:48:29 -0700
Subject: [PATCH 546/567] Do not convert MLIR back to graph if module is
 unchanged in `MlirV1CompatGraphOptimizationPass`

PiperOrigin-RevId: 570519641
---
 tensorflow/compiler/mlir/BUILD                |   3 +
 .../mlir/mlir_graph_optimization_pass.cc      |  15 +++
 .../mlir/mlir_graph_optimization_pass.h       |   4 +
 .../mlir/mlir_graph_optimization_pass_test.cc | 103 ++++++++++++++++++
 4 files changed, 125 insertions(+)

diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 4f463104c70375..bb53043acb198e 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -247,11 +247,14 @@ tf_cc_test(
     srcs = ["mlir_graph_optimization_pass_test.cc"],
     deps = [
         ":mlir_graph_optimization_pass",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:optimization_registry",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/lib/monitoring:cell_reader",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
 )
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index bd66a5f3132b13..b1e9e73acdaf0e 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/platform/status.h"
@@ -416,6 +418,7 @@ Status MlirV1CompatGraphOptimizationPass::Run(
       std::move(module_ref_status.value());
   AddDevicesToOp(*module_ref, options.device_set);
 
+  auto module_ref_clone = module_ref->clone();
   llvm::StringRef name = pass->name();
   VLOG(2) << "Run MLIR V1 graph optimization pass: " << StringRefToView(name);
 
@@ -424,6 +427,12 @@ Status MlirV1CompatGraphOptimizationPass::Run(
   }
   Status pass_status = pass->Run(options, *module_ref);
 
+  bool is_module_updated = !mlir::OperationEquivalence::isEquivalentTo(
+      module_ref_clone, *module_ref,
+      mlir::OperationEquivalence::Flags::IgnoreLocations);
+  // Destroy this cloned op to avoid memory leaks.
+  module_ref_clone->destroy();
+
   if (!pass_status.ok()) {
     if (pass_state == MlirOptimizationPassState::Enabled) return pass_status;
 
@@ -446,6 +455,12 @@ Status MlirV1CompatGraphOptimizationPass::Run(
     DumpModule(*module_ref, llvm::formatv("mlir_{0}_after_", name));
   }
 
+  if (!is_module_updated) {
+    VLOG(2) << "MLIR module is not updated. Using the original graph. "
+            << "Do not convert mlir module back to graph";
+    return OkStatus();
+  }
+
   GraphExportConfig export_config;
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       ConvertMlirToGraph(*module_ref, export_config, options.graph,
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
index 059147d4ea9322..bf500be3b33510 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
@@ -173,6 +173,10 @@ class MlirV1CompatOptimizationPassRegistry {
     return pass_ ? pass_.get() : nullptr;
   }
 
+  // Free the memory allocated for the single pass.
+  // This method is used for testing mostly.
+  void ClearPass() { pass_.reset(); }
+
  private:
   std::unique_ptr<MlirV1CompatOptimizationPass> pass_{};
 };
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
index 95d669ff9bf3f8..e9548ab782c98c 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
@@ -22,9 +22,11 @@ limitations under the License.
 #include <vector>
 
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
@@ -307,4 +309,105 @@ TEST(MlirV1CompatOptimizationPassRegistry, RegisterMultiplePassesFails) {
       "Only a single pass can be registered");
 }
 
+class MlirGraphOptimizationV1PassTest : public Test {
+ public:
+  void Init(Status pass_run_result,
+            const std::vector<MlirOptimizationPassState>& pass_states) {
+    graph_ = std::make_unique<Graph>(OpRegistry::Global());
+    MlirV1CompatOptimizationPassRegistry::Global().ClearPass();
+    for (const MlirOptimizationPassState& pass_state : pass_states) {
+      auto optimization_pass =
+          std::make_unique<NiceMock<MockMlirV1CompatOptimizationPass>>();
+      ON_CALL(*optimization_pass, GetPassState(_, _, _, _))
+          .WillByDefault(Return(pass_state));
+      ON_CALL(*optimization_pass, Run(_, _))
+          .WillByDefault(Return(pass_run_result));
+      MlirV1CompatOptimizationPassRegistry::Global().Add(
+          std::move(optimization_pass));
+      pass_result_expected_[pass_state][pass_run_result.ok()]++;
+    }
+    flib_ = std::make_unique<FunctionLibraryDefinition>(graph_->flib_def());
+    InitGraphOptions();
+  }
+
+  void verifyGraph(const GraphDef& original_graph_def, bool changed = false) {
+// Proto matchers might be unavailable in the OSS.
+#if defined(PLATFORM_GOOGLE)
+    GraphDef resulted_graph_def;
+    graph_->ToGraphDef(&resulted_graph_def);
+
+    if (changed)
+      EXPECT_THAT(resulted_graph_def,
+                  Not(::testing::proto::IgnoringRepeatedFieldOrdering(
+                      ::testing::EquivToProto(original_graph_def))));
+    else
+      EXPECT_THAT(resulted_graph_def,
+                  ::testing::proto::IgnoringRepeatedFieldOrdering(
+                      ::testing::EquivToProto(original_graph_def)));
+#endif
+  }
+
+  void InitGraphOptions() {
+    session_options_.config = config_proto_;
+    graph_optimization_pass_options_.device_set = &device_set_;
+    graph_optimization_pass_options_.session_options = &session_options_;
+    graph_optimization_pass_options_.graph = &graph_;
+    graph_optimization_pass_options_.flib_def = flib_.get();
+  }
+
+  void verifyCounters() {
+    EXPECT_EQ(mlir_function_pass_fallback_count_.Read(kSuccess),
+              pass_result_expected_[MlirOptimizationPassState::FallbackEnabled]
+                                   [false]);
+    EXPECT_EQ(mlir_function_pass_fallback_count_.Read(kFailure),
+              pass_result_expected_[MlirOptimizationPassState::FallbackEnabled]
+                                   [false]);
+    EXPECT_EQ(mlir_function_pass_graph_conversion_count_.Read(kOk), 0);
+  }
+
+  void TearDown() override {
+    MlirV1CompatOptimizationPassRegistry::Global().ClearPass();
+  }
+
+  ConfigProto config_proto_;
+  FunctionOptimizationPass::FunctionOptions function_options_;
+  MlirV1CompatGraphOptimizationPass function_optimization_pass_;
+  DeviceSet device_set_;
+  std::unique_ptr<Graph> graph_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_;
+  std::vector<std::string> control_ret_node_names_;
+  bool control_rets_updated_{false};
+  SessionOptions session_options_;
+  tensorflow::GraphOptimizationPassOptions graph_optimization_pass_options_;
+  std::map<MlirOptimizationPassState, std::map<bool, int64_t>>
+      pass_result_expected_;
+  monitoring::testing::CellReader<int64_t> mlir_function_pass_fallback_count_ =
+      monitoring::testing::CellReader<int64_t>(
+          /* metric name */
+          "/tensorflow/core/mlir_function_pass_fallback_count");
+  monitoring::testing::CellReader<int64_t>
+      mlir_graph_optimization_pass_fallback_count_ =
+          monitoring::testing::CellReader<int64_t>(
+              /* metric name */
+              "/tensorflow/core/mlir_graph_optimization_pass_fallback_count");
+  monitoring::testing::CellReader<int64_t>
+      mlir_function_pass_graph_conversion_count_ =
+          monitoring::testing::CellReader<int64_t>(
+              /* metric name */
+              "/tensorflow/core/mlir_function_pass_graph_conversion_count");
+};
+
+TEST_F(MlirGraphOptimizationV1PassTest, OptimizationPassDoesNotFailFallback) {
+  Init(OkStatus(), {MlirOptimizationPassState::FallbackEnabled});
+
+  GraphDef original_graph_def;
+  graph_->ToGraphDef(&original_graph_def);
+
+  EXPECT_EQ(function_optimization_pass_.Run(graph_optimization_pass_options_),
+            OkStatus());
+
+  verifyGraph(original_graph_def, /*changed=*/false);
+  verifyCounters();
+}
+
 }  // namespace tensorflow

From 3af7d51dcdcc378065a5add52d7fde1d37797609 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Tue, 3 Oct 2023 16:01:39 -0700
Subject: [PATCH 547/567] [xla:gpu] NFC: Add a function to select instruction
 sequences to be constructed as command buffers

https://github.com/openxla/xla/issues/5756

PiperOrigin-RevId: 570522845
---
 third_party/xla/xla/service/gpu/BUILD         |  4 +
 .../service/gpu/command_buffer_scheduling.cc  | 85 +++++++++++++++++++
 .../service/gpu/command_buffer_scheduling.h   |  6 ++
 .../gpu/command_buffer_scheduling_test.cc     | 75 ++++++++++++++++
 4 files changed, 170 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index d8812e9f76377b..63cba952ff6b7b 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2476,6 +2476,7 @@ cc_library(
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
@@ -2488,8 +2489,11 @@ xla_cc_test(
     srcs = ["command_buffer_scheduling_test.cc"],
     deps = [
         ":command_buffer_scheduling",
+        "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
+        "//xla/tests:verified_hlo_module",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
index 25fe4efe6afd5b..c0442831fdcbcc 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
@@ -18,18 +18,103 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/statusor.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
+namespace {
+
+// We categorize HLO instructions into two types.
+// 1. Commands: Instructions that correspond to a command that will be
+// submitted to a GPU. Fused computations and library calls fall into this
+// category.
+// 2. Intermediates: Instructions that produce intermediate values that are
+// used by commands.
+bool IsCommand(const HloInstruction* inst) {
+  // TODO(anlunx): Add support for conditionals and while loops.
+  return inst->opcode() == HloOpcode::kFusion;
+}
+
+bool IsIntermediate(const HloInstruction* inst) {
+  switch (inst->opcode()) {
+    case HloOpcode::kConstant:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kParameter:
+      return true;
+    default:
+      return false;
+  }
+}
+
+void RemoveTrailingIntermediates(HloInstructionSequence& seq) {
+  std::vector<HloInstruction*> instructions = seq.instructions();
+  for (int i = instructions.size() - 1; i >= 0; i--) {
+    HloInstruction* inst = instructions[i];
+    if (IsIntermediate(inst)) {
+      seq.remove_instruction(inst);
+    } else {
+      break;
+    }
+  }
+}
+
+constexpr int kMinNumCommands = 2;
+
+}  // namespace
+
+// The input is a scheduled sequence of instructions. This function collects
+// subsequences that will be extracted as command buffers.
+std::vector<HloInstructionSequence>
+CommandBufferScheduling::CollectCommandBufferSequences(
+    const HloInstructionSequence inst_sequence) {
+  struct Accumulator {
+    std::vector<HloInstructionSequence> sequences;
+    HloInstructionSequence current_seq;
+    int num_commands_in_current_seq = 0;
+  };
+
+  auto start_new_sequence = [](Accumulator* acc) -> Accumulator* {
+    if (acc->num_commands_in_current_seq >= kMinNumCommands) {
+      RemoveTrailingIntermediates(acc->current_seq);
+      acc->sequences.push_back(acc->current_seq);
+    }
+    acc->current_seq = HloInstructionSequence();
+    acc->num_commands_in_current_seq = 0;
+    return acc;
+  };
+
+  auto process_instruction = [&start_new_sequence](
+                                 Accumulator* acc,
+                                 HloInstruction* inst) -> Accumulator* {
+    if (IsCommand(inst)) {
+      acc->current_seq.push_back(inst);
+      acc->num_commands_in_current_seq += 1;
+      return acc;
+    } else if (IsIntermediate(inst)) {
+      if (acc->current_seq.size() > 0) {
+        acc->current_seq.push_back(inst);
+      }
+      return acc;
+    }
+    return start_new_sequence(acc);
+  };
+
+  std::vector<HloInstruction*> instructions = inst_sequence.instructions();
+  Accumulator acc;
+  absl::c_accumulate(instructions, &acc, process_instruction);
+  return start_new_sequence(&acc)->sequences;
+}
+
 StatusOr<bool> CommandBufferScheduling::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.h b/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
index 69fd3b44114d7b..eed3ce84917986 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
@@ -15,9 +15,12 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_COMMAND_BUFFER_SCHEDULING_H_
 #define XLA_SERVICE_GPU_COMMAND_BUFFER_SCHEDULING_H_
 
+#include <vector>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
 
@@ -68,6 +71,9 @@ class CommandBufferScheduling : public HloModulePass {
   StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  static std::vector<HloInstructionSequence> CollectCommandBufferSequences(
+      HloInstructionSequence inst_sequence);
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc
index 83b681dbbca135..86d95a21893d42 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc
@@ -14,8 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/command_buffer_scheduling.h"
 
+#include <memory>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/tests/verified_hlo_module.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
@@ -53,6 +61,73 @@ TEST_F(CommandBufferSchedulingTest, SingleFusion) {
 // CHECK: })");
 }
 
+TEST_F(CommandBufferSchedulingTest, CollectCommandBufferSequence) {
+  const char* hlo = R"(
+      HloModule TestModule
+
+      %fused_computation(param_0: s32[], param_1: s32[]) -> s32[] {
+        %p0 = s32[] parameter(0)
+        %p1 = s32[] parameter(1)
+        ROOT %add = s32[] add(s32[] %p0, s32[] %p1)
+      }
+
+      %fused_computation.1(param_0: s32[], param_1: s32[]) -> s32[] {
+        %p0 = s32[] parameter(0)
+        %p1 = s32[] parameter(1)
+        ROOT %add = s32[] add(s32[] %p0, s32[] %p1)
+      }
+
+      %fused_computation.2(param_0: s32[], param_1: s32[]) -> s32[] {
+        %p0 = s32[] parameter(0)
+        %p1 = s32[] parameter(1)
+        ROOT %add = s32[] add(s32[] %p0, s32[] %p1)
+      }
+
+      %fused_computation.3(param_0: s32[], param_1: s32[]) -> s32[] {
+        %p0 = s32[] parameter(0)
+        %p1 = s32[] parameter(1)
+        ROOT %add = s32[] add(s32[] %p0, s32[] %p1)
+      }
+
+      ENTRY %main (a: s32[], b: s32[], c: s32[], d: s32[]) -> s32[] {
+        %a = s32[] parameter(0)
+        %b = s32[] parameter(1)
+        %fusion = s32[] fusion(s32[] %a, s32[] %b), kind=kLoop, calls=%fused_computation
+        %c = s32[] parameter(2)
+        %fusion.1 = s32[] fusion(s32[] %fusion, s32[] %c), kind=kLoop, calls=%fused_computation.1
+        %d = s32[] parameter(3)
+        %custom-call = s32[] custom-call(s32[] %fusion.1, s32[] %d), custom_call_target="some target"
+        %fusion.2 = s32[] fusion(s32[] %custom-call, s32[] %c), kind=kLoop, calls=%fused_computation.2
+        ROOT %fusion.3 = s32[] fusion(s32[] %custom-call, s32[] %fusion.2), kind=kLoop, calls=%fused_computation.3
+      })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstructionSequence seq;
+  for (HloInstruction* x : module->entry_computation()->instructions()) {
+    seq.push_back(x);
+  }
+  EXPECT_EQ(seq.size(), 9);
+
+  std::vector<HloInstructionSequence> command_buffer_sequences =
+      CommandBufferScheduling::CollectCommandBufferSequences(seq);
+  EXPECT_EQ(command_buffer_sequences.size(), 2);
+
+  std::vector<HloInstruction*> seq_0 =
+      command_buffer_sequences[0].instructions();
+  EXPECT_EQ(seq_0.size(), 3);
+  EXPECT_EQ(seq_0[0]->opcode(), HloOpcode::kFusion);
+  EXPECT_EQ(seq_0[1]->opcode(), HloOpcode::kParameter);
+  EXPECT_EQ(seq_0[2]->opcode(), HloOpcode::kFusion);
+
+  std::vector<HloInstruction*> seq_1 =
+      command_buffer_sequences[1].instructions();
+  EXPECT_EQ(seq_1.size(), 2);
+  EXPECT_EQ(seq_1[0]->opcode(), HloOpcode::kFusion);
+  EXPECT_EQ(seq_1[1]->opcode(), HloOpcode::kFusion);
+}
+
 }  // namespace
 
 }  // namespace xla::gpu

From 643f2a033c121544a518d9c7f32b87767fabd6bc Mon Sep 17 00:00:00 2001
From: Matthias Kramm <kramm@google.com>
Date: Tue, 3 Oct 2023 16:02:54 -0700
Subject: [PATCH 548/567] Verify that row_ids are sorted.

PiperOrigin-RevId: 570523185
---
 .../core/tpu/kernels/sparse_core_preprocess_ops.cc   | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index 286ef7c472a26f..d02cf8e1fe0761 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/tstring.h"
@@ -587,6 +586,17 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   const int32* col_ids_ptr = col_ids->flat<int32>().data();
   const float* gains_ptr = gains->flat<float>().data();
 
+#ifndef NDEBUG
+  // row_ids are typically computed by ConvertToCooTensorOp, so we
+  // expect them to be sorted. (It doesn't really matter whether they're
+  // ascending or descending, but here, we check for the former.)
+  for (int i = 1; i < total_id_count; i++) {
+    OP_REQUIRES(ctx, row_ids_ptr[i - 1] <= row_ids_ptr[i],
+                absl::InvalidArgumentError(
+                    "row ids need to be sorted in ascending order."));
+  }
+#endif
+
   const int num_physical_replica = num_replica_ * num_sc_per_chip_;
 
   OP_REQUIRES(ctx, sample_count_ % num_sc_per_chip_ == 0,

From e9b0ded2c21cb0a43dde4b3c33f9a1f6ac43dafa Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Tue, 3 Oct 2023 16:07:45 -0700
Subject: [PATCH 549/567] Update comments that the bridge will try to transform
 to TF Executor that is suitable for graph export

PiperOrigin-RevId: 570524514
---
 .../compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h    | 6 ++++++
 .../compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h    | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h
index 5822674734b05a..2c1386f69f1415 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h
@@ -33,6 +33,12 @@ namespace v1 {
 // for replication and parallel execution ops, which may slow performance.
 // Prefer to use the v2 of this API.
 //
+// This also converts the Tensorflow Dialect MLIR into the Tensorflow Executor
+// dialect that is suitable to be exported to GraphDef. Graph -> MLIR -> Graph
+// is not perfectly round trippable, so this API will attempt to make the module
+// exportable and verify some properties of the Tensorflow Executor MLIR that
+// are required by Graph Export. It will return an error if it cannot.
+//
 // Input: A MLIR Module in the Tensorflow Dialect with no
 // `tf_device.cluster_func` ops.
 // Output: A MLIR module in the Tensorflow Executor Dialect.
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h
index 661d2a3e221e09..c410d2227f17d9 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h
@@ -31,6 +31,12 @@ namespace v2 {
 // Tensorflow GraphDef. This API will add control dependencies and verify that
 // the conversion was successful.
 //
+// This also converts the Tensorflow Dialect MLIR into the Tensorflow Executor
+// dialect that is suitable to be exported to GraphDef. Graph -> MLIR -> Graph
+// is not perfectly round trippable, so this API will attempt to make the module
+// exportable and verify some properties of the Tensorflow Executor MLIR that
+// are required by Graph Export. It will return an error if it cannot.
+//
 // Input: A MLIR Module in the Tensorflow Dialect with no
 // `tf_device.cluster_func` ops.
 // Output: A MLIR module in the Tensorflow Executor Dialect.

From 7bc727b7b61d3f5e21bc8298632440c99d99be4c Mon Sep 17 00:00:00 2001
From: Jieying Luo <jieying@google.com>
Date: Tue, 3 Oct 2023 16:10:35 -0700
Subject: [PATCH 550/567] [PJRT C API] Add C API for CreateViewOfDeviceBuffer.

Need to keep the on delete callback data in PjRtCApiClient.

PiperOrigin-RevId: 570525228
---
 third_party/xla/xla/pjrt/BUILD                |  2 +
 third_party/xla/xla/pjrt/c/BUILD              |  5 ++
 third_party/xla/xla/pjrt/c/pjrt_c_api.h       | 43 ++++++++-
 .../xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc     | 89 +++++++++++++++++++
 .../xla/xla/pjrt/c/pjrt_c_api_helpers.cc      | 32 +++++++
 .../xla/xla/pjrt/c/pjrt_c_api_helpers.h       |  5 ++
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc | 26 ++++++
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h  |  4 +
 third_party/xla/xla/pjrt/pjrt_c_api_client.cc | 51 +++++++++++
 third_party/xla/xla/pjrt/pjrt_c_api_client.h  |  5 +-
 .../xla/xla/pjrt/pjrt_c_api_client_test.cc    | 30 +++++++
 11 files changed, 286 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 24e97a8c479e0e..4e2b8a59caeef6 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -865,9 +865,11 @@ xla_cc_test(
         ":pjrt_client",
         ":pjrt_compiler",
         ":pjrt_executable",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla/client:xla_builder",
         "//xla/pjrt/c:pjrt_c_api_cpu_internal",
+        "//xla/tests:literal_test_util",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index 3838890340f678..110fd33afecb3a 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -226,12 +226,17 @@ xla_cc_test(
         ":pjrt_c_api_test_base",
         ":pjrt_c_api_test_common",
         ":pjrt_c_api_wrapper_impl",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
         "//xla:status",
         "//xla:statusor",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_future",
         "//xla/service:custom_call_target_registry",
         "//xla/service:gpu_plugin",
+        "//xla/tests:literal_test_util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index 489acc800f1892..3c10018c720bab 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -53,7 +53,7 @@ extern "C" {
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 32
+#define PJRT_API_MINOR 33
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -718,6 +718,43 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_BufferFromHostBuffer_Args, buffer);
 typedef PJRT_Error* PJRT_Client_BufferFromHostBuffer(
     PJRT_Client_BufferFromHostBuffer_Args* args);
 
+struct PJRT_Client_CreateViewOfDeviceBuffer_Args {
+  size_t struct_size;
+  void* priv;
+  PJRT_Client* client;
+  // A pointer to a non-owned device buffer. A PJRT_Buffer that is a non-owned
+  // view of this device buffer will be created.
+  void* device_buffer_ptr;
+  const int64_t* dims;
+  size_t num_dims;
+  PJRT_Buffer_Type element_type;
+  PJRT_Buffer_MemoryLayout* layout;
+  // The device that `device_buffer_ptr` is on.
+  PJRT_Device* device;
+  // A callback to be performed when the PJRT_Buffer is done with the on-device
+  // buffer. This callback is optional and can be a nullptr.
+  void (*on_delete_callback)(void* device_buffer_ptr, void* user_arg);
+  // `on_delete_callback_arg` will be passed to `on_delete_callback` as
+  // `user_arg` argument.
+  void* on_delete_callback_arg;
+  // A platform-specific stream handle that should contain the work or events
+  // needed to materialize the on-device buffer. It is optional and can be
+  // casted from a nullptr. PJRT_Client_CreateViewOfDeviceBuffer_Args will
+  // append an event to `stream` that indicates when the returned buffer is
+  // ready to use. This is intended to support dlpack on GPU and is not expected
+  // to be supported on all hardware platforms.
+  intptr_t stream;
+  PJRT_Buffer* buffer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateViewOfDeviceBuffer_Args, buffer);
+
+// Creates a PJRT buffer that is a non-owned view of an on-device buffer
+// (typically allocated by another library). The buffer may be mutated,
+// for example, if the buffer is donated to an Execute operation. This method is
+// not required on all hardware platforms.
+typedef PJRT_Error* PJRT_Client_CreateViewOfDeviceBuffer(
+    PJRT_Client_CreateViewOfDeviceBuffer_Args* args);
+
 // -------------------------- Device Descriptions ------------------------------
 
 // Device descriptions may be associated with an actual device
@@ -2050,10 +2087,12 @@ typedef struct {
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_OutputDimensions);
 
   _PJRT_API_STRUCT_FIELD(PJRT_Buffer_CopyToMemory);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateViewOfDeviceBuffer);
 } PJRT_Api;
 
 const size_t PJRT_Api_STRUCT_SIZE =
-    PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Buffer_CopyToMemory);
+    PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Client_CreateViewOfDeviceBuffer);
 
 #undef _PJRT_API_STRUCT_FIELD
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 6fd6115ca79905..ded7bf4e5c81f0 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_gpu.h"
 
 #include <cstdint>
+#include <functional>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <thread>  // NOLINT(build/c++11)
 #include <utility>
@@ -32,6 +34,8 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_gpu_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
@@ -40,9 +44,13 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/custom_call_target_registry.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/tests/literal_test_util.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -65,6 +73,87 @@ class PjrtCApiGpuTest : public PjrtCApiTestBase {
   PjrtCApiGpuTest() : PjrtCApiTestBase(GetPjrtApi()) {}
 };
 
+TEST_F(PjrtCApiGpuTest, CreateViewOfDeviceBuffer) {
+  // Prepares a device memory ptr on GPU.
+  std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter> buffer =
+      create_buffer().first;
+  PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args device_buffer_ptr_args;
+  device_buffer_ptr_args.struct_size =
+      PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args_STRUCT_SIZE;
+  device_buffer_ptr_args.priv = nullptr;
+  device_buffer_ptr_args.buffer = buffer.get();
+  PJRT_Error* device_buffer_ptr_error =
+      api_->PJRT_Buffer_OpaqueDeviceMemoryDataPointer(&device_buffer_ptr_args);
+  ASSERT_EQ(device_buffer_ptr_error, nullptr);
+  // Looks up a device.
+  PJRT_Buffer_Device_Args device_args = PJRT_Buffer_Device_Args{
+      /*struct_size=*/PJRT_Buffer_Device_Args_STRUCT_SIZE,
+      /*priv=*/nullptr,
+      /*buffer=*/buffer.get(),
+  };
+  PJRT_Error* device_error = api_->PJRT_Buffer_Device(&device_args);
+  ASSERT_EQ(device_error, nullptr);
+
+  // Prepares PJRT_Client_CreateViewOfDeviceBuffer_Args.
+  PJRT_Client_CreateViewOfDeviceBuffer_Args create_view_args;
+  create_view_args.struct_size =
+      PJRT_Client_CreateViewOfDeviceBuffer_Args_STRUCT_SIZE;
+  create_view_args.priv = nullptr;
+  create_view_args.client = client_;
+  create_view_args.device_buffer_ptr = device_buffer_ptr_args.device_memory_ptr;
+  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::S32, {4});
+  create_view_args.dims = shape.dimensions().data();
+  create_view_args.num_dims = shape.dimensions().size();
+  create_view_args.element_type =
+      pjrt::ConvertToPjRtBufferType(shape.element_type());
+  pjrt::BufferMemoryLayoutData c_layout_data;
+  TF_ASSERT_OK_AND_ASSIGN(
+      c_layout_data, pjrt::ConvertToBufferMemoryLayoutData(shape.layout()));
+  create_view_args.layout = &(c_layout_data.c_layout);
+  create_view_args.device = device_args.device;
+  std::function<void()> on_delete_callback = []() mutable {};
+  create_view_args.on_delete_callback_arg =
+      new std::function(on_delete_callback);
+  create_view_args.on_delete_callback = [](void* device_buffer_ptr,
+                                           void* user_arg) {
+    auto c_func = reinterpret_cast<std::function<void()>*>(user_arg);
+    (*c_func)();
+    delete c_func;
+  };
+  create_view_args.stream = reinterpret_cast<intptr_t>(nullptr);
+
+  PJRT_Error* error =
+      api_->PJRT_Client_CreateViewOfDeviceBuffer(&create_view_args);
+
+  ASSERT_EQ(error, nullptr);
+  std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter> view_buffer(
+      create_view_args.buffer, ::pjrt::MakeBufferDeleter(api_));
+
+  // Transfers view_buffer to host to verify.
+  PJRT_Buffer_ToHostBuffer_Args to_host_args;
+  to_host_args.struct_size = PJRT_Buffer_ToHostBuffer_Args_STRUCT_SIZE;
+  to_host_args.priv = nullptr;
+  to_host_args.src = view_buffer.get();
+  xla::Shape host_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  auto literal = std::make_shared<xla::Literal>(host_shape);
+  to_host_args.host_layout = nullptr;
+  to_host_args.dst = literal->untyped_data();
+  to_host_args.dst_size = xla::ShapeUtil::ByteSizeOfElements(host_shape);
+  to_host_args.event = nullptr;
+
+  PJRT_Error* to_host_error = api_->PJRT_Buffer_ToHostBuffer(&to_host_args);
+
+  ASSERT_EQ(to_host_error, nullptr);
+  xla::PjRtFuture<absl::Status> transfer_to_host =
+      ::pjrt::ConvertCEventToCppFuture(to_host_args.event, api_);
+  TF_CHECK_OK(transfer_to_host.Await());
+  ASSERT_EQ(literal->data<float>().size(), 4);
+  std::vector<float> float_data(4);
+  std::iota(float_data.begin(), float_data.end(), 41.0f);
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(
+      xla::LiteralUtil::CreateR1<float>(float_data), *literal));
+}
+
 std::unique_ptr<::pjrt::PJRT_KeyValueCallbackData> CreateTestCKVCallback(
     absl::flat_hash_map<std::string, std::string>* kv_store, absl::Mutex& mu) {
   xla::PjRtClient::KeyValueGetCallback kv_get =
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
index 1de705ecaf302a..833ea6f71c02a9 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/primitive_util.h"
+#include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
@@ -820,4 +821,35 @@ PJRT_Buffer_MemoryLayout GetMemoryLayout(const PJRT_Api* api,
   return args.layout;
 }
 
+xla::StatusOr<xla::Shape> BuildXlaShapeFromC(PJRT_Buffer_Type element_type,
+                                             const int64_t* dims,
+                                             size_t num_dims,
+                                             PJRT_Buffer_MemoryLayout* layout) {
+  xla::Shape shape =
+      xla::ShapeUtil::MakeShape(ConvertFromPjRtBufferType(element_type),
+                                absl::Span<const int64_t>(dims, num_dims));
+  xla::Layout cpp_layout;
+  if (layout != nullptr) {
+    switch (layout->type) {
+      case PJRT_Buffer_MemoryLayout_Type::PJRT_Buffer_MemoryLayout_Type_Tiled: {
+        TF_ASSIGN_OR_RETURN(cpp_layout, ConvertToLayout(layout->tiled));
+        break;
+      }
+      case PJRT_Buffer_MemoryLayout_Type::
+          PJRT_Buffer_MemoryLayout_Type_Strides: {
+        TF_RETURN_IF_ERROR(absl::InvalidArgumentError(
+            "PJRT_Buffer_MemoryLayout_Type_Strides is not supported to be "
+            "converted to a xla::Shape"));
+        break;
+      }
+      default: {
+        TF_RETURN_IF_ERROR(absl::InvalidArgumentError(absl::StrCat(
+            "Unexpected PJRT_Buffer_MemoryLayout_Type type: ", layout->type)));
+      }
+    }
+    *shape.mutable_layout() = cpp_layout;
+  }
+  return shape;
+}
+
 }  // namespace pjrt
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
index 97b4be01fcd8ed..0c78f7aafe0ec2 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -229,6 +229,11 @@ absl::Span<const int64_t> GetDimensions(const PJRT_Api* api,
 PJRT_Buffer_MemoryLayout GetMemoryLayout(const PJRT_Api* api,
                                          PJRT_Buffer* buffer);
 
+xla::StatusOr<xla::Shape> BuildXlaShapeFromC(PJRT_Buffer_Type element_type,
+                                             const int64_t* dims,
+                                             size_t num_dims,
+                                             PJRT_Buffer_MemoryLayout* layout);
+
 }  // namespace pjrt
 
 #endif  // XLA_PJRT_C_PJRT_C_API_HELPERS_H_
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 06a16a0aaeaee5..a624cccee3bb45 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -710,6 +710,32 @@ PJRT_Error* PJRT_Client_BufferFromHostBuffer(
   return nullptr;
 }
 
+PJRT_Error* PJRT_Client_CreateViewOfDeviceBuffer(
+    PJRT_Client_CreateViewOfDeviceBuffer_Args* args) {
+  PJRT_ASSIGN_OR_RETURN(xla::Shape shape,
+                        pjrt::BuildXlaShapeFromC(args->element_type, args->dims,
+                                                 args->num_dims, args->layout));
+  std::function<void()> on_delete_callback;
+  if (args->on_delete_callback != nullptr) {
+    on_delete_callback = [on_delete_callback = args->on_delete_callback,
+                          user_arg = args->on_delete_callback_arg,
+                          device_buffer_ptr = args->device_buffer_ptr]() {
+      on_delete_callback(device_buffer_ptr, user_arg);
+    };
+  }
+  std::optional<std::intptr_t> stream = std::nullopt;
+  if (reinterpret_cast<void*>(args->stream) != nullptr) {
+    stream = args->stream;
+  }
+  std::unique_ptr<xla::PjRtBuffer> buffer;
+  PJRT_ASSIGN_OR_RETURN(
+      buffer, args->client->client->CreateViewOfDeviceBuffer(
+                  args->device_buffer_ptr, shape, args->device->device,
+                  on_delete_callback, stream));
+  args->buffer = new PJRT_Buffer{std::move(buffer), args->client};
+  return nullptr;
+}
+
 // --------------------------------- Devices -----------------------------------
 
 PJRT_Error* PJRT_DeviceDescription_Id(PJRT_DeviceDescription_Id_Args* args) {
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index 9620aa70454b79..f26450cc027547 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -223,6 +223,8 @@ PJRT_Error* PJRT_Client_DefaultDeviceAssignment(
     PJRT_Client_DefaultDeviceAssignment_Args* args);
 PJRT_Error* PJRT_Client_BufferFromHostBuffer(
     PJRT_Client_BufferFromHostBuffer_Args* args);
+PJRT_Error* PJRT_Client_CreateViewOfDeviceBuffer(
+    PJRT_Client_CreateViewOfDeviceBuffer_Args* args);
 
 PJRT_Error* PJRT_DeviceDescription_Id(PJRT_DeviceDescription_Id_Args* args);
 PJRT_Error* PJRT_DeviceDescription_ProcessIndex(
@@ -559,6 +561,8 @@ constexpr PJRT_Api CreatePjrtApi(
       pjrt::PJRT_Executable_OutputDimensions,
       /*PJRT_Buffer_CopyToMemory=*/
       pjrt::PJRT_Buffer_CopyToMemory,
+      /*PJRT_Client_CreateViewOfDeviceBuffer=*/
+      pjrt::PJRT_Client_CreateViewOfDeviceBuffer,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index 50177ffcaa0c9c..a8c82265fd17b0 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -583,6 +583,57 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::BufferFromHostBuffer(
       on_done_with_host_buffer, device, /*device_layout=*/nullptr);
 }
 
+StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::CreateViewOfDeviceBuffer(
+    void* device_ptr, const Shape& shape, PjRtDevice* device,
+    std::function<void()> on_delete_callback,
+    std::optional<std::intptr_t> stream) {
+  PJRT_Client_CreateViewOfDeviceBuffer_Args args;
+  args.struct_size = PJRT_Client_CreateViewOfDeviceBuffer_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.client = c_client_.get();
+  args.device_buffer_ptr = device_ptr;
+  args.dims = shape.dimensions().data();
+  args.num_dims = shape.dimensions().size();
+  args.element_type = pjrt::ConvertToPjRtBufferType(shape.element_type());
+  pjrt::BufferMemoryLayoutData c_layout_data;
+  if (shape.has_layout()) {
+    TF_ASSIGN_OR_RETURN(c_layout_data,
+                        pjrt::ConvertToBufferMemoryLayoutData(shape.layout()));
+    args.layout = &(c_layout_data.c_layout);
+  } else {
+    args.layout = nullptr;
+  }
+  if (on_delete_callback != nullptr) {
+    args.on_delete_callback_arg =
+        new std::function(std::move(on_delete_callback));
+    args.on_delete_callback = [](void* device_buffer_ptr, void* user_arg) {
+      auto* c_func = reinterpret_cast<std::function<void()>*>(user_arg);
+      (*c_func)();
+      delete c_func;
+    };
+  } else {
+    args.on_delete_callback = nullptr;
+    args.on_delete_callback_arg = nullptr;
+  }
+  args.device = tensorflow::down_cast<PjRtCApiDevice*>(device)->c_device();
+  if (stream.has_value()) {
+    args.stream = *stream;
+  } else {
+    args.stream = reinterpret_cast<intptr_t>(nullptr);
+  }
+  const PJRT_Api* c_api = pjrt_c_api();
+  // TODO(jieying): To be removed after 12/29/2023.
+  if (c_api->pjrt_api_version.minor_version < 33) {
+    return Unimplemented(
+        "The plugin does not support CreateViewOfDeviceBuffer");
+  }
+  RETURN_STATUS_IF_PJRT_ERROR(
+      c_api->PJRT_Client_CreateViewOfDeviceBuffer(&args), c_api);
+
+  return std::unique_ptr<PjRtBuffer>(
+      std::make_unique<PjRtCApiBuffer>(this, args.buffer));
+}
+
 const PJRT_Api* PjRtCApiClient::pjrt_c_api() const { return c_api_; }
 
 // --------------------------------- Devices -----------------------------------
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
index 4dbccb3faa0484..1b1eecf8e188b5 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
@@ -268,10 +268,7 @@ class PjRtCApiClient : public PjRtClient {
   StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
       void* device_ptr, const Shape& shape, PjRtDevice* device,
       std::function<void()> on_delete_callback,
-      std::optional<std::intptr_t> stream) override {
-    return Unimplemented(
-        "PJRT C API does not support CreateViewOfDeviceBuffer");
-  }
+      std::optional<std::intptr_t> stream) override;
 
   StatusOr<std::uintptr_t> UnsafeBufferPointer(PjRtBuffer* buffer) override;
 
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
index d4477696f18d7a..1ab9f0b06882f5 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
@@ -18,11 +18,13 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/client/xla_builder.h"
+#include "xla/literal_util.h"
 #include "xla/pjrt/c/pjrt_c_api_cpu_internal.h"
 #include "xla/pjrt/pjrt_api.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -30,6 +32,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tests/literal_test_util.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
@@ -119,5 +122,32 @@ TEST(PjRtCApiClientTest, EmptyExecutableFingerprint) {
   EXPECT_FALSE(fingerprint.has_value());
 }
 
+TEST(PjRtClientTest, CreateViewAndCopyToDeviceAsyncExternalCpuOnly) {
+  SetUpCpuPjRtApi();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetCApiClient("cpu"));
+  ASSERT_GT(client->addressable_devices().size(), 1);
+  std::vector<int32_t> data(4, 0);
+  auto* data_ptr = data.data();
+  Shape shape = ShapeUtil::MakeShape(S32, {4});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->CreateViewOfDeviceBuffer(
+          data_ptr, shape, client->addressable_devices()[0],
+          /*on_delete_callback=*/[data = std::move(data)]() mutable {}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> result,
+      buffer->CopyToDevice(client->addressable_devices()[1]));
+  buffer.reset();
+  ASSERT_TRUE(result);
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+
+  std::vector<int32_t> expected(4, 0);
+  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
+                                     *literal));
+}
+
 }  // namespace
 }  // namespace xla

From 005309184ab1a88f2f42c93570735f2f5a89567a Mon Sep 17 00:00:00 2001
From: Tongfei Guo <tongfei@google.com>
Date: Tue, 3 Oct 2023 16:18:02 -0700
Subject: [PATCH 551/567] [XLA:SPMD] Supports splitting Gather/Scatter sharding
 dimensions so that a shard dimension can be used for more than 1 partitioning
 methods. This allows us to reduce communication such that we don't have the
 all-gather to fully replicate some dimensions, and instead we can just
 partially replicate the dimension.

PiperOrigin-RevId: 570527101
---
 .../xla/xla/hlo/utils/hlo_sharding_util.cc    | 141 +++++++++-
 .../xla/xla/hlo/utils/hlo_sharding_util.h     |  35 ++-
 .../service/spmd/gather_scatter_handler.cc    | 251 +++++++++++++++---
 .../xla/service/spmd/spmd_partitioner_test.cc | 162 ++++++++++-
 4 files changed, 529 insertions(+), 60 deletions(-)

diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index c37efb945740e2..652ece8b2db240 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_sharding_util.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <iostream>
 #include <iterator>
@@ -1131,6 +1132,31 @@ absl::InlinedVector<int64_t, 1> GetGatherScatterOperandPassthroughOperandDims(
   return passthrough_dims;
 }
 
+absl::InlinedVector<int64_t, 1>
+GetGatherScatterOperandPassthroughOutputOrUpdateDims(
+    const int64_t output_or_update_rank, const Shape& operand_shape,
+    absl::Span<const int64_t> collapsed_or_inserted_dims,
+    absl::Span<const int64_t> index_map,
+    absl::Span<const int64_t> offset_or_window_dims,
+    absl::Span<const int64_t> slice_size) {
+  auto operand_passthrough_dims = GetGatherScatterOperandPassthroughOperandDims(
+      operand_shape, collapsed_or_inserted_dims, index_map,
+      offset_or_window_dims, slice_size);
+  absl::InlinedVector<int64_t, 1> passthrough_dims;
+  int64_t collapsed = 0;
+  for (int64_t i = 0; i < operand_shape.rank(); ++i) {
+    if (absl::c_linear_search(collapsed_or_inserted_dims, i)) {
+      collapsed++;
+    }
+    if (!absl::c_linear_search(operand_passthrough_dims, i)) {
+      continue;
+    }
+    int64_t offset_dim = offset_or_window_dims[i - collapsed];
+    passthrough_dims.push_back(offset_dim);
+  }
+  return passthrough_dims;
+}
+
 // If partitioning in the operand only happens in dimensions in passthrough
 // dimensions (offset dimensions in the gather output (or scatter update) that
 // have the same size as the operand), returns the corresponding output (or
@@ -2124,6 +2150,39 @@ absl::InlinedVector<int64_t, 1> GetScatterOperandPassthroughOperandDims(
       update_window_dims, slice_sizes);
 }
 
+absl::InlinedVector<int64_t, 1> GetGatherOperandPassthroughOutputDims(
+    const Shape& output_shape, const Shape& operand_shape,
+    const HloSharding& operand_sharding, const HloInstruction& hlo,
+    absl::Span<const int64_t> slice_sizes) {
+  const auto& dnums = hlo.gather_dimension_numbers();
+  std::vector<int64_t> collapsed_slice_dims(
+      dnums.collapsed_slice_dims().begin(), dnums.collapsed_slice_dims().end());
+  std::vector<int64_t> start_index_map(dnums.start_index_map().begin(),
+                                       dnums.start_index_map().end());
+  std::vector<int64_t> offset_dims(dnums.offset_dims().begin(),
+                                   dnums.offset_dims().end());
+  return GetGatherScatterOperandPassthroughOutputOrUpdateDims(
+      output_shape.rank(), operand_shape, collapsed_slice_dims, start_index_map,
+      offset_dims, slice_sizes);
+}
+
+absl::InlinedVector<int64_t, 1> GetScatterOperandPassthroughUpdateDims(
+    const Shape& update_shape, const Shape& operand_shape,
+    const HloSharding& operand_sharding, const HloInstruction& hlo,
+    absl::Span<const int64_t> slice_sizes) {
+  const auto& dnums = hlo.scatter_dimension_numbers();
+  std::vector<int64_t> inserted_window_dims(
+      dnums.inserted_window_dims().begin(), dnums.inserted_window_dims().end());
+  std::vector<int64_t> scatter_dims_to_operand_dims(
+      dnums.scatter_dims_to_operand_dims().begin(),
+      dnums.scatter_dims_to_operand_dims().end());
+  std::vector<int64_t> update_window_dims(dnums.update_window_dims().begin(),
+                                          dnums.update_window_dims().end());
+  return GetGatherScatterOperandPassthroughOutputOrUpdateDims(
+      update_shape.rank(), operand_shape, inserted_window_dims,
+      scatter_dims_to_operand_dims, update_window_dims, slice_sizes);
+}
+
 absl::InlinedVector<int64_t, 1> GetGatherScatterIndexPassthroughIndexDims(
     const int64_t indices_rank, const int64_t index_vector_dim) {
   absl::InlinedVector<int64_t, 1> passthrough_dims;
@@ -2271,12 +2330,29 @@ GroupedSharding GroupShardingOnDims(const HloSharding& sharding,
   return grouped;
 }
 
-// See if we can group sharding on partially replicated dimensions, otherwise
-// replicate it.
-GroupedSharding GroupShardingOnReplicatedDim(const HloSharding& sharding,
-                                             const int64_t num_groups,
-                                             const int64_t num_tiles,
-                                             const int64_t data_rank) {
+namespace {
+
+std::vector<int64_t> PrimeFactorization(int64_t num) {
+  std::vector<int64_t> prime_factors;
+  while (num % 2 == 0) {
+    prime_factors.push_back(2);
+    num /= 2;
+  }
+  for (int64_t i = 3; i <= sqrt(num); i += 2) {
+    while (num % i == 0) {
+      prime_factors.push_back(i);
+      num /= i;
+    }
+  }
+  return prime_factors;
+}
+
+}  // namespace
+
+GroupedSharding GroupShardingOnReplicatedDim(
+    const HloSharding& sharding, int64_t num_groups, int64_t num_tiles,
+    int64_t data_rank, absl::Span<const int64_t> replicable_dims) {
+  // 1. Try group sharding on partially replicated dim.
   if (sharding.ReplicateOnLastTileDim() &&
       sharding.tile_assignment().dimensions().back() % num_groups == 0) {
     absl::InlinedVector<int64_t, 1> group_dim_shards = {
@@ -2285,7 +2361,58 @@ GroupedSharding GroupShardingOnReplicatedDim(const HloSharding& sharding,
         sharding, {sharding.tile_assignment().num_dimensions() - 1},
         group_dim_shards);
   }
-  // Otherwise return a grouped replicated sharding.
+
+  // 2. Try borrow dimensions from replicable_dims in order, and group sharding.
+  if (sharding.IsTiled()) {
+    int64_t max_replicable_dimensions =
+        sharding.ReplicateOnLastTileDim()
+            ? sharding.tile_assignment().dimensions().back()
+            : 1;
+    max_replicable_dimensions = absl::c_accumulate(
+        replicable_dims, max_replicable_dimensions,
+        [&](int64_t product, int64_t dim) {
+          return product * sharding.tile_assignment().dim(dim);
+        });
+    if (max_replicable_dimensions % num_groups == 0) {
+      auto tile_assignment = [&]() -> std::optional<TileAssignment> {
+        int dimensions_to_borrow =
+            num_groups / (sharding.ReplicateOnLastTileDim()
+                              ? sharding.tile_assignment().dimensions().back()
+                              : 1);
+        std::vector<int64_t> tile_dims(
+            sharding.tile_assignment().dimensions().begin(),
+            sharding.tile_assignment().dimensions().end());
+        if (!sharding.ReplicateOnLastTileDim()) {
+          tile_dims.push_back(1);
+        }
+        for (auto replicable_dim : replicable_dims) {
+          for (auto factor : PrimeFactorization(
+                   sharding.tile_assignment().dim(replicable_dim))) {
+            if (dimensions_to_borrow % factor == 0) {
+              tile_dims[replicable_dim] /= factor;
+              tile_dims.back() *= factor;
+              dimensions_to_borrow /= factor;
+              if (dimensions_to_borrow == 1) {
+                return TileAssignment(tile_dims);
+              }
+            }
+          }
+        }
+        return std::nullopt;
+      }();
+      if (tile_assignment.has_value()) {
+        HloSharding partial_sharding = HloSharding::PartialTile(
+            tile_assignment.value(), sharding.metadata());
+        if (!partial_sharding.IsReplicated()) {
+          return GroupShardingOnDims(
+              partial_sharding,
+              {partial_sharding.tile_assignment().num_dimensions() - 1});
+        }
+      }
+    }
+  }
+
+  // 3. Otherwise return a grouped replicated sharding.
   return GetGroupedReplicatedSharding(num_groups, num_tiles, data_rank);
 }
 
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index 95dcad8347376f..d953f0cc532334 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -326,6 +326,16 @@ absl::InlinedVector<int64_t, 1> GetScatterOperandPassthroughOperandDims(
     const Shape& operand_shape, const HloSharding& operand_sharding,
     const HloInstruction& hlo, absl::Span<const int64_t> slice_sizes);
 
+absl::InlinedVector<int64_t, 1> GetGatherOperandPassthroughOutputDims(
+    const Shape& output_shape, const Shape& operand_shape,
+    const HloSharding& operand_sharding, const HloInstruction& hlo,
+    absl::Span<const int64_t> slice_sizes);
+
+absl::InlinedVector<int64_t, 1> GetScatterOperandPassthroughUpdateDims(
+    const Shape& update_shape, const Shape& operand_shape,
+    const HloSharding& operand_sharding, const HloInstruction& hlo,
+    absl::Span<const int64_t> slice_sizes);
+
 // Returns the index pass-through dimensions for gather/scatter indices.
 absl::InlinedVector<int64_t, 1> GetGatherScatterIndexPassthroughIndexDims(
     const int64_t indices_rank, const int64_t index_vector_dim);
@@ -381,12 +391,25 @@ GroupedSharding GroupShardingOnAllDimsExcept(
     const HloSharding& sharding, absl::Span<const int64_t> non_group_dims,
     bool subgroup_manual = false);
 
-// Creates a GroupedSharding by trying to group on partially replicated
-// dimensions, otherwise replicate it.
-GroupedSharding GroupShardingOnReplicatedDim(const HloSharding& sharding,
-                                             const int64_t num_groups,
-                                             const int64_t num_tiles,
-                                             const int64_t data_rank);
+// Creates a GroupedSharding by trying to do the following in sequence:
+//
+// 1. Group on partially replicated dimensions, which preserves the existing
+// tiled sharding in the group.
+// 2. If option 1 doesn't have enough dimensions, try borrowing dimensions from
+// replicable_dims in order, until it has enough dimensions. This partly
+// preserves the existing tiled sharding in the group. (e.g. if we need 4
+// groups, while our sharding is {[4,8,2]<=[64] last_tile_dim_replicate}, and if
+// we borrow 2 dimensions from the first dimension(i.e. the 4-way partition),
+// combined with the partially replicated 2, we will be able to group the
+// sharding into 4 groups, and we have grouped sub-sharding [2,8]<=[16] instead.
+// 3. Otherwise replicate the whole thing.
+//
+// This does not guarantee the consistency of the ordering of the tile
+// assignment, and should be used with AlignGroup where its tile assignment
+// doesn't matter and will always align to some other tile assignment.
+GroupedSharding GroupShardingOnReplicatedDim(
+    const HloSharding& sharding, int64_t num_groups, int64_t num_tiles,
+    int64_t data_rank, absl::Span<const int64_t> replicable_dims = {});
 
 // Get group sharding for replicated sharding.
 GroupedSharding GetGroupedReplicatedSharding(const int64_t num_groups,
diff --git a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
index 41386a6d822d87..16633a9cff2571 100644
--- a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
+++ b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
@@ -23,15 +23,18 @@ limitations under the License.
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
 #include "xla/literal_util.h"
 #include "xla/service/shape_inference.h"
 #include "xla/service/spmd/spmd_partitioner.h"
 #include "xla/service/spmd/spmd_partitioner_util.h"
+#include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
@@ -79,7 +82,7 @@ absl::InlinedVector<PartitionedHlo, 1> PerGroupPartitionedHlos(
 // Returns whether partitioning in the operand only happens in dimensions with
 // gather/scatter slice size 1.
 std::optional<std::vector<int64_t>>
-GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+GatherScatterOperandPartitionedOnTrivialSliceDims(
     const PartitionedHlo& operand, absl::Span<const int64_t> index_map,
     absl::Span<const int64_t> slice_size) {
   if (operand.sharding().IsTileMaximal()) {
@@ -100,6 +103,68 @@ GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
   return std::nullopt;
 }
 
+// Priority for operand dimensions in gather from lowest to highest, in case of
+// needing to replicate a dimension, start from the lowest first.
+std::vector<int64_t> GatherOperandDimsByPriority(
+    const PartitionedHlo& operand, const HloGatherInstruction* gather,
+    absl::Span<const int64_t> slice_sizes) {
+  const GatherDimensionNumbers& dnums = gather->gather_dimension_numbers();
+  std::vector<int64_t> priority_dims_for_operand;
+  if (const std::optional<std::vector<int64_t>> trivial_slice_dims =
+          GatherScatterOperandPartitionedOnTrivialSliceDims(
+              operand, dnums.start_index_map(), slice_sizes)) {
+    absl::c_copy(trivial_slice_dims.value(),
+                 std::back_inserter(priority_dims_for_operand));
+  }
+  const auto operand_passthrough_dims =
+      hlo_sharding_util::GetGatherOperandPassthroughOperandDims(
+          operand.base_shape(), operand.sharding(), *gather, slice_sizes);
+  absl::c_copy(operand_passthrough_dims,
+               std::back_inserter(priority_dims_for_operand));
+  return priority_dims_for_operand;
+}
+
+// Priority for index dimensions in gather from lowest to highest, in case of
+// needing to replicate a dimension, start from the lowest first.
+std::vector<int64_t> GatherIndexDimsByPriority(
+    const PartitionedHlo& indices, const HloGatherInstruction* gather) {
+  const GatherDimensionNumbers& dnums = gather->gather_dimension_numbers();
+
+  std::vector<int64_t> priority_dims_for_indices;
+  priority_dims_for_indices.push_back(dnums.index_vector_dim());
+  absl::InlinedVector<int64_t, 1> index_passthrough_dims =
+      hlo_sharding_util::GetGatherScatterIndexPassthroughIndexDims(
+          indices.rank(), dnums.index_vector_dim());
+  absl::c_copy(index_passthrough_dims,
+               std::back_inserter(priority_dims_for_indices));
+  return priority_dims_for_indices;
+}
+
+// Priority for output dimensions in gather from lowest to highest, in case of
+// needing to replicate a dimension, start from the lowest first.
+std::vector<int64_t> GatherOutputDimsByPriority(
+    const Shape& output_shape, const PartitionedHlo& operand,
+    const HloGatherInstruction* gather, absl::Span<const int64_t> slice_sizes) {
+  // Try preserve operand passthrough dims as typically operand is larger in
+  // size than indices, and preserving operand pass-through dims avoid more
+  // costly reshard in operand passthrough case.
+  // TODO(b/303295109): Use better cost-model based decision to determine the
+  // priority ordering for scatter update.
+  std::vector<int64_t> priority_dims_for_output;
+  auto operand_passthrough_output_dims =
+      hlo_sharding_util::GetGatherOperandPassthroughOutputDims(
+          output_shape, operand.base_shape(), operand.sharding(), *gather,
+          slice_sizes);
+  for (int i = 0; i != output_shape.rank(); ++i) {
+    if (!absl::c_linear_search(operand_passthrough_output_dims, i)) {
+      priority_dims_for_output.push_back(i);
+    }
+  }
+  absl::c_copy(operand_passthrough_output_dims,
+               std::back_inserter(priority_dims_for_output));
+  return priority_dims_for_output;
+}
+
 // Returns the min and max for the indices in a scatter/gather which has the
 // operand partitioned on trivial slice dimensions (slice size 1).
 std::pair<HloInstruction*, HloInstruction*>
@@ -224,7 +289,8 @@ StatusOr<HloInstruction*> PartitionGatherIndexPassthroughDimensions(
   // otherwise replicate it.
   const GroupedSharding operand_grouped = AlignGroupsWith(
       hlo_sharding_util::GroupShardingOnReplicatedDim(
-          operand.sharding(), num_groups, num_tiles, operand.rank()),
+          operand.sharding(), num_groups, num_tiles, operand.rank(),
+          GatherOperandDimsByPriority(operand, gather, slice_sizes)),
       output_grouped);
   PartitionedHlo per_group_operand =
       PerGroupPartitionedHlo(operand, operand_grouped, b, clean_ups);
@@ -302,7 +368,8 @@ StatusOr<HloInstruction*> PartitionGatherOperandPassthroughDimensions(
     // otherwise replicate it.
     const GroupedSharding indices_grouped = AlignGroupsWith(
         hlo_sharding_util::GroupShardingOnReplicatedDim(
-            indices.sharding(), num_groups, num_tiles, indices.rank()),
+            indices.sharding(), num_groups, num_tiles, indices.rank(),
+            GatherIndexDimsByPriority(indices, gather)),
         output_grouped);
     PartitionedHlo per_group_operand =
         PerGroupPartitionedHlo(operand, operand_grouped, b, clean_ups);
@@ -344,7 +411,7 @@ StatusOr<HloInstruction*> PartitionGatherTrivialSlicedOperandDimensions(
   std::vector<int64_t> start_index_map(dnums.start_index_map().begin(),
                                        dnums.start_index_map().end());
   if (std::optional<std::vector<int64_t>> trivial_slice_dims =
-          GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+          GatherScatterOperandPartitionedOnTrivialSliceDims(
               operand, start_index_map, slice_sizes)) {
     const HloSharding original_operand_sharding = operand.sharding();
     const int64_t num_groups = operand.sharding().NumTiles(*trivial_slice_dims);
@@ -356,11 +423,14 @@ StatusOr<HloInstruction*> PartitionGatherTrivialSlicedOperandDimensions(
     // otherwise replicate it.
     GroupedSharding indices_grouped = AlignGroupsWith(
         hlo_sharding_util::GroupShardingOnReplicatedDim(
-            indices.sharding(), num_groups, num_tiles, indices.rank()),
+            indices.sharding(), num_groups, num_tiles, indices.rank(),
+            GatherIndexDimsByPriority(indices, gather)),
         operand_grouped);
     GroupedSharding output_grouped = AlignGroupsWith(
         hlo_sharding_util::GroupShardingOnReplicatedDim(
-            output_sharding, num_groups, num_tiles, output_shape.rank()),
+            output_sharding, num_groups, num_tiles, output_shape.rank(),
+            GatherOutputDimsByPriority(output_shape, operand, gather,
+                                       slice_sizes)),
         operand_grouped);
     // For index and output sharding, if one is grouped partially but the other
     // is replicated, pass through the partially grouped sharding to the other
@@ -372,7 +442,9 @@ StatusOr<HloInstruction*> PartitionGatherTrivialSlicedOperandDimensions(
               indices.sharding(), gather);
       output_grouped = AlignGroupsWith(
           hlo_sharding_util::GroupShardingOnReplicatedDim(
-              new_output_sharding, num_groups, num_tiles, output_shape.rank()),
+              new_output_sharding, num_groups, num_tiles, output_shape.rank(),
+              GatherOutputDimsByPriority(output_shape, operand, gather,
+                                         slice_sizes)),
           operand_grouped);
     }
     if (indices_grouped.sharding.IsTileMaximal() &&
@@ -382,7 +454,8 @@ StatusOr<HloInstruction*> PartitionGatherTrivialSlicedOperandDimensions(
               output_sharding, gather);
       indices_grouped = AlignGroupsWith(
           hlo_sharding_util::GroupShardingOnReplicatedDim(
-              new_indices_sharding, num_groups, num_tiles, indices.rank()),
+              new_indices_sharding, num_groups, num_tiles, indices.rank(),
+              GatherIndexDimsByPriority(indices, gather)),
           operand_grouped);
     }
     // Reshard indices to its intended sharding before clamping and adjusting.
@@ -603,11 +676,16 @@ StatusOr<HloInstruction*> PartitionGatherIndexParallelDimensions(
 }
 
 // Returns a full list of partitioning methods used for gather.
-std::vector<decltype(PartitionGather)*> GatherPartitionMethods() {
-  return {PartitionGatherIndexParallelDimensions,
-          PartitionGatherOperandPassthroughDimensions,
-          PartitionGatherTrivialSlicedOperandDimensions,
-          PartitionGatherIndexPassthroughDimensions};
+std::vector<std::pair<decltype(PartitionGather)*, absl::string_view>>
+GatherPartitionMethods() {
+  return {{PartitionGatherIndexParallelDimensions,
+           "PartitionGatherIndexParallelDimensions"},
+          {PartitionGatherOperandPassthroughDimensions,
+           "PartitionGatherOperandPassthroughDimensions"},
+          {PartitionGatherTrivialSlicedOperandDimensions,
+           "PartitionGatherTrivialSlicedOperandDimensions"},
+          {PartitionGatherIndexPassthroughDimensions,
+           "PartitionGatherIndexPassthroughDimensions"}};
 }
 
 // Estimates the cost for each partitioning methods for gather.
@@ -648,10 +726,9 @@ int64_t GatherPartitionMethodCostModel(
                     max_potential_output_shape_size);
   }
   if (partition_method == PartitionGatherTrivialSlicedOperandDimensions) {
-    auto trivial_slice_dims =
-        GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
-            operand, gather->gather_dimension_numbers().start_index_map(),
-            slice_sizes);
+    auto trivial_slice_dims = GatherScatterOperandPartitionedOnTrivialSliceDims(
+        operand, gather->gather_dimension_numbers().start_index_map(),
+        slice_sizes);
     return !trivial_slice_dims ? INT64_MAX
                                : ShapeSizeInBytes(operand.hlo()->shape()) +
                                      ShapeSizeInBytes(output_shape) +
@@ -694,7 +771,8 @@ std::vector<decltype(PartitionGather)*> GatherPartitionMethodsOrderedByCost(
     absl::Span<const int64_t> slice_sizes, SpmdPartitioningVisitor* visitor) {
   std::vector<decltype(PartitionGather)*> ordered_partition_methods;
   std::vector<int64_t> ordered_costs;
-  for (auto partition_method : GatherPartitionMethods()) {
+  auto gather_partition_methods = GatherPartitionMethods();
+  for (auto [partition_method, _] : gather_partition_methods) {
     const int64_t cost = GatherPartitionMethodCostModel(
         partition_method, gather, operand, indices, output_shape,
         output_sharding, batch_dims, slice_sizes, visitor);
@@ -704,7 +782,17 @@ std::vector<decltype(PartitionGather)*> GatherPartitionMethodsOrderedByCost(
     ordered_partition_methods.insert(ordered_partition_methods.begin() + offset,
                                      partition_method);
   }
-  CHECK_EQ(ordered_partition_methods.size(), GatherPartitionMethods().size());
+  CHECK_EQ(ordered_partition_methods.size(), gather_partition_methods.size());
+  VLOG(5) << "Gather partitioning methods(ordered by cost):";
+  for (auto partition_method : ordered_partition_methods) {
+    VLOG(5) << "  "
+            << absl::c_find_if(gather_partition_methods,
+                               [&](const std::pair<decltype(PartitionGather)*,
+                                                   absl::string_view>& p) {
+                                 return p.first == partition_method;
+                               })
+                   ->second;
+  }
   return ordered_partition_methods;
 }
 
@@ -839,6 +927,69 @@ std::optional<HloOpcode> ParseReductionComputation(
   return root->opcode();
 }
 
+// Priority for operand dimensions in scatter from lowest to highest, in case of
+// needing to replicate a dimension, start from the lowest first.
+std::vector<int64_t> ScatterOperandDimsByPriority(
+    const PartitionedHlo& operand, const HloScatterInstruction* scatter,
+    absl::Span<const int64_t> slice_sizes) {
+  const ScatterDimensionNumbers& dnums = scatter->scatter_dimension_numbers();
+  std::vector<int64_t> priority_dims_for_operand;
+  if (const std::optional<std::vector<int64_t>> trivial_slice_dims =
+          GatherScatterOperandPartitionedOnTrivialSliceDims(
+              operand, dnums.scatter_dims_to_operand_dims(), slice_sizes)) {
+    absl::c_copy(trivial_slice_dims.value(),
+                 std::back_inserter(priority_dims_for_operand));
+  }
+  const auto operand_passthrough_dims =
+      hlo_sharding_util::GetScatterOperandPassthroughOperandDims(
+          operand.base_shape(), operand.sharding(), *scatter, slice_sizes);
+  absl::c_copy(operand_passthrough_dims,
+               std::back_inserter(priority_dims_for_operand));
+  return priority_dims_for_operand;
+}
+
+// Priority for index dimensions in scatter from lowest to highest, in case of
+// needing to replicate a dimension, start from the lowest first.
+std::vector<int64_t> ScatterIndexDimsByPriority(
+    const PartitionedHlo& indices, const HloScatterInstruction* scatter) {
+  const ScatterDimensionNumbers& dnums = scatter->scatter_dimension_numbers();
+
+  std::vector<int64_t> priority_dims_for_indices;
+  priority_dims_for_indices.push_back(dnums.index_vector_dim());
+  absl::InlinedVector<int64_t, 1> index_passthrough_dims =
+      hlo_sharding_util::GetGatherScatterIndexPassthroughIndexDims(
+          indices.rank(), dnums.index_vector_dim());
+  absl::c_copy(index_passthrough_dims,
+               std::back_inserter(priority_dims_for_indices));
+  return priority_dims_for_indices;
+}
+
+// Priority for update dimensions in scatter from lowest to highest, in case of
+// needing to replicate a dimension, start from the lowest first.
+std::vector<int64_t> ScatterUpdateDimsByPriority(
+    const Shape& update_shape, const PartitionedHlo& operand,
+    const HloScatterInstruction* scatter,
+    absl::Span<const int64_t> slice_sizes) {
+  // Try preserve operand passthrough dims as typically operand is larger in
+  // size than indices, and preserving operand pass-through dims avoid more
+  // costly reshard in operand passthrough case.
+  // TODO(b/303295109): Use better cost-model based decision to determine the
+  // priority ordering for scatter update.
+  std::vector<int64_t> priority_dims_for_output;
+  auto operand_passthrough_update_dims =
+      hlo_sharding_util::GetScatterOperandPassthroughUpdateDims(
+          update_shape, operand.base_shape(), operand.sharding(), *scatter,
+          slice_sizes);
+  for (int i = 0; i != update_shape.rank(); ++i) {
+    if (!absl::c_linear_search(operand_passthrough_update_dims, i)) {
+      priority_dims_for_output.push_back(i);
+    }
+  }
+  absl::c_copy(operand_passthrough_update_dims,
+               std::back_inserter(priority_dims_for_output));
+  return priority_dims_for_output;
+}
+
 StatusOr<HloInstruction*> PartitionScatter(
     const HloScatterInstruction* scatter, absl::Span<PartitionedHlo> operands,
     PartitionedHlo& indices, absl::Span<PartitionedHlo> updates,
@@ -1051,7 +1202,8 @@ StatusOr<HloInstruction*> PartitionScatterOperandPassthroughDimensions(
     // otherwise replicate it.
     const GroupedSharding indices_grouped = AlignGroupsWith(
         hlo_sharding_util::GroupShardingOnReplicatedDim(
-            indices.sharding(), num_groups, num_tiles, indices.rank()),
+            indices.sharding(), num_groups, num_tiles, indices.rank(),
+            ScatterIndexDimsByPriority(indices, scatter)),
         update_grouped);
     const GroupedSharding& output_grouped = operand_grouped;
     absl::InlinedVector<PartitionedHlo, 1> per_group_operands =
@@ -1118,7 +1270,8 @@ StatusOr<HloInstruction*> PartitionScatterIndexPassthroughDimensions(
   // otherwise replicate it.
   const GroupedSharding operand_grouped = AlignGroupsWith(
       hlo_sharding_util::GroupShardingOnReplicatedDim(
-          operands[0].sharding(), num_groups, num_tiles, operands[0].rank()),
+          operands[0].sharding(), num_groups, num_tiles, operands[0].rank(),
+          ScatterOperandDimsByPriority(operands[0], scatter, slice_sizes)),
       update_grouped);
   const GroupedSharding indices_grouped =
       AlignGroupsWith(hlo_sharding_util::GroupShardingOnDims(indices.sharding(),
@@ -1230,7 +1383,7 @@ StatusOr<HloInstruction*> PartitionScatterTrivialSlicedOperandDimensions(
     SpmdBuilder* b = visitor->builder();
     auto dnums = scatter->scatter_dimension_numbers();
     if (std::optional<std::vector<int64_t>> trivial_slice_dims =
-            GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+            GatherScatterOperandPartitionedOnTrivialSliceDims(
                 operands[0], dnums.scatter_dims_to_operand_dims(),
                 slice_sizes)) {
       // Operand is sharded on trivial slice dims (update slice size 1). We can
@@ -1248,13 +1401,16 @@ StatusOr<HloInstruction*> PartitionScatterTrivialSlicedOperandDimensions(
       // otherwise replicate it.
       GroupedSharding indices_grouped = AlignGroupsWith(
           hlo_sharding_util::GroupShardingOnReplicatedDim(
-              indices.sharding(), num_groups, num_tiles, indices.rank()),
+              indices.sharding(), num_groups, num_tiles, indices.rank(),
+              ScatterIndexDimsByPriority(indices, scatter)),
           operand_grouped);
       // See if we can group partially replicated dimensions from the updates
       // otherwise replicate it.
       GroupedSharding update_grouped = AlignGroupsWith(
           hlo_sharding_util::GroupShardingOnReplicatedDim(
-              updates[0].sharding(), num_groups, num_tiles, updates[0].rank()),
+              updates[0].sharding(), num_groups, num_tiles, updates[0].rank(),
+              ScatterUpdateDimsByPriority(updates[0].base_shape(), operands[0],
+                                          scatter, slice_sizes)),
           operand_grouped);
       // For index and update sharding, if one is grouped partially but the
       // other is replicated, pass through the partially grouped sharding to the
@@ -1264,11 +1420,12 @@ StatusOr<HloInstruction*> PartitionScatterTrivialSlicedOperandDimensions(
         const HloSharding new_update_sharding = hlo_sharding_util::
             ScatterUpdateShardingFromIndexIndexPassthroughDimensions(
                 indices.sharding(), scatter);
-        update_grouped =
-            AlignGroupsWith(hlo_sharding_util::GroupShardingOnReplicatedDim(
-                                new_update_sharding, num_groups, num_tiles,
-                                output_shape.rank()),
-                            operand_grouped);
+        update_grouped = AlignGroupsWith(
+            hlo_sharding_util::GroupShardingOnReplicatedDim(
+                new_update_sharding, num_groups, num_tiles, output_shape.rank(),
+                ScatterUpdateDimsByPriority(updates[0].base_shape(),
+                                            operands[0], scatter, slice_sizes)),
+            operand_grouped);
       }
       if (indices_grouped.sharding.IsTileMaximal() &&
           !update_grouped.sharding.IsTileMaximal()) {
@@ -1277,7 +1434,8 @@ StatusOr<HloInstruction*> PartitionScatterTrivialSlicedOperandDimensions(
                 updates[0].sharding(), scatter);
         indices_grouped = AlignGroupsWith(
             hlo_sharding_util::GroupShardingOnReplicatedDim(
-                new_indices_sharding, num_groups, num_tiles, indices.rank()),
+                new_indices_sharding, num_groups, num_tiles, indices.rank(),
+                ScatterIndexDimsByPriority(indices, scatter)),
             operand_grouped);
       }
       const GroupedSharding& output_grouped = operand_grouped;
@@ -1322,11 +1480,16 @@ StatusOr<HloInstruction*> PartitionScatterTrivialSlicedOperandDimensions(
 }
 
 // Returns a full list of partitioning methods used for scatter.
-std::vector<decltype(PartitionScatter)*> ScatterPartitionMethods() {
-    return {PartitionScatterIndexParallelDimensions,
-            PartitionScatterOperandPassthroughDimensions,
-            PartitionScatterTrivialSlicedOperandDimensions,
-            PartitionScatterIndexPassthroughDimensions};
+std::vector<std::pair<decltype(PartitionScatter)*, absl::string_view>>
+ScatterPartitionMethods() {
+    return {{PartitionScatterIndexParallelDimensions,
+             "PartitionScatterIndexParallelDimensions"},
+            {PartitionScatterOperandPassthroughDimensions,
+             "PartitionScatterOperandPassthroughDimensions"},
+            {PartitionScatterTrivialSlicedOperandDimensions,
+             "PartitionScatterTrivialSlicedOperandDimensions"},
+            {PartitionScatterIndexPassthroughDimensions,
+             "PartitionScatterIndexPassthroughDimensions"}};
 }
 
 // Estimates the cost for each partitioning methods for scatter.
@@ -1373,7 +1536,7 @@ int64_t ScatterPartitionMethodCostModel(
     }
     if (partition_method == PartitionScatterTrivialSlicedOperandDimensions) {
       auto trivial_slice_dims =
-          GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+          GatherScatterOperandPartitionedOnTrivialSliceDims(
               operands[0],
               scatter->scatter_dimension_numbers()
                   .scatter_dims_to_operand_dims(),
@@ -1425,7 +1588,8 @@ std::vector<decltype(PartitionScatter)*> ScatterPartitionMethodsOrderedByCost(
     absl::Span<const int64_t> slice_sizes, SpmdPartitioningVisitor* visitor) {
     std::vector<decltype(PartitionScatter)*> ordered_partition_methods;
     std::vector<int64_t> ordered_costs;
-    for (auto partition_method : ScatterPartitionMethods()) {
+    auto scatter_partition_methods = ScatterPartitionMethods();
+    for (auto [partition_method, _] : scatter_partition_methods) {
       const int64_t cost = ScatterPartitionMethodCostModel(
           partition_method, scatter, operands, indices, updates, output_shape,
           output_sharding, slice_sizes, visitor);
@@ -1436,7 +1600,18 @@ std::vector<decltype(PartitionScatter)*> ScatterPartitionMethodsOrderedByCost(
           ordered_partition_methods.begin() + offset, partition_method);
     }
     CHECK_EQ(ordered_partition_methods.size(),
-             ScatterPartitionMethods().size());
+             scatter_partition_methods.size());
+    VLOG(5) << "Scatter partitioning methods(ordered by cost):";
+    for (auto partition_method : ordered_partition_methods) {
+      VLOG(5) << "  "
+              << absl::c_find_if(
+                     scatter_partition_methods,
+                     [&](const std::pair<decltype(PartitionScatter)*,
+                                         absl::string_view>& p) {
+                       return p.first == partition_method;
+                     })
+                     ->second;
+    }
     return ordered_partition_methods;
 }
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index 67330f976bcf87..6c281ab748c407 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -10697,6 +10697,36 @@ ENTRY %module {
                 _, _, _, _)));
 }
 
+TEST_P(SpmdPartitioningTest,
+       GatherMergedOperandPassthroughAndIndexPassthrough_PartialGrouping) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0),
+    sharding={devices=[2,2,2,1]<=[8]}
+  %parameter.1 = s32[2,8,4]{2,1,0} parameter(1),
+    sharding={devices=[1,2,2,2]<=[8] last_tile_dim_replicate}
+  ROOT %gather.20 = s32[8,4,2,2]{3,2,1,0} gather(
+    s32[8,4,2,2]{3,2,1,0} %parameter.0,
+    s32[2,8,4]{2,1,0} %parameter.1), offset_dims={2,3},
+    collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=0,
+    slice_sizes={1,1,2,2}, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  auto operand = AllOf(op::Shape("s32[8,4,1,2]"), op::AllReduce());
+  auto indices = AllOf(op::Shape("s32[2,4,2]"), op::Parameter());
+  auto gather = AllOf(op::Shape("s32[4,2,1,2]"), op::Gather(operand, indices));
+  EXPECT_THAT(root, op::AllReduce(op::DynamicUpdateSlice(
+                        _,
+                        op::AllReduce(op::AllReduce(
+                            op::DynamicUpdateSlice(_, gather, _, _, _, _))),
+                        _, _, _, _)));
+}
+
 TEST_P(SpmdPartitioningTest,
        GatherMergedTrivialSlicedOperandAndIndexPassthrough) {
   absl::string_view hlo_string = R"(
@@ -10726,6 +10756,34 @@ ENTRY %module {
                 _, _)));
 }
 
+TEST_P(SpmdPartitioningTest,
+       GatherMergedTrivialSlicedOperandAndIndexPassthrough_PartialGrouping) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0),
+    sharding={devices=[2,2,1,1,2]<=[8] last_tile_dim_replicate}
+  %parameter.1 = s32[2,8,4]{2,1,0} parameter(1),
+    sharding={devices=[1,2,2,2]<=[8] last_tile_dim_replicate}
+  ROOT %gather.20 = s32[8,4,2,2]{3,2,1,0} gather(
+    s32[8,4,2,2]{3,2,1,0} %parameter.0,
+    s32[2,8,4]{2,1,0} %parameter.1), offset_dims={2,3},
+    collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=0,
+    slice_sizes={1,1,2,2}, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  auto operand = AllOf(op::Shape("s32[8,2,2,2]"), op::AllReduce());
+  auto indices = AllOf(op::Shape("s32[2,4,2]"), op::Subtract());
+  auto gather = AllOf(op::Shape("s32[4,2,2,2]"), op::Gather(operand, indices));
+  EXPECT_THAT(root,
+              op::AllReduce(op::AllReduce(op::DynamicUpdateSlice(
+                  _, op::AllReduce(op::Select(_, _, gather)), _, _, _, _))));
+}
+
 TEST_P(SpmdPartitioningTest, GatherTrivialSlicedOperandPartial) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -11546,6 +11604,51 @@ ENTRY %module {
                         _, op::AllReduce(scatter), _, _, _, _)));
 }
 
+TEST_P(SpmdPartitioningTest,
+       ScatterMergedOperandPassthroughAndIndexPassthrough_PartialGrouping) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+add (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT sum = s32[] add(lhs, rhs)
+}
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0),
+    sharding={devices=[2,2,2,1]<=[8]}
+  %parameter.1 = s32[2,8,4]{2,1,0} parameter(1),
+    sharding={devices=[1,2,2,2]<=[8] last_tile_dim_replicate}
+  %parameter.2 = s32[8,4,2,2]{3,2,1,0} parameter(2),
+    sharding={replicated}
+  ROOT %scatter.20 = s32[8,4,2,2]{3,2,1,0} scatter(
+    s32[8,4,2,2]{3,2,1,0} %parameter.0,
+    s32[2,8,4]{2,1,0} %parameter.1,
+    s32[8,4,2,2]{3,2,1,0} %parameter.2),
+    to_apply=add,
+    update_window_dims={2,3},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=0, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  auto operand = AllOf(op::Shape("s32[8,4,1,2]"), op::Select());
+  auto indices = AllOf(op::Shape("s32[2,4,2]"), op::Parameter());
+  auto update = AllOf(op::Shape("s32[4,2,1,2]"), op::DynamicSlice());
+  auto scatter =
+      AllOf(op::Shape("s32[8,4,1,2]"), op::Scatter(operand, indices, update));
+  EXPECT_THAT(
+      root,
+      op::AllReduce(op::AllReduce(op::AllReduce(op::DynamicUpdateSlice(
+          _,
+          op::DynamicSlice(op::AllReduce(op::AllReduce(scatter)), _, _, _, _),
+          _, _, _, _)))));
+}
+
 TEST_P(SpmdPartitioningTest,
        ScatterMergedTrivialSlicedOperandAndIndexPassthrough) {
   absl::string_view hlo_string = R"(
@@ -11587,6 +11690,47 @@ ENTRY %module {
                         _, op::AllReduce(scatter), _, _, _, _))));
 }
 
+TEST_P(SpmdPartitioningTest,
+       ScatterMergedTrivialSlicedOperandAndIndexPassthrough_PartialGrouping) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+add (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT sum = s32[] add(lhs, rhs)
+}
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0),
+    sharding={devices=[2,2,1,1,2]<=[8] last_tile_dim_replicate}
+  %parameter.1 = s32[2,8,4]{2,1,0} parameter(1),
+    sharding={devices=[1,2,2,2]<=[8] last_tile_dim_replicate}
+  %parameter.2 = s32[8,4,2,2]{3,2,1,0} parameter(2),
+    sharding={replicated}
+  ROOT %scatter.20 = s32[8,4,2,2]{3,2,1,0} scatter(
+    s32[8,4,2,2]{3,2,1,0} %parameter.0,
+    s32[2,8,4]{2,1,0} %parameter.1,
+    s32[8,4,2,2]{3,2,1,0} %parameter.2),
+    to_apply=add,
+    update_window_dims={2,3},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=0, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  auto operand = AllOf(op::Shape("s32[8,2,2,2]"), op::Select());
+  auto indices = AllOf(op::Shape("s32[2,4,2]"), op::Subtract());
+  auto update = AllOf(op::Shape("s32[4,2,2,2]"), op::DynamicSlice());
+  auto scatter =
+      AllOf(op::Shape("s32[8,2,2,2]"), op::Scatter(operand, indices, update));
+  EXPECT_THAT(root, op::AllReduce(op::DynamicUpdateSlice(
+                        _, op::AllReduce(op::AllReduce(scatter)), _, _, _, _)));
+}
+
 TEST_P(SpmdPartitioningTest, ScatterTrivialSlicedOperandPartial) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -12508,13 +12652,13 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/8));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(
-      root,
-      op::Copy(op::AllReduce(op::DynamicUpdateSlice(
-          _,
-          op::Scatter(op::Shape("bf16[32,128,50001]"),
-                      op::Shape("s32[32,512,3]"), op::Shape("bf16[32,512]")),
-          _, _, _))));
+  EXPECT_THAT(root,
+              op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+                  _,
+                  op::AllReduce(op::Scatter(op::Shape("bf16[32,128,50001]"),
+                                            op::Shape("s32[32,256,3]"),
+                                            op::Shape("bf16[32,256]"))),
+                  _, _, _))));
 }
 
 TEST_P(SpmdPartitioningTest, GatherOperandPassthroughIndexPassthrough) {
@@ -13379,7 +13523,7 @@ ENTRY %main.21 {
   XLA_VLOG_LINES(1, module->ToString());
   auto* gather = FindInstruction(module.get(), HloOpcode::kGather);
   EXPECT_NE(gather, nullptr);
-  EXPECT_THAT(gather, op::Shape("bf16[2048,128]"));
+  EXPECT_THAT(gather, op::Shape("bf16[2048,64]"));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterCostModelForUnmatchedSharding) {
@@ -13408,7 +13552,7 @@ ENTRY %main.21 {
   auto* scatter = FindInstruction(module.get(), HloOpcode::kScatter);
   EXPECT_NE(scatter, nullptr);
   auto* updates = scatter->operand(2);
-  EXPECT_THAT(updates, op::Shape("bf16[4096,128]"));
+  EXPECT_THAT(updates, op::Shape("bf16[4096,64]"));
 }
 
 TEST_P(SpmdPartitioningTest, ComplexReshardUnmerge) {

From 073ffc462e6545253d17f48f9f1e05f3b6f58c62 Mon Sep 17 00:00:00 2001
From: Mason Chang <masonchang@google.com>
Date: Tue, 3 Oct 2023 16:31:51 -0700
Subject: [PATCH 552/567] Add streamz metrics to record when tf2xla bridge
 fails to export to tf executor dialect from TF Dialect.

PiperOrigin-RevId: 570530479
---
 tensorflow/compiler/mlir/tf2xla/api/v2/BUILD  |  6 ++---
 .../tf2xla/api/v2/tf_dialect_to_executor.cc   | 23 +++++++++++++++----
 .../api/v2/tf_dialect_to_executor_test.cc     | 18 +++++++++++++++
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index 42fe6d00e5ee33..d2d0192232e6e0 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -129,8 +129,6 @@ cc_library(
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:status",
@@ -141,7 +139,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
-        "@local_tsl//tsl/platform:error_logging",
+        "@local_tsl//tsl/lib/monitoring:counter",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -156,6 +154,7 @@ tf_cc_test(
     deps = [
         ":tf_dialect_to_executor",
         "//tensorflow/compiler/mlir:register_common_dialects",
+        "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/platform:resource_loader",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -163,6 +162,7 @@ tf_cc_test(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/lib/monitoring:test_utils",
         "@local_tsl//tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
index e2d34400aaaf41..b14e5f3fc2d094 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/util/debug_data_dumper.h"
+#include "tsl/lib/monitoring/counter.h"
 #include "tsl/platform/status.h"
 
 namespace tensorflow {
@@ -43,6 +44,15 @@ using mlir::OpPassManager;
 using mlir::PassManager;
 using mlir::func::FuncOp;
 
+auto *tf_dialect_to_executor_dialect_status = tsl::monitoring::Counter<1>::New(
+    "/tensorflow/core/tf2xla/api/v2/tf_dialect_to_executor_dialect_status",
+    "Counts how often a successful export from TF Dialect to Executor Dialect "
+    "is",
+    "status");
+
+constexpr char kExportSuccess[] = "success";
+constexpr char kExportFailed[] = "failed";
+
 namespace {
 
 // Add logger to bridge passmanager.
@@ -131,12 +141,17 @@ tensorflow::Status ExportFromTensorflowDialectToExecutor(
         module, llvm::StringRef(), &tf_to_executor);
   }
 
-  if (result.succeeded()) {
-    return tsl::OkStatus();
+  if (!result.succeeded()) {
+    tf_dialect_to_executor_dialect_status->GetCell(kExportFailed)
+        ->IncrementBy(1);
+
+    return absl::InternalError(
+        "Failed to export from TF Dialect to TF Executor Dialect.");
   }
 
-  return absl::InternalError(
-      "Failed to export from TF Dialect to TF Executor Dialect.");
+  tf_dialect_to_executor_dialect_status->GetCell(kExportSuccess)
+      ->IncrementBy(1);
+  return tsl::OkStatus();
 }
 
 }  // namespace v2
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
index 20583847b3f5c4..9940a8d52c18e8 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
 
+#include <cstdint>
 #include <string>
 
 #include <gtest/gtest.h>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/register_common_dialects.h"
+#include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status.h"
@@ -35,10 +37,16 @@ namespace tf2xla {
 namespace v2 {
 namespace {
 
+constexpr char kExportStreamzName[] =
+    "/tensorflow/core/tf2xla/api/v2/tf_dialect_to_executor_dialect_status";
+constexpr char kExportSuccess[] = "success";
+constexpr char kExportFailed[] = "failed";
+
 using mlir::DialectRegistry;
 using mlir::MLIRContext;
 using mlir::ModuleOp;
 using mlir::OwningOpRef;
+using ::tensorflow::monitoring::testing::CellReader;
 
 std::string TestDataPath() {
   return tensorflow::GetDataDependencyFilepath(
@@ -71,15 +79,25 @@ class TensorflowDialectToExecutorTest : public ::testing::Test {
 };
 
 TEST_F(TensorflowDialectToExecutorTest, ConvertsToExecutor) {
+  CellReader<int64_t> compilation_status(kExportStreamzName);
+
   TF_ASSERT_OK(CreateMlirModule("empty_func.mlir"));
 
   TF_EXPECT_OK(ExportFromTensorflowDialectToExecutor(*mlir_module_));
+
+  EXPECT_EQ(compilation_status.Delta(kExportSuccess), 1);
+  EXPECT_EQ(compilation_status.Delta(kExportFailed), 0);
 }
 
 TEST_F(TensorflowDialectToExecutorTest, ErrorsWhenCannotConvert) {
+  CellReader<int64_t> compilation_status(kExportStreamzName);
+
   TF_ASSERT_OK(CreateMlirModule("invalid_executor.mlir"));
 
   EXPECT_FALSE(ExportFromTensorflowDialectToExecutor(*mlir_module_).ok());
+
+  EXPECT_EQ(compilation_status.Delta(kExportSuccess), 0);
+  EXPECT_EQ(compilation_status.Delta(kExportFailed), 1);
 }
 
 }  // namespace

From 464e057ee4e755bdef03ede114929b3fc94f154e Mon Sep 17 00:00:00 2001
From: Matthias Kramm <kramm@google.com>
Date: Tue, 3 Oct 2023 16:43:07 -0700
Subject: [PATCH 553/567] Verify, in the bridge, that we were able to replace
 all program keys.

PiperOrigin-RevId: 570533232
---
 .../tensorflow/tests/embedding_program_key.mlir |  8 ++++----
 .../transforms/embedding_program_key.cc         | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/embedding_program_key.mlir b/tensorflow/compiler/mlir/tensorflow/tests/embedding_program_key.mlir
index 6cf7cc91e73f4a..060e585b5f69de 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/embedding_program_key.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/embedding_program_key.mlir
@@ -137,11 +137,11 @@ func.func @launch_intermediate_usage() {
 // CHECK-LABEL: func @compile_not_in_launch
 func.func @compile_not_in_launch() {
   // CHECK: TPUCompileMlir
-  // CHECK: %[[CONSTANT:[a-z0-9]*]] = "tf.Const"
+  // CHECK: %[[CONSTANT:[a-z0-9]*]] = builtin.unrealized
   // CHECK: "tf.OpA"(%[[CONSTANT]]
    %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
   "tf_device.launch"() ({
-    %cst_0 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %cst_0 = builtin.unrealized_conversion_cast to tensor<1x!tf_type.string>
     "tf.OpA"(%cst_0) { mini_batch_splits = ""} : (tensor<1x!tf_type.string>) -> ()
     tf_device.return
   }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
@@ -187,7 +187,7 @@ func.func @only_compile_under_replicate() {
   // CHECK-DAG: [[COMPILE_REPLICATE:%[0-9]*]]:4 = tf_device.replicate
   // CHECK-NEXT: [[COMPILE_LAUNCH:%[0-9]*]]:2 = "tf_device.launch"
   // CHECK-DAG: _TPUCompileMlir
-  // CHECK-DAG: %[[CONSTANT:[a-z0-9]*]] = "tf.Const"
+  // CHECK-DAG: %[[CONSTANT:[a-z0-9]*]] = builtin.unrealized
   // CHECK: "tf.OpA"
   %0:4 = "tf_device.replicate"() ({
     %0:2 = "tf_device.launch"() ({
@@ -201,7 +201,7 @@ func.func @only_compile_under_replicate() {
       tensor<3x!tf_type.string>,
       tensor<3x!tf_type.string>
       )
-  %cst_0 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+  %cst_0 = builtin.unrealized_conversion_cast to tensor<1x!tf_type.string>
   "tf.OpA"(%cst_0) { mini_batch_splits = ""} : (tensor<1x!tf_type.string>) -> ()
   return
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc
index 73bc4d13f028c9..f592587b3f9ca9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc
@@ -367,6 +367,19 @@ void RewritePreprocessInputs(OpBuilder* builder, func::FuncOp func_op,
   }
 }
 
+LogicalResult VerifyAllProgramKeyOperandsReplaced(Operation* module) {
+  WalkResult result = module->walk([&](Operation* op) {
+    if (!op->hasAttr(kMiniBatchSplitsAttr) && !op->hasAttr(kMiniBatchCsrAttr))
+      return WalkResult::advance();
+    Operation* defining = op->getOperand(0).getDefiningOp();
+    if (llvm::dyn_cast_or_null<TF::ConstOp>(defining)) {
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return result.wasInterrupted() ? failure() : success();
+}
+
 void EmbeddingProgramKeyPass::runOnOperation() {
   // Find all of the relevant post processing ops.
   llvm::SmallVector<Operation*, 6> preprocess_ops;
@@ -401,6 +414,10 @@ void EmbeddingProgramKeyPass::runOnOperation() {
   for (Operation* preprocess_op : preprocess_ops) {
     RewritePreprocessInputs(&builder, getOperation(), preprocess_op);
   }
+
+  if (failed(VerifyAllProgramKeyOperandsReplaced(getOperation()))) {
+    signalPassFailure();
+  }
 }
 
 }  // anonymous namespace

From c89bf7c672e040770d3221ab1efc5e08364ff548 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Tue, 3 Oct 2023 17:06:23 -0700
Subject: [PATCH 554/567] Do not allow int4 on CPU/GPU in HloVerifier.

Int4 is unsupported and currently gives incorrect results so it should not be allowed by HloVerifier. Int4 support will be added shortly, at which point the verifier will allow it.

PiperOrigin-RevId: 570538654
---
 third_party/xla/xla/service/BUILD             | 29 ++++++++
 third_party/xla/xla/service/cpu/BUILD         | 12 +---
 .../xla/xla/service/cpu/cpu_compiler.cc       |  5 +-
 .../xla/xla/service/cpu/cpu_shape_verifier.cc | 41 ------------
 .../xla/xla/service/cpu_gpu_shape_verifier.cc | 67 +++++++++++++++++++
 ...pe_verifier.h => cpu_gpu_shape_verifier.h} | 22 +++---
 .../service/cpu_gpu_shape_verifier_test.cc    | 57 ++++++++++++++++
 third_party/xla/xla/service/gpu/BUILD         | 16 +----
 .../xla/xla/service/gpu/fusion_pipeline.cc    |  4 +-
 .../xla/xla/service/gpu/gpu_compiler.cc       |  4 +-
 .../xla/xla/service/gpu/gpu_shape_verifier.cc | 41 ------------
 .../xla/xla/service/gpu/gpu_shape_verifier.h  | 49 --------------
 .../prepare_hlo_for_ir_emitting_pipeline.cc   |  4 +-
 13 files changed, 177 insertions(+), 174 deletions(-)
 delete mode 100644 third_party/xla/xla/service/cpu/cpu_shape_verifier.cc
 create mode 100644 third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
 rename third_party/xla/xla/service/{cpu/cpu_shape_verifier.h => cpu_gpu_shape_verifier.h} (59%)
 create mode 100644 third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc
 delete mode 100644 third_party/xla/xla/service/gpu/gpu_shape_verifier.cc
 delete mode 100644 third_party/xla/xla/service/gpu/gpu_shape_verifier.h

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 25354ba2badc4f..71fc180b0fd514 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -4742,6 +4742,35 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "cpu_gpu_shape_verifier",
+    srcs = ["cpu_gpu_shape_verifier.cc"],
+    hdrs = ["cpu_gpu_shape_verifier.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":hlo_verifier",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+xla_cc_test(
+    name = "cpu_gpu_shape_verifier_test",
+    srcs = ["cpu_gpu_shape_verifier_test.cc"],
+    deps = [
+        ":cpu_gpu_shape_verifier",
+        ":hlo_parser",
+        ":hlo_verifier",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "hlo_rematerialization",
     srcs = ["hlo_rematerialization.cc"],
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 95269b75edd022..6e0ea6134356e2 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -212,7 +212,6 @@ cc_library(
         ":cpu_instruction_fusion",
         ":cpu_layout_assignment",
         ":cpu_options",
-        ":cpu_shape_verifier",
         ":dot_op_emitter",
         ":executable_proto_cc",
         ":hlo_xla_runtime_pipeline",
@@ -273,6 +272,7 @@ cc_library(
         "//xla/service:conditional_to_select",
         "//xla/service:convolution_group_converter",
         "//xla/service:copy_insertion",
+        "//xla/service:cpu_gpu_shape_verifier",
         "//xla/service:dot_decomposer",
         "//xla/service:dump",
         "//xla/service:dynamic_dimension_inference",
@@ -1388,16 +1388,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "cpu_shape_verifier",
-    srcs = ["cpu_shape_verifier.cc"],
-    hdrs = ["cpu_shape_verifier.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/service:hlo_verifier",
-    ],
-)
-
 xla_cc_test(
     name = "cpu_layout_assignment_test",
     size = "small",
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index f74baacb59c697..e519cf051f9c40 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -141,7 +141,6 @@ limitations under the License.
 #include "xla/service/cpu/cpu_instruction_fusion.h"
 #include "xla/service/cpu/cpu_layout_assignment.h"
 #include "xla/service/cpu/cpu_options.h"
-#include "xla/service/cpu/cpu_shape_verifier.h"
 #include "xla/service/cpu/dot_op_emitter.h"
 #include "xla/service/cpu/hlo_xla_runtime_pipeline.h"
 #include "xla/service/cpu/ir_emitter.h"
@@ -155,6 +154,7 @@ limitations under the License.
 #include "xla/service/cpu/simple_orc_jit.h"
 #include "xla/service/cpu/target_machine_features.h"
 #include "xla/service/cpu/xla_framework.h"
+#include "xla/service/cpu_gpu_shape_verifier.h"
 #include "xla/service/dot_decomposer.h"
 #include "xla/service/dump.h"
 #include "xla/service/dynamic_dimension_inference.h"
@@ -603,7 +603,8 @@ void AddHloVerifier(HloPassPipeline* pipeline, bool allow_sparse_shapes,
     verifier_metadata =
         std::make_unique<DefaultVerifierMetadata>(std::move(opts));
   } else {
-    verifier_metadata = std::make_unique<CpuVerifierMetadata>(std::move(opts));
+    verifier_metadata =
+        std::make_unique<CpuGpuVerifierMetadata>(std::move(opts));
   }
   if (debug_only) {
     pipeline->AddInvariantCheckerDebug<HloVerifier>(
diff --git a/third_party/xla/xla/service/cpu/cpu_shape_verifier.cc b/third_party/xla/xla/service/cpu/cpu_shape_verifier.cc
deleted file mode 100644
index 278d5ed09c2ab0..00000000000000
--- a/third_party/xla/xla/service/cpu/cpu_shape_verifier.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/cpu_shape_verifier.h"
-
-namespace xla {
-
-Status CpuShapeVerifier::Preprocess(HloInstruction* hlo) {
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      hlo->shape(), [&](const Shape& shape, const ShapeIndex&) {
-        if (shape.has_layout()) {
-          if (LayoutUtil::IsSparseArray(shape)) {
-            return InvalidArgument(
-                "The XLA CPU backend does not support sparse shapes: %s",
-                hlo->ToString());
-          }
-          if (shape.layout().element_size_in_bits() != 0) {
-            return InvalidArgument(
-                "The XLA CPU backend does not support custom element sizes: %s",
-                hlo->ToString());
-          }
-        }
-        return OkStatus();
-      }));
-
-  return ShapeVerifier::Preprocess(hlo);
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc b/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
new file mode 100644
index 00000000000000..82bf70df60eba3
--- /dev/null
+++ b/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu_gpu_shape_verifier.h"
+
+#include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+
+namespace xla {
+
+namespace {
+
+bool HasInt4(const Shape& shape) {
+  return ShapeUtil::HasPrimitiveType(shape, S4) ||
+         ShapeUtil::HasPrimitiveType(shape, U4);
+}
+
+Status VerifyS4U4Usage(HloInstruction* instruction) {
+  if (HasInt4(instruction->shape())) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        "S4/U4 is currently not support on XLA CPU/GPU, but got instruction "
+        " with S4/U4 output: %s",
+        instruction->ToString()));
+  }
+  return OkStatus();
+}
+}  // namespace
+
+Status CpuGpuShapeVerifier::Preprocess(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      hlo->shape(), [&](const Shape& shape, const ShapeIndex&) {
+        if (shape.has_layout()) {
+          if (LayoutUtil::IsSparseArray(shape)) {
+            return absl::InvalidArgumentError(absl::StrFormat(
+                "The XLA CPU/GPU backend does not support sparse shapes: %s",
+                hlo->ToString()));
+          }
+          if (shape.layout().element_size_in_bits() != 0) {
+            return absl::InvalidArgumentError(absl::StrFormat(
+                "The XLA CPU/GPU backend does not support custom element "
+                "sizes: %s",
+                hlo->ToString()));
+          }
+        }
+        return OkStatus();
+      }));
+
+  TF_RETURN_IF_ERROR(VerifyS4U4Usage(hlo));
+  return ShapeVerifier::Preprocess(hlo);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_shape_verifier.h b/third_party/xla/xla/service/cpu_gpu_shape_verifier.h
similarity index 59%
rename from third_party/xla/xla/service/cpu/cpu_shape_verifier.h
rename to third_party/xla/xla/service/cpu_gpu_shape_verifier.h
index 4e6b6f4948f336..ef43fcc59670e2 100644
--- a/third_party/xla/xla/service/cpu/cpu_shape_verifier.h
+++ b/third_party/xla/xla/service/cpu_gpu_shape_verifier.h
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_CPU_CPU_SHAPE_VERIFIER_H_
-#define XLA_SERVICE_CPU_CPU_SHAPE_VERIFIER_H_
+#ifndef XLA_SERVICE_CPU_GPU_SHAPE_VERIFIER_H_
+#define XLA_SERVICE_CPU_GPU_SHAPE_VERIFIER_H_
 
 #include <memory>
 #include <utility>
@@ -23,26 +23,26 @@ limitations under the License.
 
 namespace xla {
 
-// Verifies that HLO Shapes are supported by the XLA-CPU compiler.
-class CpuShapeVerifier : public ShapeVerifier {
+// Verifies that HLO Shapes are supported by the XLA-CPU and XLA-GPU compilers.
+class CpuGpuShapeVerifier : public ShapeVerifier {
  public:
-  explicit CpuShapeVerifier(const HloVerifierOpts& opts)
+  explicit CpuGpuShapeVerifier(const HloVerifierOpts& opts)
       : ShapeVerifier(opts) {}
 
   Status Preprocess(HloInstruction* hlo) override;
 };
 
-// A verifier metadata class that uses the CpuShapeVerifier.
-class CpuVerifierMetadata : public TargetVerifierMetadata {
+// A verifier metadata class that uses the CpuGpuShapeVerifier.
+class CpuGpuVerifierMetadata : public TargetVerifierMetadata {
  public:
-  explicit CpuVerifierMetadata(HloVerifierOpts&& opts)
+  explicit CpuGpuVerifierMetadata(HloVerifierOpts&& opts)
       : TargetVerifierMetadata(std::move(opts)) {}
 
   std::unique_ptr<ShapeVerifier> GetVerifier() const override {
-    return std::make_unique<CpuShapeVerifier>(GetVerifierOpts());
+    return std::make_unique<CpuGpuShapeVerifier>(GetVerifierOpts());
   }
 };
 
 }  // namespace xla
 
-#endif  // XLA_SERVICE_CPU_CPU_SHAPE_VERIFIER_H_
+#endif  // XLA_SERVICE_CPU_GPU_SHAPE_VERIFIER_H_
diff --git a/third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc b/third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc
new file mode 100644
index 00000000000000..8b5cd456d65fe1
--- /dev/null
+++ b/third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu_gpu_shape_verifier.h"
+
+#include <memory>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/service/hlo_parser.h"
+#include "xla/service/hlo_verifier.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+using CpuGpuShapeVerifierTest = HloTestBase;
+using ::testing::HasSubstr;
+
+TEST_F(CpuGpuShapeVerifierTest, Int4NotSupported) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY main {
+    p0 = u4[10] parameter(0)
+    ROOT out = u8[10] convert(p0)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+
+  HloVerifierOpts opts;
+  std::unique_ptr<TargetVerifierMetadata> metadata =
+      std::make_unique<CpuGpuVerifierMetadata>(std::move(opts));
+  HloVerifier hlo_verifier(std::move(metadata));
+  auto status = hlo_verifier.Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(),
+              HasSubstr("S4/U4 is currently not support on XLA CPU/GPU"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 63cba952ff6b7b..9d3c5d02a6cdb1 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2505,7 +2505,6 @@ cc_library(
     deps = [
         ":fusion_merger",
         ":gpu_hlo_cost_analysis",
-        ":gpu_shape_verifier",
         ":horizontal_input_fusion",
         ":horizontal_loop_fusion",
         ":instruction_fusion",
@@ -2513,6 +2512,7 @@ cc_library(
         ":priority_fusion",
         ":variadic_op_splitter",
         "//xla:xla_proto_cc",
+        "//xla/service:cpu_gpu_shape_verifier",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_cse",
         "//xla/service:hlo_dce",
@@ -2533,11 +2533,11 @@ cc_library(
         ":alias_passthrough_params",
         ":copy_fusion",
         ":gpu_sanitize_constant_names",
-        ":gpu_shape_verifier",
         ":horizontal_loop_fusion",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:copy_insertion",
+        "//xla/service:cpu_gpu_shape_verifier",
         "//xla/service:hlo_dataflow_analysis",
         "//xla/service:hlo_dce",
         "//xla/service:hlo_pass_pipeline",
@@ -2602,7 +2602,6 @@ cc_library(
         ":gpu_all_gather_optimizer",
         ":gpu_sanitize_constant_names",
         ":gpu_scatter_expander",
-        ":gpu_shape_verifier",
         ":gpu_target_config",
         ":hlo_fusion_stats",
         ":horizontal_input_fusion",
@@ -2673,6 +2672,7 @@ cc_library(
         "//xla/service:conditional_canonicalizer",
         "//xla/service:conditional_simplifier",
         "//xla/service:convert_async_collectives_to_sync",
+        "//xla/service:cpu_gpu_shape_verifier",
         "//xla/service:convert_mover",
         "//xla/service:convolution_4d_expander",
         "//xla/service:convolution_pred_expander",
@@ -3077,16 +3077,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "gpu_shape_verifier",
-    srcs = ["gpu_shape_verifier.cc"],
-    hdrs = ["gpu_shape_verifier.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/service:hlo_verifier",
-    ],
-)
-
 cc_library(
     name = "gpu_layout_assignment",
     srcs = ["gpu_layout_assignment.cc"],
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
index 584d6776895011..8520db83adcb99 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "xla/service/cpu_gpu_shape_verifier.h"
 #include "xla/service/gpu/fusion_merger.h"
 #include "xla/service/gpu/gpu_hlo_cost_analysis.h"
-#include "xla/service/gpu/gpu_shape_verifier.h"
 #include "xla/service/gpu/horizontal_input_fusion.h"
 #include "xla/service/gpu/horizontal_loop_fusion.h"
 #include "xla/service/gpu/instruction_fusion.h"
@@ -49,7 +49,7 @@ HloPassPipeline FusionPipeline(
   // to avoid exceeding the parameter space.
   fusion.AddPass<VariadicOpSplitter>();
   fusion.AddInvariantCheckerDebug<HloVerifier>(
-      std::make_unique<GpuVerifierMetadata>(
+      std::make_unique<CpuGpuVerifierMetadata>(
           HloVerifierOpts()
               .MakeLayoutSensitive()
               .WithInstructionCanChangeLayout(
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 3f5545c0294bf1..2051b4ccc526fa 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -75,6 +75,7 @@ limitations under the License.
 #include "xla/service/convolution_4d_expander.h"
 #include "xla/service/convolution_pred_expander.h"
 #include "xla/service/copy_insertion.h"
+#include "xla/service/cpu_gpu_shape_verifier.h"
 #include "xla/service/dot_decomposer.h"
 #include "xla/service/dot_merger.h"
 #include "xla/service/dump.h"
@@ -114,7 +115,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_reduce_scatter_creator.h"
 #include "xla/service/gpu/gpu_sanitize_constant_names.h"
 #include "xla/service/gpu/gpu_scatter_expander.h"
-#include "xla/service/gpu/gpu_shape_verifier.h"
 #include "xla/service/gpu/hlo_fusion_stats.h"
 #include "xla/service/gpu/horizontal_loop_fusion.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -274,7 +274,7 @@ namespace {
 void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
                     bool debug_only = false) {
   std::unique_ptr<TargetVerifierMetadata> verifier_metadata =
-      std::make_unique<GpuVerifierMetadata>(std::move(opts));
+      std::make_unique<CpuGpuVerifierMetadata>(std::move(opts));
   if (debug_only) {
     pipeline->AddInvariantCheckerDebug<HloVerifier>(
         std::move(verifier_metadata), "hlo verifier (debug)");
diff --git a/third_party/xla/xla/service/gpu/gpu_shape_verifier.cc b/third_party/xla/xla/service/gpu/gpu_shape_verifier.cc
deleted file mode 100644
index 41af09c059ee91..00000000000000
--- a/third_party/xla/xla/service/gpu/gpu_shape_verifier.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/gpu_shape_verifier.h"
-
-namespace xla {
-
-Status GpuShapeVerifier::Preprocess(HloInstruction* hlo) {
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      hlo->shape(), [&](const Shape& shape, const ShapeIndex&) {
-        if (shape.has_layout()) {
-          if (LayoutUtil::IsSparseArray(shape)) {
-            return InvalidArgument(
-                "The XLA GPU backend does not support sparse shapes: %s",
-                hlo->ToString());
-          }
-          if (shape.layout().element_size_in_bits() != 0) {
-            return InvalidArgument(
-                "The XLA GPU backend does not support custom element sizes: %s",
-                hlo->ToString());
-          }
-        }
-        return OkStatus();
-      }));
-
-  return ShapeVerifier::Preprocess(hlo);
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_shape_verifier.h b/third_party/xla/xla/service/gpu/gpu_shape_verifier.h
deleted file mode 100644
index d2ff162041dd32..00000000000000
--- a/third_party/xla/xla/service/gpu/gpu_shape_verifier.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_GPU_SHAPE_VERIFIER_H_
-#define XLA_SERVICE_GPU_GPU_SHAPE_VERIFIER_H_
-
-#include <functional>
-#include <memory>
-#include <utility>
-
-#include "xla/service/hlo_verifier.h"
-
-namespace xla {
-
-// Verifies that HLO Shapes are supported by the XLA-GPU compiler.
-class GpuShapeVerifier : public ShapeVerifier {
- public:
-  explicit GpuShapeVerifier(const HloVerifierOpts& opts)
-      : ShapeVerifier(opts) {}
-
-  Status Preprocess(HloInstruction* hlo) override;
-};
-
-// A verifier metadata class that uses the GpuShapeVerifier.
-class GpuVerifierMetadata : public TargetVerifierMetadata {
- public:
-  explicit GpuVerifierMetadata(HloVerifierOpts&& opts)
-      : TargetVerifierMetadata(std::move(opts)) {}
-
-  std::unique_ptr<ShapeVerifier> GetVerifier() const override {
-    return std::make_unique<GpuShapeVerifier>(GetVerifierOpts());
-  }
-};
-
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_GPU_SHAPE_VERIFIER_H_
diff --git a/third_party/xla/xla/service/gpu/prepare_hlo_for_ir_emitting_pipeline.cc b/third_party/xla/xla/service/gpu/prepare_hlo_for_ir_emitting_pipeline.cc
index affdd2ffa1d91c..34cdfec28a0deb 100644
--- a/third_party/xla/xla/service/gpu/prepare_hlo_for_ir_emitting_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/prepare_hlo_for_ir_emitting_pipeline.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/copy_insertion.h"
+#include "xla/service/cpu_gpu_shape_verifier.h"
 #include "xla/service/gpu/alias_passthrough_params.h"
 #include "xla/service/gpu/copy_fusion.h"
 #include "xla/service/gpu/gpu_sanitize_constant_names.h"
-#include "xla/service/gpu/gpu_shape_verifier.h"
 #include "xla/service/gpu/horizontal_loop_fusion.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_dce.h"
@@ -48,7 +48,7 @@ HloPassPipeline PrepareHloModuleForIrEmittingPipeline(
   // the parameter.
   HloPassPipeline pipeline("GPU-ir-emit-prepare");
   std::unique_ptr<TargetVerifierMetadata> verifier_metadata =
-      std::make_unique<GpuVerifierMetadata>(
+      std::make_unique<CpuGpuVerifierMetadata>(
           HloVerifierOpts{}
               .MakeLayoutSensitive()
               .WithInstructionCanChangeLayout(

From 055de2ca51395eb86aa3ef084b858c1e0e02c948 Mon Sep 17 00:00:00 2001
From: Michael Delorimier <mdel@google.com>
Date: Tue, 3 Oct 2023 17:14:17 -0700
Subject: [PATCH 555/567] Report an error when
 extract_tpu_copy_with_dynamic_shape gets an unexpected device.

PiperOrigin-RevId: 570540308
---
 ...xtract_tpu_copy_with_dynamic_shape_op.mlir | 16 +++++++++++++
 .../extract_tpu_copy_with_dynamic_shape_op.cc | 24 ++++++++++++++-----
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/extract_tpu_copy_with_dynamic_shape_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/extract_tpu_copy_with_dynamic_shape_op.mlir
index 32dcea44445e53..e87b1602cac992 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/extract_tpu_copy_with_dynamic_shape_op.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/extract_tpu_copy_with_dynamic_shape_op.mlir
@@ -58,3 +58,19 @@ func.func @copy_and_send(%arg0: tensor<65536xi64>, %arg1: tensor<1x!tf_type.stri
     }) {device = "TPU_REPLICATED_HOST_0"} : () -> ()
   return
 }
+
+// -----
+
+func.func @bad_host0(
+  %arg0: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"},
+  %arg1: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) -> (tensor<2048xi32>, tensor<2048xi32>) {
+  %cst = "tf.Const"() {value = dense<1024> : tensor<i32>} : () -> tensor<i32>
+  %0:2 = "tf_device.launch"() ({
+    %1 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<2048xi64>) -> tensor<2048xi32>
+    %2 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2048xi64>) -> tensor<2048xi32>
+    // expected-error @+1 {{device is not a recognized host 0}}
+    %3:2 = "tf.TPUCopyWithDynamicShape"(%1, %2, %cst, %cst) {operandSegmentSizes = array<i32: 2, 2>} : (tensor<2048xi32>, tensor<2048xi32>, tensor<i32>, tensor<i32>) -> (tensor<2048xi32>, tensor<2048xi32>)
+    tf_device.return %3#0, %3#1 : tensor<2048xi32>, tensor<2048xi32>
+  }) {device = "/job:localhost/replica:0/task:1/device:CPU:0"} : () -> (tensor<2048xi32>, tensor<2048xi32>)
+  return %0#0, %0#1: tensor<2048xi32>, tensor<2048xi32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc b/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
index d41e83bf75aec7..fdcfcb3470059b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
@@ -53,12 +55,20 @@ Operation* GetOpOfValue(Value value) {
 // Check if the TPUCopyWithDynamicShapeOp is valid.
 // 1. The op should be wrapped inside a launch op.
 // 2. The wrapped launch op should be placed on CPU.
-bool IsOpValid(Operation* op) {
+LogicalResult CheckOpIsValid(Operation* op) {
   auto launch_op = llvm::dyn_cast<tf_device::LaunchOp>(op->getParentOp());
-  if (!launch_op) return false;
+  if (!launch_op) {
+    op->emitError() << "TPUCopyWithDynamicShapeOp is not in a launch";
+  }
   std::string device_str = launch_op.getDeviceAttr().getValue().str();
-  return device_str == tensorflow::GetDeviceAliasForHostOfLogicalCore(0) ||
-         device_str == "/job:localhost/replica:0/task:0/device:CPU:0";
+  if (device_str != tensorflow::GetDeviceAliasForHostOfLogicalCore(0) &&
+      device_str != "/job:localhost/replica:0/task:0/device:CPU:0") {
+    op->emitError()
+        << "TPUCopyWithDynamicShapeOp's device is not a recognized host 0: "
+        << device_str;
+    return failure();
+  }
+  return success();
 }
 
 // Check if we can move TPUCopyWithDynamicShapeOp out of a launch. This is the
@@ -167,14 +177,16 @@ void UpdateReturnOpResultWithLaunchOpResult(tf_device::LaunchOp* launch_op) {
 
 void ExtractTPUCopyWithDynamicShapeOpPass::runOnOperation() {
   llvm::SmallVector<Operation*, 4> tpu_copy_with_dynamic_shape_ops;
-  getOperation().walk([&](Operation* op) {
+  auto walk_result = getOperation().walk([&](Operation* op) {
     if (isa<TF::TPUCopyWithDynamicShapeOp>(op)) {
-      if (!IsOpValid(op)) return signalPassFailure();
+      if (failed(CheckOpIsValid(op))) return WalkResult::interrupt();
       if (CanMove(op)) {
         tpu_copy_with_dynamic_shape_ops.push_back(op);
       }
     }
+    return WalkResult::advance();
   });
+  if (walk_result.wasInterrupted()) return signalPassFailure();
 
   for (Operation* op : tpu_copy_with_dynamic_shape_ops) {
     OpBuilder builder(op);

From ad67d969389f5c1648e0b81002ed99fb3d9d44a4 Mon Sep 17 00:00:00 2001
From: Jie Sun <jiesun@google.com>
Date: Tue, 3 Oct 2023 17:17:33 -0700
Subject: [PATCH 556/567] add core type in schema

PiperOrigin-RevId: 570540973
---
 .../third_party/tsl/tsl/profiler/utils/BUILD  |  7 ++-
 .../tsl/profiler/utils/preprocess_xplane.cc   |  4 +-
 .../tsl/profiler/utils/preprocess_xplane.h    | 49 ++++++++++++++++---
 .../tsl/tsl/profiler/utils/xplane_schema.cc   |  1 +
 .../tsl/tsl/profiler/utils/xplane_schema.h    |  1 +
 5 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
index ecf2b00b7cdbd8..0c3e736a915fb3 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
@@ -1,8 +1,8 @@
+load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/platform:build_config_root.bzl", "if_static")
 load("//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
-load("//tsl:tsl.bzl", "set_external_visibility")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -421,11 +421,10 @@ cc_library(
         ":trace_utils",
         ":xplane_builder",
         ":xplane_schema",
-        "//tsl/profiler/lib:connected_traceme",
         "//tsl/profiler/lib:context_types",
         "//tsl/profiler/protobuf:xplane_proto_cc",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
index 029e53b8abcdee..15aedbc9d495b4 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
@@ -138,7 +138,9 @@ CreateMutatorFactories() {
           /*unique_stats=*/true,
           XContextStatsAccessor<uint64_t, StatType::kDeviceOrdinal>,
           XContextStatsAccessor<uint64_t, StatType::kQueueId>,
-          XContextStatsAccessor<uint64_t, StatType::kRunId>>::CreateFactory());
+          XContextStatsAccessor<uint64_t, StatType::kRunId>,
+          XContextStatsAccessorWithDefault<uint64_t, StatType::kCoreType,
+                                           0ULL>>::CreateFactory());
 
   mutator_factories.push_back(TpuModuleLineMutatorFactory::CreateFactory());
   return mutator_factories;
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
index b3e2d4b09c3f66..6ccfadb61bb4d7 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_TSL_PROFILER_UTILS_PREPROCESS_XPLANE_H_
 
 #include <cstdint>
-#include <initializer_list>
 #include <memory>
 #include <optional>
 #include <tuple>
@@ -26,7 +25,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-#include "absl/algorithm/container.h"
+#include "absl/hash/hash.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
@@ -50,7 +49,7 @@ class XplaneEventMutator {
  public:
   virtual ~XplaneEventMutator() = default;
 
-  // Mutate event by event specifiedd by the event_metadata.
+  // Mutate event by event specified by the event_metadata.
   virtual void Mutate(XEventBuilder* builder) = 0;
   // Mutate line by line if event_metadata() return nullptr.
   virtual void MutateEventsInLine(XLineBuilder* line) = 0;
@@ -154,6 +153,31 @@ class XContextStatsAccessor {
   XStatMetadata* stats_metadata_ = nullptr;
 };
 
+template <typename StatValueType, StatType kStatId, StatValueType kDefaultValue>
+class XContextStatsAccessorWithDefault {
+ public:
+  using value_type = StatValueType;
+
+  bool Initialize(XPlaneBuilder* xplane) {
+    stats_metadata_ = xplane->GetStatMetadata(GetStatTypeStr(kStatId));
+    return true;  // Always return true, even stat_metadata doesn't exist.
+  }
+
+  std::optional<StatValueType> GetStat(XEventBuilder* event_builder) {
+    if (stats_metadata_ == nullptr) return kDefaultValue;
+    auto* stat = event_builder->GetStat(*stats_metadata_);
+    if (stat == nullptr) return kDefaultValue;
+    if constexpr (std::is_integral_v<StatValueType>) {
+      return event_builder->IntOrUintValue(*stat);
+    } else {
+      return event_builder->StrOrRefValue(*stat);
+    }
+  }
+
+ private:
+  XStatMetadata* stats_metadata_ = nullptr;
+};
+
 // A template helper for tuple manipulation, although std::apply can achieve
 // similar result. However it requires C++ 17, TF windows bot is still C++ 14.
 template <std::size_t... Idx>
@@ -363,11 +387,15 @@ class TpuModuleLineMutatorFactory : public XplaneEventMutatorFactory {
         XContextStatsAccessor<uint64_t, StatType::kQueueId>
             queue_id_stats_accessor;
         XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor;
+        XContextStatsAccessorWithDefault<uint64_t, StatType::kCoreType, 0ULL>
+            core_type_stats_accessor;
         queue_id_stats_accessor.Initialize(xplane);
         run_id_stats_accessor.Initialize(xplane);
+        core_type_stats_accessor.Initialize(xplane);
         mutators.emplace_back(std::make_unique<TpuModuleLineMutator>(
             *device_ordinal, context_type_metadata, context_id_metadata,
-            queue_id_stats_accessor, run_id_stats_accessor));
+            queue_id_stats_accessor, run_id_stats_accessor,
+            core_type_stats_accessor));
       }
     }
     return mutators;
@@ -383,13 +411,16 @@ class TpuModuleLineMutatorFactory : public XplaneEventMutatorFactory {
         XStatMetadata* context_id_metadata,
         XContextStatsAccessor<uint64_t, StatType::kQueueId>
             queue_id_stats_accessor,
-        XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor)
+        XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor,
+        XContextStatsAccessorWithDefault<uint64_t, StatType::kCoreType, 0ULL>
+            core_type_stats_accessor)
         : XplaneEventMutator(nullptr),
           device_ordinal_(device_ordinal),
           context_type_metadata_(context_type_metadata),
           context_id_metadata_(context_id_metadata),
           queue_id_stats_accessor_(queue_id_stats_accessor),
-          run_id_stats_accessor_(run_id_stats_accessor) {}
+          run_id_stats_accessor_(run_id_stats_accessor),
+          core_type_stats_accessor_(core_type_stats_accessor) {}
 
     void Mutate(XEventBuilder* event_builder) override {
       CHECK(false);  // Crash OK
@@ -400,14 +431,16 @@ class TpuModuleLineMutatorFactory : public XplaneEventMutatorFactory {
       line->ForEachEvent([&](XEventBuilder event) {
         auto run_id = run_id_stats_accessor_.GetStat(&event);
         auto queue_id = queue_id_stats_accessor_.GetStat(&event);
+        auto core_type = core_type_stats_accessor_.GetStat(&event);
         if (!run_id || !queue_id) return;
         // The order of tuple <device_ordinal, queue_id, run_id> need to be
         // consistent with other kTpuLaunch types.
         std::vector<std::variant<absl::string_view, uint64_t>> required_stats;
-        required_stats.reserve(3);
+        required_stats.reserve(4);
         required_stats.emplace_back(device_ordinal_);
         required_stats.emplace_back(*queue_id);
         required_stats.emplace_back(*run_id);
+        required_stats.emplace_back(static_cast<uint64_t>(*core_type));
         int64_t context_id = absl::HashOf(required_stats);
         event.SetOrAddStatValue(*context_type_metadata_,
                                 static_cast<int64_t>(ContextType::kTpuLaunch));
@@ -422,6 +455,8 @@ class TpuModuleLineMutatorFactory : public XplaneEventMutatorFactory {
     XContextStatsAccessor<uint64_t, StatType::kQueueId>
         queue_id_stats_accessor_;
     XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor_;
+    XContextStatsAccessorWithDefault<uint64_t, StatType::kCoreType, 0ULL>
+        core_type_stats_accessor_;
   };
 };
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
index ca70bb29aa4b21..7702cc9d557484 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
@@ -223,6 +223,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"kpi_value", kKpiValue},
       {"element_id", kElementId},
       {"parent_id", kParentId},
+      {"core_type", kCoreType},
       // XPlane semantics related.
       {"_pt", kProducerType},
       {"_ct", kConsumerType},
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
index a1da54f73deaaf..46427052cca8d2 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
@@ -214,6 +214,7 @@ enum StatType {
   kKpiValue,
   kElementId,
   kParentId,
+  kCoreType,
   // XPlane semantics related.
   kProducerType,
   kConsumerType,

From c2b61a1a6ad34561ec503f69b0894c71a2d833c2 Mon Sep 17 00:00:00 2001
From: Shixin Li <shixinli@google.com>
Date: Tue, 3 Oct 2023 17:25:32 -0700
Subject: [PATCH 557/567] Rename se executable names to align with
 PjrtExecutable/PjrtLoadedExecutable.

PiperOrigin-RevId: 570542627
---
 third_party/xla/xla/pjrt/BUILD                | 12 ++--
 third_party/xla/xla/pjrt/gpu/BUILD            |  6 +-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    | 21 +++----
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc  |  4 +-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h   |  2 -
 .../xla/pjrt/pjrt_stream_executor_client.cc   | 62 ++++++++++---------
 .../xla/pjrt/pjrt_stream_executor_client.h    |  6 +-
 ...table.cc => stream_executor_executable.cc} |  9 ++-
 ...cutable.h => stream_executor_executable.h} | 13 ++--
 ...proto => stream_executor_executable.proto} |  2 +-
 10 files changed, 68 insertions(+), 69 deletions(-)
 rename third_party/xla/xla/pjrt/{stream_executor_unloaded_executable.cc => stream_executor_executable.cc} (83%)
 rename third_party/xla/xla/pjrt/{stream_executor_unloaded_executable.h => stream_executor_executable.h} (85%)
 rename third_party/xla/xla/pjrt/{stream_executor_unloaded_executable.proto => stream_executor_executable.proto} (94%)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 4e2b8a59caeef6..7d7ca6d2363925 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -377,8 +377,8 @@ cc_library(
 )
 
 tf_proto_library(
-    name = "stream_executor_unloaded_executable_proto",
-    srcs = ["stream_executor_unloaded_executable.proto"],
+    name = "stream_executor_executable_proto",
+    srcs = ["stream_executor_executable.proto"],
     cc_api_version = 2,
     protodeps = [
         ":compile_options_proto",
@@ -388,13 +388,13 @@ tf_proto_library(
 )
 
 cc_library(
-    name = "stream_executor_unloaded_executable",
-    srcs = ["stream_executor_unloaded_executable.cc"],
-    hdrs = ["stream_executor_unloaded_executable.h"],
+    name = "stream_executor_executable",
+    srcs = ["stream_executor_executable.cc"],
+    hdrs = ["stream_executor_executable.h"],
     visibility = ["//visibility:public"],
     deps = [
         ":pjrt_executable",
-        ":stream_executor_unloaded_executable_proto_cc",
+        ":stream_executor_executable_proto_cc",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index e6a511e7142a45..e820a3806ca6e4 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -53,8 +53,8 @@ cc_library(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_stream_executor_client",
-        "//xla/pjrt:stream_executor_unloaded_executable",
-        "//xla/pjrt:stream_executor_unloaded_executable_proto_cc",
+        "//xla/pjrt:stream_executor_executable",
+        "//xla/pjrt:stream_executor_executable_proto_cc",
         "//xla/pjrt:tracked_device_buffer",
         "//xla/pjrt:utils",
         "//xla/pjrt/distributed:client",
@@ -225,7 +225,7 @@ cc_library(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_stream_executor_client",
-        "//xla/pjrt:stream_executor_unloaded_executable",
+        "//xla/pjrt:stream_executor_executable",
         "//xla/pjrt:utils",
         "//xla/service:compiler",
         "//xla/service:dump",
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 9352b830152df5..8f5511ff9bfc6a 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -44,7 +44,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
-#include "xla/pjrt/stream_executor_unloaded_executable.h"
+#include "xla/pjrt/stream_executor_executable.h"
 #include "xla/pjrt/tracked_device_buffer.h"
 #include "xla/pjrt/utils.h"
 #include "xla/service/compiler.h"
@@ -66,7 +66,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/gpu/nccl_id_store.h"
-#include "xla/pjrt/stream_executor_unloaded_executable.pb.h"
+#include "xla/pjrt/stream_executor_executable.pb.h"
 #include "xla/service/gpu/gpu_compiler.h"
 #include "xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
 #include "xla/xla.pb.h"
@@ -76,7 +76,7 @@ limitations under the License.
 #include "rocm/rocm_config.h"
 #include "xla/pjrt/compile_options.pb.h"  // NOLINT(build/include)
 #include "xla/pjrt/gpu/nccl_id_store.h"  // NOLINT(build/include)
-#include "xla/pjrt/stream_executor_unloaded_executable.pb.h"  // NOLINT(build/include)
+#include "xla/pjrt/stream_executor_executable.pb.h"  // NOLINT(build/include)
 #include "xla/service/gpu/gpu_compiler.h"  // NOLINT(build/include)
 #include "xla/xla.pb.h"  // NOLINT(build/include)
 #endif  // TENSORFLOW_USE_ROCM
@@ -536,8 +536,8 @@ PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
 
 namespace {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-StatusOr<std::unique_ptr<StreamExecutorUnloadedExecutable>> FromProto(
-    const StreamExecutorUnloadedExecutableProto& proto) {
+StatusOr<std::unique_ptr<StreamExecutorExecutable>> FromProto(
+    const StreamExecutorExecutableProto& proto) {
   TF_ASSIGN_OR_RETURN(CompileOptions compile_options,
                       CompileOptions::FromProto(proto.compile_options()));
   std::vector<std::unique_ptr<xla::AotCompilationResult>>
@@ -548,7 +548,7 @@ StatusOr<std::unique_ptr<StreamExecutorUnloadedExecutable>> FromProto(
         gpu::GpuXlaRuntimeAotCompilationResult::FromString(executable));
     deserialized_aot_executables.push_back(std::move(deserialized));
   }
-  return std::make_unique<StreamExecutorUnloadedExecutable>(
+  return std::make_unique<StreamExecutorExecutable>(
       compile_options, std::move(deserialized_aot_executables),
       proto.num_replicas(), proto.num_partitions(), proto.name());
 }
@@ -560,7 +560,7 @@ StreamExecutorGpuClient::LoadSerializedExecutable(
     absl::string_view serialized, std::optional<CompileOptions> options,
     const LoadOptions& load_options) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  StreamExecutorUnloadedExecutableProto proto;
+  StreamExecutorExecutableProto proto;
   if (serialized.size() > std::numeric_limits<int>::max()) {
     return Internal(
         "PjRtStreamExecutorClient::DeserializeExecutable proto too large "
@@ -595,9 +595,8 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>> StreamExecutorGpuClient::Load(
     std::unique_ptr<PjRtExecutable> executable,
     const LoadOptions& load_options) {
-  auto se_executable =
-      absl::WrapUnique(tensorflow::down_cast<StreamExecutorUnloadedExecutable*>(
-          executable.release()));
+  auto se_executable = absl::WrapUnique(
+      tensorflow::down_cast<StreamExecutorExecutable*>(executable.release()));
 
   CompileOptions compile_options = se_executable->compile_options();
   CompileOptions input_options = compile_options;
@@ -619,7 +618,7 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> StreamExecutorGpuClient::Load(
   }
   bool parameter_is_tupled_arguments =
       compile_options.parameter_is_tupled_arguments;
-  auto ret = std::make_unique<PjRtStreamExecutorExecutable>(
+  auto ret = std::make_unique<PjRtStreamExecutorLoadedExecutable>(
       std::move(local_executables), parameter_is_tupled_arguments,
       std::move(extras.device_assignment), std::move(input_options),
       std::move(extras.addressable_device_logical_ids),
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index e3457f350c8b7f..eb3bf173e68a19 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/client/local_client.h"
 #include "xla/pjrt/mlir_to_hlo.h"
-#include "xla/pjrt/stream_executor_unloaded_executable.h"
+#include "xla/pjrt/stream_executor_executable.h"
 #include "xla/pjrt/utils.h"
 #include "xla/service/dump.h"
 #include "xla/service/gpu/executable.pb.h"
@@ -139,7 +139,7 @@ absl::StatusOr<std::unique_ptr<PjRtExecutable>> AotCompile(
       std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
       gpu_compiler.CompileAheadOfTime(std::move(unique_module_group),
                                       aot_options));
-  return std::make_unique<StreamExecutorUnloadedExecutable>(
+  return std::make_unique<StreamExecutorExecutable>(
       std::move(input_options), std::move(aot_results), num_replicas,
       num_partitions, name);
 }
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
index 981748429fa48e..a9cbc28f20b393 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
@@ -27,8 +27,6 @@ limitations under the License.
 
 namespace xla {
 // Implements the interfaces that are needed for the registered compiler.
-// TODO(b/285385306): current implementation purely relies on the `client`
-// Compile() functions and ignores the `topology` parameter.
 class StreamExecutorGpuCompiler : public PjRtCompiler {
  public:
   // If `gpu_target_config` is nullopt, the compiler has to compile with device,
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index c7e097fe25764d..338693e7c14b30 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -2046,7 +2046,7 @@ bool IsAllZeros(const DeviceAssignment& assignment) {
 
 }  // namespace
 
-PjRtStreamExecutorExecutable::PjRtStreamExecutorExecutable(
+PjRtStreamExecutorLoadedExecutable::PjRtStreamExecutorLoadedExecutable(
     std::vector<std::unique_ptr<LocalExecutable>> executables,
     bool parameter_is_tupled_arguments,
     std::shared_ptr<DeviceAssignment> device_assignment,
@@ -2081,12 +2081,12 @@ PjRtStreamExecutorExecutable::PjRtStreamExecutorExecutable(
   int num_partitions;
   if (device_assignment_ == nullptr) {
     // This must go after `executables_` is initialized.
-    VLOG(3) << "PjRtStreamExecutorExecutable portable single-core";
+    VLOG(3) << "PjRtStreamExecutorLoadedExecutable portable single-core";
     num_partitions = 1;
     CHECK(addressable_devices_.empty());
   } else {
     // This must go after `executables_` is initialized.
-    VLOG(3) << "PjRtStreamExecutorExecutable device_assignment:\n"
+    VLOG(3) << "PjRtStreamExecutorLoadedExecutable device_assignment:\n"
             << device_assignment_->ToString();
     CHECK_GE(addressable_devices_.size(), 1) << device_assignment_->ToString();
 
@@ -2117,7 +2117,7 @@ PjRtStreamExecutorExecutable::PjRtStreamExecutorExecutable(
   }
 }
 
-Status PjRtStreamExecutorExecutable::SetUpDonation(bool tuple_inputs) {
+Status PjRtStreamExecutorLoadedExecutable::SetUpDonation(bool tuple_inputs) {
   parameters_that_must_be_donated_.reserve(executables_.size());
   for (auto& executable : executables_) {
     TF_ASSIGN_OR_RETURN(std::vector<int> parameters_to_donate,
@@ -2129,7 +2129,7 @@ Status PjRtStreamExecutorExecutable::SetUpDonation(bool tuple_inputs) {
   return OkStatus();
 }
 
-absl::string_view PjRtStreamExecutorExecutable::name() const {
+absl::string_view PjRtStreamExecutorLoadedExecutable::name() const {
   Executable* executable = executables_[0]->executable();
   if (executable->has_module()) {
     return executable->module().name();
@@ -2138,13 +2138,14 @@ absl::string_view PjRtStreamExecutorExecutable::name() const {
   }
 }
 
-absl::Span<int const> PjRtStreamExecutorExecutable::ParametersThatMustBeDonated(
+absl::Span<int const>
+PjRtStreamExecutorLoadedExecutable::ParametersThatMustBeDonated(
     int executable_idx) const {
   return parameters_that_must_be_donated_[executable_idx];
 }
 
 StatusOr<std::vector<ExecutionInput>>
-PjRtStreamExecutorExecutable::MakeExecutionInputsAndWaitForEvents(
+PjRtStreamExecutorLoadedExecutable::MakeExecutionInputsAndWaitForEvents(
     int device_ordinal, const ExecuteOptions& options,
     absl::Span<const Shape> executable_parameter_shapes,
     absl::Span<PjRtBuffer* const> argument_handles,
@@ -2258,7 +2259,8 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
     thread_pool->Schedule([done_event, stream, src, channel_id, shape, send] {
       tsl::profiler::TraceMe trace([&] {
         return tsl::profiler::TraceMeEncode(
-            "PjRtStreamExecutorExecutable::Send", {{"channel_id", channel_id}});
+            "PjRtStreamExecutorLoadedExecutable::Send",
+            {{"channel_id", channel_id}});
       });
 
       // Allocate chunk on the host for copying data from device.
@@ -2395,8 +2397,9 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
             << " (shape=" << shape.ToString() << ")";
 
     tsl::profiler::TraceMe trace([&] {
-      return tsl::profiler::TraceMeEncode("PjRtStreamExecutorExecutable::Recv",
-                                          {{"channel_id", channel_id}});
+      return tsl::profiler::TraceMeEncode(
+          "PjRtStreamExecutorLoadedExecutable::Recv",
+          {{"channel_id", channel_id}});
     });
 
     const RecvCallback* recv = FindCallback(channel_id, callbacks);
@@ -2426,7 +2429,8 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
 // converted on success.
 // When `options` has non-zero `launch_id`, use `launch_id` instead of `run_id`
 // to initialize `run_options`.
-StatusOr<ScopedShapedBuffer> PjRtStreamExecutorExecutable::EnqueueExecution(
+StatusOr<ScopedShapedBuffer>
+PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
     PjRtDevice* device,
@@ -2438,7 +2442,7 @@ StatusOr<ScopedShapedBuffer> PjRtStreamExecutorExecutable::EnqueueExecution(
                            ->device_ordinal();
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
   tsl::profiler::TraceMeConsumer activity(
-      "PjRtStreamExecutorExecutable::EnqueueExecution",
+      "PjRtStreamExecutorLoadedExecutable::EnqueueExecution",
       tsl::profiler::ContextType::kPjRt, run_id.ToInt());
   VLOG(3) << "Replica " << replica << ", partition " << partition
           << " mapped to device ordinal for execution: " << device_ordinal;
@@ -2630,7 +2634,7 @@ StatusOr<ScopedShapedBuffer> PjRtStreamExecutorExecutable::EnqueueExecution(
 }
 
 std::vector<std::unique_ptr<PjRtBuffer>>
-PjRtStreamExecutorExecutable::MakeOutputBuffers(
+PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
     int device_ordinal, const ExecuteOptions& options,
     ScopedShapedBuffer result_buffer,
     std::shared_ptr<BufferSequencingEvent> definition_event, PjRtDevice* device,
@@ -2669,7 +2673,7 @@ PjRtStreamExecutorExecutable::MakeOutputBuffers(
 }
 
 StatusOr<PjRtLoadedExecutable::Result>
-PjRtStreamExecutorExecutable::ExecuteHelper(
+PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     const RunId& run_id, const ExecuteOptions& options, bool fill_future,
     PjRtDevice* device) const {
@@ -2693,7 +2697,8 @@ PjRtStreamExecutorExecutable::ExecuteHelper(
   int device_ordinal = tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
                            ->local_device_state()
                            ->device_ordinal();
-  tsl::profiler::TraceMe traceme("PjRtStreamExecutorExecutable::ExecuteHelper");
+  tsl::profiler::TraceMe traceme(
+      "PjRtStreamExecutorLoadedExecutable::ExecuteHelper");
   VLOG(1) << "Replica " << replica << ", partition " << partition
           << " mapped to device ordinal for execution: " << device_ordinal;
 
@@ -2773,7 +2778,7 @@ PjRtStreamExecutorExecutable::ExecuteHelper(
 }
 
 StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
-PjRtStreamExecutorExecutable::Execute(
+PjRtStreamExecutorLoadedExecutable::Execute(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options,
     std::optional<std::vector<PjRtFuture<Status>>>& returned_futures) {
@@ -2783,7 +2788,7 @@ PjRtStreamExecutorExecutable::Execute(
 
   RunId run_id;
   tsl::profiler::TraceMeProducer activity(
-      "PjRtStreamExecutorExecutable::Execute",
+      "PjRtStreamExecutorLoadedExecutable::Execute",
       tsl::profiler::ContextType::kPjRt, run_id.ToInt());
 
   const int num_addressable_devices = addressable_devices_.size();
@@ -2898,7 +2903,7 @@ PjRtStreamExecutorExecutable::Execute(
 }
 
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-PjRtStreamExecutorExecutable::ExecuteSharded(
+PjRtStreamExecutorLoadedExecutable::ExecuteSharded(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options,
     std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
@@ -2927,7 +2932,7 @@ PjRtStreamExecutorExecutable::ExecuteSharded(
 }
 
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-PjRtStreamExecutorExecutable::ExecutePortable(
+PjRtStreamExecutorLoadedExecutable::ExecutePortable(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options,
     std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
@@ -2954,7 +2959,7 @@ PjRtStreamExecutorExecutable::ExecutePortable(
 }
 
 StatusOr<std::vector<std::shared_ptr<HloModule>>>
-PjRtStreamExecutorExecutable::GetHloModules() const {
+PjRtStreamExecutorLoadedExecutable::GetHloModules() const {
   std::vector<std::shared_ptr<HloModule>> modules;
   modules.reserve(executables().size());
   for (const auto& local_exec : executables()) {
@@ -2967,7 +2972,7 @@ PjRtStreamExecutorExecutable::GetHloModules() const {
 }
 
 StatusOr<std::vector<std::vector<absl::string_view>>>
-PjRtStreamExecutorExecutable::GetOutputMemoryKinds() const {
+PjRtStreamExecutorLoadedExecutable::GetOutputMemoryKinds() const {
   return Unimplemented("GetOutputMemoryKinds is not supported.");
 }
 
@@ -2976,7 +2981,7 @@ PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
   ExecutableExtras extras;
   std::shared_ptr<DeviceAssignment>& device_assignment =
       extras.device_assignment;
-  std::vector<PjRtStreamExecutorExecutable::LogicalDeviceIds>&
+  std::vector<PjRtStreamExecutorLoadedExecutable::LogicalDeviceIds>&
       addressable_device_logical_ids = extras.addressable_device_logical_ids;
   std::vector<PjRtDevice*>& addressable_devices = extras.addressable_devices;
 
@@ -3062,7 +3067,7 @@ PjRtStreamExecutorClient::Compile(const XlaComputation& computation,
   TF_ASSIGN_OR_RETURN(ExecutableExtras extras, GetExecutableExtras(&options));
   std::shared_ptr<DeviceAssignment>& device_assignment =
       extras.device_assignment;
-  std::vector<PjRtStreamExecutorExecutable::LogicalDeviceIds>&
+  std::vector<PjRtStreamExecutorLoadedExecutable::LogicalDeviceIds>&
       addressable_device_logical_ids = extras.addressable_device_logical_ids;
   std::vector<PjRtDevice*>& addressable_devices = extras.addressable_devices;
 
@@ -3082,7 +3087,7 @@ PjRtStreamExecutorClient::Compile(const XlaComputation& computation,
       client()->Compile(computation, argument_layout_pointers,
                         options.executable_build_options));
 
-  auto executable = std::make_unique<PjRtStreamExecutorExecutable>(
+  auto executable = std::make_unique<PjRtStreamExecutorLoadedExecutable>(
       std::move(local_executables), options.parameter_is_tupled_arguments,
       std::move(device_assignment), std::move(input_options),
       std::move(addressable_device_logical_ids), std::move(addressable_devices),
@@ -3106,8 +3111,9 @@ PjRtStreamExecutorClient::Compile(mlir::ModuleOp module,
 
 StatusOr<std::string> PjRtStreamExecutorClient::SerializeExecutable(
     const PjRtLoadedExecutable& executable) const {
-  const PjRtStreamExecutorExecutable* se_executable =
-      tensorflow::down_cast<const PjRtStreamExecutorExecutable*>(&executable);
+  const PjRtStreamExecutorLoadedExecutable* se_executable =
+      tensorflow::down_cast<const PjRtStreamExecutorLoadedExecutable*>(
+          &executable);
 
   absl::Span<const std::shared_ptr<LocalExecutable>> local_executables =
       se_executable->executables();
@@ -3169,7 +3175,7 @@ PjRtStreamExecutorClient::DeserializeExecutable(
                       GetExecutableExtras(&compile_options));
   std::shared_ptr<DeviceAssignment>& device_assignment =
       extras.device_assignment;
-  std::vector<PjRtStreamExecutorExecutable::LogicalDeviceIds>&
+  std::vector<PjRtStreamExecutorLoadedExecutable::LogicalDeviceIds>&
       addressable_device_logical_ids = extras.addressable_device_logical_ids;
   std::vector<PjRtDevice*>& addressable_devices = extras.addressable_devices;
 
@@ -3181,7 +3187,7 @@ PjRtStreamExecutorClient::DeserializeExecutable(
   std::vector<std::unique_ptr<LocalExecutable>> local_executables;
   local_executables.push_back(std::move(loaded));
 
-  auto executable = std::make_unique<PjRtStreamExecutorExecutable>(
+  auto executable = std::make_unique<PjRtStreamExecutorLoadedExecutable>(
       std::move(local_executables),
       compile_options.parameter_is_tupled_arguments,
       std::move(device_assignment), std::move(input_options),
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 14abf2e4851a41..b91269c3e3d8f9 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -771,9 +771,9 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
 
 // Wraps one or more XLA LocalExecutables (one per partition, as specified by
 // the build options).
-class PjRtStreamExecutorExecutable : public PjRtLoadedExecutable {
+class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
  public:
-  PjRtStreamExecutorExecutable(
+  PjRtStreamExecutorLoadedExecutable(
       std::vector<std::unique_ptr<LocalExecutable>> executables,
       bool parameter_is_tupled_arguments,
       std::shared_ptr<DeviceAssignment> device_assignment,
@@ -782,7 +782,7 @@ class PjRtStreamExecutorExecutable : public PjRtLoadedExecutable {
       std::vector<PjRtDevice*> addressable_devices,
       PjRtStreamExecutorClient* client);
 
-  ~PjRtStreamExecutorExecutable() override = default;
+  ~PjRtStreamExecutorLoadedExecutable() override = default;
 
   PjRtStreamExecutorClient* client() const override { return client_; }
 
diff --git a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.cc b/third_party/xla/xla/pjrt/stream_executor_executable.cc
similarity index 83%
rename from third_party/xla/xla/pjrt/stream_executor_unloaded_executable.cc
rename to third_party/xla/xla/pjrt/stream_executor_executable.cc
index e2c15ad314d58b..f80a1f26c5b069 100644
--- a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.cc
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.cc
@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/pjrt/stream_executor_unloaded_executable.h"
+#include "xla/pjrt/stream_executor_executable.h"
 
 #include <memory>
 #include <string>
 
-#include "xla/pjrt/stream_executor_unloaded_executable.pb.h"
+#include "xla/pjrt/stream_executor_executable.pb.h"
 #include "xla/service/compiler.h"
 #include "xla/statusor.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
-StatusOr<std::string> StreamExecutorUnloadedExecutable::SerializeExecutable()
-    const {
-  StreamExecutorUnloadedExecutableProto proto;
+StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable() const {
+  StreamExecutorExecutableProto proto;
   TF_ASSIGN_OR_RETURN(*proto.mutable_compile_options(),
                       compile_options_.ToProto());
   for (const std::unique_ptr<xla::AotCompilationResult>& aot_executable :
diff --git a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.h b/third_party/xla/xla/pjrt/stream_executor_executable.h
similarity index 85%
rename from third_party/xla/xla/pjrt/stream_executor_unloaded_executable.h
rename to third_party/xla/xla/pjrt/stream_executor_executable.h
index a44660e9579b68..e9fe0b319558ef 100644
--- a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.h
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_PJRT_STREAM_EXECUTOR_UNLOADED_EXECUTABLE_H_
-#define XLA_PJRT_STREAM_EXECUTOR_UNLOADED_EXECUTABLE_H_
+#ifndef XLA_PJRT_STREAM_EXECUTOR_EXECUTABLE_H_
+#define XLA_PJRT_STREAM_EXECUTOR_EXECUTABLE_H_
 
 #include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -22,12 +22,9 @@ limitations under the License.
 #include "xla/service/compiler.h"
 
 namespace xla {
-// TODO(b/300657649): Rename existing PjRtStreamExecutorExecutable to
-// PjRtStreamExecutorLoadedExecutable, and this one to
-// PjRtStreamExecutorExecutable.
-class StreamExecutorUnloadedExecutable : public PjRtExecutable {
+class StreamExecutorExecutable : public PjRtExecutable {
  public:
-  StreamExecutorUnloadedExecutable(
+  StreamExecutorExecutable(
       const CompileOptions& compile_options,
       std::vector<std::unique_ptr<xla::AotCompilationResult>> executables,
       int num_replicas, int num_partitions, absl::string_view name)
@@ -75,4 +72,4 @@ class StreamExecutorUnloadedExecutable : public PjRtExecutable {
 };
 }  // namespace xla
 
-#endif  // XLA_PJRT_STREAM_EXECUTOR_UNLOADED_EXECUTABLE_H_
+#endif  // XLA_PJRT_STREAM_EXECUTOR_EXECUTABLE_H_
diff --git a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.proto b/third_party/xla/xla/pjrt/stream_executor_executable.proto
similarity index 94%
rename from third_party/xla/xla/pjrt/stream_executor_unloaded_executable.proto
rename to third_party/xla/xla/pjrt/stream_executor_executable.proto
index aeade5edc7cb7b..20cc49de21252a 100644
--- a/third_party/xla/xla/pjrt/stream_executor_unloaded_executable.proto
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.proto
@@ -19,7 +19,7 @@ package xla;
 
 import "xla/pjrt/compile_options.proto";
 
-message StreamExecutorUnloadedExecutableProto {
+message StreamExecutorExecutableProto {
   CompileOptionsProto compile_options = 1;
   repeated bytes executables = 2;
   int32 num_replicas = 3;

From db6c71f6c7c7160674c25eca628a02242e830188 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 19:25:04 -0700
Subject: [PATCH 558/567] Fix "SAME" padding check in Conv2DTranspose
 legalization

PiperOrigin-RevId: 570562097
---
 .../lite/stablehlo/tests/legalize_hlo.mlir    | 28 ++++++++++
 .../lite/stablehlo/transforms/legalize_hlo.cc | 51 ++++++++++++++-----
 2 files changed, 65 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index 195f539319f3e8..04bd639a4a5d5d 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -2743,6 +2743,34 @@ func.func @convert_conv2d_resize_perferred(%arg0: tensor<1x56x1248x16xf32>, %arg
   func.return %0 : tensor<1x111x1248x16xf32>
 }
 
+// CHECK-LABEL:   func @convert_conv2d_back_prop_input_same_pad(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x256x256x2xf32>,
+// CHECK-SAME:                         %[[VAL_1:.*]]: tensor<4x4x2x2xf32>) -> tensor<1x512x512x2xf32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.ReverseV2"(%[[VAL_1]], %[[VAL_3]]) : (tensor<4x4x2x2xf32>, tensor<2xi64>) -> tensor<4x4x2x2xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 512, 512, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_2]], %[[VAL_4]], %[[VAL_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<4x4x2x2xf32>, tensor<1x256x256x2xf32>) -> tensor<1x512x512x2xf32>
+// CHECK:           return %[[VAL_5]] : tensor<1x512x512x2xf32>
+// CHECK:         }
+func.func @convert_conv2d_back_prop_input_same_pad(%arg0: tensor<1x256x256x2xf32>, %arg1: tensor<4x4x2x2xf32>) -> tensor<1x512x512x2xf32> {
+  %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64,
+    dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, o, i]->[b, 0, 1, f]>, feature_group_count = 1 : i64, lhs_dilation = dense<2> : tensor<2xi64>, padding = dense<[[2, 2], [2, 2]]> : tensor<2x2xi64>,
+    precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>
+  } : (tensor<1x256x256x2xf32>, tensor<4x4x2x2xf32>) -> tensor<1x512x512x2xf32>
+  func.return %0 : tensor<1x512x512x2xf32>
+}
+
+// CHECK-LABEL:   func @convert_conv2d_back_prop_input_negative_pad(
+// CHECK-NOT:       "tf.Conv2DBackpropInput"
+func.func @convert_conv2d_back_prop_input_negative_pad(%arg0: tensor<1x256x256x2xf32>, %arg1: tensor<4x4x2x2xf32>) -> tensor<1x504x504x2xf32> {
+  %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64,
+    dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, o, i]->[b, 0, 1, f]>, feature_group_count = 1 : i64, lhs_dilation = dense<2> : tensor<2xi64>, padding = dense<[[-2, -2], [-2, -2]]> : tensor<2x2xi64>,
+    precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>
+  } : (tensor<1x256x256x2xf32>, tensor<4x4x2x2xf32>) -> tensor<1x504x504x2xf32>
+  func.return %0 : tensor<1x504x504x2xf32>
+}
+
+
 // CHECK-LABEL:   func @convert_conv2d_back_prop_input(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<8x4x4x32xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x64x32xf32>) -> tensor<8x8x8x64xf32> {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index a0a58997f1ac6e..4f877c23262c08 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -802,24 +802,41 @@ class ConvertNonTrivialConvOp
   };
 
  private:
-  bool IsSamePadding(mhlo::ConvolutionOp conv_op, int num_spatial_dims,
+  bool IsSamePadding(mhlo::ConvolutionOp conv_op, size_t num_spatial_dims,
                      ArrayRef<int64_t> strides) const {
-    for (auto i : llvm::seq<int>(0, num_spatial_dims)) {
-      int stride_dim = i + 1;
-      int stride = strides[stride_dim];
-      int output_dim =
-          conv_op.getDimensionNumbers().getOutputSpatialDimensions().data()[i];
-      int input_dim =
-          conv_op.getDimensionNumbers().getInputSpatialDimensions().data()[i];
-      int output_size =
-          conv_op.getType().cast<ShapedType>().getDimSize(output_dim);
-      int input_size =
-          conv_op.getLhs().getType().cast<ShapedType>().getDimSize(input_dim);
-      if (input_size != (output_size + stride - 1) / stride) {
+    auto dnums = conv_op.getDimensionNumbers();
+    SmallVector<int64_t, 4> padding(
+        conv_op.getPadding().value().getValues<int64_t>().begin(),
+        conv_op.getPadding().value().getValues<int64_t>().end());
+    // The newly added spatial dimension requires zero left and right padding.
+    ArrayRef<int64_t> input_spatial_dims = dnums.getInputSpatialDimensions();
+    ArrayRef<int64_t> output_spatial_dims = dnums.getOutputSpatialDimensions();
+    for (size_t i = 0; i < num_spatial_dims; ++i) {
+      // In some cases the total padding is odd, so we have 1 leftover, which is
+      // why below we check pad_delta > 1.
+      int64_t pad_delta = std::abs(padding[2 * i] - padding[2 * i + 1]);
+      if (pad_delta > 1) {
+        return false;
+      }
+      int64_t stride = strides[i + 1];
+      int64_t input_size =
+          conv_op.getLhs().getType().cast<ShapedType>().getDimSize(
+              input_spatial_dims[i]);
+      int64_t output_size = conv_op.getType().cast<ShapedType>().getDimSize(
+          output_spatial_dims[i]);
+      // The reason for the below check is as follows:
+      // When computing the output, we have the following relation between
+      // o - output dim size, i - input dim size, s - stride, P - total pads
+      // o = (i-k+1) + (s-1)(i-1) + P
+      // Where the first term is the kernel applications on the input,
+      // the second term is the additional applications from the stride
+      // and P is a term that captures the total padding. After expanding we get
+      // o = si + k - s + 2 + P
+      // Here JAX sets P to cancel k-s+2, leading to the expression below
+      if (output_size != input_size * stride) {
         return false;
       }
     }
-
     return true;
   }
 
@@ -873,6 +890,12 @@ class ConvertNonTrivialConvOp
       return rewriter.notifyMatchFailure(conv_op,
                                          "doesn't support more than 2D");
 
+    if (llvm::any_of(conv_op.getPadding().value().getValues<int64_t>(),
+                     [](int64_t v) { return v < 0; })) {
+      return rewriter.notifyMatchFailure(conv_op,
+                                         "doesn't support negative pads");
+    }
+
     // Checks kernel dimensions.
     if (!isKernelFormatHWIO(dnums) && !isKernelFormatHWOI(dnums))
       return rewriter.notifyMatchFailure(

From b86a33b596ecf16babbac9762adc8223b78238bc Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 3 Oct 2023 19:42:29 -0700
Subject: [PATCH 559/567] [stream_executor] Add initial support for command
 buffer mode

https://github.com/openxla/xla/issues/5857

PiperOrigin-RevId: 570564392
---
 .../xla/xla/stream_executor/command_buffer.cc | 18 ++++++----
 .../xla/xla/stream_executor/command_buffer.h  | 29 +++++++++------
 .../xla/stream_executor/cuda/cuda_executor.cc |  9 +++--
 .../stream_executor/gpu/gpu_command_buffer.cc | 35 ++++++++++++-------
 .../stream_executor/gpu/gpu_command_buffer.h  |  8 ++++-
 .../xla/stream_executor/gpu/gpu_executor.h    |  2 +-
 .../stream_executor_internal.h                |  5 ++-
 7 files changed, 71 insertions(+), 35 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/command_buffer.cc b/third_party/xla/xla/stream_executor/command_buffer.cc
index 1a77ebb11c2faa..1a4c1451b93dc1 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/command_buffer.cc
@@ -32,16 +32,18 @@ limitations under the License.
 namespace stream_executor {
 
 /*static*/ tsl::StatusOr<CommandBuffer> CommandBuffer::Create(
-    StreamExecutor* executor) {
+    StreamExecutor* executor, Mode mode) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<internal::CommandBufferInterface> command_buffer,
-      executor->implementation()->GetCommandBufferImplementation());
-  return tsl::StatusOr<CommandBuffer>(std::move(command_buffer));
+      executor->implementation()->GetCommandBufferImplementation(mode));
+
+  CommandBuffer cmd(std::move(command_buffer));
+  return cmd;
 }
 
 /*static*/ tsl::StatusOr<CommandBuffer> CommandBuffer::Trace(
-    StreamExecutor* executor,
-    absl::AnyInvocable<tsl::Status(Stream*)> function) {
+    StreamExecutor* executor, absl::AnyInvocable<tsl::Status(Stream*)> function,
+    Mode mode) {
   Stream stream(executor);
 
   // TODO(ezhulenev): Keep a dedicated stream for command buffer tracing in the
@@ -54,7 +56,7 @@ namespace stream_executor {
 
   // Prepare an empty command buffer instance.
   TF_ASSIGN_OR_RETURN(CommandBuffer command_buffer,
-                      CommandBuffer::Create(executor));
+                      CommandBuffer::Create(executor, mode));
 
   // Trace and finalize the command buffer.
   TF_RETURN_IF_ERROR(command_buffer.implementation()->Trace(
@@ -81,6 +83,10 @@ tsl::Status CommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
   return implementation_->MemcpyDeviceToDevice(dst, src, size);
 }
 
+CommandBuffer::Mode CommandBuffer::mode() const {
+  return implementation_->mode();
+}
+
 tsl::Status CommandBuffer::Finalize() { return implementation_->Finalize(); }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index 31361638d52d4a..fe27156cae4178 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -47,20 +47,23 @@ class CommandBufferInterface;
 // buffers allow to amortize the cost of launching "work" on device by building
 // it on the host ahead of time without expensive interaction with underlying
 // device.
-//
-// TODO(ezhulenev): Add a concept of a "nested" command buffer which can't be
-// submitted on its own, but has to be recorded into the parent command buffer.
-// For CUDA backend nested command buffers will never be instantiated into
-// executable graphs, but instead will only have a regular graph instance. We
-// should almost always trace into nested command buffers.
 class CommandBuffer {
  public:
+  enum class Mode {
+    // Command buffer can be submitted for execution via StreamExecutor APIs.
+    kPrimary,
+
+    // Command buffer can be executed only within a primary command buffer.
+    kNested
+  };
+
   //===--------------------------------------------------------------------===//
   // Command buffer constructors
   //===--------------------------------------------------------------------===//
 
   // Creates a new empty command buffer on the given executor.
-  static tsl::StatusOr<CommandBuffer> Create(StreamExecutor* executor);
+  static tsl::StatusOr<CommandBuffer> Create(StreamExecutor* executor,
+                                             Mode mode = Mode::kPrimary);
 
   // Creates a new command buffer on the given executor by tracing `function`
   // invocation. All StreamExecutor operations on a Stream argument will be
@@ -71,7 +74,8 @@ class CommandBuffer {
   // explicit construction APIs, e.g. when calling external libraries.
   static tsl::StatusOr<CommandBuffer> Trace(
       StreamExecutor* executor,
-      absl::AnyInvocable<tsl::Status(Stream*)> function);
+      absl::AnyInvocable<tsl::Status(Stream*)> function,
+      Mode mode = Mode::kPrimary);
 
   //===--------------------------------------------------------------------===//
   // Command buffer API
@@ -96,6 +100,9 @@ class CommandBuffer {
                      const ThreadDim& threads, const BlockDim& blocks,
                      Args... args);
 
+  // Returns command buffer execution mode.
+  Mode mode() const;
+
   internal::CommandBufferInterface* implementation() {
     return implementation_.get();
   }
@@ -104,13 +111,13 @@ class CommandBuffer {
     return implementation_.get();
   }
 
-  explicit CommandBuffer(
-      std::unique_ptr<internal::CommandBufferInterface> implementation);
-
   CommandBuffer(CommandBuffer&&) = default;
   CommandBuffer& operator=(CommandBuffer&&) = default;
 
  private:
+  explicit CommandBuffer(
+      std::unique_ptr<internal::CommandBufferInterface> implementation);
+
   std::unique_ptr<internal::CommandBufferInterface> implementation_;
 
   SE_DISALLOW_COPY_AND_ASSIGN(CommandBuffer);
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 23d8a93b6bb135..a22dd41e34b968 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -431,6 +431,11 @@ tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
 
 tsl::Status GpuExecutor::Submit(Stream* stream,
                                 const CommandBuffer& command_buffer) {
+  if (command_buffer.mode() != CommandBuffer::Mode::kPrimary) {
+    return absl::InvalidArgumentError(
+        "Can't submit non-primary command buffer for execution");
+  }
+
   auto exec = GpuCommandBuffer::Cast(&command_buffer)->executable();
   VLOG(3) << "Launch command buffer execuable graph " << exec
           << " on a stream: " << stream->DebugStreamPointers();
@@ -853,11 +858,11 @@ GpuExecutor::GetStreamImplementation() {
 }
 
 tsl::StatusOr<std::unique_ptr<internal::CommandBufferInterface>>
-GpuExecutor::GetCommandBufferImplementation() {
+GpuExecutor::GetCommandBufferImplementation(CommandBuffer::Mode mode) {
   VLOG(2) << "Create CUDA command buffer (CUDA graph)";
   GpuGraphHandle graph = nullptr;
   TF_RETURN_IF_ERROR(GpuDriver::CreateGraph(&graph));
-  return std::make_unique<GpuCommandBuffer>(this, graph);
+  return std::make_unique<GpuCommandBuffer>(mode, /*parent=*/this, graph);
 }
 
 void* GpuExecutor::GpuContextHack() { return context_; }
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index 11080a80fcd0ff..fa1e381c078b54 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_kernel.h"
@@ -65,8 +66,9 @@ static int64_t NotifyExecDestroyed() {
 // GpuCommandBuffer implementation
 //===----------------------------------------------------------------------===//
 
-GpuCommandBuffer::GpuCommandBuffer(GpuExecutor* parent, GpuGraphHandle graph)
-    : parent_(parent), graph_(graph), exec_(nullptr) {}
+GpuCommandBuffer::GpuCommandBuffer(CommandBuffer::Mode mode,
+                                   GpuExecutor* parent, GpuGraphHandle graph)
+    : mode_(mode), parent_(parent), graph_(graph), exec_(nullptr) {}
 
 GpuCommandBuffer::~GpuCommandBuffer() {
   if (exec_ != nullptr) {
@@ -119,10 +121,10 @@ tsl::Status GpuCommandBuffer::Trace(
 }
 
 tsl::Status GpuCommandBuffer::CheckNotFinalized() {
-  if (exec_ == nullptr) return tsl::OkStatus();
-
-  return absl::InternalError(
-      "Command buffer can't be updated after it was finalized");
+  if (finalized_)
+    return absl::InternalError(
+        "Command buffer can't be updated after it was finalized");
+  return tsl::OkStatus();
 }
 
 tsl::Status GpuCommandBuffer::Launch(const ThreadDim& threads,
@@ -161,16 +163,23 @@ tsl::Status GpuCommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
 tsl::Status GpuCommandBuffer::Finalize() {
   TF_RETURN_IF_ERROR(CheckNotFinalized());
 
-  GpuDriver::GraphInstantiateFlags flags;
+  if (mode_ == CommandBuffer::Mode::kPrimary) {
+    GpuDriver::GraphInstantiateFlags flags;
 
-  uint64_t start_nanos = tsl::Env::Default()->NowNanos();
-  TF_RETURN_IF_ERROR(GpuDriver::GraphInstantiate(&exec_, graph_, flags));
-  uint64_t end_nanos = tsl::Env::Default()->NowNanos();
+    uint64_t start_nanos = tsl::Env::Default()->NowNanos();
+    TF_RETURN_IF_ERROR(GpuDriver::GraphInstantiate(&exec_, graph_, flags));
+    uint64_t end_nanos = tsl::Env::Default()->NowNanos();
+
+    VLOG(5) << "Instantiated executable graph #" << NotifyExecCreated()
+            << " in " << (end_nanos - start_nanos) / 1000
+            << " μs (alive executable graphs: " << AliveExecs() << ")";
 
-  VLOG(5) << "Instantiated executable graph #" << NotifyExecCreated() << " in "
-          << (end_nanos - start_nanos) / 1000
-          << " μs (alive executable graphs: " << AliveExecs() << ")";
+  } else {
+    VLOG(5) << "Finalize nested command buffer without instantiating "
+               "executable graph";
+  }
 
+  finalized_ = true;
   return tsl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index 8680f0966348bf..1121dfe21eb80e 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -34,7 +34,8 @@ namespace stream_executor::gpu {
 // implementation (it's backed by CUDA or HIP graphs on NVIDIA and AMD devices).
 class GpuCommandBuffer : public internal::CommandBufferInterface {
  public:
-  GpuCommandBuffer(GpuExecutor* parent, GpuGraphHandle graph);
+  GpuCommandBuffer(CommandBuffer::Mode mode, GpuExecutor* parent,
+                   GpuGraphHandle graph);
   ~GpuCommandBuffer() override;
 
   tsl::Status Trace(Stream* stream,
@@ -48,6 +49,8 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
                                    const DeviceMemoryBase& src,
                                    uint64_t size) override;
 
+  CommandBuffer::Mode mode() const override { return mode_; }
+
   tsl::Status Finalize() override;
 
   GpuGraphExecHandle executable() const { return exec_; }
@@ -80,6 +83,9 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   static_assert(std::is_pointer_v<GpuGraphExecHandle>,
                 "GpuGraphExecHandle must be a pointer");
 
+  CommandBuffer::Mode mode_;
+  bool finalized_ = false;
+
   GpuExecutor* parent_;                // not owned, must outlive *this
   GpuGraphHandle graph_ = nullptr;     // owned handle
   GpuGraphExecHandle exec_ = nullptr;  // owned handle
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index 88574f63db3c2c..6eedbfcc307089 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -259,7 +259,7 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
 
   tsl::StatusOr<std::unique_ptr<internal::CommandBufferInterface>>
-  GetCommandBufferImplementation() override;
+  GetCommandBufferImplementation(CommandBuffer::Mode mode) override;
 
   void* GpuContextHack() override;
 
diff --git a/third_party/xla/xla/stream_executor/stream_executor_internal.h b/third_party/xla/xla/stream_executor/stream_executor_internal.h
index 1464348b3688fe..dceea1a64434dc 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_internal.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_internal.h
@@ -160,6 +160,9 @@ class CommandBufferInterface {
   // finalized no commands can be added to it.
   virtual tsl::Status Finalize() = 0;
 
+  // Returns command buffer execution mode.
+  virtual CommandBuffer::Mode mode() const = 0;
+
  private:
   SE_DISALLOW_COPY_AND_ASSIGN(CommandBufferInterface);
 };
@@ -401,7 +404,7 @@ class StreamExecutorInterface {
   virtual std::unique_ptr<StreamInterface> GetStreamImplementation() = 0;
 
   virtual tsl::StatusOr<std::unique_ptr<CommandBufferInterface>>
-  GetCommandBufferImplementation() {
+  GetCommandBufferImplementation(CommandBuffer::Mode mode) {
     return absl::UnimplementedError("Command buffers are not implemented");
   }
 

From 553d18dd5bff575143c0b8bba45bb8805dff0d87 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 3 Oct 2023 22:26:56 -0700
Subject: [PATCH 560/567] [stream executor] Disallow moving
 `ScopedActivateExecutorContext`. This is too dangerous for a class that
 expects strictly nested lifetimes.

Note: `cudnnSetStream()` does not require an active CUDA context, so activating the context later (in the handle's c'tor) is fine.
PiperOrigin-RevId: 570589531
---
 third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc | 12 +++++-------
 .../xla/xla/stream_executor/gpu/gpu_activation.cc    |  6 ------
 .../xla/xla/stream_executor/gpu/gpu_activation.h     |  2 --
 third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc |  9 ++++-----
 4 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index a7519e660dcbe5..c96ee57438496d 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -166,11 +166,10 @@ std::string CudnnStatusToString(cudnnStatus_t status) {
 // See CudnnAccess::GetHandle() for details.
 class CudnnHandle {
  public:
-  // Takes ownership of the executor context and the lock to access cuDNN
-  // using handle.
-  CudnnHandle(gpu::ScopedActivateExecutorContext context,
-              std::unique_ptr<absl::MutexLock> lock, cudnnHandle_t handle)
-      : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
+  // Takes ownership of the lock to access cuDNN using handle.
+  CudnnHandle(GpuExecutor* executor, std::unique_ptr<absl::MutexLock> lock,
+              cudnnHandle_t handle)
+      : context_(executor), lock_(std::move(lock)), handle_(handle) {}
 
   // Returns cuDNN handle. To be passed directly to cuDNN APIs, don't keep
   // a copy.
@@ -218,14 +217,13 @@ class CudnnAccess {
   CudnnHandle GetHandle(GpuExecutor* executor, Stream* stream) {
     auto lock = std::make_unique<absl::MutexLock>(&mutex_);
     mutex_.AssertHeld();
-    gpu::ScopedActivateExecutorContext context(executor);
     CUstream cu_stream = stream ? AsGpuStreamValue(stream) : cudaStreamLegacy;
     if (!current_stream_ || cu_stream != *current_stream_) {
       current_stream_ = cu_stream;
       const auto status = cudnnSetStream(handle_, cu_stream);
       CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Failed to set cuDNN stream.";
     }
-    return CudnnHandle(std::move(context), std::move(lock), handle_);
+    return CudnnHandle(executor, std::move(lock), handle_);
   }
 
   void NotifyStreamDestroyed(Stream* stream) {
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_activation.cc b/third_party/xla/xla/stream_executor/gpu/gpu_activation.cc
index baa73e782b65d9..4a4704fb539b05 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_activation.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_activation.cc
@@ -38,11 +38,5 @@ ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
   delete static_cast<ScopedActivateContext*>(driver_scoped_activate_context_);
 }
 
-ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    ScopedActivateExecutorContext&& other)
-    : driver_scoped_activate_context_(other.driver_scoped_activate_context_) {
-  other.driver_scoped_activate_context_ = nullptr;
-}
-
 }  // namespace gpu
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_activation.h b/third_party/xla/xla/stream_executor/gpu/gpu_activation.h
index e5da9b5357255b..34456b729b59c2 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_activation.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_activation.h
@@ -44,8 +44,6 @@ class ScopedActivateExecutorContext {
   // fatal failure if it is not CUDA inside.
   explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
 
-  ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
-
   ~ScopedActivateExecutorContext();
 
  private:
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index 77efe5126c257c..329dbfbe1ee7e1 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -195,9 +195,9 @@ class MIOpenHandle {
  public:
   // Takes ownership of the executor context and the lock to access MIOpen
   // using handle.
-  MIOpenHandle(gpu::ScopedActivateExecutorContext context,
-               std::unique_ptr<absl::MutexLock> lock, miopenHandle_t handle)
-      : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
+  MIOpenHandle(GpuExecutor* executor, std::unique_ptr<absl::MutexLock> lock,
+               miopenHandle_t handle)
+      : context_(executor), lock_(std::move(lock)), handle_(handle) {}
 
   // Returns MIOpen handle. To be passed directly to MIOpen APIs, don't keep
   // a copy.
@@ -690,11 +690,10 @@ class MIOpenAccess {
   MIOpenHandle GetHandle(GpuExecutor* executor, Stream* stream) {
     auto lock = std::make_unique<absl::MutexLock>(&mutex_);
     mutex_.AssertHeld();
-    gpu::ScopedActivateExecutorContext context(executor);
     hipStream_t hip_stream = stream ? AsGpuStreamValue(stream) : nullptr;
     auto status = wrap::miopenSetStream(handle_, hip_stream);
     CHECK_EQ(status, miopenStatusSuccess) << "Failed to set MIOpen stream.";
-    return MIOpenHandle(std::move(context), std::move(lock), handle_);
+    return MIOpenHandle(executor, std::move(lock), handle_);
   }
 
  private:

From 1c7d739b88822079abf68a899c61eb01e5ccaacb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Oct 2023 22:29:46 -0700
Subject: [PATCH 561/567] Fixes build issue for Windows with Google build
 system.

PiperOrigin-RevId: 570589957
---
 third_party/xla/third_party/tsl/tsl/platform/regexp.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/regexp.h b/third_party/xla/third_party/tsl/tsl/platform/regexp.h
index 89551033dbc1ad..7a5e94e7048353 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/regexp.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/regexp.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include "tsl/platform/platform.h"
 
-
-#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID) || \
-    defined(PLATFORM_GOOGLE_IOS) || defined(PLATFORM_PORTABLE_GOOGLE)
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID) ||      \
+    defined(PLATFORM_GOOGLE_IOS) || defined(PLATFORM_PORTABLE_GOOGLE) || \
+    (defined(__clang__) && defined(PLATFORM_WINDOWS))
 #include "third_party/re2/re2.h"
 #else
 #include "re2/re2.h"

From a6340f8c77623df0bed53ad6aee9fc193c8e6758 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 3 Oct 2023 23:13:05 -0700
Subject: [PATCH 562/567] Fix minor issue in which, if tf_keras is installed
 and TF_USE_LEGACY_KERAS is set, doing `from tensorflow.compat.v2.keras import
 x` would pull from Keras 2 rather than tf_keras.

PiperOrigin-RevId: 570596346
---
 tensorflow/compat_template.__init__.py | 7 ++++++-
 tensorflow/python/util/lazy_loader.py  | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compat_template.__init__.py b/tensorflow/compat_template.__init__.py
index 814b38d53584dd..9d2f954293eddc 100644
--- a/tensorflow/compat_template.__init__.py
+++ b/tensorflow/compat_template.__init__.py
@@ -50,8 +50,13 @@
 setattr(_current_module, "estimator", estimator)
 
 # Lazy load Keras v2
+_tf_uses_legacy_keras = (
+    _os.environ.get("TF_USE_LEGACY_KERAS", None) in ("true", "True", "1"))
 setattr(_current_module, "keras", _KerasLazyLoader(globals(), mode="v2"))
-_module_dir = _module_util.get_parent_dir_for_name("keras.api._v2.keras")
+if _tf_uses_legacy_keras:
+  _module_dir = _module_util.get_parent_dir_for_name("tf_keras.api._v2.keras")
+else:
+  _module_dir = _module_util.get_parent_dir_for_name("keras.api._v2.keras")
 _current_module.__path__ = [_module_dir] + _current_module.__path__
 
 
diff --git a/tensorflow/python/util/lazy_loader.py b/tensorflow/python/util/lazy_loader.py
index bfd0372bc99c85..08e0fb0609b4d5 100644
--- a/tensorflow/python/util/lazy_loader.py
+++ b/tensorflow/python/util/lazy_loader.py
@@ -173,7 +173,7 @@ def __getattr__(self, item):
   def __repr__(self):
     if self._initialized:
       return (f"<KerasLazyLoader ({self._keras_version}) "
-              f"{self.__name__} as {self._local_name}")
+              f"{self.__name__} as {self._local_name} mode={self._mode}>")
     return "<KerasLazyLoader>"
 
   def __dir__(self):

From b9db6fe2aafe4f9718ef4e5a40650b70c9a0b9d9 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 3 Oct 2023 23:55:22 -0700
Subject: [PATCH 563/567] NFC: replace uses of
 TF_DISALLOW_COPY_AND_ASSIGN/SE_DISALLOW_COPY_AND_ASSIGN macros with
 implementation.

PiperOrigin-RevId: 570602800
---
 .../compiler/tf2xla/kernels/aggregate_ops.cc  |  3 +-
 .../compiler/tf2xla/kernels/all_reduce_op.cc  |  3 +-
 .../compiler/tf2xla/kernels/approx_topk_op.cc |  3 +-
 tensorflow/compiler/tf2xla/kernels/arg_op.cc  |  3 +-
 .../compiler/tf2xla/kernels/assert_op.cc      |  3 +-
 .../compiler/tf2xla/kernels/bcast_ops.cc      |  6 ++-
 tensorflow/compiler/tf2xla/kernels/case_op.h  |  3 +-
 tensorflow/compiler/tf2xla/kernels/cast_op.cc |  6 ++-
 .../compiler/tf2xla/kernels/categorical_op.cc |  6 ++-
 .../compiler/tf2xla/kernels/const_op.cc       |  3 +-
 .../compiler/tf2xla/kernels/conv_ops.cc       |  9 +++--
 .../compiler/tf2xla/kernels/cross_op.cc       |  3 +-
 .../tf2xla/kernels/data_format_ops.cc         |  6 ++-
 .../kernels/extract_image_patches_op.cc       |  3 +-
 .../compiler/tf2xla/kernels/fake_param_op.cc  |  3 +-
 tensorflow/compiler/tf2xla/kernels/fft_ops.cc |  3 +-
 .../compiler/tf2xla/kernels/gather_op.cc      |  3 +-
 .../compiler/tf2xla/kernels/identity_op.cc    |  3 +-
 tensorflow/compiler/tf2xla/kernels/if_op.h    |  3 +-
 .../compiler/tf2xla/kernels/in_topk_op.cc     |  3 +-
 .../tf2xla/kernels/matrix_diag_ops.cc         |  3 +-
 .../tf2xla/kernels/matrix_inverse_op.cc       |  3 +-
 .../tf2xla/kernels/matrix_solve_op.cc         |  3 +-
 .../compiler/tf2xla/kernels/mirror_pad_op.cc  |  6 ++-
 .../compiler/tf2xla/kernels/one_hot_op.cc     |  3 +-
 .../compiler/tf2xla/kernels/random_ops.cc     |  9 +++--
 .../compiler/tf2xla/kernels/retval_op.cc      |  3 +-
 tensorflow/compiler/tf2xla/kernels/roll_op.cc |  3 +-
 .../compiler/tf2xla/kernels/select_op.cc      |  6 ++-
 .../compiler/tf2xla/kernels/sendrecv_ops.cc   |  6 ++-
 .../compiler/tf2xla/kernels/sharding_op.cc    |  3 +-
 .../kernels/spmd_manual_sharding_ops.cc       |  6 ++-
 .../compiler/tf2xla/kernels/stack_ops.cc      | 12 ++++--
 .../tf2xla/kernels/stateful_random_ops.cc     | 18 ++++++---
 .../tf2xla/kernels/stateless_random_ops.cc    | 20 +++++++---
 .../tf2xla/kernels/stateless_random_ops_v2.cc | 25 ++++++++----
 .../tf2xla/kernels/tensor_array_ops.cc        | 30 +++++++++-----
 .../tf2xla/kernels/tensor_list_ops.cc         | 39 ++++++++++++-------
 .../compiler/tf2xla/kernels/tile_ops.cc       |  3 +-
 tensorflow/compiler/tf2xla/kernels/while_op.h |  3 +-
 .../tf2xla/kernels/xla_broadcast_helper_op.cc |  3 +-
 .../tf2xla/kernels/xla_dequantize_op.cc       |  3 +-
 .../compiler/tf2xla/kernels/xla_dot_op.cc     |  6 ++-
 .../compiler/tf2xla/kernels/xla_pad_op.cc     |  3 +-
 tensorflow/compiler/tf2xla/tf2xla_util.h      |  3 +-
 tensorflow/compiler/tf2xla/xla_compiler.h     |  3 +-
 tensorflow/compiler/tf2xla/xla_context.h      |  3 +-
 47 files changed, 205 insertions(+), 101 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
index 606f7c28e1db8b..0daa74bf112f29 100644
--- a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
@@ -88,7 +88,8 @@ class AddNOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(AddNOp);
+  AddNOp(const AddNOp&) = delete;
+  void operator=(const AddNOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("AddN").AllowVariantTypes(), AddNOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc b/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
index 49269752a57695..eb0250db1feea9 100644
--- a/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
@@ -85,7 +85,8 @@ class CollectiveReduceV2Op : public XlaOpKernel {
   string final_op_name_;
   string communication_hint_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveReduceV2Op);
+  CollectiveReduceV2Op(const CollectiveReduceV2Op&) = delete;
+  void operator=(const CollectiveReduceV2Op&) = delete;
 };
 
 class CollectiveAssignGroupV2Op : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/approx_topk_op.cc b/tensorflow/compiler/tf2xla/kernels/approx_topk_op.cc
index f217482df438c8..0b2711d7d0158e 100644
--- a/tensorflow/compiler/tf2xla/kernels/approx_topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/approx_topk_op.cc
@@ -109,7 +109,8 @@ class ApproxTopKOpBase : public XlaOpKernel {
   int64_t reduction_input_size_override_;
   bool aggregate_to_topk_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ApproxTopKOpBase);
+  ApproxTopKOpBase(const ApproxTopKOpBase&) = delete;
+  void operator=(const ApproxTopKOpBase&) = delete;
 };
 
 class TpuApproxTopKOp : public ApproxTopKOpBase {
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
index 7b74c19d0f6553..785db26c8f26d0 100644
--- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -110,7 +110,8 @@ class XlaArgOp : public XlaOpKernel {
   int index_;
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaArgOp);
+  XlaArgOp(const XlaArgOp&) = delete;
+  void operator=(const XlaArgOp&) = delete;
 };
 
 REGISTER_XLA_OP(
diff --git a/tensorflow/compiler/tf2xla/kernels/assert_op.cc b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
index c1c14d7dcafb04..8a863ea978d4b6 100644
--- a/tensorflow/compiler/tf2xla/kernels/assert_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
@@ -40,7 +40,8 @@ class AssertOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(AssertOp);
+  AssertOp(const AssertOp&) = delete;
+  void operator=(const AssertOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("Assert").CompilationOnly(), AssertOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
index 7cfec82abadd73..25ca7b8c20688f 100644
--- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
@@ -72,7 +72,8 @@ class BCastArgsOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(BCastArgsOp);
+  BCastArgsOp(const BCastArgsOp&) = delete;
+  void operator=(const BCastArgsOp&) = delete;
 };
 REGISTER_XLA_OP(Name("BroadcastArgs")
                     .CompileTimeConstantInput("s0")
@@ -133,7 +134,8 @@ class BCastGradArgsOp : public XlaOpKernel {
     ctx->SetConstantOutput(idx, constant);
   }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BCastGradArgsOp);
+  BCastGradArgsOp(const BCastGradArgsOp&) = delete;
+  void operator=(const BCastGradArgsOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("BroadcastGradientArgs")
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.h b/tensorflow/compiler/tf2xla/kernels/case_op.h
index 1aa642285913cb..a4c01bea65a04d 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.h
@@ -49,7 +49,8 @@ class XlaCaseOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaCaseOp);
+  XlaCaseOp(const XlaCaseOp&) = delete;
+  void operator=(const XlaCaseOp&) = delete;
 
   // If the branch_index input is a constant: prunes out all but the branch
   // corrresponding to that constant branch index, and returns that branch and
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index 85aa7e438aab70..67452a81f42dfd 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -97,7 +97,8 @@ class CastOp : public XlaOpKernel {
   xla::PrimitiveType src_type_, dst_type_;
   bool use_truncation_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CastOp);
+  CastOp(const CastOp&) = delete;
+  void operator=(const CastOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("Cast"), CastOp);
@@ -142,7 +143,8 @@ class BitcastOp : public XlaOpKernel {
   DataType src_dtype_, dst_dtype_;
   xla::PrimitiveType src_type_, dst_type_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(BitcastOp);
+  BitcastOp(const BitcastOp&) = delete;
+  void operator=(const BitcastOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("Bitcast"), BitcastOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index bb16fd1dad8e6b..ec0d3d3b7d3fec 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -139,7 +139,8 @@ class CategoricalOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CategoricalOp);
+  CategoricalOp(const CategoricalOp&) = delete;
+  void operator=(const CategoricalOp&) = delete;
 };
 
 // TODO(b/68769717): Rename this sampler to Categorical.
@@ -184,7 +185,8 @@ class StatelessCategoricalOp : public CategoricalOp {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessCategoricalOp);
+  StatelessCategoricalOp(const StatelessCategoricalOp&) = delete;
+  void operator=(const StatelessCategoricalOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StatelessMultinomial")
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index 2624d006926374..aec9a54fa610f0 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -134,7 +134,8 @@ class ConstOp : public XlaOpKernel {
 
  private:
   TensorProto proto_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ConstOp);
+  ConstOp(const ConstOp&) = delete;
+  void operator=(const ConstOp&) = delete;
 };
 
 // XLA_* devices also register a "real" Const operator so we suppress the
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 998d0b4a0b81f3..babc53d7f5bcb0 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -66,7 +66,8 @@ class ConvOp : public XlaOpKernel {
   ConvOpAttrs attrs_;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ConvOp);
+  ConvOp(const ConvOp&) = delete;
+  void operator=(const ConvOp&) = delete;
 };
 
 class ConvNDOp : public XlaOpKernel {
@@ -202,7 +203,8 @@ class ConvBackpropInputOp : public XlaOpKernel {
   ConvOpAttrs attrs_;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ConvBackpropInputOp);
+  ConvBackpropInputOp(const ConvBackpropInputOp&) = delete;
+  void operator=(const ConvBackpropInputOp&) = delete;
 };
 
 class Conv2DBackpropInputOp : public ConvBackpropInputOp {
@@ -262,7 +264,8 @@ class ConvBackpropFilterOp : public XlaOpKernel {
   ConvOpAttrs attrs_;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ConvBackpropFilterOp);
+  ConvBackpropFilterOp(const ConvBackpropFilterOp&) = delete;
+  void operator=(const ConvBackpropFilterOp&) = delete;
 };
 
 class Conv2DBackpropFilterOp : public ConvBackpropFilterOp {
diff --git a/tensorflow/compiler/tf2xla/kernels/cross_op.cc b/tensorflow/compiler/tf2xla/kernels/cross_op.cc
index e5ccdd5f8c6f06..1dfd1b5f208647 100644
--- a/tensorflow/compiler/tf2xla/kernels/cross_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cross_op.cc
@@ -83,7 +83,8 @@ class CrossOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CrossOp);
+  CrossOp(const CrossOp&) = delete;
+  void operator=(const CrossOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("Cross"), CrossOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
index bfca2525e67053..992b8bb3387366 100644
--- a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
@@ -79,7 +79,8 @@ class DataFormatDimMapOp : public XlaOpKernel {
  private:
   std::vector<int32> dst_idx_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DataFormatDimMapOp);
+  DataFormatDimMapOp(const DataFormatDimMapOp&) = delete;
+  void operator=(const DataFormatDimMapOp&) = delete;
 };
 
 REGISTER_XLA_OP(
@@ -178,7 +179,8 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
   string src_format_;
   string dst_format_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DataFormatVecPermuteOp);
+  DataFormatVecPermuteOp(const DataFormatVecPermuteOp&) = delete;
+  void operator=(const DataFormatVecPermuteOp&) = delete;
 };
 
 REGISTER_XLA_OP(
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index 49761a3e48551a..069c430d348c84 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -178,7 +178,8 @@ class ExtractImagePatchesOp : public XlaOpKernel {
   Padding padding_;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ExtractImagePatchesOp);
+  ExtractImagePatchesOp(const ExtractImagePatchesOp&) = delete;
+  void operator=(const ExtractImagePatchesOp&) = delete;
 };
 
 // We don't support integers for the convolution for GPU used in the
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_param_op.cc b/tensorflow/compiler/tf2xla/kernels/fake_param_op.cc
index 13abc70a27442b..b8f7b19a3d4b1f 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_param_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_param_op.cc
@@ -45,7 +45,8 @@ class XlaFakeParamOp : public XlaOpKernel {
  private:
   xla::Shape shape_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaFakeParamOp);
+  XlaFakeParamOp(const XlaFakeParamOp&) = delete;
+  void operator=(const XlaFakeParamOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("FakeParam"), XlaFakeParamOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index 4021da5fbd2d04..561d2e89a245e9 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -109,7 +109,8 @@ class GenericFftOp : public XlaOpKernel {
   const int fft_rank_;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(GenericFftOp);
+  GenericFftOp(const GenericFftOp&) = delete;
+  void operator=(const GenericFftOp&) = delete;
 };
 
 template <int FFTRank>
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 3d5b9067486d97..59d196c351c99b 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -261,7 +261,8 @@ class GatherOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(GatherOp);
+  GatherOp(const GatherOp&) = delete;
+  void operator=(const GatherOp&) = delete;
 
   // The number of batch dimensions, as passed in the batch_dims attribute.
   // It must be less than or equal to rank(indices).
diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
index fa2eaa844e5c3a..845c22dd4b2008 100644
--- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
@@ -40,7 +40,8 @@ class IdentityOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(IdentityOp);
+  IdentityOp(const IdentityOp&) = delete;
+  void operator=(const IdentityOp&) = delete;
 };
 
 // XLA_* devices also register a "real" Identity operator so we suppress the
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.h b/tensorflow/compiler/tf2xla/kernels/if_op.h
index 11b196f939ec82..ac596a29b28ca9 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.h
@@ -47,7 +47,8 @@ class XlaIfOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaIfOp);
+  XlaIfOp(const XlaIfOp&) = delete;
+  void operator=(const XlaIfOp&) = delete;
 
   NameAttrList then_branch_;
   NameAttrList else_branch_;
diff --git a/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc b/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
index 55dc39cd5a342c..50e22ed008deb7 100644
--- a/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
@@ -104,7 +104,8 @@ class InTopKOp : public XlaOpKernel {
   DataType targets_dtype_;
   xla::PrimitiveType targets_type_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(InTopKOp);
+  InTopKOp(const InTopKOp&) = delete;
+  void operator=(const InTopKOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("InTopKV2")
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
index bd65ed58010035..f7e8dab3d120ff 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
@@ -540,7 +540,8 @@ class MatrixSetDiagOp : public XlaOpKernel {
   bool left_align_superdiagonal_ = true;
   bool left_align_subdiagonal_ = true;
   static constexpr int kNumV1Inputs = 2;
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSetDiagOp);
+  MatrixSetDiagOp(const MatrixSetDiagOp&) = delete;
+  void operator=(const MatrixSetDiagOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("MatrixSetDiag"), MatrixSetDiagOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_inverse_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_inverse_op.cc
index e029a5cae346d2..4f950b256ef695 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_inverse_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_inverse_op.cc
@@ -57,7 +57,8 @@ class MatrixInverseOp : public XlaOpKernel {
  private:
   bool adjoint_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixInverseOp);
+  MatrixInverseOp(const MatrixInverseOp&) = delete;
+  void operator=(const MatrixInverseOp&) = delete;
 };
 
 // TODO(b/135640736): Allow this for integer and complex types.
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_solve_op.cc
index 5fe6a17485f62b..eadd7ed8164ffe 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_solve_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_solve_op.cc
@@ -65,7 +65,8 @@ class MatrixSolveOp : public XlaOpKernel {
  private:
   bool adjoint_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSolveOp);
+  MatrixSolveOp(const MatrixSolveOp&) = delete;
+  void operator=(const MatrixSolveOp&) = delete;
 };
 
 // TODO(b/111271662): Support integer and complex types.
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index 3f8f7ad1e70c44..c4664da0940635 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -103,7 +103,8 @@ class MirrorPadOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(MirrorPadOp);
+  MirrorPadOp(const MirrorPadOp&) = delete;
+  void operator=(const MirrorPadOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("MirrorPad").CompileTimeConstantInput("paddings"),
@@ -203,7 +204,8 @@ class MirrorPadGradOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(MirrorPadGradOp);
+  MirrorPadGradOp(const MirrorPadGradOp&) = delete;
+  void operator=(const MirrorPadGradOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("MirrorPadGrad").CompileTimeConstantInput("paddings"),
diff --git a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
index e6ea22ebdcf608..37bbb505b47343 100644
--- a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
@@ -73,7 +73,8 @@ class OneHotOp : public XlaOpKernel {
  private:
   int32 axis_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(OneHotOp);
+  OneHotOp(const OneHotOp&) = delete;
+  void operator=(const OneHotOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("OneHot").CompileTimeConstantInput("depth"), OneHotOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 73b2ea76d16831..d419d2538aeac7 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -71,7 +71,8 @@ class RandomUniformOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(RandomUniformOp);
+  RandomUniformOp(const RandomUniformOp&) = delete;
+  void operator=(const RandomUniformOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("RandomUniform").CompileTimeConstantInput("shape"),
@@ -110,7 +111,8 @@ class RandomUniformIntOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(RandomUniformIntOp);
+  RandomUniformIntOp(const RandomUniformIntOp&) = delete;
+  void operator=(const RandomUniformIntOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("RandomUniformInt").CompileTimeConstantInput("shape"),
@@ -143,7 +145,8 @@ class RandomStandardNormalOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(RandomStandardNormalOp);
+  RandomStandardNormalOp(const RandomStandardNormalOp&) = delete;
+  void operator=(const RandomStandardNormalOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("RandomStandardNormal").CompileTimeConstantInput("shape"),
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index ea805b141a3683..6fc4e184d5a510 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -62,7 +62,8 @@ class RetvalOp : public XlaOpKernel {
   int index_;
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp);
+  RetvalOp(const RetvalOp&) = delete;
+  void operator=(const RetvalOp&) = delete;
 };
 
 REGISTER_XLA_OP(
diff --git a/tensorflow/compiler/tf2xla/kernels/roll_op.cc b/tensorflow/compiler/tf2xla/kernels/roll_op.cc
index d13c200c2e4462..bedf4cfa0bb61b 100644
--- a/tensorflow/compiler/tf2xla/kernels/roll_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/roll_op.cc
@@ -92,7 +92,8 @@ class RollOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(RollOp);
+  RollOp(const RollOp&) = delete;
+  void operator=(const RollOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("Roll").CompileTimeConstantInput("axis"), RollOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc
index 91f3bad03c6cb4..7a4d27bad6516a 100644
--- a/tensorflow/compiler/tf2xla/kernels/select_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc
@@ -76,7 +76,8 @@ class SelectOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(SelectOp);
+  SelectOp(const SelectOp&) = delete;
+  void operator=(const SelectOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("Select"), MlirXlaOpKernel);
@@ -129,7 +130,8 @@ class SelectOpV2 : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(SelectOpV2);
+  SelectOpV2(const SelectOpV2&) = delete;
+  void operator=(const SelectOpV2&) = delete;
 };
 
 REGISTER_XLA_OP(Name("SelectV2"), SelectOpV2);
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index ab698291ddb913..04c126646fc0db 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -34,7 +34,8 @@ class SendOp : public XlaOpKernel {
  private:
   string tensor_name_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SendOp);
+  SendOp(const SendOp&) = delete;
+  void operator=(const SendOp&) = delete;
 };
 
 SendOp::SendOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -59,7 +60,8 @@ class RecvOp : public XlaOpKernel {
   string tensor_name_;
   xla::Shape shape_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(RecvOp);
+  RecvOp(const RecvOp&) = delete;
+  void operator=(const RecvOp&) = delete;
 };
 
 RecvOp::RecvOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
diff --git a/tensorflow/compiler/tf2xla/kernels/sharding_op.cc b/tensorflow/compiler/tf2xla/kernels/sharding_op.cc
index d5c573e37511e6..086d2f2d7bdb8c 100644
--- a/tensorflow/compiler/tf2xla/kernels/sharding_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sharding_op.cc
@@ -60,7 +60,8 @@ class ShardingOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ShardingOp);
+  ShardingOp(const ShardingOp&) = delete;
+  void operator=(const ShardingOp&) = delete;
   std::vector<int64_t> unspecified_dims_;
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc b/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc
index 91dfc17d40394a..1962b9e064fec4 100644
--- a/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc
@@ -69,7 +69,8 @@ class XlaSpmdFullToShardShapeOp : public XlaOpKernel {
   string manual_sharding_str_;
   int32 single_dim_;
   std::vector<int64_t> unspecified_dims_;
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSpmdFullToShardShapeOp);
+  XlaSpmdFullToShardShapeOp(const XlaSpmdFullToShardShapeOp&) = delete;
+  void operator=(const XlaSpmdFullToShardShapeOp&) = delete;
 };
 
 class XlaSpmdShardToFullShapeOp : public XlaOpKernel {
@@ -119,7 +120,8 @@ class XlaSpmdShardToFullShapeOp : public XlaOpKernel {
   string manual_sharding_str_;
   int32 single_dim_;
   std::vector<int64_t> unspecified_dims_;
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaSpmdShardToFullShapeOp);
+  XlaSpmdShardToFullShapeOp(const XlaSpmdShardToFullShapeOp&) = delete;
+  void operator=(const XlaSpmdShardToFullShapeOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaSpmdFullToShardShape"), XlaSpmdFullToShardShapeOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index fbf52a39b207b6..e74381aa6f24b4 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -117,7 +117,8 @@ class StackOp : public XlaOpKernel {
   DataType dtype_;
   string stack_name_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
+  StackOp(const StackOp&) = delete;
+  void operator=(const StackOp&) = delete;
 };
 
 REGISTER_XLA_OP(
@@ -166,7 +167,8 @@ class StackPushOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StackPushOp);
+  StackPushOp(const StackPushOp&) = delete;
+  void operator=(const StackPushOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StackPushV2").CompilationOnly(), StackPushOp);
@@ -221,7 +223,8 @@ class StackPopOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StackPopOp);
+  StackPopOp(const StackPopOp&) = delete;
+  void operator=(const StackPopOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StackPopV2").CompilationOnly(), StackPopOp);
@@ -235,7 +238,8 @@ class StackCloseOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(StackCloseOp);
+  StackCloseOp(const StackCloseOp&) = delete;
+  void operator=(const StackCloseOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StackCloseV2").CompilationOnly(), StackCloseOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
index dcc18bae4c7adc..6fbb413b46bc3b 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
@@ -262,7 +262,8 @@ class StatefulUniformOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatefulUniformOp);
+  StatefulUniformOp(const StatefulUniformOp&) = delete;
+  void operator=(const StatefulUniformOp&) = delete;
 };
 
 // TODO(wangpeng): Support plain float16 to get rid of the `TypeConstraint`.
@@ -301,7 +302,8 @@ class StatefulStandardNormalOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatefulStandardNormalOp);
+  StatefulStandardNormalOp(const StatefulStandardNormalOp&) = delete;
+  void operator=(const StatefulStandardNormalOp&) = delete;
 };
 
 // TODO(wangpeng): Support plain float16 to get rid of the `TypeConstraint`.
@@ -348,7 +350,8 @@ class StatefulTruncatedNormalOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatefulTruncatedNormalOp);
+  StatefulTruncatedNormalOp(const StatefulTruncatedNormalOp&) = delete;
+  void operator=(const StatefulTruncatedNormalOp&) = delete;
 };
 
 // TODO(wangpeng): Support plain float16 to get rid of the `TypeConstraint`.
@@ -384,7 +387,8 @@ class StatefulUniformIntOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatefulUniformIntOp);
+  StatefulUniformIntOp(const StatefulUniformIntOp&) = delete;
+  void operator=(const StatefulUniformIntOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StatefulUniformInt")
@@ -417,7 +421,8 @@ class StatefulUniformFullIntOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatefulUniformFullIntOp);
+  StatefulUniformFullIntOp(const StatefulUniformFullIntOp&) = delete;
+  void operator=(const StatefulUniformFullIntOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StatefulUniformFullInt")
@@ -491,7 +496,8 @@ class RngSkipOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(RngSkipOp);
+  RngSkipOp(const RngSkipOp&) = delete;
+  void operator=(const RngSkipOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("RngSkip").CompileTimeConstantInput("algorithm"),
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index a1847dd60d7c08..95d7f4df60c60f 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -183,7 +183,8 @@ class StatelessRandomUniformOp : public XlaOpKernel {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformOp);
+  StatelessRandomUniformOp(const StatelessRandomUniformOp&) = delete;
+  void operator=(const StatelessRandomUniformOp&) = delete;
 };
 
 // TODO(phawkins): generalize to non-float, non-int32 seed types.
@@ -238,7 +239,8 @@ class StatelessRandomUniformIntOp : public XlaOpKernel {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformIntOp);
+  StatelessRandomUniformIntOp(const StatelessRandomUniformIntOp&) = delete;
+  void operator=(const StatelessRandomUniformIntOp&) = delete;
 };
 
 // TODO(phawkins): generalize to non-int32 seed types.
@@ -280,7 +282,9 @@ class StatelessRandomUniformFullIntOp : public XlaOpKernel {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformFullIntOp);
+  StatelessRandomUniformFullIntOp(const StatelessRandomUniformFullIntOp&) =
+      delete;
+  void operator=(const StatelessRandomUniformFullIntOp&) = delete;
 };
 
 // TODO(phawkins): generalize to non-int32 seed types.
@@ -331,7 +335,8 @@ class StatelessRandomNormalOp : public XlaOpKernel {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomNormalOp);
+  StatelessRandomNormalOp(const StatelessRandomNormalOp&) = delete;
+  void operator=(const StatelessRandomNormalOp&) = delete;
 };
 
 // TODO(phawkins): generalize to non-float, non-int32 seed types.
@@ -378,7 +383,8 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessTruncatedNormalOp);
+  StatelessTruncatedNormalOp(const StatelessTruncatedNormalOp&) = delete;
+  void operator=(const StatelessTruncatedNormalOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StatelessTruncatedNormal")
@@ -442,7 +448,9 @@ class StatelessParameterizedTruncatedNormalOp : public XlaOpKernel {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessParameterizedTruncatedNormalOp);
+  StatelessParameterizedTruncatedNormalOp(
+      const StatelessParameterizedTruncatedNormalOp&) = delete;
+  void operator=(const StatelessParameterizedTruncatedNormalOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StatelessParameterizedTruncatedNormal")
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
index c3929e788fbf8b..098ecf39792e21 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
@@ -116,7 +116,8 @@ class StatelessRandomUniformOp : public XlaOpKernel {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformOp);
+  StatelessRandomUniformOp(const StatelessRandomUniformOp&) = delete;
+  void operator=(const StatelessRandomUniformOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StatelessRandomUniformV2")
@@ -164,7 +165,8 @@ class StatelessRandomUniformIntOp : public XlaOpKernel {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformIntOp);
+  StatelessRandomUniformIntOp(const StatelessRandomUniformIntOp&) = delete;
+  void operator=(const StatelessRandomUniformIntOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StatelessRandomUniformIntV2")
@@ -211,7 +213,9 @@ class StatelessRandomUniformFullIntOp : public XlaOpKernel {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformFullIntOp);
+  StatelessRandomUniformFullIntOp(const StatelessRandomUniformFullIntOp&) =
+      delete;
+  void operator=(const StatelessRandomUniformFullIntOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StatelessRandomUniformFullIntV2")
@@ -279,7 +283,8 @@ class StatelessRandomNormalOp : public XlaOpKernel {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomNormalOp);
+  StatelessRandomNormalOp(const StatelessRandomNormalOp&) = delete;
+  void operator=(const StatelessRandomNormalOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StatelessRandomNormalV2")
@@ -313,7 +318,8 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
   DataType dtype_;
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(StatelessTruncatedNormalOp);
+  StatelessTruncatedNormalOp(const StatelessTruncatedNormalOp&) = delete;
+  void operator=(const StatelessTruncatedNormalOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StatelessTruncatedNormalV2")
@@ -351,7 +357,8 @@ class GetKeyCounterOp : public XlaOpKernel {
  private:
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GetKeyCounterOp);
+  GetKeyCounterOp(const GetKeyCounterOp&) = delete;
+  void operator=(const GetKeyCounterOp&) = delete;
 };
 
 // TODO(hinsu): Dis-allow unsupported int64 seed types.
@@ -373,7 +380,8 @@ class GetAlgOp : public XlaOpKernel {
  private:
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GetAlgOp);
+  GetAlgOp(const GetAlgOp&) = delete;
+  void operator=(const GetAlgOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("StatelessRandomGetAlg"), GetAlgOp);
@@ -410,7 +418,8 @@ class GetKeyCounterAlgOp : public XlaOpKernel {
  private:
   string device_type_string_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GetKeyCounterAlgOp);
+  GetKeyCounterAlgOp(const GetKeyCounterAlgOp&) = delete;
+  void operator=(const GetKeyCounterAlgOp&) = delete;
 };
 
 // TODO(hinsu): Dis-allow unsupported int64 seed types.
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index b334c13e3753c1..22614039459a90 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -181,7 +181,8 @@ class TensorArrayOp : public XlaOpKernel {
   DataType dtype_;
   string tensor_array_name_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayOp);
+  TensorArrayOp(const TensorArrayOp&) = delete;
+  void operator=(const TensorArrayOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorArrayV3").CompileTimeConstantInput("size"),
@@ -236,7 +237,8 @@ class TensorArrayWriteOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayWriteOp);
+  TensorArrayWriteOp(const TensorArrayWriteOp&) = delete;
+  void operator=(const TensorArrayWriteOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorArrayWriteV3"), TensorArrayWriteOp);
@@ -280,7 +282,8 @@ class TensorArrayReadOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayReadOp);
+  TensorArrayReadOp(const TensorArrayReadOp&) = delete;
+  void operator=(const TensorArrayReadOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorArrayReadV3"), TensorArrayReadOp);
@@ -347,7 +350,8 @@ class TensorArrayGatherOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayGatherOp);
+  TensorArrayGatherOp(const TensorArrayGatherOp&) = delete;
+  void operator=(const TensorArrayGatherOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorArrayGatherV3"), TensorArrayGatherOp);
@@ -434,7 +438,8 @@ class TensorArrayScatterOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayScatterOp);
+  TensorArrayScatterOp(const TensorArrayScatterOp&) = delete;
+  void operator=(const TensorArrayScatterOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorArrayScatterV3"), TensorArrayScatterOp);
@@ -474,7 +479,8 @@ class TensorArrayConcatOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayConcatOp);
+  TensorArrayConcatOp(const TensorArrayConcatOp&) = delete;
+  void operator=(const TensorArrayConcatOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorArrayConcatV3"), TensorArrayConcatOp);
@@ -544,7 +550,8 @@ class TensorArraySplitOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArraySplitOp);
+  TensorArraySplitOp(const TensorArraySplitOp&) = delete;
+  void operator=(const TensorArraySplitOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorArraySplitV3").CompileTimeConstantInput("lengths"),
@@ -563,7 +570,8 @@ class TensorArraySizeOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArraySizeOp);
+  TensorArraySizeOp(const TensorArraySizeOp&) = delete;
+  void operator=(const TensorArraySizeOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorArraySizeV3"), TensorArraySizeOp);
@@ -598,7 +606,8 @@ class TensorArrayGradOp : public XlaOpKernel {
  private:
   string source_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayGradOp);
+  TensorArrayGradOp(const TensorArrayGradOp&) = delete;
+  void operator=(const TensorArrayGradOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorArrayGradV3"), TensorArrayGradOp);
@@ -612,7 +621,8 @@ class TensorArrayCloseOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayCloseOp);
+  TensorArrayCloseOp(const TensorArrayCloseOp&) = delete;
+  void operator=(const TensorArrayCloseOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorArrayCloseV3"), TensorArrayCloseOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 75fe8654eff4b3..7e4c16ca189de3 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -101,7 +101,8 @@ class TensorListLengthOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListLengthOp);
+  TensorListLengthOp(const TensorListLengthOp&) = delete;
+  void operator=(const TensorListLengthOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorListLength").IsMetadataOp(), TensorListLengthOp);
@@ -202,7 +203,8 @@ class TensorListReserveOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListReserveOp);
+  TensorListReserveOp(const TensorListReserveOp&) = delete;
+  void operator=(const TensorListReserveOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorListReserve")
@@ -276,7 +278,8 @@ class EmptyTensorListOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(EmptyTensorListOp);
+  EmptyTensorListOp(const EmptyTensorListOp&) = delete;
+  void operator=(const EmptyTensorListOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("EmptyTensorList")
@@ -338,7 +341,8 @@ class TensorListElementShapeOp : public XlaOpKernel {
  private:
   DataType shape_type_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListElementShapeOp);
+  TensorListElementShapeOp(const TensorListElementShapeOp&) = delete;
+  void operator=(const TensorListElementShapeOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorListElementShape").IsMetadataOp(),
@@ -377,7 +381,8 @@ class TensorListGetItemOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListGetItemOp);
+  TensorListGetItemOp(const TensorListGetItemOp&) = delete;
+  void operator=(const TensorListGetItemOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorListGetItem"), TensorListGetItemOp);
@@ -430,7 +435,8 @@ class TensorListGatherOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListGatherOp);
+  TensorListGatherOp(const TensorListGatherOp&) = delete;
+  void operator=(const TensorListGatherOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorListGather"), TensorListGatherOp);
@@ -460,7 +466,8 @@ class TensorListStackOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListStackOp);
+  TensorListStackOp(const TensorListStackOp&) = delete;
+  void operator=(const TensorListStackOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorListStack"), TensorListStackOp);
@@ -515,7 +522,8 @@ class TensorListConcatOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListConcatOp);
+  TensorListConcatOp(const TensorListConcatOp&) = delete;
+  void operator=(const TensorListConcatOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorListConcatV2"), TensorListConcatOp);
@@ -573,7 +581,8 @@ class TensorListSplitOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListSplitOp);
+  TensorListSplitOp(const TensorListSplitOp&) = delete;
+  void operator=(const TensorListSplitOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorListSplit")
@@ -598,7 +607,8 @@ class TensorListFromTensorOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListFromTensorOp);
+  TensorListFromTensorOp(const TensorListFromTensorOp&) = delete;
+  void operator=(const TensorListFromTensorOp&) = delete;
 };
 
 REGISTER_XLA_OP(
@@ -633,7 +643,8 @@ class TensorListSetItemOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListSetItemOp);
+  TensorListSetItemOp(const TensorListSetItemOp&) = delete;
+  void operator=(const TensorListSetItemOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorListSetItem"), TensorListSetItemOp);
@@ -660,7 +671,8 @@ class TensorListPushBackOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListPushBackOp);
+  TensorListPushBackOp(const TensorListPushBackOp&) = delete;
+  void operator=(const TensorListPushBackOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorListPushBack").AllowVariantTypes(),
@@ -696,7 +708,8 @@ class TensorListPopBackOp : public XlaOpKernel {
  private:
   DataType dtype_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TensorListPopBackOp);
+  TensorListPopBackOp(const TensorListPopBackOp&) = delete;
+  void operator=(const TensorListPopBackOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("TensorListPopBack").AllowVariantTypes(),
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index 7133486392511f..4f1516cd7f09f2 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -119,7 +119,8 @@ class TileOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(TileOp);
+  TileOp(const TileOp&) = delete;
+  void operator=(const TileOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("Tile").CompileTimeConstantInput("multiples"), TileOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.h b/tensorflow/compiler/tf2xla/kernels/while_op.h
index 2f0b6c3a7f4cd3..3015e1862d041a 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.h
@@ -66,7 +66,8 @@ class XlaWhileOp : public XlaOpKernel {
   // overheads.
   bool propagate_compile_time_consts_ = false;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaWhileOp);
+  XlaWhileOp(const XlaWhileOp&) = delete;
+  void operator=(const XlaWhileOp&) = delete;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
index 8219368c680215..373d91db2dbb4a 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
@@ -104,7 +104,8 @@ class XlaBroadcastHelperOp : public XlaOpKernel {
  private:
   xla::DotDimensionNumbers dnums_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaBroadcastHelperOp);
+  XlaBroadcastHelperOp(const XlaBroadcastHelperOp&) = delete;
+  void operator=(const XlaBroadcastHelperOp&) = delete;
 };
 
 REGISTER_XLA_OP(
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc
index cecf474613d918..cdc6a9c5a22bae 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc
@@ -51,7 +51,8 @@ class XlaDequantizeOp : public XlaOpKernel {
   float max_range_;
   bool transpose_output_;
   string mode_;
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaDequantizeOp);
+  XlaDequantizeOp(const XlaDequantizeOp&) = delete;
+  void operator=(const XlaDequantizeOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaDequantize"), XlaDequantizeOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
index 5eed76e40e2243..140c8d368f7f5e 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
@@ -65,7 +65,8 @@ class XlaDotOp : public XlaOpKernel {
  private:
   xla::DotDimensionNumbers dnums_;
   xla::PrecisionConfig precision_config_;
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaDotOp);
+  XlaDotOp(const XlaDotOp&) = delete;
+  void operator=(const XlaDotOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaDot"), MlirXlaOpKernel);
@@ -83,7 +84,8 @@ class XlaDotV2Op : public XlaDotOp {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaDotV2Op);
+  XlaDotV2Op(const XlaDotV2Op&) = delete;
+  void operator=(const XlaDotV2Op&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaDotV2"), MlirXlaOpKernel);
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
index e92bcafbb50b3e..cd10b44c9c955f 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
@@ -86,7 +86,8 @@ class XlaPadOp : public XlaOpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaPadOp);
+  XlaPadOp(const XlaPadOp&) = delete;
+  void operator=(const XlaPadOp&) = delete;
 };
 
 REGISTER_XLA_OP(Name("XlaPad")
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h
index 94e186172abf5c..694d33f969ddfc 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.h
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -165,7 +165,8 @@ class CachedFunctionHandles {
   FunctionLibraryRuntime* flr_;
   std::map<string, FunctionLibraryRuntime::Handle> handles_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CachedFunctionHandles);
+  CachedFunctionHandles(const CachedFunctionHandles&) = delete;
+  void operator=(const CachedFunctionHandles&) = delete;
 };
 
 // Struct for node's output edge info.
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 1d07e3c86a2961..57df97b7d1e9b3 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -387,7 +387,8 @@ class XlaCompiler {
   // stack, and pop the mapping before returning.
   std::stack<std::map<string, xla::XlaOp>> node_token_mapping_stack_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler);
+  XlaCompiler(const XlaCompiler&) = delete;
+  void operator=(const XlaCompiler&) = delete;
 };
 
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 2aadce03752dcf..f3b46fe0e853c9 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -175,7 +175,8 @@ class XlaContext : public ResourceBase {
   // Cached computation to compute Sigmoid of an element, specialized by type.
   ComputationMap sigmoid_func_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaContext);
+  XlaContext(const XlaContext&) = delete;
+  void operator=(const XlaContext&) = delete;
 };
 
 }  // namespace tensorflow

From e538241b03b2772ca65e04a321af9a9c0779691b Mon Sep 17 00:00:00 2001
From: Kyuyeun Kim <kyuyeunk@google.com>
Date: Wed, 4 Oct 2023 00:24:55 -0700
Subject: [PATCH 564/567] Modify quantization debugger to use quant unit.

PiperOrigin-RevId: 570608308
---
 .../quantization/tensorflow/debugging/BUILD   |  2 +
 .../tensorflow/debugging/dump_tensor_op.cc    | 44 ++++++++++++++-----
 .../tensorflow/passes/add_dump_tensor_op.cc   | 15 +++++++
 .../tensorflow/passes/tf_quant_ops.td         |  4 +-
 .../integration_test/quantize_model_test.py   | 23 ++++++++++
 .../tensorflow/tests/add_dump_tensor_op.mlir  | 26 +++++------
 .../tests/quantize_composite_functions.mlir   | 16 +++----
 .../quantize_composite_functions_xla.mlir     | 16 +++----
 8 files changed, 105 insertions(+), 41 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD
index b6a884d57479dd..fa55ce0ba1e391 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD
@@ -55,12 +55,14 @@ tf_kernel_library(
     srcs = ["dump_tensor_op.cc"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:path",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
     ],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/dump_tensor_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/dump_tensor_op.cc
index 01d39f348148e2..ce52e488774f60 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/dump_tensor_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/dump_tensor_op.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
@@ -31,28 +33,29 @@ limitations under the License.
 
 namespace tensorflow {
 
-absl::Status SaveTensorProtoToFile(const Tensor& tensor, std::string file_path,
-                                   tsl::Env* env) {
-  TensorProto tensor_proto;
-  tensor.AsProtoTensorContent(&tensor_proto);
-
+absl::Status SaveSerializedProtoToFile(const absl::string_view serialized_proto,
+                                       const absl::string_view file_path,
+                                       tsl::Env* env) {
   std::unique_ptr<tsl::WritableFile> file;
-  TF_RETURN_IF_ERROR(env->NewWritableFile(file_path, &file));
-  absl::Status append_result = file->Append(tensor_proto.SerializeAsString());
+  TF_RETURN_IF_ERROR(env->NewWritableFile(std::string(file_path), &file));
+  absl::Status append_result = file->Append(serialized_proto);
   absl::Status close_result = file->Close();
 
   return append_result.ok() ? close_result : append_result;
 }
 
-// DumpTensor op saves entire value of input to as a tensor proto into a
+// `DumpTensor` op saves entire value of input to as a tensor proto into a
 // specified directory and filename. When enabled is set to false, op is
-// disabled and won't save any value.
+// disabled and won't save any value. It also creates `QuantizationUnit` proto
+// with `func_name` and `node_name` to identify the op.
 REGISTER_OP("DumpTensor")
     .Input("tensor_data: T")
     .Attr("log_dir_path: string")
     .Attr("file_name: string")
     .Attr("T: type")
     .Attr("enabled: bool")
+    .Attr("func_name: string")
+    .Attr("node_name: string")
     .SetIsStateful();
 
 class DumpTensorOp : public OpKernel {
@@ -60,20 +63,39 @@ class DumpTensorOp : public OpKernel {
   explicit DumpTensorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     string log_dir_path;
     string file_name;
+    string func_name;
+    string node_name;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("log_dir_path", &log_dir_path));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("enabled", &enabled_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("file_name", &file_name));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("func_name", &func_name));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("node_name", &node_name));
     OP_REQUIRES_OK(ctx, ctx->env()->RecursivelyCreateDir(log_dir_path));
 
     tensor_data_path_ = io::JoinPath(log_dir_path, file_name);
+
+    // Fetch func_name and node_name from attributes and save as proto.
+    quantization::UnitWiseQuantizationSpec::QuantizationUnit quant_unit_proto;
+    quant_unit_proto.set_func_name(func_name);
+    quant_unit_proto.set_node_name(node_name);
+
+    string quant_unit_path = io::JoinPath(log_dir_path, "quant_unit.pb");
+
+    OP_REQUIRES_OK(
+        ctx, SaveSerializedProtoToFile(quant_unit_proto.SerializeAsString(),
+                                       quant_unit_path, ctx->env()));
   }
 
   void Compute(OpKernelContext* ctx) override {
     if (enabled_) {
       const Tensor& tensor_data = ctx->input(0);
 
-      OP_REQUIRES_OK(ctx, SaveTensorProtoToFile(tensor_data, tensor_data_path_,
-                                                ctx->env()));
+      TensorProto tensor_proto;
+      tensor_data.AsProtoTensorContent(&tensor_proto);
+
+      OP_REQUIRES_OK(ctx,
+                     SaveSerializedProtoToFile(tensor_proto.SerializeAsString(),
+                                               tensor_data_path_, ctx->env()));
     }
   }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
index 31b93fe272bb24..39c302f9bdafc0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -38,6 +39,7 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
@@ -141,6 +143,11 @@ class AddDumpTensorOp : public OpRewritePattern<TF::PartitionedCallOp> {
 
     rewriter.setInsertionPointAfterValue(result);
 
+    std::optional<QuantizationUnitLoc::QuantizationUnit> quant_unit =
+        FindQuantizationUnitFromLoc(call_op->getLoc());
+
+    if (!quant_unit.has_value()) return failure();
+
     auto folder_name =
         tensorflow::io::JoinPath(log_dir_path_, f_attr.getValue());
     // In Whole model, we first need to set file_name as
@@ -162,6 +169,10 @@ class AddDumpTensorOp : public OpRewritePattern<TF::PartitionedCallOp> {
         // The op is disabled by default. Otherwise, values will be saved
         // during calibration.
         rewriter.getNamedAttr("enabled", rewriter.getBoolAttr(false)),
+        rewriter.getNamedAttr("func_name",
+                              rewriter.getStringAttr(quant_unit->func_name())),
+        rewriter.getNamedAttr("node_name",
+                              rewriter.getStringAttr(quant_unit->node_name())),
     };
 
     rewriter.create<TF::DumpTensorOp>(call_op->getLoc(), TypeRange{}, result,
@@ -193,6 +204,10 @@ class AddDumpTensorOp : public OpRewritePattern<TF::PartitionedCallOp> {
           rewriter.getNamedAttr("file_name", rewriter.getStringAttr(
                                                  "unquantized_tensor_data.pb")),
           rewriter.getNamedAttr("enabled", rewriter.getBoolAttr(false)),
+          rewriter.getNamedAttr(
+              "func_name", rewriter.getStringAttr(quant_unit->func_name())),
+          rewriter.getNamedAttr(
+              "node_name", rewriter.getStringAttr(quant_unit->node_name())),
       };
 
       rewriter.create<TF::DumpTensorOp>(call_op->getLoc(), TypeRange{},
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
index 0dd926e61fd9eb..559c5e31a71f09 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
@@ -77,7 +77,9 @@ def TF_DumpTensorOp : TF_Op<"DumpTensor", []> {
 
     StrAttr:$log_dir_path,
     StrAttr:$file_name,
-    BoolAttr:$enabled
+    BoolAttr:$enabled,
+    StrAttr:$func_name,
+    StrAttr:$node_name
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index f69bb978ad1167..fc2b849940b342 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -5900,6 +5900,18 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
       # should be the same.
       self.assertAllEqual(output_value, dump_file_numpy)
 
+      # Verify if quant_unit.pb file was created correctly.
+      quant_unit_file_path = os.path.join(log_dir_path, folder, 'quant_unit.pb')
+
+      quant_unit = (
+          quant_opts_pb2.UnitWiseQuantizationSpec.QuantizationUnit.FromString(
+              open(quant_unit_file_path, 'rb').read()
+          )
+      )
+
+      self.assertEqual(quant_unit.node_name, 'Conv2D')
+      self.assertRegex(quant_unit.func_name, r'^__inference_conv_\d+')
+
   @parameterized.named_parameters(
       {
           'testcase_name': 'none',
@@ -6013,6 +6025,17 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     self.assertAllEqual(quantized_output_value, quantized_dump_file_numpy)
     self.assertAllEqual(unquantized_output_value, unquantized_dump_file_numpy)
 
+    # Verify if quant_unit.pb file was created correctly.
+    quant_unit_file_path = os.path.join(log_dir_path, folder, 'quant_unit.pb')
+    quant_unit = (
+        quant_opts_pb2.UnitWiseQuantizationSpec.QuantizationUnit.FromString(
+            open(quant_unit_file_path, 'rb').read()
+        )
+    )
+
+    self.assertEqual(quant_unit.node_name, 'Conv2D')
+    self.assertRegex(quant_unit.func_name, r'^__inference_conv_\d+')
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
index bddfbf19d470ec..4ee9b746fff9f6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
@@ -5,8 +5,8 @@ module {
   func.func @conv(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
     %cst = "tf.Const"() {value = dense<[[[[1.600000e-01, 1.000000e-01], [5.100000e-01, 5.400000e-01], [-5.000000e-01, 4.100000e-01]], [[-3.500000e-01, 5.000000e-02], [-0.00999999977, 1.600000e-01], [-4.800000e-01, -2.400000e-01]]], [[[-3.500000e-01, -2.100000e-01], [-1.400000e-01, -2.000000e-02], [4.800000e-01, 3.500000e-01]], [[-1.900000e-01, 3.200000e-01], [0.00999999977, -7.000000e-02], [2.000000e-01, -4.000000e-02]]]]> : tensor<2x2x3x2xf32>} : () -> tensor<2x2x3x2xf32>
     %cst_0 = "tf.Const"() {value = dense<[-2.000000e+00, 3.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
-    %0 = "tf.PartitionedCall"(%arg0, %cst, %cst_0) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2} : (tensor<1x2x2x3xf32>, tensor<2x2x3x2xf32>, tensor<2xf32>) -> tensor<*xf32>
-    %1 = "tf.PartitionedCall"(%arg0, %cst, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x2x3x2xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst, %cst_0) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2} : (tensor<1x2x2x3xf32>, tensor<2x2x3x2xf32>, tensor<2xf32>) -> tensor<*xf32> loc(callsite("test@conv"("Conv2D") at "QuantizationUnit(\12\06Conv2D\1a\04conv)"))
+    %1 = "tf.PartitionedCall"(%arg0, %cst, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x2x3x2xf32>, tensor<2xf32>) -> tensor<*xf32> loc(callsite("test@conv"("Conv2D_1") at "QuantizationUnit(\12\08Conv2D_1\1a\04conv)"))
     func.return %0, %1 : tensor<*xf32>, tensor<*xf32>
   }
   func.func private @composite_conv2d_with_bias_and_relu6_fn_2(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x2x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
@@ -27,7 +27,7 @@ module {
 // WholeModel-DAG: %[[b:.*]] = "tf.Const"() {value = dense<[-2.000000e+00, 3.000000e+00
 // WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
 // WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) {enabled = false, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"} : (tensor<*xf32>) -> ()
+// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"} : (tensor<*xf32>) -> ()
 // WholeModel-DAG: return %[[output0]], %[[output1]]
 
 // PerLayer-LABEL: func @conv
@@ -36,8 +36,8 @@ module {
 // PerLayer-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
 // PerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}
 // PerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %cst, %cst_0) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
-// PerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) {enabled = false, file_name = "quantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"} : (tensor<*xf32>) -> ()
-// PerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) {enabled = false, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"} : (tensor<*xf32>) -> ()
+// PerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) {enabled = false, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"} : (tensor<*xf32>) -> ()
+// PerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"} : (tensor<*xf32>) -> ()
 // PerLayer-DAG: return %[[output0]], %[[output1_quantized]]
 }
 
@@ -49,8 +49,8 @@ module {
     %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
     %cst_1 = "tf.Const"() {value = dense<[[[[0.193340182, 0.285152316], [0.41538316, -0.313452125]], [[0.188379049, 0.0693640113], [-0.199678659, -0.0629909635]]], [[[0.141592324, 0.554834187], [-0.224576354, 0.103607118]], [[0.134974658, -2.952230e-02], [-0.15929231, -0.538676262]]]]> : tensor<2x2x2x2xf32>} : () -> tensor<2x2x2x2xf32>
     %cst_2 = "tf.Const"() {value = dense<[[[[-0.174680978, -0.367524445], [-0.0481151938, -0.154707015]], [[-0.0463985205, 0.457213104], [-0.0713823438, 0.0317451358]]], [[[-0.335502505, 0.00602310896], [0.307939529, 0.49636358]], [[-0.223585874, -0.194682062], [0.0728010535, 0.43586427]]]]> : tensor<2x2x2x2xf32>} : () -> tensor<2x2x2x2xf32>
-    %0 = "tf.PartitionedCall"(%arg, %cst_1, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2} : (tensor<?x2x2x2xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<?x2x2x2xf32>
-    %1 = "tf.PartitionedCall"(%0, %cst_2, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1} : (tensor<?x2x2x2xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<?x2x2x2xf32>
+    %0 = "tf.PartitionedCall"(%arg, %cst_1, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2} : (tensor<?x2x2x2xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<?x2x2x2xf32> loc(callsite("test@multiple_conv2d"("Conv2D") at "QuantizationUnit(\12\06Conv2D\1a\0fmultiple_conv2d)"))
+    %1 = "tf.PartitionedCall"(%0, %cst_2, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1} : (tensor<?x2x2x2xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<?x2x2x2xf32> loc(callsite("test@multiple_conv2d"("Conv2D_1") at "QuantizationUnit(\12\08Conv2D_1\1a\0fmultiple_conv2d)"))
     return %1 : tensor<?x2x2x2xf32>
   }
 
@@ -74,9 +74,9 @@ module {
 // WholeModel-DAG: %[[w0:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}0.193340182, 0.285152316
 // WholeModel-DAG: %[[w1:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
 // WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output0]]) {enabled = false, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[output0]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
 // WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%[[output0]], %[[w1]], %[[b1]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) {enabled = false, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
 // WholeModel-DAG: return %[[output1]]
 
 // PerLayer-LABEL: func @multiple_conv2d
@@ -86,11 +86,11 @@ module {
 // PerLayer-DAG: %[[w1:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
 // PerLayer-DAG: %[[output0_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
 // PerLayer-DAG: %[[output0_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_0}
-// PerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) {enabled = false, file_name = "quantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2"}
-// PerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) {enabled = false, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2"}
+// PerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) {enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
+// PerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
 // PerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}
 // PerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
-// PerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) {enabled = false, file_name = "quantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"}
-// PerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) {enabled = false, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"}
+// PerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) {enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// PerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
 // PerLayer-DAG: return %[[output1_quantized]]
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir
index 0a35859628a7a1..eab1244fb62b3d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir
@@ -211,7 +211,7 @@ module {
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[1.878980e-02, 0.988373816]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
     %1 = "tf.PartitionedCall"(%0, %cst_0, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_20} : (tensor<*xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<*xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[0.000000e+00, 0.36084348]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
-    "tf.DumpTensor"(%2) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2"} : (tensor<*xf32>) -> ()
+    "tf.DumpTensor"(%2) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"} : (tensor<*xf32>) -> ()
     %3 = "tf.PartitionedCall"(%2, %cst_2, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_10} : (tensor<*xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<*xf32>
     %4 = "quantfork.stats"(%3) {layerStats = dense<[0.000000e+00, 0.18486841]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
     %5 = "tf.Identity"(%4) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
@@ -220,9 +220,9 @@ module {
     %8 = "quantfork.stats"(%7) {layerStats = dense<[0.000000e+00, 0.18486841]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
     %9 = "tf.PartitionedCall"(%0, %cst_0, %cst) {config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_00} : (tensor<*xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<*xf32>
     %10 = "quantfork.stats"(%9) {layerStats = dense<[0.000000e+00, 0.36084348]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
-    "tf.DumpTensor"(%10) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2"} : (tensor<*xf32>) -> ()
-    "tf.DumpTensor"(%4) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"} : (tensor<*xf32>) -> ()
-    "tf.DumpTensor"(%8) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"} : (tensor<*xf32>) -> ()
+    "tf.DumpTensor"(%10) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"} : (tensor<*xf32>) -> ()
+    "tf.DumpTensor"(%4) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"} : (tensor<*xf32>) -> ()
+    "tf.DumpTensor"(%8) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"} : (tensor<*xf32>) -> ()
     func.return %6 : tensor<*xf32>
   }
 
@@ -278,9 +278,9 @@ module {
 // CHECK-DAG: %[[identity:.*]] = "tf.Identity"(%[[conv1_dequantized_1]])
 // CHECK-DAG: %[[conv0_float:.*]] = "tf.PartitionedCall"(%arg0, %[[w0_float]], %[[b0_float]]) {config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_00}
 // CHECK-DAG: %[[conv1_float:.*]] = "tf.PartitionedCall"(%[[conv0_dequantized]], %[[w1_float]], %[[b1_float]]) {config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_00}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv0_dequantized]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv0_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv1_dequantized_0]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv1_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv0_dequantized]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv0_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv1_dequantized_0]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv1_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
 // CHECK-DAG: return %[[identity]]
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
index 2a6ea22ddcc1d6..38f0662e097063 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
@@ -257,7 +257,7 @@ module {
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[1.878980e-02, 0.988373816]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
     %1 = "tf.PartitionedCall"(%0, %cst_0, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_20} : (tensor<*xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<*xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[0.000000e+00, 0.36084348]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
-    "tf.DumpTensor"(%2) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2"} : (tensor<*xf32>) -> ()
+    "tf.DumpTensor"(%2) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"} : (tensor<*xf32>) -> ()
     %3 = "tf.PartitionedCall"(%2, %cst_2, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_10} : (tensor<*xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<*xf32>
     %4 = "quantfork.stats"(%3) {layerStats = dense<[0.000000e+00, 0.18486841]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
     %5 = "tf.Identity"(%4) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
@@ -266,9 +266,9 @@ module {
     %8 = "quantfork.stats"(%7) {layerStats = dense<[0.000000e+00, 0.18486841]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
     %9 = "tf.PartitionedCall"(%0, %cst_0, %cst) {config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_00} : (tensor<*xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<*xf32>
     %10 = "quantfork.stats"(%9) {layerStats = dense<[0.000000e+00, 0.36084348]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
-    "tf.DumpTensor"(%10) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2"} : (tensor<*xf32>) -> ()
-    "tf.DumpTensor"(%4) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"} : (tensor<*xf32>) -> ()
-    "tf.DumpTensor"(%8) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"} : (tensor<*xf32>) -> ()
+    "tf.DumpTensor"(%10) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"} : (tensor<*xf32>) -> ()
+    "tf.DumpTensor"(%4) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"} : (tensor<*xf32>) -> ()
+    "tf.DumpTensor"(%8) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"} : (tensor<*xf32>) -> ()
     func.return %6 : tensor<*xf32>
   }
 
@@ -322,9 +322,9 @@ module {
 // CHECK-DAG: %[[identity:.*]] = "tf.Identity"(%[[conv1_dequantized]])
 // CHECK-DAG: %[[conv0_float:.*]] = "tf.PartitionedCall"(%arg0, %[[w0_float]], %[[b0_float]]) {config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_00}
 // CHECK-DAG: %[[conv1_float:.*]] = "tf.PartitionedCall"(%[[conv0_dequantized]], %[[w1_float]], %[[b1_float]]) {config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_00}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv0_dequantized]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv0_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv1_dequantized]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv1_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1"}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv0_dequantized]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv0_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv1_dequantized]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv1_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
 // CHECK-DAG: return %[[identity]]
 }

From b3a190545353825b548456b63a24130fd08ec0f5 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Wed, 4 Oct 2023 01:01:42 -0700
Subject: [PATCH 565/567] NFC: replace uses of
 TF_DISALLOW_COPY_AND_ASSIGN/SE_DISALLOW_COPY_AND_ASSIGN macros with
 implementation.

PiperOrigin-RevId: 570614942
---
 .../xla/xla/backends/interpreter/platform.h   |  3 +-
 .../profiler/cpu/metadata_collector.cc        |  3 +-
 .../backends/profiler/cpu/python_tracer.cc    |  3 +-
 .../backends/profiler/gpu/cupti_collector.cc  |  3 +-
 .../backends/profiler/gpu/cupti_collector.h   |  6 ++-
 .../profiler/gpu/cupti_error_manager.h        |  3 +-
 .../backends/profiler/gpu/cupti_interface.h   |  3 +-
 .../xla/backends/profiler/gpu/cupti_tracer.cc |  8 ++-
 .../xla/backends/profiler/gpu/cupti_wrapper.h |  3 +-
 .../xla/backends/profiler/gpu/nvtx_utils.h    |  3 +-
 .../xla/xla/service/gpu/ir_emitter_unnested.h |  1 -
 third_party/xla/xla/stream_executor/blas.h    |  3 +-
 .../xla/xla/stream_executor/command_buffer.h  |  3 +-
 .../xla/xla/stream_executor/cuda/cuda_blas.h  |  3 +-
 .../xla/xla/stream_executor/cuda/cuda_dnn.cc  | 25 ++++++---
 .../xla/xla/stream_executor/cuda/cuda_dnn.h   |  3 +-
 .../xla/xla/stream_executor/cuda/cuda_fft.h   |  3 +-
 .../xla/stream_executor/cuda/cuda_platform.h  |  3 +-
 .../xla/stream_executor/device_description.h  |  3 +-
 .../stream_executor/device_host_allocator.h   |  3 +-
 .../stream_executor/device_mem_allocator.h    |  3 +-
 .../stream_executor/device_memory_allocator.h |  3 +-
 third_party/xla/xla/stream_executor/dnn.h     |  3 +-
 third_party/xla/xla/stream_executor/event.h   |  3 +-
 .../xla/xla/stream_executor/executor_cache.h  |  3 +-
 third_party/xla/xla/stream_executor/fft.h     |  3 +-
 .../xla/stream_executor/gpu/gpu_activation.h  |  3 +-
 .../gpu/gpu_cudamallocasync_allocator.h       |  3 +-
 .../xla/stream_executor/gpu/gpu_diagnostics.h |  3 +-
 .../xla/stream_executor/gpu/gpu_executor.h    |  3 +-
 .../xla/xla/stream_executor/gpu/gpu_timer.h   |  3 +-
 .../xla/stream_executor/host/host_platform.h  |  3 +-
 third_party/xla/xla/stream_executor/kernel.h  |  6 ++-
 .../xla/xla/stream_executor/kernel_spec.h     | 18 ++++---
 .../xla/xla/stream_executor/platform.h        |  3 +-
 .../xla/xla/stream_executor/platform/port.h   |  2 +
 .../xla/xla/stream_executor/plugin_registry.h |  3 +-
 .../xla/xla/stream_executor/rocm/rocm_blas.h  |  3 +-
 .../xla/xla/stream_executor/rocm/rocm_dnn.cc  | 54 +++++++++++++------
 .../xla/xla/stream_executor/rocm/rocm_dnn.h   |  3 +-
 .../xla/xla/stream_executor/rocm/rocm_fft.h   |  3 +-
 .../xla/stream_executor/rocm/rocm_platform.h  |  3 +-
 .../xla/stream_executor/scratch_allocator.h   |  6 ++-
 third_party/xla/xla/stream_executor/stream.h  |  3 +-
 .../stream_executor_internal.h                | 15 ++++--
 .../stream_executor/stream_executor_pimpl.h   |  6 ++-
 .../temporary_memory_manager.h                |  3 +-
 .../stream_executor/tpu/tpu_node_context.h    |  3 +-
 .../stream_executor/tpu/tpu_op_executable.h   |  3 +-
 49 files changed, 177 insertions(+), 84 deletions(-)

diff --git a/third_party/xla/xla/backends/interpreter/platform.h b/third_party/xla/xla/backends/interpreter/platform.h
index dc96be16a5ea19..5bf5533ebce91a 100644
--- a/third_party/xla/xla/backends/interpreter/platform.h
+++ b/third_party/xla/xla/backends/interpreter/platform.h
@@ -60,7 +60,8 @@ class XlaInterpreterPlatform : public Platform {
   // Cache of created StreamExecutors.
   ExecutorCache executor_cache_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(XlaInterpreterPlatform);
+  XlaInterpreterPlatform(const XlaInterpreterPlatform&) = delete;
+  void operator=(const XlaInterpreterPlatform&) = delete;
 };
 
 }  // namespace interpreter
diff --git a/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc b/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc
index 27f25b25559a8a..19bc06d288fc19 100644
--- a/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc
+++ b/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc
@@ -77,7 +77,8 @@ class MetadataCollector : public tsl::profiler::ProfilerInterface {
   std::vector<std::unique_ptr<xla::HloProto>> debug_info_;
   bool trace_active_ = false;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(MetadataCollector);
+  MetadataCollector(const MetadataCollector&) = delete;
+  void operator=(const MetadataCollector&) = delete;
 };
 
 std::unique_ptr<tsl::profiler::ProfilerInterface> CreatMetadataCollector(
diff --git a/third_party/xla/xla/backends/profiler/cpu/python_tracer.cc b/third_party/xla/xla/backends/profiler/cpu/python_tracer.cc
index 303244339f1969..d45a221990f4d6 100644
--- a/third_party/xla/xla/backends/profiler/cpu/python_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/cpu/python_tracer.cc
@@ -47,7 +47,8 @@ class PythonTracer : public tsl::profiler::ProfilerInterface {
   const PythonHooksOptions options_;
   std::unique_ptr<PythonHookContext> context_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(PythonTracer);
+  PythonTracer(const PythonTracer&) = delete;
+  void operator=(const PythonTracer&) = delete;
 };
 
 PythonTracer::~PythonTracer() { Stop().IgnoreError(); }  // NOLINT
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
index 3d885d29663bdc..2e97339714a07d 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
@@ -606,7 +606,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
 
   absl::FixedArray<PerDeviceCollector> per_device_collector_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollectorImpl);
+  CuptiTraceCollectorImpl(const CuptiTraceCollectorImpl&) = delete;
+  void operator=(const CuptiTraceCollectorImpl&) = delete;
 };
 
 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
index 2a6af02581d0a2..8af5927269de52 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
@@ -251,7 +251,8 @@ class AnnotationMap {
   const tsl::uint64 max_size_;
   absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
+  AnnotationMap(const AnnotationMap&) = delete;
+  void operator=(const AnnotationMap&) = delete;
 };
 
 class CuptiTraceCollector {
@@ -282,7 +283,8 @@ class CuptiTraceCollector {
  private:
   AnnotationMap annotation_map_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
+  CuptiTraceCollector(const CuptiTraceCollector&) = delete;
+  void operator=(const CuptiTraceCollector&) = delete;
 };
 
 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
index 6d037ffc61ebf8..9ee29433b6a7db 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
@@ -268,7 +268,8 @@ class CuptiErrorManager : public xla::profiler::CuptiInterface {
   // Prevent recursive undo if an UndoFunction fails.
   bool undo_disabled_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiErrorManager);
+  CuptiErrorManager(const CuptiErrorManager&) = delete;
+  void operator=(const CuptiErrorManager&) = delete;
 };
 
 }  // namespace profiler
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h b/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h
index ad941ff2602f7d..631780e1792de7 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h
@@ -193,7 +193,8 @@ class CuptiInterface {
   virtual bool Disabled() const = 0;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiInterface);
+  CuptiInterface(const CuptiInterface&) = delete;
+  void operator=(const CuptiInterface&) = delete;
 };
 
 CuptiInterface* GetCuptiInterface();
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
index d5573f116473ab..5c93d7872f633d 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
@@ -1077,7 +1077,9 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
   absl::Mutex mutex_;
   absl::flat_hash_set<CUcontext> contexts_ TF_GUARDED_BY(mutex_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithActivityApi);
+  CuptiDriverApiHookWithActivityApi(const CuptiDriverApiHookWithActivityApi &) =
+      delete;
+  void operator=(const CuptiDriverApiHookWithActivityApi &) = delete;
 };
 
 struct KernelRecord {
@@ -1699,7 +1701,9 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
   CuptiTraceCollector *collector_;
   absl::node_hash_set<CuptiApiCallbackContext *> callback_contexts_;
   std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
+  CuptiDriverApiHookWithCudaEvent(const CuptiDriverApiHookWithCudaEvent &) =
+      delete;
+  void operator=(const CuptiDriverApiHookWithCudaEvent &) = delete;
 };
 
 /*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h
index 206f98fb529a50..658148106d272f 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h
@@ -176,7 +176,8 @@ class CuptiWrapper : public xla::profiler::CuptiInterface {
   bool Disabled() const override { return false; }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiWrapper);
+  CuptiWrapper(const CuptiWrapper&) = delete;
+  void operator=(const CuptiWrapper&) = delete;
 };
 
 }  // namespace profiler
diff --git a/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.h b/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.h
index 897e6768b2866e..58be1980e2a293 100644
--- a/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.h
+++ b/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.h
@@ -49,7 +49,8 @@ class NVTXRangeTracker {
  private:
   static std::stack<std::string>& GetRangeStack();
 
-  TF_DISALLOW_COPY_AND_ASSIGN(NVTXRangeTracker);
+  NVTXRangeTracker(const NVTXRangeTracker&) = delete;
+  void operator=(const NVTXRangeTracker&) = delete;
 };
 
 }  // namespace profiler
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
index deb24c986a5fcb..0a2830aae4c9d5 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -152,7 +152,6 @@ class IrEmitterUnnested : public IrEmitter {
       mlir::Operation* op,
       const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
           hlo_for_lmhlo);
-  Status EmitReduce(mlir::Operation* op);
   Status EmitSelectAndScatter(
       mlir::Operation* op,
       const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
diff --git a/third_party/xla/xla/stream_executor/blas.h b/third_party/xla/xla/stream_executor/blas.h
index 4c0ff8a74bdc89..a85ea3b460d6ed 100644
--- a/third_party/xla/xla/stream_executor/blas.h
+++ b/third_party/xla/xla/stream_executor/blas.h
@@ -490,7 +490,8 @@ class BlasSupport {
   BlasSupport() {}
 
  private:
-  SE_DISALLOW_COPY_AND_ASSIGN(BlasSupport);
+  BlasSupport(const BlasSupport &) = delete;
+  void operator=(const BlasSupport &) = delete;
 };
 
 // Macro used to quickly declare overrides for abstract virtuals in the
diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index fe27156cae4178..8d544ea4214a74 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -120,7 +120,8 @@ class CommandBuffer {
 
   std::unique_ptr<internal::CommandBufferInterface> implementation_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(CommandBuffer);
+  CommandBuffer(const CommandBuffer&) = delete;
+  void operator=(const CommandBuffer&) = delete;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
index 8869b284ce487a..a23fb616181e42 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
@@ -121,7 +121,8 @@ class CUDABlas : public blas::BlasSupport {
 
   BlasLt blas_lt_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
+  CUDABlas(const CUDABlas &) = delete;
+  void operator=(const CUDABlas &) = delete;
 };
 
 }  // namespace cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index c96ee57438496d..060307a297597a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -1019,7 +1019,8 @@ class CudnnPoolingDescriptor {
  private:
   PoolingDescriptor handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(CudnnPoolingDescriptor);
+  CudnnPoolingDescriptor(const CudnnPoolingDescriptor&) = delete;
+  void operator=(const CudnnPoolingDescriptor&) = delete;
 };
 
 // Turns a NormalizeDescriptor structure into a cudnn LRN descriptor handle.
@@ -1057,7 +1058,8 @@ class CudnnNormalizeDescriptor {
  private:
   LrnDescriptor handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(CudnnNormalizeDescriptor);
+  CudnnNormalizeDescriptor(const CudnnNormalizeDescriptor&) = delete;
+  void operator=(const CudnnNormalizeDescriptor&) = delete;
 };
 
 // Turns a ActivationDescriptor structure into a cudnn activation
@@ -1254,7 +1256,8 @@ class CudnnDropoutDescriptor {
 
  private:
   DropoutDescriptor handle_;  // Owned.
-  SE_DISALLOW_COPY_AND_ASSIGN(CudnnDropoutDescriptor);
+  CudnnDropoutDescriptor(const CudnnDropoutDescriptor&) = delete;
+  void operator=(const CudnnDropoutDescriptor&) = delete;
 };
 
 class CudnnRnnParamsDescriptor {
@@ -1286,7 +1289,8 @@ class CudnnRnnParamsDescriptor {
   int64_t params_size_in_bytes_;
   ParamsRegions weights_;
   ParamsRegions biases_;
-  SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnParamsDescriptor);
+  CudnnRnnParamsDescriptor(const CudnnRnnParamsDescriptor&) = delete;
+  void operator=(const CudnnRnnParamsDescriptor&) = delete;
 };
 
 }  // namespace
@@ -1488,7 +1492,8 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
   dnn::AlgorithmConfig algorithm_config_;
   CudnnDropoutDescriptor dropout_desc_;
   CudnnRnnParamsDescriptor params_desc_;
-  SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnDescriptor);
+  CudnnRnnDescriptor(const CudnnRnnDescriptor&) = delete;
+  void operator=(const CudnnRnnDescriptor&) = delete;
 };
 
 #if CUDNN_VERSION >= 7603
@@ -1508,7 +1513,8 @@ class CudnnCtcLossDescriptor {
  private:
   CtcLossDescriptor handle_;  // Owned
 
-  SE_DISALLOW_COPY_AND_ASSIGN(CudnnCtcLossDescriptor);
+  CudnnCtcLossDescriptor(const CudnnCtcLossDescriptor&) = delete;
+  void operator=(const CudnnCtcLossDescriptor&) = delete;
 };
 #else
 // dummy class
@@ -1783,7 +1789,9 @@ class CudnnRnnSequenceTensorDescriptor
   TensorDescriptor handle_;
   RNNDataDescriptor rnn_data_handle_;
   std::vector<cudnnTensorDescriptor_t> handles_;  // Copies of handle_.
-  SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnSequenceTensorDescriptor);
+  CudnnRnnSequenceTensorDescriptor(const CudnnRnnSequenceTensorDescriptor&) =
+      delete;
+  void operator=(const CudnnRnnSequenceTensorDescriptor&) = delete;
 };
 
 class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
@@ -1814,7 +1822,8 @@ class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
   int num_layers_;
   int batch_size_;
   int data_size_;
-  SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnStateTensorDescriptor);
+  CudnnRnnStateTensorDescriptor(const CudnnRnnStateTensorDescriptor&) = delete;
+  void operator=(const CudnnRnnStateTensorDescriptor&) = delete;
 };
 
 namespace {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
index 65eb462dd60f66..1157093b53bedc 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
@@ -767,7 +767,8 @@ class CudnnSupport : public dnn::DnnSupport {
       ScratchAllocator* scratch_allocator,
       DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) override;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
+  CudnnSupport(const CudnnSupport&) = delete;
+  void operator=(const CudnnSupport&) = delete;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_fft.h b/third_party/xla/xla/stream_executor/cuda/cuda_fft.h
index 0794fdd9aeb67d..e5ce88163e4bbc 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_fft.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_fft.h
@@ -131,7 +131,8 @@ class CUDAFft : public fft::FftSupport {
                      const DeviceMemory<InputT> &input,
                      DeviceMemory<OutputT> *output);
 
-  SE_DISALLOW_COPY_AND_ASSIGN(CUDAFft);
+  CUDAFft(const CUDAFft&) = delete;
+  void operator=(const CUDAFft&) = delete;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.h b/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
index fdf21fa9fa227e..8b26b8c76cefba 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
@@ -94,7 +94,8 @@ class CudaPlatform : public Platform {
   // manager.
   int limit_numa_node_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(CudaPlatform);
+  CudaPlatform(const CudaPlatform&) = delete;
+  void operator=(const CudaPlatform&) = delete;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index 0be3f4f0d8330e..84653c070ec56f 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -524,7 +524,8 @@ class DeviceDescriptionBuilder {
  private:
   DeviceDescription device_description_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder);
+  DeviceDescriptionBuilder(const DeviceDescriptionBuilder &) = delete;
+  void operator=(const DeviceDescriptionBuilder &) = delete;
 };
 
 }  // namespace internal
diff --git a/third_party/xla/xla/stream_executor/device_host_allocator.h b/third_party/xla/xla/stream_executor/device_host_allocator.h
index 8ee4f89a79db43..b724f2e1d2292d 100644
--- a/third_party/xla/xla/stream_executor/device_host_allocator.h
+++ b/third_party/xla/xla/stream_executor/device_host_allocator.h
@@ -77,7 +77,8 @@ class DeviceHostAllocator : public tsl::SubAllocator {
   StreamExecutor* stream_exec_;  // not owned, non-null
   const int numa_node_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DeviceHostAllocator);
+  DeviceHostAllocator(const DeviceHostAllocator&) = delete;
+  void operator=(const DeviceHostAllocator&) = delete;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/device_mem_allocator.h b/third_party/xla/xla/stream_executor/device_mem_allocator.h
index 5a946dc876bbfd..fe02b44f5fd5bd 100644
--- a/third_party/xla/xla/stream_executor/device_mem_allocator.h
+++ b/third_party/xla/xla/stream_executor/device_mem_allocator.h
@@ -87,7 +87,8 @@ class DeviceMemAllocator : public tsl::SubAllocator {
   const tsl::PlatformDeviceId device_id_;
   const bool use_unified_memory_ = false;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DeviceMemAllocator);
+  DeviceMemAllocator(const DeviceMemAllocator&) = delete;
+  void operator=(const DeviceMemAllocator&) = delete;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/device_memory_allocator.h b/third_party/xla/xla/stream_executor/device_memory_allocator.h
index 7b3ee6eb162f6f..500428b0ed24b5 100644
--- a/third_party/xla/xla/stream_executor/device_memory_allocator.h
+++ b/third_party/xla/xla/stream_executor/device_memory_allocator.h
@@ -148,7 +148,8 @@ class ScopedDeviceMemory {
   int device_ordinal_;                // Negative one for inactive object.
   DeviceMemoryAllocator *allocator_;  // Null if this object is inactive.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedDeviceMemory);
+  ScopedDeviceMemory(const ScopedDeviceMemory &) = delete;
+  void operator=(const ScopedDeviceMemory &) = delete;
 };
 
 // Type alias for compatibility with the previous managed memory implementation.
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index d56f3677f102aa..58e204c3f2fe09 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -2589,7 +2589,8 @@ class DnnSupport {
     return ::tsl::OkStatus();
   }
 
-  SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport);
+  DnnSupport(const DnnSupport&) = delete;
+  void operator=(const DnnSupport&) = delete;
 };
 
 }  // namespace dnn
diff --git a/third_party/xla/xla/stream_executor/event.h b/third_party/xla/xla/stream_executor/event.h
index b2b2121c9c8123..7b2b706ad9622a 100644
--- a/third_party/xla/xla/stream_executor/event.h
+++ b/third_party/xla/xla/stream_executor/event.h
@@ -80,7 +80,8 @@ class Event {
   // the object. Owned.
   std::unique_ptr<internal::EventInterface> implementation_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(Event);
+  Event(const Event&) = delete;
+  void operator=(const Event&) = delete;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/executor_cache.h b/third_party/xla/xla/stream_executor/executor_cache.h
index d845b8eabb5ce3..6e9668afdbf39d 100644
--- a/third_party/xla/xla/stream_executor/executor_cache.h
+++ b/third_party/xla/xla/stream_executor/executor_cache.h
@@ -81,7 +81,8 @@ class ExecutorCache {
   absl::Mutex mutex_;
   absl::node_hash_map<int, Entry> cache_ ABSL_GUARDED_BY(mutex_);
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ExecutorCache);
+  ExecutorCache(const ExecutorCache&) = delete;
+  void operator=(const ExecutorCache&) = delete;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/fft.h b/third_party/xla/xla/stream_executor/fft.h
index 683357ea63991e..88a383b84cb1e7 100644
--- a/third_party/xla/xla/stream_executor/fft.h
+++ b/third_party/xla/xla/stream_executor/fft.h
@@ -204,7 +204,8 @@ class FftSupport {
   FftSupport() {}
 
  private:
-  SE_DISALLOW_COPY_AND_ASSIGN(FftSupport);
+  FftSupport(const FftSupport &) = delete;
+  void operator=(const FftSupport &) = delete;
 };
 
 // Macro used to quickly declare overrides for abstract virtuals in the
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_activation.h b/third_party/xla/xla/stream_executor/gpu/gpu_activation.h
index 34456b729b59c2..385644d7ac1975 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_activation.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_activation.h
@@ -50,7 +50,8 @@ class ScopedActivateExecutorContext {
   // The cuda.h-using datatype that we wrap.
   ScopedActivateContext* driver_scoped_activate_context_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
+  ScopedActivateExecutorContext(const ScopedActivateExecutorContext&) = delete;
+  void operator=(const ScopedActivateExecutorContext&) = delete;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h
index 6ff645e56eb6b3..c428132d74aa93 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h
@@ -127,7 +127,8 @@ class GpuCudaMallocAsyncAllocator : public tsl::Allocator {
 
   bool reserve_memory_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GpuCudaMallocAsyncAllocator);
+  GpuCudaMallocAsyncAllocator(const GpuCudaMallocAsyncAllocator&) = delete;
+  void operator=(const GpuCudaMallocAsyncAllocator&) = delete;
 
   // Stats.
   // Structures mutable after construction
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_diagnostics.h b/third_party/xla/xla/stream_executor/gpu/gpu_diagnostics.h
index 704f780b540402..7892a3bcf4f508 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_diagnostics.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_diagnostics.h
@@ -89,7 +89,8 @@ class Diagnostician {
 
   static std::string GetDevNodePath(int dev_node_ordinal);
 
-  SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
+  Diagnostician(const Diagnostician&) = delete;
+  void operator=(const Diagnostician&) = delete;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index 6eedbfcc307089..c93073ffd9dafa 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -387,7 +387,8 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   absl::flat_hash_map<void*, Stream*> alive_gpu_streams_
       ABSL_GUARDED_BY(alive_gpu_streams_mu_);
 
-  SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
+  GpuExecutor(const GpuExecutor&) = delete;
+  void operator=(const GpuExecutor&) = delete;
 };
 
 inline GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
index 91ef02a7c23016..23663cb1d978f1 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
@@ -66,7 +66,8 @@ class GpuTimer {
   GpuStream* stream_;
   bool is_stopped_ = false;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(GpuTimer);
+  GpuTimer(const GpuTimer&) = delete;
+  void operator=(const GpuTimer&) = delete;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/host/host_platform.h b/third_party/xla/xla/stream_executor/host/host_platform.h
index e08af9feab0c23..4461669b95bbd5 100644
--- a/third_party/xla/xla/stream_executor/host/host_platform.h
+++ b/third_party/xla/xla/stream_executor/host/host_platform.h
@@ -67,7 +67,8 @@ class HostPlatform : public Platform {
   // Cache of created StreamExecutors.
   ExecutorCache executor_cache_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(HostPlatform);
+  HostPlatform(const HostPlatform&) = delete;
+  void operator=(const HostPlatform&) = delete;
 };
 
 }  // namespace host
diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index 7ad6994d3cf225..4a9f7fbbc95c66 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -196,7 +196,8 @@ class KernelBase {
 
   KernelMetadata metadata_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(KernelBase);
+  KernelBase(const KernelBase &) = delete;
+  void operator=(const KernelBase &) = delete;
 };
 
 // Whether T is a DeviceMemory-family pointer.
@@ -610,7 +611,8 @@ class TypedKernel : public KernelBase {
     args->add_shared_bytes(arg.size());
   }
 
-  SE_DISALLOW_COPY_AND_ASSIGN(TypedKernel);
+  TypedKernel(const TypedKernel &) = delete;
+  void operator=(const TypedKernel &) = delete;
 };
 
 // Template metaprogramming helper type that helps us produce better error
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.h b/third_party/xla/xla/stream_executor/kernel_spec.h
index 4d28b5b6c72dca..68f45a0ee07f4d 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.h
+++ b/third_party/xla/xla/stream_executor/kernel_spec.h
@@ -84,7 +84,8 @@ class KernelLoaderSpec {
   // above.
   std::string kernel_name_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(KernelLoaderSpec);
+  KernelLoaderSpec(const KernelLoaderSpec &) = delete;
+  void operator=(const KernelLoaderSpec &) = delete;
 };
 
 // An abstract kernel loader spec that has an associated file path, where
@@ -108,7 +109,8 @@ class OnDiskKernelLoaderSpec : public KernelLoaderSpec {
   std::string filename_;
 
  private:
-  SE_DISALLOW_COPY_AND_ASSIGN(OnDiskKernelLoaderSpec);
+  OnDiskKernelLoaderSpec(const OnDiskKernelLoaderSpec &) = delete;
+  void operator=(const OnDiskKernelLoaderSpec &) = delete;
 };
 
 // Kernel loader specification for PTX text that resides on disk.
@@ -120,7 +122,8 @@ class CudaPtxOnDisk : public OnDiskKernelLoaderSpec {
   const char *CanonicalSuffix() const override { return ".ptx"; }
 
  private:
-  SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxOnDisk);
+  CudaPtxOnDisk(const CudaPtxOnDisk &) = delete;
+  void operator=(const CudaPtxOnDisk &) = delete;
 };
 
 // Kernel loader specification for CUBIN binary that resides on disk.
@@ -136,7 +139,8 @@ class CudaCubinOnDisk : public OnDiskKernelLoaderSpec {
  private:
   std::string filename_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinOnDisk);
+  CudaCubinOnDisk(const CudaCubinOnDisk &) = delete;
+  void operator=(const CudaCubinOnDisk &) = delete;
 };
 
 // Kernel loader specification for PTX text that resides in memory.
@@ -213,7 +217,8 @@ class CudaPtxInMemory : public KernelLoaderSpec {
   // compute capability specified (in the single-PTX constructor).
   static const std::tuple<int, int> kMinimumCapability;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxInMemory);
+  CudaPtxInMemory(const CudaPtxInMemory &) = delete;
+  void operator=(const CudaPtxInMemory &) = delete;
 };
 
 // Kernel loader specification for a CUBIN blob that resides in memory.
@@ -227,7 +232,8 @@ class CudaCubinInMemory : public KernelLoaderSpec {
  private:
   const char *bytes_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinInMemory);
+  CudaCubinInMemory(const CudaCubinInMemory &) = delete;
+  void operator=(const CudaCubinInMemory &) = delete;
 };
 
 // Describes how to load a kernel on any subset of a number of target platforms.
diff --git a/third_party/xla/xla/stream_executor/platform.h b/third_party/xla/xla/stream_executor/platform.h
index 6ce7263162ba25..b03d046220bbbc 100644
--- a/third_party/xla/xla/stream_executor/platform.h
+++ b/third_party/xla/xla/stream_executor/platform.h
@@ -144,7 +144,8 @@ class Platform {
   Platform() = default;
 
  private:
-  SE_DISALLOW_COPY_AND_ASSIGN(Platform);
+  Platform(const Platform&) = delete;
+  void operator=(const Platform&) = delete;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/platform/port.h b/third_party/xla/xla/stream_executor/platform/port.h
index 08b9163753a1e0..dfb9c1997baf3a 100644
--- a/third_party/xla/xla/stream_executor/platform/port.h
+++ b/third_party/xla/xla/stream_executor/platform/port.h
@@ -40,7 +40,9 @@ using std::string;
 
 }  // namespace stream_executor
 
+// DEPRECATED: directly use the macro implementation instead.
 #define SE_DISALLOW_COPY_AND_ASSIGN TF_DISALLOW_COPY_AND_ASSIGN
+
 #define SE_MUST_USE_RESULT TF_MUST_USE_RESULT
 #define SE_PREDICT_TRUE TF_PREDICT_TRUE
 #define SE_PREDICT_FALSE TF_PREDICT_FALSE
diff --git a/third_party/xla/xla/stream_executor/plugin_registry.h b/third_party/xla/xla/stream_executor/plugin_registry.h
index ed2ebd7cd4d3a6..26ddf0bd60f080 100644
--- a/third_party/xla/xla/stream_executor/plugin_registry.h
+++ b/third_party/xla/xla/stream_executor/plugin_registry.h
@@ -98,7 +98,8 @@ class PluginRegistry {
   // The set of registered factories, keyed by platform ID.
   std::map<Platform::Id, Factories> factories_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(PluginRegistry);
+  PluginRegistry(const PluginRegistry&) = delete;
+  void operator=(const PluginRegistry&) = delete;
 };
 
 // Explicit specializations are defined in plugin_registry.cc.
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.h b/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
index 785c5026e32f0c..a6ec9e0ad6cdd0 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
@@ -201,7 +201,8 @@ class ROCMBlas : public blas::BlasSupport {
   rocm::BlasLt blas_lt_;
 #endif
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ROCMBlas);
+  ROCMBlas(const ROCMBlas &) = delete;
+  void operator=(const ROCMBlas &) = delete;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index 329dbfbe1ee7e1..3bda5c198b8b41 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -819,7 +819,8 @@ class ScopedTensorDescriptor {
  private:
   miopenTensorDescriptor_t handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedTensorDescriptor);
+  ScopedTensorDescriptor(const ScopedTensorDescriptor&) = delete;
+  void operator=(const ScopedTensorDescriptor&) = delete;
 };
 
 // Turns a FilterDescriptor structure into a miopen filter handle within a
@@ -916,7 +917,8 @@ class ScopedFilterDescriptor {
   // miopen filter descriptor this object creates. Owned.
   miopenTensorDescriptor_t handle_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor);
+  ScopedFilterDescriptor(const ScopedFilterDescriptor&) = delete;
+  void operator=(const ScopedFilterDescriptor&) = delete;
 };
 
 // Turns a ConvolutionDescriptor structure into a miopen convolution handle
@@ -993,7 +995,8 @@ class ScopedConvolutionDescriptor {
  private:
   miopenConvolutionDescriptor_t handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
+  ScopedConvolutionDescriptor(const ScopedConvolutionDescriptor&) = delete;
+  void operator=(const ScopedConvolutionDescriptor&) = delete;
 };
 
 // Turns a PoolingDescriptor structure into a miopen pooling descriptor handle
@@ -1054,7 +1057,8 @@ class ScopedPoolingDescriptor {
  private:
   miopenPoolingDescriptor_t handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
+  ScopedPoolingDescriptor(const ScopedPoolingDescriptor&) = delete;
+  void operator=(const ScopedPoolingDescriptor&) = delete;
 };
 
 // Turns a NormalizeDescriptor structure into a miopen LRN descriptor handle.
@@ -1108,7 +1112,8 @@ class ScopedNormalizeDescriptor {
  private:
   miopenLRNDescriptor_t handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedNormalizeDescriptor);
+  ScopedNormalizeDescriptor(const ScopedNormalizeDescriptor&) = delete;
+  void operator=(const ScopedNormalizeDescriptor&) = delete;
 };
 
 // Turns a activation mode into a miopen activation mode descriptor with a scope
@@ -1186,7 +1191,8 @@ class ScopedActivationDescriptor {
  private:
   miopenActivationDescriptor_t handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivationDescriptor);
+  ScopedActivationDescriptor(const ScopedActivationDescriptor&) = delete;
+  void operator=(const ScopedActivationDescriptor&) = delete;
 
  public:
   // caching these values here to avoid calling miopenGetActivationDescriptor
@@ -1398,7 +1404,8 @@ class ScopedFusionPlanBase {
   miopenOperatorArgs_t fusion_args_;  // Owned.
   bool fusion_plan_compiled_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFusionPlanBase);
+  ScopedFusionPlanBase(const ScopedFusionPlanBase&) = delete;
+  void operator=(const ScopedFusionPlanBase&) = delete;
 };
 
 // class to represent the Convolution+Bias+Activation fusion plan
@@ -1510,7 +1517,9 @@ class ScopedFusionPlanConvolutionBiasActivation : public ScopedFusionPlanBase {
   const int k_bias_op_idx = 1;
   const int k_actv_op_idx = 2;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFusionPlanConvolutionBiasActivation);
+  ScopedFusionPlanConvolutionBiasActivation(
+      const ScopedFusionPlanConvolutionBiasActivation&) = delete;
+  void operator=(const ScopedFusionPlanConvolutionBiasActivation&) = delete;
 };
 
 // class to represent the BatchNorm+Activation (inference) fusion plan
@@ -1610,7 +1619,9 @@ class ScopedFusionPlanBatchNormActivationInference
   const int k_batchnorm_op_idx = 0;
   const int k_actv_op_idx = 1;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFusionPlanBatchNormActivationInference);
+  ScopedFusionPlanBatchNormActivationInference(
+      const ScopedFusionPlanBatchNormActivationInference&) = delete;
+  void operator=(const ScopedFusionPlanBatchNormActivationInference&) = delete;
 };
 
 // class to represent the BatchNorm+Activation (training-forward) fusion plan
@@ -1709,7 +1720,9 @@ class ScopedFusionPlanBatchNormActivationForward : public ScopedFusionPlanBase {
   const int k_batchnorm_op_idx = 0;
   const int k_actv_op_idx = 1;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFusionPlanBatchNormActivationForward);
+  ScopedFusionPlanBatchNormActivationForward(
+      const ScopedFusionPlanBatchNormActivationForward&) = delete;
+  void operator=(const ScopedFusionPlanBatchNormActivationForward&) = delete;
 };
 
 // class to represent the BatchNorm+Activation (training-backward) fusion plan
@@ -1810,7 +1823,9 @@ class ScopedFusionPlanBatchNormActivationBackward
   const int k_batchnorm_op_idx = 0;
   const int k_actv_op_idx = 1;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFusionPlanBatchNormActivationBackward);
+  ScopedFusionPlanBatchNormActivationBackward(
+      const ScopedFusionPlanBatchNormActivationBackward&) = delete;
+  void operator=(const ScopedFusionPlanBatchNormActivationBackward&) = delete;
 };
 
 namespace {
@@ -1929,7 +1944,8 @@ class MIOpenRnnParamsDescriptor : public MIOpenDescriptorCommon<void> {
   ParamsRegions weights_;
   ParamsRegions biases_;
   tsl::Status status_;
-  SE_DISALLOW_COPY_AND_ASSIGN(MIOpenRnnParamsDescriptor);
+  MIOpenRnnParamsDescriptor(const MIOpenRnnParamsDescriptor&) = delete;
+  void operator=(const MIOpenRnnParamsDescriptor&) = delete;
 };
 
 class MIOpenRnnDescriptor : public MIOpenDescriptorCommon<dnn::RnnDescriptor> {
@@ -2013,7 +2029,8 @@ class MIOpenRnnDescriptor : public MIOpenDescriptorCommon<dnn::RnnDescriptor> {
   // no dropout in MIOpen.
   // std::unique_ptr<miopenDropoutDescriptor> miopen_dropout_desc_;
   std::unique_ptr<MIOpenRnnParamsDescriptor> miopen_params_desc_;
-  SE_DISALLOW_COPY_AND_ASSIGN(MIOpenRnnDescriptor);
+  MIOpenRnnDescriptor(const MIOpenRnnDescriptor&) = delete;
+  void operator=(const MIOpenRnnDescriptor&) = delete;
 };
 
 // Get ID of the internal parameter tensor.
@@ -2085,7 +2102,9 @@ class MIOpenRnnSequenceTensorDescriptor
   miopenDataType_t data_type_;
   std::vector<miopenTensorDescriptor_t> handles_;
   tsl::Status status_;
-  SE_DISALLOW_COPY_AND_ASSIGN(MIOpenRnnSequenceTensorDescriptor);
+  MIOpenRnnSequenceTensorDescriptor(const MIOpenRnnSequenceTensorDescriptor&) =
+      delete;
+  void operator=(const MIOpenRnnSequenceTensorDescriptor&) = delete;
 };
 
 class MIOpenRnnStateTensorDescriptor
@@ -2129,7 +2148,9 @@ class MIOpenRnnStateTensorDescriptor
   int data_size_;
   tsl::Status status_;
   miopenDataType_t data_type_;
-  SE_DISALLOW_COPY_AND_ASSIGN(MIOpenRnnStateTensorDescriptor);
+  MIOpenRnnStateTensorDescriptor(const MIOpenRnnStateTensorDescriptor&) =
+      delete;
+  void operator=(const MIOpenRnnStateTensorDescriptor&) = delete;
 };
 
 namespace {
@@ -2558,7 +2579,8 @@ class MIOpenCTCLossDescriptor {
  private:
   miopenCTCLossDescriptor_t handle_;  // Owned
 
-  SE_DISALLOW_COPY_AND_ASSIGN(MIOpenCTCLossDescriptor);
+  MIOpenCTCLossDescriptor(const MIOpenCTCLossDescriptor&) = delete;
+  void operator=(const MIOpenCTCLossDescriptor&) = delete;
 };
 
 tsl::Status MIOpenSupport::DoPrepareForCtcLoss(
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index f20727e7227078..e00b65a710aa7f 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -692,7 +692,8 @@ class MIOpenSupport : public dnn::DnnSupport {
       ScratchAllocator* scratch_allocator,
       std::vector<dnn::ProfileResult>* out_algorithms);
 
-  SE_DISALLOW_COPY_AND_ASSIGN(MIOpenSupport);
+  MIOpenSupport(const MIOpenSupport&) = delete;
+  void operator=(const MIOpenSupport&) = delete;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_fft.h b/third_party/xla/xla/stream_executor/rocm/rocm_fft.h
index c98303b539eb27..330237656f2f7a 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_fft.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_fft.h
@@ -139,7 +139,8 @@ class ROCMFft : public fft::FftSupport {
                      const DeviceMemory<InputT> &input,
                      DeviceMemory<OutputT> *output);
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ROCMFft);
+  ROCMFft(const ROCMFft &) = delete;
+  void operator=(const ROCMFft &) = delete;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
index 28fd9e7914acae..bd861cceb993c2 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
@@ -95,7 +95,8 @@ class ROCmPlatform : public Platform {
   // manager.
   int limit_numa_node_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ROCmPlatform);
+  ROCmPlatform(const ROCmPlatform&) = delete;
+  void operator=(const ROCmPlatform&) = delete;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/scratch_allocator.h b/third_party/xla/xla/stream_executor/scratch_allocator.h
index b9b9957e0d9365..c3a321e9e5e8a8 100644
--- a/third_party/xla/xla/stream_executor/scratch_allocator.h
+++ b/third_party/xla/xla/stream_executor/scratch_allocator.h
@@ -75,7 +75,8 @@ class OneTimeScratchAllocator : public ScratchAllocator {
   std::unique_ptr<TemporaryDeviceMemory<uint8_t>> temporary_;
   Stream* stream_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(OneTimeScratchAllocator);
+  OneTimeScratchAllocator(const OneTimeScratchAllocator&) = delete;
+  void operator=(const OneTimeScratchAllocator&) = delete;
 };
 
 // Can allocate several times -- this memory is deallocated when the scratch
@@ -105,7 +106,8 @@ class OwningScratchAllocator : public ScratchAllocator {
   DeviceMemoryAllocator* allocator_;
   absl::InlinedVector<OwningDeviceMemory, N> buffers_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(OwningScratchAllocator);
+  OwningScratchAllocator(const OwningScratchAllocator&) = delete;
+  void operator=(const OwningScratchAllocator&) = delete;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/stream.h b/third_party/xla/xla/stream_executor/stream.h
index 219e91d4e5b0e2..a190c36e296c41 100644
--- a/third_party/xla/xla/stream_executor/stream.h
+++ b/third_party/xla/xla/stream_executor/stream.h
@@ -1492,7 +1492,8 @@ class Stream {
     }
   }
 
-  SE_DISALLOW_COPY_AND_ASSIGN(Stream);
+  Stream(const Stream &) = delete;
+  void operator=(const Stream &) = delete;
 };
 
 ////////////
diff --git a/third_party/xla/xla/stream_executor/stream_executor_internal.h b/third_party/xla/xla/stream_executor/stream_executor_internal.h
index dceea1a64434dc..604356669ea350 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_internal.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_internal.h
@@ -89,7 +89,8 @@ class EventInterface {
   virtual ~EventInterface() = default;
 
  private:
-  SE_DISALLOW_COPY_AND_ASSIGN(EventInterface);
+  EventInterface(const EventInterface&) = delete;
+  void operator=(const EventInterface&) = delete;
 };
 
 //===----------------------------------------------------------------------===//
@@ -118,7 +119,8 @@ class KernelInterface {
   virtual KernelCacheConfig GetPreferredCacheConfig() const = 0;
 
  private:
-  SE_DISALLOW_COPY_AND_ASSIGN(KernelInterface);
+  KernelInterface(const KernelInterface&) = delete;
+  void operator=(const KernelInterface&) = delete;
 };
 
 //===----------------------------------------------------------------------===//
@@ -164,7 +166,8 @@ class CommandBufferInterface {
   virtual CommandBuffer::Mode mode() const = 0;
 
  private:
-  SE_DISALLOW_COPY_AND_ASSIGN(CommandBufferInterface);
+  CommandBufferInterface(const CommandBufferInterface&) = delete;
+  void operator=(const CommandBufferInterface&) = delete;
 };
 
 //===----------------------------------------------------------------------===//
@@ -206,7 +209,8 @@ class StreamInterface {
   virtual void** GpuStreamMemberHack() { return nullptr; }
 
  private:
-  SE_DISALLOW_COPY_AND_ASSIGN(StreamInterface);
+  StreamInterface(const StreamInterface&) = delete;
+  void operator=(const StreamInterface&) = delete;
 };
 
 //===----------------------------------------------------------------------===//
@@ -439,7 +443,8 @@ class StreamExecutorInterface {
   virtual Stream* FindAllocatedStream(void* /*gpu_stream*/) { return nullptr; }
 
  private:
-  SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutorInterface);
+  StreamExecutorInterface(const StreamExecutorInterface&) = delete;
+  void operator=(const StreamExecutorInterface&) = delete;
 };
 
 }  // namespace internal
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
index 03c8f1aaefc642..bb76cef5eaa953 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
@@ -626,7 +626,8 @@ class StreamExecutor {
 
   StreamExecutorMemoryAllocator allocator_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor);
+  StreamExecutor(const StreamExecutor&) = delete;
+  void operator=(const StreamExecutor&) = delete;
 };
 
 // A wrapper around ModuleHandle that uses RAII to manage its lifetime.
@@ -661,7 +662,8 @@ class ScopedModuleHandle {
   StreamExecutor* executor_;
   ModuleHandle module_handle_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ScopedModuleHandle);
+  ScopedModuleHandle(const ScopedModuleHandle&) = delete;
+  void operator=(const ScopedModuleHandle&) = delete;
 };
 
 ////////////
diff --git a/third_party/xla/xla/stream_executor/temporary_memory_manager.h b/third_party/xla/xla/stream_executor/temporary_memory_manager.h
index 1e9c47b4b02823..6a82fe0667235a 100644
--- a/third_party/xla/xla/stream_executor/temporary_memory_manager.h
+++ b/third_party/xla/xla/stream_executor/temporary_memory_manager.h
@@ -127,7 +127,8 @@ class TemporaryMemoryManager {
   // are performed through this stream handle.
   Stream* stream_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(TemporaryMemoryManager);
+  TemporaryMemoryManager(const TemporaryMemoryManager&) = delete;
+  void operator=(const TemporaryMemoryManager&) = delete;
 };
 
 ////////////
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_node_context.h b/third_party/xla/xla/stream_executor/tpu/tpu_node_context.h
index 347803dd9c79eb..1af41c5b9cc208 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_node_context.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_node_context.h
@@ -66,7 +66,8 @@ class TpuNodeContext final {
   const int device_ordinal_;
   XLA_TpuNodeContext* const node_context_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TpuNodeContext);
+  TpuNodeContext(const TpuNodeContext&) = delete;
+  void operator=(const TpuNodeContext&) = delete;
 };
 
 }  // namespace tpu
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
index 62181d0dedcfc4..346e15453f0897 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
@@ -61,7 +61,8 @@ class TpuOpExecutable : public xla::TpuExecutableInterface {
 
   SE_OutsideCompilationParams* outside_compilation_params_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(TpuOpExecutable);
+  TpuOpExecutable(const TpuOpExecutable&) = delete;
+  void operator=(const TpuOpExecutable&) = delete;
 };
 
 }  // namespace tensorflow

From 3e8034c77e8940a397867779bfd20170e0c3a5f1 Mon Sep 17 00:00:00 2001
From: Songyi Han <syhan@google.com>
Date: Wed, 4 Oct 2023 01:28:40 -0700
Subject: [PATCH 566/567] Deprecate enable_legacy_weight_only support

Legacy weight-only support was added to support TFLite float16
weight-only representation case. However, as weight-only quantization
representation is consolidated, this is not needed so deprecate the
support.

PiperOrigin-RevId: 570620016
---
 .../integration_test/quantize_model_test.py   | 89 +++----------------
 .../tensorflow/python/quantize_model.py       | 11 +--
 .../tensorflow/quantization_options.proto     |  2 +-
 3 files changed, 19 insertions(+), 83 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index fc2b849940b342..b20bfac1b03cdb 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -348,61 +348,6 @@ def test_drq_per_channel_for_non_uniform_opset_raises_value_error(
           self._input_saved_model_path, quantization_options=options
       )
 
-  @parameterized.named_parameters(
-      ('weight_only_per_tensor', False),
-      ('legacy_weight_only_per_tensor', True),
-  )
-  @test_util.run_in_graph_and_eager_modes
-  def test_enable_legacy_weight_only(
-      self,
-      enable_legacy_weight_only: bool,
-  ):
-    input_shape = (1, 512)
-
-    self._create_matmul_model(
-        input_shape=input_shape,
-        weight_shape=(512, 2),
-        saved_model_path=self._input_saved_model_path,
-    )
-
-    tags = {tag_constants.SERVING}
-    quantization_options = quant_opts_pb2.QuantizationOptions(
-        quantization_method=quant_opts_pb2.QuantizationMethod(
-            preset_method=_PresetMethod.METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8
-        ),
-        tags=tags,
-        signature_keys=['serving_default'],
-        op_set=quant_opts_pb2.XLA,
-        enable_legacy_weight_only=enable_legacy_weight_only,
-    )
-
-    converted_model = quantize_model.quantize(
-        self._input_saved_model_path,
-        self._output_saved_model_path,
-        quantization_options,
-    )
-    self.assertIsNotNone(converted_model)
-    self.assertCountEqual(
-        converted_model.signatures._signatures.keys(), {'serving_default'}
-    )
-
-    output_loader = saved_model_loader.SavedModelLoader(
-        self._output_saved_model_path
-    )
-    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
-    # Legacy quantization uses float32 tf.Conv2D
-    if enable_legacy_weight_only:
-      self.assertFalse(self._contains_op(output_graphdef, 'XlaDotV2'))
-    else:
-      self.assertTrue(self._contains_op(output_graphdef, 'XlaDotV2'))
-
-    # Due to other meta data, the compression is not exactly 1/4.
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path,
-        self._input_saved_model_path,
-        threshold=0.3,
-    )
-
   @test_util.run_in_graph_and_eager_modes
   def test_force_graph_mode_calibration(self):
     input_type = dtypes.int32
@@ -5292,8 +5237,6 @@ def test_einsum_model(
     )
 
   @parameterized.named_parameters(
-      # TODO(b/269421880): Enable legacy weight-only scheme with the uniform
-      # quantized opset
       ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
   )
   @test_util.run_in_graph_and_eager_modes
@@ -5345,18 +5288,18 @@ def test_matmul_model(
     )
 
   @parameterized.named_parameters(
-      # TODO(b/269421880): Enable legacy weight-only scheme with the uniform
-      # quantized opset
-      ('to_xla_per_tensor', quant_opts_pb2.XLA, False, False),
-      # ('to_xla_per_channel', quant_opts_pb2.XLA, True, False),
-      ('to_xla_per_channel_legacy', quant_opts_pb2.XLA, True, True),
+      ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+      # TODO: b/289761265 - [Converter Component][TF-Quantizer] Improve Weight-
+      # only Quantization
+      # Enable this back once new weight-only quantizer is supported for per-
+      # channel quantization.
+      # ('to_xla_per_channel', quant_opts_pb2.XLA, True),
   )
   @test_util.run_in_graph_and_eager_modes
   def test_conv_model(
       self,
       target_opset: quant_opts_pb2.OpSet,
       enable_per_channel_quantization: bool,
-      enable_legacy_weight_only: bool,
   ):
     input_shape = (1, 3, 4, 512)
     filter_shape = (2, 3, 512, 2)
@@ -5379,7 +5322,6 @@ def test_conv_model(
         signature_keys=['serving_default'],
         op_set=target_opset,
         enable_per_channel_quantization=enable_per_channel_quantization,
-        enable_legacy_weight_only=enable_legacy_weight_only,
     )
 
     converted_model = quantize_model.quantize(
@@ -5398,9 +5340,6 @@ def test_conv_model(
     )
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
-    if not enable_legacy_weight_only:
-      self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
-
     # Due to other meta data, the compression is not exactly 1/4.
     self.assertSizeRatioLessThan(
         self._output_saved_model_path,
@@ -5441,18 +5380,18 @@ def test_conv_model(
     self.assertAllClose(original_output, quantized_output, atol=threshold)
 
   @parameterized.named_parameters(
-      # TODO(b/269421880): Enable legacy weight-only scheme with the uniform
-      # quantized opset
-      ('to_xla_per_tensor', quant_opts_pb2.XLA, False, False),
-      # ('to_xla_per_channel', quant_opts_pb2.XLA, True, False),
-      ('to_xla_per_channel_legacy', quant_opts_pb2.XLA, True, True),
+      ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+      # TODO: b/289761265 - [Converter Component][TF-Quantizer] Improve Weight-
+      # only Quantization
+      # Enable this back once new weight-only quantizer is supported for per-
+      # channel quantization.
+      # ('to_xla_per_channel', quant_opts_pb2.XLA, True),
   )
   @test_util.run_in_graph_and_eager_modes
   def test_depthwise_conv2d_model(
       self,
       target_opset: quant_opts_pb2.OpSet,
       enable_per_channel_quantization: bool,
-      enable_legacy_weight_only: bool,
   ):
     input_shape = (1, 3, 4, 512)
     filter_shape = (2, 3, 512, 2)
@@ -5474,7 +5413,6 @@ def test_depthwise_conv2d_model(
         signature_keys=['serving_default'],
         op_set=target_opset,
         enable_per_channel_quantization=enable_per_channel_quantization,
-        enable_legacy_weight_only=enable_legacy_weight_only,
     )
 
     converted_model = quantize_model.quantize(
@@ -5494,9 +5432,6 @@ def test_depthwise_conv2d_model(
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
     # Due to other meta data, the compression is not exactly 1/4.
-    if not enable_legacy_weight_only:
-      self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
-
     size_threshold = 0.5 if enable_per_channel_quantization else 0.32
     self.assertSizeRatioLessThan(
         self._output_saved_model_path,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 911c35987f6333..43dbd283a67fdf 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -1356,6 +1356,11 @@ def _populate_quantization_options_default_values(
   if not quantization_options.HasField('freeze_all_variables'):
     quantization_options.freeze_all_variables = True
 
+  if quantization_options.enable_legacy_weight_only:
+    raise ValueError(
+        'Legacy weight-only is deprecated. Use weight-only quantization method.'
+    )
+
   # Check default quantization option values for weight-only quantization.
   # TODO(b/242805842): Find good minimum_elements_for_weights number for server.
   # please also update default value in tflite converter:
@@ -1584,10 +1589,7 @@ def quantize(
         quantization_options,
         representative_dataset,
     )
-  elif (method.preset_method == _PresetMethod.METHOD_DYNAMIC_RANGE_INT8) or (
-      method.preset_method == _PresetMethod.METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8
-      and quantization_options.enable_legacy_weight_only
-  ):
+  elif method.preset_method == _PresetMethod.METHOD_DYNAMIC_RANGE_INT8:
     return _dynamic_range_quantize(
         saved_model_path,
         output_directory,
@@ -1595,7 +1597,6 @@ def quantize(
     )
   elif (
       method.preset_method == _PresetMethod.METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8
-      and not quantization_options.enable_legacy_weight_only
   ):
     return _weight_only_quantize(
         saved_model_path,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
index 0a44a6e5b858e9..01cf3401f9a37a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
@@ -305,7 +305,7 @@ message QuantizationOptions {
   bool experimental_enable_tpu_model_support = 12;
 
   // Produces legacy weight-only graph where the qconst op(containing quantized
-  // values) is followed by a dequantization op.
+  // values) is followed by a dequantization op. This flag will be deprecated.
   bool enable_legacy_weight_only = 13;
 
   // If set to true, it forces calibration in graph model instead of eager mode

From c36e989fde07601b807dfee5b96897d7fd81e5c0 Mon Sep 17 00:00:00 2001
From: Chao <cchen104@amd.com>
Date: Wed, 4 Oct 2023 01:51:00 -0700
Subject: [PATCH 567/567] PR #6054: [ROCm] fixed rocm build at hlo_to_llvm and
 miopen

Imported from GitHub PR https://github.com/openxla/xla/pull/6054

1. Fixed ROCm build due to https://github.com/openxla/xla/commit/6f80714eb1c14b591442e8a5a03565f02e064c99
2. Fixed MIOpen build due to https://github.com/openxla/xla/commit/673d0c9af734792c1373c9c7ba7c98f6ba612bd0

@cheshire Thanks in advance!
Copybara import of the project:

--
3d6851faed1313c476576d96e6f3de6b60729652 by Chao Chen <cchen104@amd.com>:

fixed rocm se

--
d7f44518255810150e82f16834be0589142cbe42 by Chao Chen <cchen104@amd.com>:

fixed miopen build

Merging this change closes #6054

PiperOrigin-RevId: 570623937
---
 .../xla/service/gpu/tests/hlo_to_llvm_ir.cc   |  2 +-
 .../xla/xla/stream_executor/rocm/rocm_dnn.h   | 26 -------------------
 2 files changed, 1 insertion(+), 27 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
index 7aaefd9cc020f7..d31d9c3824a5eb 100644
--- a/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
@@ -77,7 +77,7 @@ xla::Status CompileAndPrintLlvmIr(const std::string& hlo_text,
   stream_executor::Platform::Id platform_id =
       stream_executor::cuda::kCudaPlatformId;
 #elif TENSORFLOW_USE_ROCM
-  se::DeviceDescription gpu_device_info =
+  stream_executor::DeviceDescription gpu_device_info =
       xla::gpu::TestGpuDeviceInfo::AMDMI210DeviceInfo();
   std::string target_triple = xla::gpu::amdgpu::TargetTriple();
   std::string data_layout = xla::gpu::amdgpu::DataLayout();
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index e00b65a710aa7f..30cf2dfd56b086 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -514,18 +514,6 @@ class MIOpenSupport : public dnn::DnnSupport {
                          dnn::DataType output_type, float scale,
                          DeviceMemoryBase* output_data) override;
 
-  bool DoFusedConvolutionBiasActivation(
-      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-      const DeviceMemory<float>& conv_input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data,
-      dnn::ProfileResult* output_profile_result) override;
-
   GpuExecutor* GetParentExecutor() { return parent_; }
 
   tsl::Status DoCtcLoss(Stream* stream, dnn::DataType element_type,
@@ -624,20 +612,6 @@ class MIOpenSupport : public dnn::DnnSupport {
                          DeviceMemory<uint8>* reserve_space_data,
                          ScratchAllocator* workspace_allocator);
 
-  template <typename T>
-  bool DoFusedConvolutionBiasActivationImpl(
-      Stream* stream,
-      int miopen_type,  // Actually miopenDataType_t.
-      const dnn::BatchDescriptor& conv_input_descriptor,
-      const DeviceMemory<T>& conv_input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<T>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const DeviceMemory<T>& bias_data, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T>* output_data, dnn::ProfileResult* output_profile_result);
-
   tsl::Status DoPrepareForConvolution(
       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,